From 76838a20b7bd936472d3431bbc7534afac883dad Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Fri, 30 Oct 2020 09:11:08 -0700
Subject: [PATCH 001/461] A model used to quickly simulate various GRV
 scenarios and algorithms

---
 contrib/grv_proxy_model/grv_test.py         | 134 ++++++++
 contrib/grv_proxy_model/plot.py             | 107 +++++++
 contrib/grv_proxy_model/priority.py         |  40 +++
 contrib/grv_proxy_model/proxy_model.py      | 338 ++++++++++++++++++++
 contrib/grv_proxy_model/rate_model.py       |  83 +++++
 contrib/grv_proxy_model/ratekeeper_model.py |  67 ++++
 contrib/grv_proxy_model/smoother.py         |  53 +++
 contrib/grv_proxy_model/workload_model.py   | 201 ++++++++++++
 8 files changed, 1023 insertions(+)
 create mode 100755 contrib/grv_proxy_model/grv_test.py
 create mode 100755 contrib/grv_proxy_model/plot.py
 create mode 100755 contrib/grv_proxy_model/priority.py
 create mode 100755 contrib/grv_proxy_model/proxy_model.py
 create mode 100755 contrib/grv_proxy_model/rate_model.py
 create mode 100755 contrib/grv_proxy_model/ratekeeper_model.py
 create mode 100644 contrib/grv_proxy_model/smoother.py
 create mode 100755 contrib/grv_proxy_model/workload_model.py

diff --git a/contrib/grv_proxy_model/grv_test.py b/contrib/grv_proxy_model/grv_test.py
new file mode 100755
index 0000000000..1cd0224538
--- /dev/null
+++ b/contrib/grv_proxy_model/grv_test.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+
+#
+# grv_test.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import inspect
+import sys
+
+import rate_model
+import workload_model
+import proxy_model
+import ratekeeper_model
+from priority import Priority
+from plot import Plotter
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-w', '--workload', type=str, help='Name of workload to run')
+parser.add_argument('-r', '--ratekeeper', type=str, help='Name of ratekeeper model')
+parser.add_argument('-d', '--duration', type=int, default=240, help='Duration of simulated test, in seconds. Defaults to 240.')
+parser.add_argument('-L', '--limiter', type=str, default='Original', help='Name of limiter implementation. Defaults to \'Original\'.')
+parser.add_argument('-p', '--proxy', type=str, default='ProxyModel', help='Name of proxy implementation. Defaults to \'ProxyModel\'.')
+parser.add_argument('--list', action='store_true', default=False, help='List options for all models.')
+parser.add_argument('--no-graph', action='store_true', default=False, help='Disable graphical output.')
+
+args = parser.parse_args()
+
+def print_choices_list(context=None):
+    if context == 'workload' or context is None:
+        print('Workloads:')
+        for w in workload_model.predefined_workloads.keys():
+            print('  %s' % w)
+
+    if context == 'ratekeeper' or context is None:
+        print('\nRatekeeper models:')
+        for r in ratekeeper_model.predefined_ratekeeper.keys():
+            print('  %s' % r)
+
+    proxy_model_classes = [c for c in [getattr(proxy_model, a) for a in dir(proxy_model)] if inspect.isclass(c)]
+
+    if context == 'proxy' or context is None:
+        print('\nProxy models:')
+        for p in proxy_model_classes:
+            if issubclass(p, proxy_model.ProxyModel):
+                print('  %s' % p.__name__)
+
+    if context == 'limiter' or context is None:
+        print('\nProxy limiters:')
+        for p in proxy_model_classes:
+            if issubclass(p, proxy_model.Limiter) and p != proxy_model.Limiter:
+                name = p.__name__
+                if name.endswith('Limiter'):
+                    name = name[0:-len('Limiter')]
+                print('  %s' % name)
+
+if args.workload is None or args.ratekeeper is None:
+    print('ERROR: A workload (-w/--workload) and ratekeeper model (-r/--ratekeeper) must be specified.\n')
+    print_choices_list()
+    sys.exit(1)
+
+if args.list:
+    print_choices_list()
+    sys.exit(0)
+
+def validate_class_type(var, name, superclass):
+    cls = getattr(var, name, None)
+    return cls is not None and inspect.isclass(cls) and issubclass(cls, superclass)
+
+if not args.ratekeeper in ratekeeper_model.predefined_ratekeeper:
+    print('Invalid ratekeeper model `%s\'' % args.ratekeeper)
+    print_choices_list('ratekeeper')
+    sys.exit(1)
+
+if not args.workload in workload_model.predefined_workloads:
+    print('Invalid workload model `%s\'' % args.workload)
+    print_choices_list('workload')
+    sys.exit(1)
+
+if not validate_class_type(proxy_model, args.proxy, proxy_model.ProxyModel):
+    print('Invalid proxy model `%s\'' % args.proxy)
+    print_choices_list('proxy')
+    sys.exit(1)
+
+limiter_name = args.limiter
+if not validate_class_type(proxy_model, limiter_name, proxy_model.Limiter):
+    limiter_name += 'Limiter'
+    if not validate_class_type(proxy_model, limiter_name, proxy_model.Limiter):
+        print('Invalid proxy limiter `%s\'' % args.limiter)
+        print_choices_list('limiter')
+        sys.exit(1)
+
+ratekeeper = ratekeeper_model.predefined_ratekeeper[args.ratekeeper]
+workload = workload_model.predefined_workloads[args.workload]
+
+limiter = getattr(proxy_model, limiter_name)
+proxy = getattr(proxy_model, args.proxy)(args.duration, ratekeeper, workload, limiter)
+
+proxy.run()
+
+for priority in workload.priorities():
+    latencies = sorted([p for t in proxy.results.latencies[priority].values() for p in t])
+    total_started = sum(proxy.results.started[priority].values())
+    still_queued = sum([r.count for r in proxy.request_queue if r.priority == priority])
+
+    if len(latencies) > 0:
+        print('\n%s: %d requests in %d seconds (rate=%f). %d still queued.' % (priority, total_started, proxy.time, float(total_started)/proxy.time, still_queued))
+        print('  Median latency: %f' % latencies[len(latencies)//2])
+        print('  90%% latency: %f' % latencies[int(0.9*len(latencies))])
+        print('  99%% latency: %f' % latencies[int(0.99*len(latencies))])
+        print('  99.9%% latency: %f' % latencies[int(0.999*len(latencies))])
+        print('  Max latency: %f' % latencies[-1])
+
+print('')
+
+if not args.no_graph:
+    plotter = Plotter(proxy.results)
+    plotter.display()
diff --git a/contrib/grv_proxy_model/plot.py b/contrib/grv_proxy_model/plot.py
new file mode 100755
index 0000000000..9334e2c844
--- /dev/null
+++ b/contrib/grv_proxy_model/plot.py
@@ -0,0 +1,107 @@
+#
+# plot.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import matplotlib.pyplot as plt
+
+class Plotter:
+    def __init__(self, results):
+        self.results = results
+
+    def add_plot(data, time_resolution, label, use_avg=False):
+        out_data = {}
+        counts = {}
+        for t in data.keys():
+            out_data.setdefault(t//time_resolution*time_resolution, 0)
+            counts.setdefault(t//time_resolution*time_resolution, 0)
+            out_data[t//time_resolution*time_resolution] += data[t]
+            counts[t//time_resolution*time_resolution] += 1
+
+        if use_avg:
+            out_data = { t: v/counts[t] for t,v in out_data.items() }
+
+        plt.plot(list(out_data.keys()), list(out_data.values()), label=label)
+
+    def add_plot_with_times(data, label):
+        plt.plot(list(data.keys()), list(data.values()), label=label)
+
+    def display(self, time_resolution=0.1):
+        plt.figure(figsize=(40,9))
+        plt.subplot(3, 3, 1)
+        for priority in self.results.started.keys():
+            Plotter.add_plot(self.results.started[priority], time_resolution, priority)
+
+        plt.xlabel('Time (s)')
+        plt.ylabel('Released/s')
+        plt.legend()
+
+        plt.subplot(3, 3, 2)
+        for priority in self.results.queued.keys():
+            Plotter.add_plot(self.results.queued[priority], time_resolution, priority)
+
+        plt.xlabel('Time (s)')
+        plt.ylabel('Requests/s')
+        plt.legend()
+
+        plt.subplot(3, 3, 3)
+        for priority in self.results.unprocessed_queue_sizes.keys():
+            data = {k: max(v) for (k,v) in self.results.unprocessed_queue_sizes[priority].items()}
+            Plotter.add_plot(data, time_resolution, priority)
+
+        plt.xlabel('Time (s)')
+        plt.ylabel('Max queue size')
+        plt.legend()
+
+        num = 4
+        for priority in self.results.latencies.keys():
+            plt.subplot(3, 3, num)
+            median_latencies = {k: v[int(0.5*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()}
+            percentile90_latencies = {k: v[int(0.9*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()}
+            max_latencies = {k: max(v) if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()}
+
+            Plotter.add_plot(median_latencies, time_resolution, 'median')
+            Plotter.add_plot(percentile90_latencies, time_resolution, '90th percentile')
+            Plotter.add_plot(max_latencies, time_resolution, 'max')
+
+            plt.xlabel('Time (s)')
+            plt.ylabel(str(priority) + ' Latency (s)')
+            plt.yscale('log')
+            plt.legend()
+            num += 1
+
+        for priority in self.results.rate.keys():
+            plt.subplot(3, 3, num)
+            if len(self.results.rate[priority]) > 0:
+                Plotter.add_plot(self.results.rate[priority], time_resolution, 'Rate', use_avg=True)
+            if len(self.results.released[priority]) > 0:
+                Plotter.add_plot(self.results.released[priority], time_resolution, 'Released', use_avg=True)
+            if len(self.results.limit[priority]) > 0:
+                Plotter.add_plot(self.results.limit[priority], time_resolution, 'Limit', use_avg=True)
+            if len(self.results.limit_and_budget[priority]) > 0:
+                Plotter.add_plot(self.results.limit_and_budget[priority], time_resolution, 'Limit and budget', use_avg=True)
+            if len(self.results.budget[priority]) > 0:
+                Plotter.add_plot(self.results.budget[priority], time_resolution, 'Budget', use_avg=True)
+
+            plt.xlabel('Time (s)')
+            plt.ylabel('Value (' + str(priority) + ')')
+            plt.legend()
+            num += 1
+
+        plt.show()
+
diff --git a/contrib/grv_proxy_model/priority.py b/contrib/grv_proxy_model/priority.py
new file mode 100755
index 0000000000..3ba5c05f2e
--- /dev/null
+++ b/contrib/grv_proxy_model/priority.py
@@ -0,0 +1,40 @@
+#
+# priority.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import functools
+
+@functools.total_ordering
+class Priority:
+    def __init__(self, priority_value, label):
+        self.priority_value = priority_value
+        self.label = label
+
+    def __lt__(self, other):
+        return self.priority_value < other.priority_value
+
+    def __str__(self):
+        return self.label
+
+    def __repr__(self):
+        return repr(self.label)
+
+Priority.SYSTEM = Priority(0, "System")
+Priority.DEFAULT = Priority(1, "Default")
+Priority.BATCH = Priority(2, "Batch")
diff --git a/contrib/grv_proxy_model/proxy_model.py b/contrib/grv_proxy_model/proxy_model.py
new file mode 100755
index 0000000000..9ca2a39bfe
--- /dev/null
+++ b/contrib/grv_proxy_model/proxy_model.py
@@ -0,0 +1,338 @@
+#
+# proxy_model.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import copy
+import functools
+import heapq
+
+from priority import Priority
+from smoother import Smoother
+
+@functools.total_ordering
+class Task:
+    def __init__(self, time, fxn):
+        self.time = time
+        self.fxn = fxn
+
+    def __lt__(self, other):
+        return self.time < other.time
+
+class Limiter:
+    class UpdateRateParams:
+        def __init__(self, time):
+            self.time = time
+
+    class UpdateLimitParams:
+        def __init__(self, time, elapsed):
+            self.time = time
+            self.elapsed = elapsed
+
+    class CanStartParams:
+        def __init__(self, time, num_started, count):
+            self.time = time
+            self.num_started = num_started
+            self.count = count
+
+    class UpdateBudgetParams:
+        def __init__(self, time, num_started, num_started_at_priority, min_priority, last_batch, queue_empty, elapsed):
+            self.time = time
+            self.num_started = num_started
+            self.num_started_at_priority = num_started_at_priority
+            self.min_priority = min_priority
+            self.last_batch = last_batch
+            self.queue_empty = queue_empty
+            self.elapsed = elapsed
+
+    def __init__(self, priority, ratekeeper_model, proxy_model):
+        self.priority = priority
+        self.ratekeeper_model = ratekeeper_model
+        self.proxy_model = proxy_model
+        self.limit = 0
+        self.rate = self.ratekeeper_model.get_limit(0, self.priority)
+
+    def update_rate(self, params):
+        pass
+
+    def update_limit(self, params):
+        pass
+
+    def can_start(self, params):
+        pass
+
+    def update_budget(self, params):
+        pass
+
+class OriginalLimiter(Limiter):
+    def __init__(self, priority, limit_rate_model, proxy_model):
+        Limiter.__init__(self, priority, limit_rate_model, proxy_model)
+
+    def update_rate(self, params):
+        self.rate = self.ratekeeper_model.get_limit(params.time, self.priority)
+
+    def update_limit(self, params):
+        self.limit = min(0, self.limit) + params.elapsed * self.rate
+        self.limit = min(self.limit, self.rate * 0.01)
+        self.limit = min(self.limit, 100000)
+
+        self.proxy_model.results.rate[self.priority][params.time] = self.rate
+        self.proxy_model.results.limit[self.priority][params.time] = self.limit
+
+    def can_start(self, params):
+        return params.num_started < self.limit
+
+    def update_budget(self, params):
+        self.limit -= params.num_started
+
+class PositiveBudgetLimiter(OriginalLimiter):
+    def __init__(self, priority, limit_rate_model, proxy_model):
+        OriginalLimiter.__init__(self, priority, limit_rate_model, proxy_model)
+
+    def update_limit(self, params):
+        self.limit += params.elapsed * self.rate
+        self.limit = min(self.limit, 2.0 * self.rate)
+
+class ClampedBudgetLimiter(PositiveBudgetLimiter):
+    def __init__(self, priority, limit_rate_model, proxy_model):
+        PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model)
+
+    def update_budget(self, params):
+        min_budget = -self.rate * 5.0
+        if self.limit > min_budget:
+            self.limit = max(self.limit - params.num_started, min_budget)
+
+class TimeLimiter(PositiveBudgetLimiter):
+    def __init__(self, priority, limit_rate_model, proxy_model):
+        PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model)
+        self.locked_until = 0
+
+    def can_start(self, params):
+        return params.time >= self.locked_until and PositiveBudgetLimiter.can_start(self, params)
+
+    def update_budget(self, params):
+        #print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch))
+
+        if params.min_priority >= self.priority or params.num_started < self.limit:
+            self.limit -= params.num_started
+        else:
+            self.limit = min(self.limit, max(self.limit - params.num_started, -params.last_batch))
+            self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + (params.num_started - self.limit)/self.rate)
+
+        #print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority))
+
+class TimePositiveBudgetLimiter(PositiveBudgetLimiter):
+    def __init__(self, priority, limit_rate_model, proxy_model):
+        PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model)
+        self.locked_until = 0
+
+    def update_limit(self, params):
+        if params.time >= self.locked_until:
+            PositiveBudgetLimiter.update_limit(self, params)
+
+    def can_start(self, params):
+        return params.num_started + params.count <= self.limit
+
+    def update_budget(self, params):
+        #if params.num_started > 0:
+            #print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch))
+
+        if params.num_started > self.limit:
+            self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + penalty/self.rate)
+            self.limit = 0
+        else:
+            self.limit -= params.num_started
+
+        #if params.num_started > 0:
+            #print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority))
+
+class SmoothingLimiter(OriginalLimiter):
+    def __init__(self, priority, limit_rate_model, proxy_model):
+        OriginalLimiter.__init__(self, priority, limit_rate_model, proxy_model)
+        self.smooth_released = Smoother(2)
+        self.smooth_rate_limit = Smoother(2)
+        self.rate_set = False
+
+    def update_rate(self, params):
+        OriginalLimiter.update_rate(self, params)
+        if not self.rate_set:
+            self.rate_set = True
+            self.smooth_rate_limit.reset(self.rate)
+        else:
+            self.smooth_rate_limit.set_total(params.time, self.rate)
+
+    def update_limit(self, params):
+        self.limit = 2.0 * (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time))
+
+    def can_start(self, params):
+        return params.num_started + params.count <= self.limit
+
+    def update_budget(self, params):
+        self.smooth_released.add_delta(params.time, params.num_started)
+
+class SmoothingBudgetLimiter(SmoothingLimiter):
+    def __init__(self, priority, limit_rate_model, proxy_model):
+        SmoothingLimiter.__init__(self, priority, limit_rate_model, proxy_model)
+        #self.smooth_filled = Smoother(2)
+        self.budget = 0
+
+    def update_limit(self, params):
+        release_rate = (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time))
+        #self.smooth_filled.set_total(params.time, 1 if release_rate > 0 else 0)
+        self.limit = 2.0 * release_rate
+
+        self.proxy_model.results.rate[self.priority][params.time] = self.smooth_rate_limit.smooth_total(params.time)
+        self.proxy_model.results.released[self.priority][params.time] = self.smooth_released.smooth_rate(params.time)
+        self.proxy_model.results.limit[self.priority][params.time] = self.limit
+        self.proxy_model.results.limit_and_budget[self.priority][params.time] = self.limit + self.budget
+        self.proxy_model.results.budget[self.priority][params.time] = self.budget
+
+        #self.budget = max(0, self.budget + params.elapsed * self.smooth_rate_limit.smooth_total(params.time))
+
+        #if self.smooth_filled.smooth_total(params.time) >= 0.1:
+            #self.budget += params.elapsed * self.smooth_rate_limit.smooth_total(params.time)
+
+        #print('Update limit: time=%f, priority=%s, limit=%f, rate=%f, released=%f, budget=%f' % (params.time, self.priority, self.limit, self.smooth_rate_limit.smooth_total(params.time), self.smooth_released.smooth_rate(params.time), self.budget))
+
+    def can_start(self, params):
+        return params.num_started + params.count <= self.limit + self.budget #or params.num_started + params.count <= self.budget
+
+    def update_budget(self, params):
+        self.budget = max(0, self.budget + (self.limit - params.num_started_at_priority) / 2 * params.elapsed)
+
+        if params.queue_empty:
+            self.budget = min(10, self.budget)
+
+        self.smooth_released.add_delta(params.time, params.num_started_at_priority)
+
+class ProxyModel:
+    class Results:
+        def __init__(self, priorities, duration):
+            self.started = self.init_result(priorities, 0, duration)
+            self.queued = self.init_result(priorities, 0, duration)
+            self.latencies = self.init_result(priorities, [], duration)
+            self.unprocessed_queue_sizes = self.init_result(priorities, [], duration)
+
+            self.rate = {p:{} for p in priorities}
+            self.released = {p:{} for p in priorities}
+            self.limit = {p:{} for p in priorities}
+            self.limit_and_budget = {p:{} for p in priorities}
+            self.budget = {p:{} for p in priorities}
+
+        def init_result(self, priorities, starting_value, duration):
+            return {p: {s: copy.copy(starting_value) for s in range(0, duration)} for p in priorities}
+
+    def __init__(self, duration, ratekeeper_model, workload_model, Limiter):
+        self.time = 0
+        self.log_time = 0
+        self.duration = duration
+        self.priority_limiters = { priority: Limiter(priority, ratekeeper_model, self) for priority in workload_model.priorities() }
+        self.workload_model = workload_model
+        self.request_scheduled = { p: False for p in self.workload_model.priorities()}
+
+        self.tasks = []
+        self.request_queue = []
+        self.results = ProxyModel.Results(self.workload_model.priorities(), duration)
+
+    def run(self):
+        self.update_rate()
+        self.process_requests(self.time)
+
+        for priority in self.workload_model.priorities():
+            next_request = self.workload_model.next_request(self.time, priority)
+            assert next_request is not None
+            heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request)))
+            self.request_scheduled[priority] = True
+
+        while True:# or len(self.request_queue) > 0:
+            if int(self.time) > self.log_time:
+                self.log_time = int(self.time)
+                #print(self.log_time)
+
+            task = heapq.heappop(self.tasks)
+            self.time = task.time
+            if self.time >= self.duration:
+                break
+
+            task.fxn()
+
+    def update_rate(self):
+        for limiter in self.priority_limiters.values():
+            limiter.update_rate(Limiter.UpdateRateParams(self.time))
+
+        heapq.heappush(self.tasks, Task(self.time + 0.01, lambda: self.update_rate()))
+
+    def receive_request(self, request):
+        heapq.heappush(self.request_queue, request)
+
+        self.results.queued[request.priority][int(self.time)] += request.count
+
+        next_request = self.workload_model.next_request(self.time, request.priority)
+        if next_request is not None and next_request.time < self.duration:
+            heapq.heappush(self.tasks, Task(next_request.time, lambda: self.receive_request(next_request)))
+        else:
+            self.request_scheduled[request.priority] = False
+
+    def process_requests(self, last_time):
+        elapsed = self.time - last_time
+        for limiter in self.priority_limiters.values():
+            limiter.update_limit(Limiter.UpdateLimitParams(self.time, elapsed))
+
+        current_started = 0
+        started = {p:0 for p in self.workload_model.priorities()}
+
+        min_priority = Priority.SYSTEM
+        last_batch = 0
+        while len(self.request_queue) > 0:
+            request = self.request_queue[0]
+
+            if not self.priority_limiters[request.priority].can_start(Limiter.CanStartParams(self.time, current_started, request.count)):
+                break
+
+            min_priority = request.priority
+            last_batch = request.count
+
+            if self.workload_model.request_completed(request) and not self.request_scheduled[request.priority]:
+                next_request = self.workload_model.next_request(self.time, request.priority)
+                assert next_request is not None
+                heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request)))
+                self.request_scheduled[request.priority] = True
+
+            current_started += request.count
+            started[request.priority] += request.count
+
+            heapq.heappop(self.request_queue)
+            self.results.started[request.priority][int(self.time)] += request.count
+            self.results.latencies[request.priority][int(self.time)].append(self.time-request.time)
+
+        if len(self.request_queue) == 0:
+            min_priority = Priority.BATCH
+
+        for priority, limiter in self.priority_limiters.items():
+            started_at_priority = sum([v for p,v in started.items() if p <= priority])
+            limiter.update_budget(Limiter.UpdateBudgetParams(self.time, current_started, started_at_priority, min_priority, last_batch, len(self.request_queue) == 0 or self.request_queue[0].priority > priority, elapsed))
+        
+        for priority in self.workload_model.priorities():
+            self.results.unprocessed_queue_sizes[priority][int(self.time)].append(self.workload_model.workload_models[priority].outstanding)
+
+        current_time = self.time
+
+        delay = 0.001
+        heapq.heappush(self.tasks, Task(self.time + delay, lambda: self.process_requests(current_time)))
+
+
diff --git a/contrib/grv_proxy_model/rate_model.py b/contrib/grv_proxy_model/rate_model.py
new file mode 100755
index 0000000000..1fabce2c7e
--- /dev/null
+++ b/contrib/grv_proxy_model/rate_model.py
@@ -0,0 +1,83 @@
+#
+# rate_model.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy
+
+class RateModel:
+    def __init__(self):
+        pass
+
+    def get_rate(self, time):
+        pass
+
+class FixedRateModel(RateModel):
+    def __init__(self, rate):
+        RateModel.__init__(self)
+        self.rate = rate
+
+    def get_rate(self, time):
+        return self.rate
+
+class UnlimitedRateModel(FixedRateModel):
+    def __init__(self):
+        self.rate = 1e9
+
+class IntervalRateModel(RateModel):
+    def __init__(self, intervals):
+        self.intervals = sorted(intervals)
+
+    def get_rate(self, time):
+        if len(self.intervals) == 0 or time < self.intervals[0][0]:
+            return 0
+        
+        target_interval = len(self.intervals)-1
+        for i in range(1, len(self.intervals)):
+            if time < self.intervals[i][0]:
+                target_interval = i-1
+                break
+
+        self.intervals = self.intervals[target_interval:]
+        return self.intervals[0][1]
+
+class SawtoothRateModel(RateModel):
+    def __init__(self, low, high, frequency):
+        self.low = low
+        self.high = high
+        self.frequency = frequency
+
+    def get_rate(self, time):
+        if int(2*time/self.frequency) % 2 == 0:
+            return self.low
+        else:
+            return self.high
+
+class DistributionRateModel(RateModel):
+    def __init__(self, distribution, frequency):
+        self.distribution = distribution
+        self.frequency = frequency
+        self.last_change = 0
+        self.rate = None
+
+    def get_rate(self, time):
+        if self.frequency == 0 or int((time - self.last_change) / self.frequency) > int(self.last_change / self.frequency) or self.rate is None:
+            self.last_change = time
+            self.rate = self.distribution()
+
+        return self.rate
diff --git a/contrib/grv_proxy_model/ratekeeper_model.py b/contrib/grv_proxy_model/ratekeeper_model.py
new file mode 100755
index 0000000000..57125dc4c0
--- /dev/null
+++ b/contrib/grv_proxy_model/ratekeeper_model.py
@@ -0,0 +1,67 @@
+#
+# ratekeeper.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy
+import rate_model
+from priority import Priority
+
+class RatekeeperModel:
+    def __init__(self, limit_models):
+        self.limit_models = limit_models
+
+    def get_limit(self, time, priority):
+        return self.limit_models[priority].get_rate(time)
+
+predefined_ratekeeper = {}
+
+predefined_ratekeeper['default200_batch100'] = RatekeeperModel(
+{ 
+    Priority.SYSTEM: rate_model.UnlimitedRateModel(), 
+    Priority.DEFAULT: rate_model.FixedRateModel(200),
+    Priority.BATCH: rate_model.FixedRateModel(100) 
+})
+
+predefined_ratekeeper['default_sawtooth'] = RatekeeperModel(
+{ 
+    Priority.SYSTEM: rate_model.UnlimitedRateModel(), 
+    Priority.DEFAULT: rate_model.SawtoothRateModel(10, 200, 1),
+    Priority.BATCH: rate_model.FixedRateModel(0) 
+})
+
+predefined_ratekeeper['default_uniform_random'] = RatekeeperModel(
+{ 
+    Priority.SYSTEM: rate_model.UnlimitedRateModel(), 
+    Priority.DEFAULT: rate_model.DistributionRateModel(lambda: numpy.random.uniform(10, 200), 1),
+    Priority.BATCH: rate_model.FixedRateModel(0) 
+})
+
+predefined_ratekeeper['default_trickle'] = RatekeeperModel(
+{ 
+    Priority.SYSTEM: rate_model.UnlimitedRateModel(), 
+    Priority.DEFAULT: rate_model.FixedRateModel(3),
+    Priority.BATCH: rate_model.FixedRateModel(0) 
+})
+
+predefined_ratekeeper['default1000'] = RatekeeperModel(
+{
+    Priority.SYSTEM: rate_model.UnlimitedRateModel(),
+    Priority.DEFAULT: rate_model.FixedRateModel(1000),
+    Priority.BATCH: rate_model.FixedRateModel(500)
+})
diff --git a/contrib/grv_proxy_model/smoother.py b/contrib/grv_proxy_model/smoother.py
new file mode 100644
index 0000000000..bc1b32ea12
--- /dev/null
+++ b/contrib/grv_proxy_model/smoother.py
@@ -0,0 +1,53 @@
+#
+# smoother.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import math
+
+class Smoother:
+    def __init__(self, folding_time):
+        self.folding_time = folding_time
+        self.reset(0)
+
+    def reset(self, value):
+        self.time = 0
+        self.total = value
+        self.estimate = value 
+        
+    def set_total(self, time, total):
+        self.add_delta(time, total-self.total)
+
+    def add_delta(self, time, delta):
+        self.update(time)
+        self.total += delta
+
+    def smooth_total(self, time):
+        self.update(time)
+        return self.estimate
+
+    def smooth_rate(self, time):
+        self.update(time)
+        return (self.total-self.estimate) / self.folding_time
+
+    def update(self, time):
+        elapsed = time - self.time
+        if elapsed > 0:
+            self.time = time
+            self.estimate += (self.total-self.estimate) * (1-math.exp(-elapsed/self.folding_time))
+
diff --git a/contrib/grv_proxy_model/workload_model.py b/contrib/grv_proxy_model/workload_model.py
new file mode 100755
index 0000000000..63fb4c472e
--- /dev/null
+++ b/contrib/grv_proxy_model/workload_model.py
@@ -0,0 +1,201 @@
+#
+# workload_model.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import functools
+import numpy
+import math
+
+import rate_model
+from priority import Priority
+
+@functools.total_ordering
+class Request:
+    def __init__(self, time, count, priority):
+        self.time = time
+        self.count = count
+        self.priority = priority
+
+    def __lt__(self, other):
+        return self.priority < other.priority
+
+class PriorityWorkloadModel:
+    def __init__(self, priority, rate_model, batch_model, generator, max_outstanding=1e9):
+        self.priority = priority
+        self.rate_model = rate_model
+        self.batch_model = batch_model
+        self.generator = generator
+        self.max_outstanding = max_outstanding
+        self.outstanding = 0
+
+    def next_request(self, time):
+        if self.outstanding >= self.max_outstanding:
+            return None
+
+        batch_size = self.batch_model.next_batch()
+        self.outstanding += batch_size
+        interval = self.generator.next_request_interval(self.rate_model.get_rate(time))
+        return Request(time + interval, batch_size, self.priority)
+
+    def request_completed(self, request):
+        was_full = self.max_outstanding <= self.outstanding
+        self.outstanding -= request.count
+
+        return was_full and self.outstanding < self.max_outstanding
+
+class WorkloadModel:
+    def __init__(self, workload_models):
+        self.workload_models = workload_models
+
+    def priorities(self):
+        return list(self.workload_models.keys())
+
+    def next_request(self, time, priority):
+        return self.workload_models[priority].next_request(time)
+
+    def request_completed(self, request):
+        return self.workload_models[request.priority].request_completed(request)
+
+class Distribution:
+    EXPONENTIAL = lambda x: numpy.random.exponential(x)
+    UNIFORM = lambda x: numpy.random.uniform(0, 2.0*x)
+    FIXED = lambda x: x 
+
+class BatchGenerator:
+    def __init__(self):
+        pass
+
+    def next_batch(self):
+        pass
+
+class DistributionBatchGenerator(BatchGenerator):
+    def __init__(self, distribution, size):
+        BatchGenerator.__init__(self)
+        self.distribution = distribution
+        self.size = size
+
+    def next_batch(self):
+        return math.ceil(self.distribution(self.size))
+
+class RequestGenerator:
+    def __init__(self):
+        pass
+
+    def next_request_interval(self, rate):
+        pass
+
+class DistributionRequestGenerator(RequestGenerator):
+    def __init__(self, distribution):
+        RequestGenerator.__init__(self)
+        self.distribution = distribution
+
+    def next_request_interval(self, rate):
+        if rate == 0:
+            return 1e9
+
+        return self.distribution(1.0/rate)
+
+predefined_workloads = {}
+
+predefined_workloads['slow_exponential'] = WorkloadModel(
+{
+    Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, 
+                                            rate_model.FixedRateModel(100), 
+                                            DistributionBatchGenerator(Distribution.FIXED, 1),
+                                            DistributionRequestGenerator(Distribution.EXPONENTIAL),
+                                            max_outstanding=100
+    )
+})
+
+predefined_workloads['fixed_uniform'] = WorkloadModel(
+{
+    Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, 
+                                           rate_model.FixedRateModel(0), 
+                                           DistributionBatchGenerator(Distribution.FIXED, 1),
+                                           DistributionRequestGenerator(Distribution.UNIFORM),
+                                           max_outstanding=10
+    ),
+    Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, 
+                                            rate_model.FixedRateModel(95), 
+                                            DistributionBatchGenerator(Distribution.FIXED, 10),
+                                            DistributionRequestGenerator(Distribution.UNIFORM),
+                                            max_outstanding=200
+    ),
+    Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, 
+                                          rate_model.FixedRateModel(1), 
+                                          DistributionBatchGenerator(Distribution.UNIFORM, 500),
+                                          DistributionRequestGenerator(Distribution.UNIFORM),
+                                          max_outstanding=200
+    )
+})
+
+predefined_workloads['batch_starvation'] = WorkloadModel(
+{
+    Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, 
+                                           rate_model.FixedRateModel(1), 
+                                           DistributionBatchGenerator(Distribution.FIXED, 1),
+                                           DistributionRequestGenerator(Distribution.UNIFORM),
+                                           max_outstanding=10
+    ),
+    Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, 
+                                            rate_model.IntervalRateModel([(0,50), (60,150), (120,90)]), 
+                                            DistributionBatchGenerator(Distribution.FIXED, 1),
+                                            DistributionRequestGenerator(Distribution.UNIFORM),
+                                            max_outstanding=200
+    ),
+    Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, 
+                                          rate_model.FixedRateModel(100), 
+                                          DistributionBatchGenerator(Distribution.FIXED, 1),
+                                          DistributionRequestGenerator(Distribution.UNIFORM),
+                                          max_outstanding=200
+    )
+})
+
+predefined_workloads['default_low_high_low'] = WorkloadModel(
+{
+    Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, 
+                                           rate_model.FixedRateModel(0), 
+                                           DistributionBatchGenerator(Distribution.FIXED, 1),
+                                           DistributionRequestGenerator(Distribution.UNIFORM),
+                                           max_outstanding=10
+    ),
+    Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, 
+                                            rate_model.IntervalRateModel([(0,100), (60,300), (120,100)]), 
+                                            DistributionBatchGenerator(Distribution.FIXED, 1),
+                                            DistributionRequestGenerator(Distribution.UNIFORM),
+                                            max_outstanding=200
+    ),
+    Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, 
+                                          rate_model.FixedRateModel(0), 
+                                          DistributionBatchGenerator(Distribution.FIXED, 1),
+                                          DistributionRequestGenerator(Distribution.UNIFORM),
+                                          max_outstanding=200
+    )
+})
+
+for rate in [83, 100, 180, 190, 200]:
+    predefined_workloads['default%d' % rate] = WorkloadModel(
+    {
+        Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
+                                                rate_model.FixedRateModel(rate),
+                                                DistributionBatchGenerator(Distribution.FIXED, 1),
+                                                DistributionRequestGenerator(Distribution.EXPONENTIAL),
+                                                max_outstanding=1000
+        )
+    })

From 82f7f541c39377ae2386cc52b777b354b3f545c4 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 25 Nov 2020 11:38:08 -0700
Subject: [PATCH 002/461] started lineage implementation

---
 flow/flow.cpp |  2 ++
 flow/flow.h   | 33 +++++++++++++++++++++++++++++++--
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/flow/flow.cpp b/flow/flow.cpp
index 89f04bd5df..a2bfcc1510 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -26,6 +26,8 @@
 #include <stdarg.h>
 #include <cinttypes>
 
+thread_local ActorLineagePropertyMap* currentLineage = nullptr;
+
 #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER)
 // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test.
 void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) {
diff --git a/flow/flow.h b/flow/flow.h
index a72465143d..155c5db2a2 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -36,6 +36,7 @@
 #include <string>
 #include <utility>
 #include <algorithm>
+#include <memory>
 
 #include "flow/Platform.h"
 #include "flow/FastAlloc.h"
@@ -407,6 +408,30 @@ struct SingleCallback {
 	}
 };
 
+// in the future we might want to read these from a different thread. std::shared_ptr
+// seems to be better suited for this...
+struct ActorLineagePropertyMap : std::enable_shared_from_this<ActorLineagePropertyMap> {
+	std::shared_ptr<ActorLineagePropertyMap> parent = nullptr;
+};
+
+extern thread_local ActorLineagePropertyMap* currentLineage;
+
+struct ActorLineage {
+	std::shared_ptr<ActorLineagePropertyMap> properties = std::make_shared<ActorLineagePropertyMap>();
+	ActorLineage() {
+		if (currentLineage) {
+			properties->parent = currentLineage->shared_from_this();
+		}
+	}
+};
+
+struct save_lineage {
+	ActorLineagePropertyMap* current = currentLineage;
+	~save_lineage() {
+		currentLineage = current;
+	}
+};
+
 // SAV is short for Single Assignment Variable: It can be assigned for only once!
 template <class T>
 struct SAV : private Callback<T>, FastAllocated<SAV<T>> {
@@ -445,6 +470,7 @@ public:
 		ASSERT(canBeSet());
 		new (&value_storage) T(std::forward<U>(value));
 		this->error_state = Error::fromCode(SET_ERROR_CODE);
+		save_lineage _{};
 		while (Callback<T>::next != this)
 			Callback<T>::next->fire(this->value());
 	}
@@ -457,6 +483,7 @@ public:
 	void sendError(Error err) {
 		ASSERT(canBeSet() && int16_t(err.code()) > 0);
 		this->error_state = err;
+		save_lineage _{};
 		while (Callback<T>::next != this)
 			Callback<T>::next->error(err);
 	}
@@ -477,6 +504,7 @@ public:
 	void finishSendAndDelPromiseRef() {
 		// Call only after value_storage has already been initialized!
 		this->error_state = Error::fromCode(SET_ERROR_CODE);
+		save_lineage _{};
 		while (Callback<T>::next != this)
 			Callback<T>::next->fire(this->value());
 
@@ -500,6 +528,7 @@ public:
 		}
 
 		this->error_state = err;
+		save_lineage _{};
 		while (Callback<T>::next != this)
 			Callback<T>::next->error(err);
 
@@ -987,7 +1016,7 @@ static inline void destruct(T& t) {
 }
 
 template <class ReturnValue>
-struct Actor : SAV<ReturnValue> {
+struct Actor : SAV<ReturnValue>, ActorLineage {
 	int8_t actor_wait_state;  // -1 means actor is cancelled; 0 means actor is not waiting; 1-N mean waiting in callback group #
 
 	Actor() : SAV<ReturnValue>(1, 1), actor_wait_state(0) { /*++actorCount;*/ }
@@ -995,7 +1024,7 @@ struct Actor : SAV<ReturnValue> {
 };
 
 template <>
-struct Actor<void> {
+struct Actor<void> : ActorLineage {
 	// This specialization is for a void actor (one not returning a future, hence also uncancellable)
 
 	int8_t actor_wait_state;  // 0 means actor is not waiting; 1-N mean waiting in callback group #

From 05f77f905fb3a32c026729479de3de5456a5789e Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 7 Dec 2020 15:15:25 -0700
Subject: [PATCH 003/461] Added actor lineage

---
 flow/actorcompiler/ActorCompiler.cs     |   1 +
 flow/actorcompiler/actorcompiler.csproj | 108 +-----------------------
 flow/actorcompiler/actorcompiler.sln    |  34 ++++++++
 flow/flow.cpp                           |   5 +-
 flow/flow.h                             |  96 +++++++++++++--------
 flow/genericactors.actor.h              |   4 +
 6 files changed, 110 insertions(+), 138 deletions(-)
 create mode 100644 flow/actorcompiler/actorcompiler.sln

diff --git a/flow/actorcompiler/ActorCompiler.cs b/flow/actorcompiler/ActorCompiler.cs
index 7aef82a42e..dc9de91868 100644
--- a/flow/actorcompiler/ActorCompiler.cs
+++ b/flow/actorcompiler/ActorCompiler.cs
@@ -452,6 +452,7 @@ namespace actorcompiler
                     fullClassName,
                     string.Join(", ", actor.parameters.Select(p => p.name).ToArray()));
 
+            writer.WriteLine("restore_lineage _;");
             if (actor.returnType != null)
                 writer.WriteLine("\treturn Future<{1}>({0});", newActor, actor.returnType);
             else
diff --git a/flow/actorcompiler/actorcompiler.csproj b/flow/actorcompiler/actorcompiler.csproj
index e737adabd2..b590913634 100644
--- a/flow/actorcompiler/actorcompiler.csproj
+++ b/flow/actorcompiler/actorcompiler.csproj
@@ -1,108 +1,8 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project Sdk="Microsoft.NET.Sdk">
+
   <PropertyGroup>
-    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
-    <ProductVersion>10.0.20506</ProductVersion>
-    <SchemaVersion>2.0</SchemaVersion>
-    <ProjectGuid>{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}</ProjectGuid>
     <OutputType>Exe</OutputType>
-    <AppDesignerFolder>Properties</AppDesignerFolder>
-    <RootNamespace>actorcompiler</RootNamespace>
-    <AssemblyName>actorcompiler</AssemblyName>
-    <TargetFrameworkVersion>v4.0</TargetFrameworkVersion>
-    <FileAlignment>512</FileAlignment>
-    <OutputPath>$(SolutionDir)bin\$(Configuration)\</OutputPath>
-    <PublishUrl>publish\</PublishUrl>
-    <Install>true</Install>
-    <InstallFrom>Disk</InstallFrom>
-    <UpdateEnabled>false</UpdateEnabled>
-    <UpdateMode>Foreground</UpdateMode>
-    <UpdateInterval>7</UpdateInterval>
-    <UpdateIntervalUnits>Days</UpdateIntervalUnits>
-    <UpdatePeriodically>false</UpdatePeriodically>
-    <UpdateRequired>false</UpdateRequired>
-    <MapFileExtensions>true</MapFileExtensions>
-    <ApplicationRevision>0</ApplicationRevision>
-    <ApplicationVersion>1.0.0.%2a</ApplicationVersion>
-    <IsWebBootstrapper>false</IsWebBootstrapper>
-    <UseApplicationTrust>false</UseApplicationTrust>
-    <BootstrapperEnabled>true</BootstrapperEnabled>
+    <TargetFramework>net5.0</TargetFramework>
   </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|AnyCPU'">
-    <DebugSymbols>true</DebugSymbols>
-    <DefineConstants>DEBUG;TRACE</DefineConstants>
-    <DebugType>full</DebugType>
-    <PlatformTarget>AnyCPU</PlatformTarget>
-    <LangVersion>default</LangVersion>
-    <ErrorReport>prompt</ErrorReport>
-    <CodeAnalysisIgnoreBuiltInRuleSets>false</CodeAnalysisIgnoreBuiltInRuleSets>
-    <CodeAnalysisFailOnMissingRules>false</CodeAnalysisFailOnMissingRules>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|AnyCPU'">
-    <DefineConstants>TRACE</DefineConstants>
-    <Optimize>true</Optimize>
-    <DebugType>pdbonly</DebugType>
-    <PlatformTarget>AnyCPU</PlatformTarget>
-    <LangVersion>default</LangVersion>
-    <ErrorReport>prompt</ErrorReport>
-    <CodeAnalysisIgnoreBuiltInRuleSets>false</CodeAnalysisIgnoreBuiltInRuleSets>
-    <CodeAnalysisIgnoreBuiltInRules>false</CodeAnalysisIgnoreBuiltInRules>
-  </PropertyGroup>
-  <ItemGroup>
-    <Reference Include="System" />
-    <Reference Include="System.Core">
-      <RequiredTargetFramework>3.5</RequiredTargetFramework>
-    </Reference>
-    <Reference Include="System.Xml.Linq">
-      <RequiredTargetFramework>3.5</RequiredTargetFramework>
-    </Reference>
-    <Reference Include="System.Data.DataSetExtensions">
-      <RequiredTargetFramework>3.5</RequiredTargetFramework>
-    </Reference>
-    <Reference Include="Microsoft.CSharp">
-      <RequiredTargetFramework>4.0</RequiredTargetFramework>
-    </Reference>
-    <Reference Include="System.Data" />
-    <Reference Include="System.Xml" />
-  </ItemGroup>
-  <ItemGroup>
-    <Compile Include="ActorCompiler.cs" />
-    <Compile Include="ActorParser.cs" />
-    <Compile Include="ParseTree.cs" />
-    <Compile Include="Program.cs" />
-    <Compile Include="Properties\AssemblyInfo.cs" />
-  </ItemGroup>
-  <ItemGroup>
-    <BootstrapperPackage Include=".NETFramework,Version=v4.0">
-      <Visible>False</Visible>
-      <ProductName>Microsoft .NET Framework 4 %28x86 and x64%29</ProductName>
-      <Install>true</Install>
-    </BootstrapperPackage>
-    <BootstrapperPackage Include="Microsoft.Net.Client.3.5">
-      <Visible>False</Visible>
-      <ProductName>.NET Framework 3.5 SP1 Client Profile</ProductName>
-      <Install>false</Install>
-    </BootstrapperPackage>
-    <BootstrapperPackage Include="Microsoft.Net.Framework.3.5.SP1">
-      <Visible>False</Visible>
-      <ProductName>.NET Framework 3.5 SP1</ProductName>
-      <Install>false</Install>
-    </BootstrapperPackage>
-    <BootstrapperPackage Include="Microsoft.Windows.Installer.3.1">
-      <Visible>False</Visible>
-      <ProductName>Windows Installer 3.1</ProductName>
-      <Install>true</Install>
-    </BootstrapperPackage>
-  </ItemGroup>
-  <ItemGroup>
-    <Content Include="Actor checklist.txt" />
-  </ItemGroup>
-  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
-  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
-       Other similar extension points exist, see Microsoft.Common.targets.
-  <Target Name="BeforeBuild">
-  </Target>
-  <Target Name="AfterBuild">
-  </Target>
-  -->
+
 </Project>
\ No newline at end of file
diff --git a/flow/actorcompiler/actorcompiler.sln b/flow/actorcompiler/actorcompiler.sln
new file mode 100644
index 0000000000..a4292bfaaa
--- /dev/null
+++ b/flow/actorcompiler/actorcompiler.sln
@@ -0,0 +1,34 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.26124.0
+MinimumVisualStudioVersion = 15.0.26124.0
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "actorcompiler", "actorcompiler.csproj", "{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Debug|x64 = Debug|x64
+		Debug|x86 = Debug|x86
+		Release|Any CPU = Release|Any CPU
+		Release|x64 = Release|x64
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x64.Build.0 = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x86.Build.0 = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|Any CPU.ActiveCfg = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|Any CPU.Build.0 = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x64.ActiveCfg = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x64.Build.0 = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x86.ActiveCfg = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x86.Build.0 = Debug|Any CPU
+	EndGlobalSection
+EndGlobal
diff --git a/flow/flow.cpp b/flow/flow.cpp
index a2bfcc1510..c4a6097300 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -26,7 +26,10 @@
 #include <stdarg.h>
 #include <cinttypes>
 
-thread_local ActorLineagePropertyMap* currentLineage = nullptr;
+extern thread_local Reference<ActorLineage> currentLineage;
+
+ActorLineage::ActorLineage() : parent(currentLineage) {
+}
 
 #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER)
 // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test.
diff --git a/flow/flow.h b/flow/flow.h
index 155c5db2a2..a0c9793a7a 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -20,6 +20,7 @@
 
 #ifndef FLOW_FLOW_H
 #define FLOW_FLOW_H
+#include "flow/FastRef.h"
 #pragma once
 
 #pragma warning( disable: 4244 4267 ) // SOMEDAY: Carefully check for integer overflow issues (e.g. size_t to int conversions like this suppresses)
@@ -408,28 +409,21 @@ struct SingleCallback {
 	}
 };
 
-// in the future we might want to read these from a different thread. std::shared_ptr
-// seems to be better suited for this...
-struct ActorLineagePropertyMap : std::enable_shared_from_this<ActorLineagePropertyMap> {
-	std::shared_ptr<ActorLineagePropertyMap> parent = nullptr;
+struct ActorLineagePropertyMap : ReferenceCounted<ActorLineagePropertyMap> {
 };
 
-extern thread_local ActorLineagePropertyMap* currentLineage;
-
-struct ActorLineage {
-	std::shared_ptr<ActorLineagePropertyMap> properties = std::make_shared<ActorLineagePropertyMap>();
-	ActorLineage() {
-		if (currentLineage) {
-			properties->parent = currentLineage->shared_from_this();
-		}
-	}
+struct ActorLineage : ReferenceCounted<ActorLineage> {
+	Reference<ActorLineagePropertyMap> map;
+	Reference<ActorLineage> parent;
+	ActorLineage();
 };
 
-struct save_lineage {
-	ActorLineagePropertyMap* current = currentLineage;
-	~save_lineage() {
-		currentLineage = current;
-	}
+extern thread_local Reference<ActorLineage> currentLineage;
+
+struct restore_lineage {
+	Reference<ActorLineage> lineage;
+	restore_lineage() : lineage(currentLineage) {}
+	~restore_lineage() { currentLineage = lineage; }
 };
 
 // SAV is short for Single Assignment Variable: It can be assigned for only once!
@@ -447,7 +441,8 @@ public:
 
 	T& value() { return *(T*)&value_storage; }
 
-	SAV(int futures, int promises) : futures(futures), promises(promises), error_state(Error::fromCode(UNSET_ERROR_CODE)) {
+	SAV(int futures, int promises)
+	  : futures(futures), promises(promises), error_state(Error::fromCode(UNSET_ERROR_CODE)) {
 		Callback<T>::prev = Callback<T>::next = this;
 	}
 	~SAV() {
@@ -466,13 +461,14 @@ public:
 	}
 
 	template <class U>
-	void send(U && value) {
+	void send(U&& value) {
 		ASSERT(canBeSet());
 		new (&value_storage) T(std::forward<U>(value));
 		this->error_state = Error::fromCode(SET_ERROR_CODE);
-		save_lineage _{};
-		while (Callback<T>::next != this)
+		restore_lineage _;
+		while (Callback<T>::next != this) {
 			Callback<T>::next->fire(this->value());
+		}
 	}
 
 	void send(Never) {
@@ -483,13 +479,15 @@ public:
 	void sendError(Error err) {
 		ASSERT(canBeSet() && int16_t(err.code()) > 0);
 		this->error_state = err;
-		save_lineage _{};
-		while (Callback<T>::next != this)
+		restore_lineage _;
+		while (Callback<T>::next != this) {
 			Callback<T>::next->error(err);
+		}
 	}
 
 	template <class U>
 	void sendAndDelPromiseRef(U && value) {
+		restore_lineage _;
 		ASSERT(canBeSet());
 		if (promises == 1 && !futures) {
 			// No one is left to receive the value, so we can just die
@@ -503,8 +501,8 @@ public:
 
 	void finishSendAndDelPromiseRef() {
 		// Call only after value_storage has already been initialized!
+		restore_lineage _;
 		this->error_state = Error::fromCode(SET_ERROR_CODE);
-		save_lineage _{};
 		while (Callback<T>::next != this)
 			Callback<T>::next->fire(this->value());
 
@@ -520,6 +518,7 @@ public:
 	}
 
 	void sendErrorAndDelPromiseRef(Error err) {
+		restore_lineage _;
 		ASSERT(canBeSet() && int16_t(err.code()) > 0);
 		if (promises == 1 && !futures) {
 			// No one is left to receive the value, so we can just die
@@ -528,7 +527,6 @@ public:
 		}
 
 		this->error_state = err;
-		save_lineage _{};
 		while (Callback<T>::next != this)
 			Callback<T>::next->error(err);
 
@@ -624,6 +622,7 @@ struct NotifiedQueue : private SingleCallback<T>, FastAllocated<NotifiedQueue<T>
 		if (error.isValid()) return;
 
 		if (SingleCallback<T>::next != this) {
+			restore_lineage _;
 			SingleCallback<T>::next->fire(std::forward<U>(value));
 		}
 		else {
@@ -635,8 +634,10 @@ struct NotifiedQueue : private SingleCallback<T>, FastAllocated<NotifiedQueue<T>
 		if (error.isValid()) return;
 
 		this->error = err;
-		if (SingleCallback<T>::next != this)
+		if (SingleCallback<T>::next != this) {
+			restore_lineage _;
 			SingleCallback<T>::next->error(err);
+		}
 	}
 
 	void addPromiseRef() { promises++; }
@@ -1016,38 +1017,67 @@ static inline void destruct(T& t) {
 }
 
 template <class ReturnValue>
-struct Actor : SAV<ReturnValue>, ActorLineage {
+struct Actor : SAV<ReturnValue> {
+	Reference<ActorLineage> lineage = Reference<ActorLineage>{new ActorLineage() };
 	int8_t actor_wait_state;  // -1 means actor is cancelled; 0 means actor is not waiting; 1-N mean waiting in callback group #
 
-	Actor() : SAV<ReturnValue>(1, 1), actor_wait_state(0) { /*++actorCount;*/ }
+	Actor() : SAV<ReturnValue>(1, 1), actor_wait_state(0) {
+		/*++actorCount;*/
+		currentLineage = lineage;
+	}
+
+	Reference<ActorLineage> setLineage() {
+		auto res = currentLineage;
+		currentLineage = lineage;
+		return res;
+	}
 	//~Actor() { --actorCount; }
 };
 
 template <>
-struct Actor<void> : ActorLineage {
+struct Actor<void> {
 	// This specialization is for a void actor (one not returning a future, hence also uncancellable)
 
+	Reference<ActorLineage> lineage = Reference<ActorLineage>{new ActorLineage() };
 	int8_t actor_wait_state;  // 0 means actor is not waiting; 1-N mean waiting in callback group #
 
-	Actor() : actor_wait_state(0) { /*++actorCount;*/ }
+	Actor() : actor_wait_state(0) {
+		/*++actorCount;*/
+		currentLineage = lineage;
+	}
+
+	Reference<ActorLineage> setLineage() {
+		auto res = currentLineage;
+		currentLineage = lineage;
+		return res;
+	}
 	//~Actor() { --actorCount; }
 };
 
 template <class ActorType, int CallbackNumber, class ValueType>
 struct ActorCallback : Callback<ValueType> {
-	virtual void fire(ValueType const& value) override { static_cast<ActorType*>(this)->a_callback_fire(this, value); }
-	virtual void error(Error e) override { static_cast<ActorType*>(this)->a_callback_error(this, e); }
+	virtual void fire(ValueType const& value) override {
+		auto _ = static_cast<ActorType*>(this)->setLineage();
+		static_cast<ActorType*>(this)->a_callback_fire(this, value);
+	}
+	virtual void error(Error e) override {
+		auto _ = static_cast<ActorType*>(this)->setLineage();
+		static_cast<ActorType*>(this)->a_callback_error(this, e);
+	}
 };
 
 template <class ActorType, int CallbackNumber, class ValueType>
 struct ActorSingleCallback : SingleCallback<ValueType> {
 	virtual void fire(ValueType const& value) override {
+		auto _ = static_cast<ActorType*>(this)->setLineage();
 		static_cast<ActorType*>(this)->a_callback_fire(this, value);
 	}
 	virtual void fire(ValueType && value) override {
+		auto _ = static_cast<ActorType*>(this)->setLineage();
 		static_cast<ActorType*>(this)->a_callback_fire(this, std::move(value));
 	}
 	virtual void error(Error e) override {
+		auto _ = static_cast<ActorType*>(this)->setLineage();
 		static_cast<ActorType*>(this)->a_callback_error(this, e);
 	}
 };
diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h
index 3fcab1f7dd..ab9d9c07d5 100644
--- a/flow/genericactors.actor.h
+++ b/flow/genericactors.actor.h
@@ -1493,6 +1493,10 @@ struct YieldedFutureActor : SAV<Void>, ActorCallback<YieldedFutureActor, 1, Void
 		delete this;
 	}
 
+	Reference<ActorLineage> setLineage() {
+		return currentLineage;
+	}
+
 	void a_callback_fire(ActorCallback<YieldedFutureActor, 1, Void>*, Void) {
 		if (int16_t(in_error_state.code()) == UNSET_ERROR_CODE) {
 			in_error_state = Error::fromCode(SET_ERROR_CODE);

From d837e923ad9f8cbf3a5bcd5668a74d4ee0222c32 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 7 Dec 2020 15:23:18 -0700
Subject: [PATCH 004/461] minor bugfix

---
 flow/flow.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/flow.cpp b/flow/flow.cpp
index c4a6097300..ed977141bd 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -26,7 +26,7 @@
 #include <stdarg.h>
 #include <cinttypes>
 
-extern thread_local Reference<ActorLineage> currentLineage;
+thread_local Reference<ActorLineage> currentLineage;
 
 ActorLineage::ActorLineage() : parent(currentLineage) {
 }

From 2c4e38329e536172d2413da61d884ef944277598 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 9 Dec 2020 10:19:32 -0700
Subject: [PATCH 005/461] fix some compiler warnings

---
 fdbclient/SystemData.cpp              | 6 +++---
 fdbserver/BackupProgress.actor.cpp    | 2 +-
 fdbserver/BackupWorker.actor.cpp      | 6 +++---
 fdbserver/CommitProxyServer.actor.cpp | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index b402ad99a7..16733b1ad6 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -57,7 +57,7 @@ const Value keyServersValue( Standalone<RangeResultRef> result, const std::vecto
 	std::vector<Tag> destTag;
 
 	bool foundOldLocality = false;
-	for (const KeyValueRef kv : result) {
+	for (const KeyValueRef& kv : result) {
 		UID uid = decodeServerTagKey(kv.key);
 		if (std::find(src.begin(), src.end(), uid) != src.end()) {
 			srcTag.push_back( decodeServerTagValue(kv.value) );
@@ -109,7 +109,7 @@ void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& v
 	src.clear();
 	dest.clear();
 
-	for (const KeyValueRef kv : result) {
+	for (const KeyValueRef& kv : result) {
 		Tag tag = decodeServerTagValue(kv.value);
 		if (std::find(srcTag.begin(), srcTag.end(), tag) != srcTag.end()) {
 			src.push_back( decodeServerTagKey(kv.key) );
@@ -122,7 +122,7 @@ void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& v
 	std::sort(dest.begin(), dest.end());
 	if(missingIsError && (src.size() != srcTag.size() || dest.size() != destTag.size())) {
 		TraceEvent(SevError, "AttemptedToDecodeMissingTag");
-		for (const KeyValueRef kv : result) {
+		for (const KeyValueRef& kv : result) {
 			Tag tag = decodeServerTagValue(kv.value);
 			UID serverID = decodeServerTagKey(kv.key);
 			TraceEvent("TagUIDMap").detail("Tag", tag.toString()).detail("UID", serverID.toString());
diff --git a/fdbserver/BackupProgress.actor.cpp b/fdbserver/BackupProgress.actor.cpp
index 3f1d564c16..f496ec0558 100644
--- a/fdbserver/BackupProgress.actor.cpp
+++ b/fdbserver/BackupProgress.actor.cpp
@@ -121,7 +121,7 @@ std::map<std::tuple<LogEpoch, Version, int>, std::map<Tag, Version>> BackupProgr
 			}
 		}
 
-		for (const Tag tag : tags) { // tags without progress data
+		for (const Tag& tag : tags) { // tags without progress data
 			tagVersions.insert({ tag, adjustedBeginVersion });
 			TraceEvent("BackupVersionRange", dbgid)
 			    .detail("OldEpoch", epoch)
diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp
index 3cea9f6611..b5f78593e2 100644
--- a/fdbserver/BackupWorker.actor.cpp
+++ b/fdbserver/BackupWorker.actor.cpp
@@ -508,7 +508,7 @@ ACTOR Future<Void> setBackupKeys(BackupData* self, std::map<UID, Version> savedL
 			state std::vector<Future<Optional<Version>>> prevVersions;
 			state std::vector<BackupConfig> versionConfigs;
 			state std::vector<Future<Optional<bool>>> allWorkersReady;
-			for (const auto [uid, version] : savedLogVersions) {
+			for (const auto& [uid, version] : savedLogVersions) {
 				versionConfigs.emplace_back(uid);
 				prevVersions.push_back(versionConfigs.back().latestBackupWorkerSavedVersion().get(tr));
 				allWorkersReady.push_back(versionConfigs.back().allWorkerStarted().get(tr));
@@ -573,7 +573,7 @@ ACTOR Future<Void> monitorBackupProgress(BackupData* self) {
 			if (self->recruitedEpoch == self->oldestBackupEpoch) {
 				// update update progress so far if previous epochs are done
 				Version v = std::numeric_limits<Version>::max();
-				for (const auto [tag, version] : tagVersions) {
+				for (const auto& [tag, version] : tagVersions) {
 					v = std::min(v, version);
 				}
 				savedLogVersions.emplace(uid, v);
@@ -783,7 +783,7 @@ ACTOR Future<Void> saveMutationsToFile(BackupData* self, Version popVersion, int
 		    .detail("TagId", self->tag.id)
 		    .detail("File", file->getFileName());
 	}
-	for (const UID uid : activeUids) {
+	for (const UID& uid : activeUids) {
 		self->backups[uid].lastSavedVersion = popVersion + 1;
 	}
 
diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp
index eac0f0d4c2..96ae4c000c 100644
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@@ -1778,7 +1778,7 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy, MasterInter
 					state KeyRange txnKeys = allKeys;
 					Standalone<RangeResultRef> UIDtoTagMap = commitData.txnStateStore->readRange( serverTagKeys ).get();
 					state std::map<Tag, UID> tag_uid;
-					for (const KeyValueRef kv : UIDtoTagMap) {
+					for (const KeyValueRef& kv : UIDtoTagMap) {
 						tag_uid[decodeServerTagValue(kv.value)] = decodeServerTagKey(kv.key);
 					}
 					loop {

From 0d324cee80b306797e6f92392414b786ad5ce914 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 9 Dec 2020 10:19:59 -0700
Subject: [PATCH 006/461] Annotation framework and role lineage

---
 fdbrpc/CMakeLists.txt      |  2 +
 fdbrpc/Locality.h          |  1 +
 fdbrpc/RoleLineage.cpp     | 23 ++++++++++
 fdbrpc/RoleLineage.h       | 31 +++++++++++++
 fdbserver/worker.actor.cpp |  3 ++
 flow/flow.cpp              |  6 +++
 flow/flow.h                | 90 ++++++++++++++++++++++++++++++++------
 7 files changed, 142 insertions(+), 14 deletions(-)
 create mode 100644 fdbrpc/RoleLineage.cpp
 create mode 100644 fdbrpc/RoleLineage.h

diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt
index b4fb20098d..41229dce47 100644
--- a/fdbrpc/CMakeLists.txt
+++ b/fdbrpc/CMakeLists.txt
@@ -22,6 +22,8 @@ set(FDBRPC_SRCS
   ReplicationPolicy.cpp
   ReplicationTypes.cpp
   ReplicationUtils.cpp
+  RoleLineage.h
+  RoleLineage.cpp
   Stats.actor.cpp
   Stats.h
   sim2.actor.cpp
diff --git a/fdbrpc/Locality.h b/fdbrpc/Locality.h
index 11c209071a..2129b7a3b7 100644
--- a/fdbrpc/Locality.h
+++ b/fdbrpc/Locality.h
@@ -63,6 +63,7 @@ struct ProcessClass {
 		Ratekeeper,
 		StorageCache,
 		Backup,
+		Worker, // used for actor lineage tracking
 		NoRole
 	};
 	enum ClassSource { CommandLineSource, AutoSource, DBSource, InvalidSource = -1 };
diff --git a/fdbrpc/RoleLineage.cpp b/fdbrpc/RoleLineage.cpp
new file mode 100644
index 0000000000..89a64bbe40
--- /dev/null
+++ b/fdbrpc/RoleLineage.cpp
@@ -0,0 +1,23 @@
+/*
+ * RoleLineage.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbrpc/RoleLineage.h"
+
+StringRef RoleLineage::name = "RoleLineage"_sr;
diff --git a/fdbrpc/RoleLineage.h b/fdbrpc/RoleLineage.h
new file mode 100644
index 0000000000..30a2ea2650
--- /dev/null
+++ b/fdbrpc/RoleLineage.h
@@ -0,0 +1,31 @@
+/*
+ * RoleLineage.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "fdbrpc/Locality.h"
+
+struct RoleLineage : LineageProperties<RoleLineage> {
+    static StringRef name;
+    ProcessClass::ClusterRole role = ProcessClass::NoRole;
+
+    bool isSet(ProcessClass::ClusterRole RoleLineage::*member) {
+        return this->*member != ProcessClass::NoRole;
+    }
+};
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index ca34f903a2..98363ea247 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -22,6 +22,7 @@
 #include <boost/lexical_cast.hpp>
 
 #include "fdbrpc/Locality.h"
+#include "fdbrpc/RoleLineage.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbserver/Knobs.h"
 #include "flow/ActorCollection.h"
@@ -46,6 +47,7 @@
 #include "flow/Profiler.h"
 #include "flow/ThreadHelper.actor.h"
 #include "flow/Trace.h"
+#include "flow/flow.h"
 
 #ifdef __linux__
 #include <fcntl.h>
@@ -1810,6 +1812,7 @@ ACTOR Future<Void> fdbd(
 {
 	state vector<Future<Void>> actors;
 	state Promise<Void> recoveredDiskFiles;
+	currentLineage->modify(&RoleLineage::role) = ProcessClass::Worker;
 
 	try {
 		ServerCoordinators coordinators( connFile );
diff --git a/flow/flow.cpp b/flow/flow.cpp
index ed977141bd..5b354fe054 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -31,6 +31,12 @@ thread_local Reference<ActorLineage> currentLineage;
 ActorLineage::ActorLineage() : parent(currentLineage) {
 }
 
+ActorLineage::~ActorLineage() {
+	for (auto ptr : properties) {
+		delete ptr.second;
+	}
+}
+
 #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER)
 // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test.
 void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) {
diff --git a/flow/flow.h b/flow/flow.h
index a0c9793a7a..0ffc895a86 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -20,6 +20,7 @@
 
 #ifndef FLOW_FLOW_H
 #define FLOW_FLOW_H
+#include "flow/Arena.h"
 #include "flow/FastRef.h"
 #pragma once
 
@@ -29,6 +30,7 @@
 
 #include <vector>
 #include <queue>
+#include <stack>
 #include <map>
 #include <unordered_map>
 #include <set>
@@ -409,21 +411,88 @@ struct SingleCallback {
 	}
 };
 
-struct ActorLineagePropertyMap : ReferenceCounted<ActorLineagePropertyMap> {
+struct LineagePropertiesBase {
+};
+
+// helper class to make implementation of LineageProperties easier
+template<class Derived>
+struct LineageProperties : LineagePropertiesBase {
+	// Contract:
+	//
+	// StringRef name = "SomeUniqueName"_str;
+
+
+	// this has to be implemented by subclasses
+	// but can't be made virtual.
+	// A user should implement this for any type
+	// within the properies class.
+	template<class Value>
+	bool isSet(Value Derived::*member) {
+		return true;
+	}
 };
 
 struct ActorLineage : ReferenceCounted<ActorLineage> {
-	Reference<ActorLineagePropertyMap> map;
+private:
+	std::unordered_map<StringRef, LineagePropertiesBase*> properties;
 	Reference<ActorLineage> parent;
+public:
 	ActorLineage();
+	~ActorLineage();
+	bool isRoot() const {
+		return parent.getPtr() == nullptr;
+	}
+	void makeRoot() {
+		parent.clear();
+	}
+	template <class T, class V>
+	V& modify(V T::*member) {
+		auto& res = properties[T::name];
+		if (!res) {
+			res = new T{};
+		}
+		T* map = static_cast<T*>(res);
+		return map->*member;
+	}
+	template <class T, class V>
+	std::optional<V> get(V T::*member) const {
+		auto current = this;
+		while (current != nullptr) {
+			auto iter = current->properties.find(T::name);
+			if (iter != current->properties.end()) {
+				T const& map = static_cast<T const&>(*iter->second);
+				if (map.isSet(member)) {
+					return map.*member;
+				}
+			}
+			current = current->parent.getPtr();
+		}
+		return std::optional<V>{};
+	}
+	template <class T, class V>
+	std::stack<V> stack(V T::*member) const {
+		auto current = this;
+		std::stack<V> res;
+		while (current != nullptr) {
+			auto iter = current->properties.find(T::name);
+			if (iter != current->properties.end()) {
+				T const& map = static_cast<T const&>(*iter->second);
+				if (map.isSet(member)) {
+					res.push(map.*member);
+				}
+			}
+			current = current->parent.getPtr();
+		}
+		return res;
+	}
 };
 
 extern thread_local Reference<ActorLineage> currentLineage;
 
 struct restore_lineage {
-	Reference<ActorLineage> lineage;
-	restore_lineage() : lineage(currentLineage) {}
-	~restore_lineage() { currentLineage = lineage; }
+	Reference<ActorLineage> prev;
+	restore_lineage() : prev(currentLineage) {}
+	~restore_lineage() { currentLineage = prev; }
 };
 
 // SAV is short for Single Assignment Variable: It can be assigned for only once!
@@ -465,7 +534,6 @@ public:
 		ASSERT(canBeSet());
 		new (&value_storage) T(std::forward<U>(value));
 		this->error_state = Error::fromCode(SET_ERROR_CODE);
-		restore_lineage _;
 		while (Callback<T>::next != this) {
 			Callback<T>::next->fire(this->value());
 		}
@@ -479,7 +547,6 @@ public:
 	void sendError(Error err) {
 		ASSERT(canBeSet() && int16_t(err.code()) > 0);
 		this->error_state = err;
-		restore_lineage _;
 		while (Callback<T>::next != this) {
 			Callback<T>::next->error(err);
 		}
@@ -487,7 +554,6 @@ public:
 
 	template <class U>
 	void sendAndDelPromiseRef(U && value) {
-		restore_lineage _;
 		ASSERT(canBeSet());
 		if (promises == 1 && !futures) {
 			// No one is left to receive the value, so we can just die
@@ -501,7 +567,6 @@ public:
 
 	void finishSendAndDelPromiseRef() {
 		// Call only after value_storage has already been initialized!
-		restore_lineage _;
 		this->error_state = Error::fromCode(SET_ERROR_CODE);
 		while (Callback<T>::next != this)
 			Callback<T>::next->fire(this->value());
@@ -518,7 +583,6 @@ public:
 	}
 
 	void sendErrorAndDelPromiseRef(Error err) {
-		restore_lineage _;
 		ASSERT(canBeSet() && int16_t(err.code()) > 0);
 		if (promises == 1 && !futures) {
 			// No one is left to receive the value, so we can just die
@@ -622,7 +686,6 @@ struct NotifiedQueue : private SingleCallback<T>, FastAllocated<NotifiedQueue<T>
 		if (error.isValid()) return;
 
 		if (SingleCallback<T>::next != this) {
-			restore_lineage _;
 			SingleCallback<T>::next->fire(std::forward<U>(value));
 		}
 		else {
@@ -635,7 +698,6 @@ struct NotifiedQueue : private SingleCallback<T>, FastAllocated<NotifiedQueue<T>
 
 		this->error = err;
 		if (SingleCallback<T>::next != this) {
-			restore_lineage _;
 			SingleCallback<T>::next->error(err);
 		}
 	}
@@ -1025,13 +1087,13 @@ struct Actor : SAV<ReturnValue> {
 		/*++actorCount;*/
 		currentLineage = lineage;
 	}
+	//~Actor() { --actorCount; }
 
 	Reference<ActorLineage> setLineage() {
 		auto res = currentLineage;
 		currentLineage = lineage;
 		return res;
 	}
-	//~Actor() { --actorCount; }
 };
 
 template <>
@@ -1045,13 +1107,13 @@ struct Actor<void> {
 		/*++actorCount;*/
 		currentLineage = lineage;
 	}
+	//~Actor() { --actorCount; }
 
 	Reference<ActorLineage> setLineage() {
 		auto res = currentLineage;
 		currentLineage = lineage;
 		return res;
 	}
-	//~Actor() { --actorCount; }
 };
 
 template <class ActorType, int CallbackNumber, class ValueType>

From 945d0246cddc0dcfff982f22af54c43617bc79a8 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 9 Dec 2020 13:28:15 -0700
Subject: [PATCH 007/461] add actor stacktrace feature

---
 flow/actorcompiler/ActorCompiler.cs |  3 ++-
 flow/flow.cpp                       |  6 ++++++
 flow/flow.h                         | 12 ++++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/flow/actorcompiler/ActorCompiler.cs b/flow/actorcompiler/ActorCompiler.cs
index dc9de91868..28771f4503 100644
--- a/flow/actorcompiler/ActorCompiler.cs
+++ b/flow/actorcompiler/ActorCompiler.cs
@@ -452,7 +452,7 @@ namespace actorcompiler
                     fullClassName,
                     string.Join(", ", actor.parameters.Select(p => p.name).ToArray()));
 
-            writer.WriteLine("restore_lineage _;");
+            writer.WriteLine("\trestore_lineage _;");
             if (actor.returnType != null)
                 writer.WriteLine("\treturn Future<{1}>({0});", newActor, actor.returnType);
             else
@@ -1287,6 +1287,7 @@ namespace actorcompiler
             constructor.WriteLine("{");
             constructor.Indent(+1);
             ProbeEnter(constructor, actor.name);
+            constructor.WriteLine("currentLineage->modify(&StackLineage::actorName) = LiteralStringRef(\"{0}\");", actor.name);
             constructor.WriteLine("this->{0};", body.call());
             ProbeExit(constructor, actor.name);
             WriteFunction(writer, constructor, constructor.BodyText);
diff --git a/flow/flow.cpp b/flow/flow.cpp
index 5b354fe054..2e47847fcd 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -37,6 +37,12 @@ ActorLineage::~ActorLineage() {
 	}
 }
 
+StringRef StackLineage::name = "StackLineage"_sr;
+
+std::stack<StringRef> getActorStackTrace() {
+	return currentLineage->stack(&StackLineage::actorName);
+}
+
 #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER)
 // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test.
 void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) {
diff --git a/flow/flow.h b/flow/flow.h
index 0ffc895a86..518dbd036c 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -495,6 +495,18 @@ struct restore_lineage {
 	~restore_lineage() { currentLineage = prev; }
 };
 
+struct StackLineage : LineageProperties<StackLineage> {
+	static StringRef name;
+	StringRef actorName;
+
+	template<class Value>
+	bool isSet(Value StackLineage::*member) {
+		return true;
+	}
+};
+
+extern std::stack<StringRef> getActorStackTrace();
+
 // SAV is short for Single Assignment Variable: It can be assigned for only once!
 template <class T>
 struct SAV : private Callback<T>, FastAllocated<SAV<T>> {

From f8e1df6c4f8c5a687afffe2b9a28aa13e32ae9d5 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 10 Dec 2020 10:42:04 -0700
Subject: [PATCH 008/461] Support for actor stack traces

---
 fdbrpc/RoleLineage.h              |  2 +-
 fdbserver/CMakeLists.txt          |  1 +
 fdbserver/SigStack.cpp            | 23 +++++++++++++++++++++++
 fdbserver/worker.actor.cpp        |  3 +++
 flow/flow.h                       |  7 +------
 tests/TestRunner/local_cluster.py |  2 +-
 6 files changed, 30 insertions(+), 8 deletions(-)
 create mode 100644 fdbserver/SigStack.cpp

diff --git a/fdbrpc/RoleLineage.h b/fdbrpc/RoleLineage.h
index 30a2ea2650..8e9d3f4e9e 100644
--- a/fdbrpc/RoleLineage.h
+++ b/fdbrpc/RoleLineage.h
@@ -25,7 +25,7 @@ struct RoleLineage : LineageProperties<RoleLineage> {
     static StringRef name;
     ProcessClass::ClusterRole role = ProcessClass::NoRole;
 
-    bool isSet(ProcessClass::ClusterRole RoleLineage::*member) {
+    bool isSet(ProcessClass::ClusterRole RoleLineage::*member) const {
         return this->*member != ProcessClass::NoRole;
     }
 };
diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt
index bf266069cb..f52e5b8279 100644
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@@ -88,6 +88,7 @@ set(FDBSERVER_SRCS
   ResolverInterface.h
   ServerDBInfo.actor.h
   ServerDBInfo.h
+  SigStack.cpp
   SimulatedCluster.actor.cpp
   SimulatedCluster.h
   SkipList.cpp
diff --git a/fdbserver/SigStack.cpp b/fdbserver/SigStack.cpp
new file mode 100644
index 0000000000..efec5aff7d
--- /dev/null
+++ b/fdbserver/SigStack.cpp
@@ -0,0 +1,23 @@
+#include "flow/flow.h"
+#include <csignal>
+#include <iostream>
+#include <string_view>
+
+// This is not yet correct, as this is not async safe
+// However, this should be good enough for an initial
+// proof of concept.
+extern "C" void stackSignalHandler(int sig) {
+    auto stack = getActorStackTrace();
+    int i = 0;
+    while (!stack.empty()) {
+        auto s = stack.top();
+        stack.pop();
+        std::string_view n(reinterpret_cast<const char*>(s.begin()), s.size());
+        std::cout << i << ": " << n << std::endl;
+        ++i;
+    }
+}
+
+void setupStackSignal() {
+    std::signal(SIGUSR1, &stackSignalHandler);
+}
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 98363ea247..5d371c0c80 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -1798,6 +1798,8 @@ ACTOR Future<Void> monitorLeaderRemotelyWithDelayedCandidacy( Reference<ClusterC
 	}
 }
 
+extern void setupStackSignal();
+
 ACTOR Future<Void> fdbd(
 	Reference<ClusterConnectionFile> connFile,
 	LocalityData localities,
@@ -1812,6 +1814,7 @@ ACTOR Future<Void> fdbd(
 {
 	state vector<Future<Void>> actors;
 	state Promise<Void> recoveredDiskFiles;
+	setupStackSignal();
 	currentLineage->modify(&RoleLineage::role) = ProcessClass::Worker;
 
 	try {
diff --git a/flow/flow.h b/flow/flow.h
index 518dbd036c..b1e4c1e1fb 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -427,7 +427,7 @@ struct LineageProperties : LineagePropertiesBase {
 	// A user should implement this for any type
 	// within the properies class.
 	template<class Value>
-	bool isSet(Value Derived::*member) {
+	bool isSet(Value Derived::*member) const {
 		return true;
 	}
 };
@@ -498,11 +498,6 @@ struct restore_lineage {
 struct StackLineage : LineageProperties<StackLineage> {
 	static StringRef name;
 	StringRef actorName;
-
-	template<class Value>
-	bool isSet(Value StackLineage::*member) {
-		return true;
-	}
 };
 
 extern std::stack<StringRef> getActorStackTrace();
diff --git a/tests/TestRunner/local_cluster.py b/tests/TestRunner/local_cluster.py
index 68318d51dd..85f2094774 100644
--- a/tests/TestRunner/local_cluster.py
+++ b/tests/TestRunner/local_cluster.py
@@ -38,7 +38,7 @@ cluster_file = {etcdir}/fdb.cluster
 command = {fdbserver_bin}
 public_address = auto:$ID
 listen_address = public
-datadir = {datadir}
+datadir = {datadir}/$ID
 logdir = {logdir}
 # logsize = 10MiB
 # maxlogssize = 100MiB

From fb64902d5c5b6e88501ebe906d4d939f61257b9b Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Tue, 19 Jan 2021 16:04:09 -0700
Subject: [PATCH 009/461] Assign roles

---
 fdbrpc/CMakeLists.txt                         |  2 --
 fdbserver/CMakeLists.txt                      |  2 ++
 .../RoleLineage.actor.cpp                     |  2 +-
 .../RoleLineage.actor.h                       | 21 ++++++++++++++-
 fdbserver/worker.actor.cpp                    | 26 ++++++++++++++++++-
 flow/flow.cpp                                 |  5 ++--
 flow/flow.h                                   | 16 ++++++++++++
 7 files changed, 67 insertions(+), 7 deletions(-)
 rename fdbrpc/RoleLineage.cpp => fdbserver/RoleLineage.actor.cpp (95%)
 rename fdbrpc/RoleLineage.h => fdbserver/RoleLineage.actor.h (59%)

diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt
index 7a9ce26a10..af84676be7 100644
--- a/fdbrpc/CMakeLists.txt
+++ b/fdbrpc/CMakeLists.txt
@@ -22,8 +22,6 @@ set(FDBRPC_SRCS
   ReplicationPolicy.cpp
   ReplicationTypes.cpp
   ReplicationUtils.cpp
-  RoleLineage.h
-  RoleLineage.cpp
   Stats.actor.cpp
   Stats.h
   sim2.actor.cpp
diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt
index afc45b2cc4..9e406a0d26 100644
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@@ -86,6 +86,8 @@ set(FDBSERVER_SRCS
   RestoreWorker.actor.cpp
   Resolver.actor.cpp
   ResolverInterface.h
+  RoleLineage.actor.h
+  RoleLineage.actor.cpp
   ServerDBInfo.actor.h
   ServerDBInfo.h
   SigStack.cpp
diff --git a/fdbrpc/RoleLineage.cpp b/fdbserver/RoleLineage.actor.cpp
similarity index 95%
rename from fdbrpc/RoleLineage.cpp
rename to fdbserver/RoleLineage.actor.cpp
index 89a64bbe40..6d1b49527a 100644
--- a/fdbrpc/RoleLineage.cpp
+++ b/fdbserver/RoleLineage.actor.cpp
@@ -18,6 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/RoleLineage.h"
+#include "fdbserver/RoleLineage.actor.h"
 
 StringRef RoleLineage::name = "RoleLineage"_sr;
diff --git a/fdbrpc/RoleLineage.h b/fdbserver/RoleLineage.actor.h
similarity index 59%
rename from fdbrpc/RoleLineage.h
rename to fdbserver/RoleLineage.actor.h
index 8e9d3f4e9e..d35c749771 100644
--- a/fdbrpc/RoleLineage.h
+++ b/fdbserver/RoleLineage.actor.h
@@ -1,5 +1,5 @@
 /*
- * RoleLineage.h
+ * RoleLineage.actor.h
  *
  * This source file is part of the FoundationDB open source project
  *
@@ -19,7 +19,15 @@
  */
 
 #pragma once
+#include "flow/flow.h"
+#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_G_H)
+#  define FDBSERVER_ROLE_LINEAGE_ACTOR_G_H
+#  include "fdbserver/RoleLineage.actor.g.h"
+#elif !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_H)
+#  define FDBSERVER_ROLE_LINEAGE_ACTOR_H
+
 #include "fdbrpc/Locality.h"
+#include "flow/actorcompiler.h" // This must be the last include
 
 struct RoleLineage : LineageProperties<RoleLineage> {
     static StringRef name;
@@ -29,3 +37,14 @@ struct RoleLineage : LineageProperties<RoleLineage> {
         return this->*member != ProcessClass::NoRole;
     }
 };
+
+// creates a new root and sets the role lineage
+ACTOR template<class Fun>
+Future<decltype(std::declval<Fun>()())> runInRole(Fun fun, ProcessClass::ClusterRole role) {
+    currentLineage->makeRoot();
+    currentLineage->modify(&RoleLineage::role) = role;
+    decltype(std::declval<Fun>()()) res = wait(fun());
+    return res;
+}
+
+#endif
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 36f5c14860..19aea8622c 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -22,7 +22,6 @@
 #include <boost/lexical_cast.hpp>
 
 #include "fdbrpc/Locality.h"
-#include "fdbrpc/RoleLineage.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbserver/Knobs.h"
 #include "flow/ActorCollection.h"
@@ -33,6 +32,7 @@
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/MetricLogger.h"
 #include "fdbserver/BackupInterface.h"
+#include "fdbserver/RoleLineage.actor.h"
 #include "fdbserver/WorkerInterface.actor.h"
 #include "fdbserver/IKeyValueStore.h"
 #include "fdbserver/WaitFailure.h"
@@ -1024,6 +1024,8 @@ ACTOR Future<Void> workerServer(
 			DiskStore s = stores[f];
 			// FIXME: Error handling
 			if( s.storedComponent == DiskStore::Storage ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage;
 				IKeyValueStore* kv = openKVStore(s.storeType, s.filename, s.storeID, memoryLimit, false, validateDataFiles);
 				Future<Void> kvClosed = kv->onClosed();
 				filesClosed.add( kvClosed );
@@ -1058,6 +1060,8 @@ ACTOR Future<Void> workerServer(
 				f = storageServerRollbackRebooter( f, s.storeType, s.filename, recruited.id(), recruited.locality, dbInfo, folder, &filesClosed, memoryLimit, kv);
 				errorForwarders.add( forwardError( errors, Role::STORAGE_SERVER, recruited.id(), f ) );
 			} else if( s.storedComponent == DiskStore::TLogData ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::TLog;
 				std::string logQueueBasename;
 				const std::string filename = basename(s.filename);
 				if (StringRef(filename).startsWith(fileLogDataPrefix)) {
@@ -1218,6 +1222,8 @@ ACTOR Future<Void> workerServer(
 				}
 			}
 			when( RecruitMasterRequest req = waitNext(interf.master.getFuture()) ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Master;
 				MasterInterface recruited;
 				recruited.locality = locality;
 				recruited.initEndpoints();
@@ -1238,6 +1244,8 @@ ACTOR Future<Void> workerServer(
 				req.reply.send(recruited);
 			}
 			when ( InitializeDataDistributorRequest req = waitNext(interf.dataDistributor.getFuture()) ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::DataDistributor;
 				DataDistributorInterface recruited(locality);
 				recruited.initEndpoints();
 
@@ -1256,6 +1264,8 @@ ACTOR Future<Void> workerServer(
 				req.reply.send(recruited);
 			}
 			when ( InitializeRatekeeperRequest req = waitNext(interf.ratekeeper.getFuture()) ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Ratekeeper;
 				RatekeeperInterface recruited(locality, req.reqId);
 				recruited.initEndpoints();
 
@@ -1280,6 +1290,8 @@ ACTOR Future<Void> workerServer(
 			}
 			when (InitializeBackupRequest req = waitNext(interf.backup.getFuture())) {
 				if (!backupWorkerCache.exists(req.reqId)) {
+					LocalLineage _;
+					currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Backup;
 					BackupInterface recruited(locality);
 					recruited.initEndpoints();
 
@@ -1309,6 +1321,8 @@ ACTOR Future<Void> workerServer(
 						.detail("MinRecruitable", TLogVersion::MIN_RECRUITABLE);
 					req.reply.sendError(internal_error());
 				}
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::TLog;
 				TLogOptions tLogOptions(req.logVersion, req.spillType);
 				TLogFn tLogFn = tLogFnForOptions(tLogOptions);
 				auto& logData = sharedLogs[SharedLogsKey(tLogOptions, req.storeType)];
@@ -1341,6 +1355,8 @@ ACTOR Future<Void> workerServer(
 			}
 			when( InitializeStorageRequest req = waitNext(interf.storage.getFuture()) ) {
 				if( !storageCache.exists( req.reqId ) ) {
+					LocalLineage _;
+					currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage;
 					StorageServerInterface recruited(req.interfaceId);
 					recruited.locality = locality;
 					recruited.initEndpoints();
@@ -1379,6 +1395,8 @@ ACTOR Future<Void> workerServer(
 					forwardPromise( req.reply, storageCache.get( req.reqId ) );
 			}
 			when(InitializeCommitProxyRequest req = waitNext(interf.commitProxy.getFuture())) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::CommitProxy;
 				CommitProxyInterface recruited;
 				recruited.processId = locality.processId();
 				recruited.provisional = false;
@@ -1402,6 +1420,8 @@ ACTOR Future<Void> workerServer(
 				req.reply.send(recruited);
 			}
 			when( InitializeGrvProxyRequest req = waitNext(interf.grvProxy.getFuture()) ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::GrvProxy;
 				GrvProxyInterface recruited;
 				recruited.processId = locality.processId();
 				recruited.provisional = false;
@@ -1421,6 +1441,8 @@ ACTOR Future<Void> workerServer(
 				req.reply.send(recruited);
 			}
 			when( InitializeResolverRequest req = waitNext(interf.resolver.getFuture()) ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Resolver;
 				ResolverInterface recruited;
 				recruited.locality = locality;
 				recruited.initEndpoints();
@@ -1438,6 +1460,8 @@ ACTOR Future<Void> workerServer(
 				req.reply.send(recruited);
 			}
 			when( InitializeLogRouterRequest req = waitNext(interf.logRouter.getFuture()) ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::LogRouter;
 				TLogInterface recruited(locality);
 				recruited.initEndpoints();
 
diff --git a/flow/flow.cpp b/flow/flow.cpp
index 2e47847fcd..c90bbbe9ae 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -28,8 +28,9 @@
 
 thread_local Reference<ActorLineage> currentLineage;
 
-ActorLineage::ActorLineage() : parent(currentLineage) {
-}
+LineagePropertiesBase::~LineagePropertiesBase() {}
+
+ActorLineage::ActorLineage() : parent(currentLineage) {}
 
 ActorLineage::~ActorLineage() {
 	for (auto ptr : properties) {
diff --git a/flow/flow.h b/flow/flow.h
index e043ab49d4..9b3ba698b6 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -412,6 +412,7 @@ struct SingleCallback {
 };
 
 struct LineagePropertiesBase {
+	virtual ~LineagePropertiesBase();
 };
 
 // helper class to make implementation of LineageProperties easier
@@ -433,6 +434,7 @@ struct LineageProperties : LineagePropertiesBase {
 };
 
 struct ActorLineage : ReferenceCounted<ActorLineage> {
+	friend class LocalLineage;
 private:
 	std::unordered_map<StringRef, LineagePropertiesBase*> properties;
 	Reference<ActorLineage> parent;
@@ -489,6 +491,20 @@ public:
 
 extern thread_local Reference<ActorLineage> currentLineage;
 
+// This class can be used in order to modify all lineage properties
+// of actors created within a (non-actor) scope
+struct LocalLineage {
+	Reference<ActorLineage> lineage = Reference<ActorLineage>{new ActorLineage() };
+	Reference<ActorLineage> oldLineage;
+	LocalLineage() {
+		oldLineage = currentLineage;
+		currentLineage = lineage;
+	}
+	~LocalLineage() {
+		currentLineage = oldLineage;
+	}
+};
+
 struct restore_lineage {
 	Reference<ActorLineage> prev;
 	restore_lineage() : prev(currentLineage) {}

From f40d8c2f490a08351ce3d7e91bfd6752e268548a Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Tue, 19 Jan 2021 16:04:21 -0700
Subject: [PATCH 010/461] make profiler signal handler reentrant safe

---
 flow/Profiler.actor.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp
index ece9bcfafd..33d1542db7 100644
--- a/flow/Profiler.actor.cpp
+++ b/flow/Profiler.actor.cpp
@@ -148,6 +148,8 @@ struct Profiler {
 	}
 
 	void signal_handler() {  // async signal safe!
+		static std::atomic<bool> inSigHandler = false;
+		if (!inSigHandler.exchange(true)) { return; }
 		if(profilingEnabled) {
 			double t = timer();
 			output_buffer->push(*(void**)&t);
@@ -156,6 +158,7 @@ struct Profiler {
 				output_buffer->push(addresses[i]);
 			output_buffer->push((void*)-1LL);
 		}
+		inSigHandler.store(false);
 	}
 
 	static void signal_handler_for_closure(int, siginfo_t* si, void*, void* self) {  // async signal safe!

From c3efbe3040770dae65319446b9b3877f29b0ee44 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Tue, 19 Jan 2021 16:52:30 -0700
Subject: [PATCH 011/461] fixed minor bug

---
 flow/Profiler.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp
index 33d1542db7..d691f46205 100644
--- a/flow/Profiler.actor.cpp
+++ b/flow/Profiler.actor.cpp
@@ -149,7 +149,7 @@ struct Profiler {
 
 	void signal_handler() {  // async signal safe!
 		static std::atomic<bool> inSigHandler = false;
-		if (!inSigHandler.exchange(true)) { return; }
+		if (inSigHandler.exchange(true)) { return; }
 		if(profilingEnabled) {
 			double t = timer();
 			output_buffer->push(*(void**)&t);

From 29c626ca6a0d02f1d412327e177cc5db36b02042 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 15 Mar 2021 17:36:13 -0400
Subject: [PATCH 012/461] Changed code flow to fix loophole that avoided the
 knob guarding higher protocol versions and also added new restarting tests

---
 fdbserver/MoveKeys.actor.cpp                  | 24 ++++++++-------
 tests/CMakeLists.txt                          |  3 ++
 .../to_6.2.33/CycleTestRestart-1.txt          | 30 +++++++++++++++++++
 .../to_6.2.33/CycleTestRestart-2.txt          | 26 ++++++++++++++++
 4 files changed, 73 insertions(+), 10 deletions(-)
 create mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-1.txt
 create mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-2.txt

diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp
index c08f3f3476..83f7170e95 100644
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@@ -1232,23 +1232,27 @@ void seedShardServers(Arena& arena, CommitTransactionRef& tr, vector<StorageServ
 	tr.read_snapshot = 0;
 	tr.read_conflict_ranges.push_back_deep(arena, allKeys);
 
-	for (int s = 0; s < servers.size(); s++) {
-		tr.set(arena, serverTagKeyFor(servers[s].id()), serverTagValue(server_tag[servers[s].id()]));
-		tr.set(arena, serverListKeyFor(servers[s].id()), serverListValue(servers[s]));
+	for (auto& s : servers) {
+		tr.set(arena, serverTagKeyFor(s.id()), serverTagValue(server_tag[s.id()]));
+		tr.set(arena, serverListKeyFor(s.id()), serverListValue(s));
 	}
 
 	std::vector<Tag> serverTags;
+	std::vector<UID> serverSrcUID;
 	serverTags.reserve(servers.size());
-	for (int i = 0; i < servers.size(); i++)
-		serverTags.push_back(server_tag[servers[i].id()]);
+	for (auto& s : servers) {
+		serverTags.push_back(server_tag[s.id()]);
+		serverSrcUID.push_back(s.id());
+	}
 
+	auto ksValue = CLIENT_KNOBS->TAG_ENCODE_KEY_SERVERS ? keyServersValue(serverTags)
+	                                                    : keyServersValue(Standalone<RangeResultRef>(), serverSrcUID);
 	// We have to set this range in two blocks, because the master tracking of "keyServersLocations" depends on a change
 	// to a specific
 	//   key (keyServersKeyServersKey)
-	krmSetPreviouslyEmptyRange(
-	    tr, arena, keyServersPrefix, KeyRangeRef(KeyRef(), allKeys.end), keyServersValue(serverTags), Value());
+	krmSetPreviouslyEmptyRange(tr, arena, keyServersPrefix, KeyRangeRef(KeyRef(), allKeys.end), ksValue, Value());
 
-	for (int s = 0; s < servers.size(); s++)
-		krmSetPreviouslyEmptyRange(
-		    tr, arena, serverKeysPrefixFor(servers[s].id()), allKeys, serverKeysTrue, serverKeysFalse);
+	for (auto& s : servers) {
+		krmSetPreviouslyEmptyRange(tr, arena, serverKeysPrefixFor(s.id()), allKeys, serverKeysTrue, serverKeysFalse);
+	}
 }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 132616b1bb..16f0eb2170 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -204,6 +204,9 @@ if(WITH_PYTHON)
   add_fdb_test(
     TEST_FILES restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml
                restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml)
+  add_fdb_test(
+    TEST_FILES restarting/to_6.2.33/CycleTestRestart-1.txt
+               restarting/to_6.2.33/CycleTestRestart-2.txt IGNORE)
   add_fdb_test(
     TEST_FILES restarting/to_6.3.10/CycleTestRestart-1.txt
                restarting/to_6.3.10/CycleTestRestart-2.txt)
diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-1.txt b/tests/restarting/to_6.2.33/CycleTestRestart-1.txt
new file mode 100644
index 0000000000..647c2f3fe3
--- /dev/null
+++ b/tests/restarting/to_6.2.33/CycleTestRestart-1.txt
@@ -0,0 +1,30 @@
+testTitle=Clogged
+    clearAfterTest=false
+    testName=Cycle
+    transactionsPerSecond=500.0
+    nodeCount=2500
+    testDuration=10.0
+    expectedRate=0
+
+    testName=RandomClogging
+    testDuration=10.0
+
+    testName=Rollback
+    meanDelay=10.0
+    testDuration=10.0
+
+    testName=Attrition
+    machinesToKill=10
+    machinesToLeave=3
+    reboot=true
+    testDuration=10.0
+
+    testName=Attrition
+    machinesToKill=10
+    machinesToLeave=3
+    reboot=true
+    testDuration=10.0
+
+    testName=SaveAndKill
+    restartInfoLocation=simfdb/restartInfo.ini
+    testDuration=10.0
diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-2.txt b/tests/restarting/to_6.2.33/CycleTestRestart-2.txt
new file mode 100644
index 0000000000..7d498f2be1
--- /dev/null
+++ b/tests/restarting/to_6.2.33/CycleTestRestart-2.txt
@@ -0,0 +1,26 @@
+testTitle=Clogged
+    runSetup=false
+    testName=Cycle
+    transactionsPerSecond=2500.0
+    nodeCount=2500
+    testDuration=10.0
+    expectedRate=0
+
+    testName=RandomClogging
+    testDuration=10.0
+
+    testName=Rollback
+    meanDelay=10.0
+    testDuration=10.0
+
+    testName=Attrition
+    machinesToKill=10
+    machinesToLeave=3
+    reboot=true
+    testDuration=10.0
+
+    testName=Attrition
+    machinesToKill=10
+    machinesToLeave=3
+    reboot=true
+    testDuration=10.0

From a8c7a798f2483c22ffd6c8dacbb0946c81237c12 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 17 Mar 2021 15:34:20 -0600
Subject: [PATCH 013/461] First prototype of actorlineageset

---
 flow/ActorLineageSet.cpp | 118 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 flow/ActorLineageSet.cpp

diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp
new file mode 100644
index 0000000000..9fb93e9df7
--- /dev/null
+++ b/flow/ActorLineageSet.cpp
@@ -0,0 +1,118 @@
+/*
+ * ActorLineageSet.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flow/flow.h"
+#include <boost/lockfree/queue.hpp>
+
+class ActorLineageSet {
+public:
+	// The type we use for lookup into the set. Gets assigned during insert
+	using Index = unsigned;
+	// For now we use a fixed size capacity
+	constexpr static Index CAPACITY = 1024;
+	constexpr static Index npos = std::numeric_limits<Index>::max();
+
+	explicit ActorLineageSet();
+	ActorLineageSet(const ActorLineageSet&) = delete;
+	ActorLineageSet& operator=(const ActorLineageSet&) = delete;
+
+	// Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so
+	// the actual size might change anytime after or even during the call. This function only guarantees that the size
+	// was whatever the method returns at one point between the start and the end of the function call. The safest way
+	// to handle this is by assuming that this returns an estimate.
+	unsigned size();
+
+	Index insert(const Reference<ActorLineage>& lineage);
+	void erase(Index idx);
+	std::vector<Reference<ActorLineage>> copy();
+
+private:
+	static constexpr uintptr_t FREE = 0b1;
+	static constexpr uintptr_t LOCK = 0b10;
+	std::atomic<unsigned> _size = 0;
+	std::vector<std::atomic<std::uintptr_t>> _set;
+	boost::lockfree::queue<Index, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeQueue;
+	boost::lockfree::queue<ActorLineage*, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>>
+	    freeList;
+};
+
+ActorLineageSet::ActorLineageSet() {
+	// insert the free indexes in reverse order
+	for (unsigned i = CAPACITY; i > 0; --i) {
+		freeQueue.push(i - 1);
+		_set[i] = uintptr_t(1);
+	}
+}
+
+std::vector<Reference<ActorLineage>> ActorLineageSet::copy() {
+	std::vector<Reference<ActorLineage>> result;
+	for (int i = 0; i < CAPACITY; ++i) {
+		auto ptr = _set[i].load();
+		if ((ptr & FREE) != 0) {
+			ASSERT((ptr & LOCK) == 0);
+			if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) {
+				ActorLineage* entry = reinterpret_cast<ActorLineage*>(ptr);
+				ptr |= LOCK;
+				entry->addref();
+				// we try to unlock now. If this element was removed while we incremented the refcount, the element will
+				// end up in the freeList, so we will decrement later.
+				_set[i].compare_exchange_strong(ptr, ptr ^ LOCK);
+				result.emplace_back(entry);
+			}
+		}
+	}
+	// after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread
+	// might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next
+	// iteration
+	ActorLineage* toClean;
+	while (freeList.pop(toClean)) {
+		toClean->delref();
+	}
+	return result;
+}
+
+ActorLineageSet::Index ActorLineageSet::insert(const Reference<ActorLineage>& lineage) {
+	Index res;
+	if (!freeQueue.pop(res)) {
+		TraceEvent(SevWarnAlways, "NoCapacityInActorLineageSet");
+		return npos;
+	}
+	ASSERT(_set[res].load() & FREE);
+	auto ptr = reinterpret_cast<uintptr_t>(lineage.getPtr());
+	lineage->addref();
+	_set[res].store(ptr);
+	return res;
+}
+
+void ActorLineageSet::erase(Index idx) {
+	while (true) {
+		auto ptr = _set[idx].load();
+		if (ptr & LOCK) {
+			_set[idx].store(FREE);
+			freeList.push(reinterpret_cast<ActorLineage*>(ptr ^ LOCK));
+			return;
+		} else {
+			if (_set[idx].compare_exchange_strong(ptr, FREE)) {
+				reinterpret_cast<ActorLineage*>(ptr)->delref();
+				return;
+			}
+		}
+	}
+}
\ No newline at end of file

From 9812a49058adf16c2cdd1445f876f372be074109 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 17 Mar 2021 15:40:19 -0600
Subject: [PATCH 014/461] use consume_all to clean up after copy

---
 flow/ActorLineageSet.cpp | 5 +----
 flow/CMakeLists.txt      | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp
index 9fb93e9df7..0957339501 100644
--- a/flow/ActorLineageSet.cpp
+++ b/flow/ActorLineageSet.cpp
@@ -81,10 +81,7 @@ std::vector<Reference<ActorLineage>> ActorLineageSet::copy() {
 	// after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread
 	// might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next
 	// iteration
-	ActorLineage* toClean;
-	while (freeList.pop(toClean)) {
-		toClean->delref();
-	}
+	freeList.consume_all([](auto toClean) { toClean->delRef(); });
 	return result;
 }
 
diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt
index c838e8eff8..5e89fe4d28 100644
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@@ -3,6 +3,7 @@ find_package(Threads REQUIRED)
 set(FLOW_SRCS
   ActorCollection.actor.cpp
   ActorCollection.h
+  ActorLineageSet.cpp
   Arena.cpp
   Arena.h
   AsioReactor.h

From f6c7aa6ac77e55266e030109eb77d24b8894952e Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 17 Mar 2021 15:50:29 -0600
Subject: [PATCH 015/461] fixed typo

---
 flow/ActorLineageSet.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp
index 0957339501..9a0d34c9bf 100644
--- a/flow/ActorLineageSet.cpp
+++ b/flow/ActorLineageSet.cpp
@@ -81,7 +81,7 @@ std::vector<Reference<ActorLineage>> ActorLineageSet::copy() {
 	// after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread
 	// might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next
 	// iteration
-	freeList.consume_all([](auto toClean) { toClean->delRef(); });
+	freeList.consume_all([](auto toClean) { toClean->delref(); });
 	return result;
 }
 

From 4f1b807e1f480f24a0e3cb9622149953c295a4ab Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 17 Mar 2021 16:01:23 -0600
Subject: [PATCH 016/461] assert object alignment

---
 flow/ActorLineageSet.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp
index 9a0d34c9bf..570976379c 100644
--- a/flow/ActorLineageSet.cpp
+++ b/flow/ActorLineageSet.cpp
@@ -93,6 +93,7 @@ ActorLineageSet::Index ActorLineageSet::insert(const Reference<ActorLineage>& li
 	}
 	ASSERT(_set[res].load() & FREE);
 	auto ptr = reinterpret_cast<uintptr_t>(lineage.getPtr());
+	ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned
 	lineage->addref();
 	_set[res].store(ptr);
 	return res;

From 650e0de62570338ebff06cedc819a9bb00a0b925 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Thu, 18 Mar 2021 15:32:17 -0400
Subject: [PATCH 017/461] Remove extra downgrade workloads to restrict
 downgrade testing to 1 version apart

---
 tests/CMakeLists.txt                          |  3 --
 .../to_6.2.33/CycleTestRestart-1.txt          | 30 -------------------
 .../to_6.2.33/CycleTestRestart-2.txt          | 26 ----------------
 3 files changed, 59 deletions(-)
 delete mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-1.txt
 delete mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-2.txt

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 16f0eb2170..132616b1bb 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -204,9 +204,6 @@ if(WITH_PYTHON)
   add_fdb_test(
     TEST_FILES restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml
                restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml)
-  add_fdb_test(
-    TEST_FILES restarting/to_6.2.33/CycleTestRestart-1.txt
-               restarting/to_6.2.33/CycleTestRestart-2.txt IGNORE)
   add_fdb_test(
     TEST_FILES restarting/to_6.3.10/CycleTestRestart-1.txt
                restarting/to_6.3.10/CycleTestRestart-2.txt)
diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-1.txt b/tests/restarting/to_6.2.33/CycleTestRestart-1.txt
deleted file mode 100644
index 647c2f3fe3..0000000000
--- a/tests/restarting/to_6.2.33/CycleTestRestart-1.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-testTitle=Clogged
-    clearAfterTest=false
-    testName=Cycle
-    transactionsPerSecond=500.0
-    nodeCount=2500
-    testDuration=10.0
-    expectedRate=0
-
-    testName=RandomClogging
-    testDuration=10.0
-
-    testName=Rollback
-    meanDelay=10.0
-    testDuration=10.0
-
-    testName=Attrition
-    machinesToKill=10
-    machinesToLeave=3
-    reboot=true
-    testDuration=10.0
-
-    testName=Attrition
-    machinesToKill=10
-    machinesToLeave=3
-    reboot=true
-    testDuration=10.0
-
-    testName=SaveAndKill
-    restartInfoLocation=simfdb/restartInfo.ini
-    testDuration=10.0
diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-2.txt b/tests/restarting/to_6.2.33/CycleTestRestart-2.txt
deleted file mode 100644
index 7d498f2be1..0000000000
--- a/tests/restarting/to_6.2.33/CycleTestRestart-2.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-testTitle=Clogged
-    runSetup=false
-    testName=Cycle
-    transactionsPerSecond=2500.0
-    nodeCount=2500
-    testDuration=10.0
-    expectedRate=0
-
-    testName=RandomClogging
-    testDuration=10.0
-
-    testName=Rollback
-    meanDelay=10.0
-    testDuration=10.0
-
-    testName=Attrition
-    machinesToKill=10
-    machinesToLeave=3
-    reboot=true
-    testDuration=10.0
-
-    testName=Attrition
-    machinesToKill=10
-    machinesToLeave=3
-    reboot=true
-    testDuration=10.0

From 5c1b674815b1765dbc08eed4d98875163dee5708 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 19 Mar 2021 10:31:58 -0600
Subject: [PATCH 018/461] implemented test

---
 flow/CMakeLists.txt                          |   2 +-
 flow/WriteOnlySet.actor.cpp                  | 159 +++++++++++++++++++
 flow/{ActorLineageSet.cpp => WriteOnlySet.h} |  75 ++++-----
 3 files changed, 187 insertions(+), 49 deletions(-)
 create mode 100644 flow/WriteOnlySet.actor.cpp
 rename flow/{ActorLineageSet.cpp => WriteOnlySet.h} (60%)

diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt
index 5e89fe4d28..4c28aee437 100644
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@@ -3,7 +3,6 @@ find_package(Threads REQUIRED)
 set(FLOW_SRCS
   ActorCollection.actor.cpp
   ActorCollection.h
-  ActorLineageSet.cpp
   Arena.cpp
   Arena.h
   AsioReactor.h
@@ -70,6 +69,7 @@ set(FLOW_SRCS
   TreeBenchmark.h
   UnitTest.cpp
   UnitTest.h
+  WriteOnlySet.actor.cpp
   XmlTraceLogFormatter.cpp
   XmlTraceLogFormatter.h
   actorcompiler.h
diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
new file mode 100644
index 0000000000..d0f7c514ad
--- /dev/null
+++ b/flow/WriteOnlySet.actor.cpp
@@ -0,0 +1,159 @@
+/*
+ * WriteOnlySet.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flow/DeterministicRandom.h"
+#include "flow/WriteOnlySet.h"
+#include "flow/flow.h"
+#include "flow/UnitTest.h"
+
+#include <chrono>
+#include <random>
+#include "flow/actorcompiler.h" // has to be last include
+
+template <class T, class IndexType, IndexType CAPACITY>
+auto WriteOnlySet<T, IndexType, CAPACITY>::insert(const Reference<T>& lineage) -> Index {
+	Index res;
+	if (!freeQueue.pop(res)) {
+		TraceEvent(SevWarnAlways, "NoCapacityInWriteOnlySet");
+		return npos;
+	}
+	ASSERT(_set[res].load() & FREE);
+	auto ptr = reinterpret_cast<uintptr_t>(lineage.getPtr());
+	ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned
+	ASSERT((ptr & FREE) == 0 && (ptr & LOCK) == 0);
+	lineage->addref();
+	_set[res].store(ptr);
+	return res;
+}
+
+template <class T, class IndexType, IndexType CAPACITY>
+void WriteOnlySet<T, IndexType, CAPACITY>::erase(Index idx) {
+	while (true) {
+		auto ptr = _set[idx].load();
+		if (ptr & LOCK) {
+			_set[idx].store(FREE);
+			freeList.push(reinterpret_cast<T*>(ptr ^ LOCK));
+			return;
+		} else {
+			if (_set[idx].compare_exchange_strong(ptr, FREE)) {
+				reinterpret_cast<T*>(ptr)->delref();
+				return;
+			}
+		}
+	}
+}
+
+// Explicit instantiation
+template class WriteOnlySet<ActorLineage, unsigned, 1024>;
+
+// testing code
+namespace {
+
+std::atomic<unsigned long> instanceCounter = 0;
+constexpr double iteration_frequency = 10.0;
+
+struct TestObject {
+	mutable std::atomic<unsigned> _refCount = 1;
+	TestObject() { instanceCounter.fetch_add(1); }
+	void delref() const {
+		if (--_refCount == 0) {
+			delete this;
+			--instanceCounter;
+		}
+	}
+	void addref() const { ++_refCount; }
+};
+
+using TestSet = WriteOnlySet<TestObject, unsigned, 128>;
+using Clock = std::chrono::steady_clock;
+
+ACTOR Future<Void> threadjoiner(std::shared_ptr<std::vector<std::thread>> threads, std::shared_ptr<TestSet> set) {
+	loop {
+		wait(delay(0.1));
+		for (unsigned i = 0;;) {
+			if (threads->size() == i) {
+				break;
+			}
+			auto& t = (*threads)[i];
+			if (t.joinable()) {
+				t.join();
+				if (i + 1 < threads->size()) {
+					std::swap(*threads->rbegin(), (*threads)[i]);
+				}
+				threads->pop_back();
+			} else {
+				++i;
+			}
+		}
+		if (threads->empty()) {
+			set->copy();
+			ASSERT(instanceCounter.load() == 0);
+			return Void();
+		}
+	}
+}
+
+void testCopier(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
+	auto start = Clock::now();
+	while (true) {
+		if (Clock::now() - start > runFor) {
+			return;
+		}
+		auto copy = set->copy();
+		std::this_thread::sleep_for(std::chrono::milliseconds(10));
+	}
+}
+
+void writer(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
+	auto start = Clock::now();
+	std::random_device rDev;
+	DeterministicRandom rnd(rDev());
+	while (true) {
+		if (Clock::now() - start > runFor) {
+			return;
+		}
+		std::vector<TestSet::Index> positions;
+		for (int i = 0; i < rnd.randomInt(1, 101); ++i) {
+			positions.push_back(set->insert(Reference<TestObject>(new TestObject())));
+		}
+		rnd.randomShuffle(positions);
+		for (auto p : positions) {
+			set->erase(p);
+		}
+		std::this_thread::sleep_for(std::chrono::milliseconds(1));
+	}
+}
+
+TEST_CASE("/flow/WriteOnlySet") {
+	if (g_network->isSimulated()) {
+		// This test is not deterministic, so we shouldn't run it in simulation
+		return Void();
+	}
+	auto set = std::make_shared<TestSet>();
+	auto threads = std::make_shared<std::vector<std::thread>>();
+	std::chrono::seconds runFor(10);
+	for (int i = 0; i < 5; ++i) {
+		threads->emplace_back([set, runFor]() { writer(set, runFor); });
+	}
+	threads->emplace_back([set, runFor]() { testCopier(set, runFor); });
+	wait(threadjoiner(threads, set));
+	return Void();
+}
+} // namespace
\ No newline at end of file
diff --git a/flow/ActorLineageSet.cpp b/flow/WriteOnlySet.h
similarity index 60%
rename from flow/ActorLineageSet.cpp
rename to flow/WriteOnlySet.h
index 570976379c..a319ad22f0 100644
--- a/flow/ActorLineageSet.cpp
+++ b/flow/WriteOnlySet.h
@@ -1,9 +1,9 @@
 /*
- * ActorLineageSet.cpp
+ * WriteOnlySet.cpp
  *
  * This source file is part of the FoundationDB open source project
  *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,20 +18,23 @@
  * limitations under the License.
  */
 
-#include "flow/flow.h"
+#pragma once
+#include "flow/Error.h"
+#include "flow/FastRef.h"
+#include "flow/Trace.h"
 #include <boost/lockfree/queue.hpp>
 
-class ActorLineageSet {
+template <class T, class IndexType, IndexType CAPACITY>
+class WriteOnlySet {
 public:
 	// The type we use for lookup into the set. Gets assigned during insert
-	using Index = unsigned;
+	using Index = IndexType;
 	// For now we use a fixed size capacity
-	constexpr static Index CAPACITY = 1024;
 	constexpr static Index npos = std::numeric_limits<Index>::max();
 
-	explicit ActorLineageSet();
-	ActorLineageSet(const ActorLineageSet&) = delete;
-	ActorLineageSet& operator=(const ActorLineageSet&) = delete;
+	explicit WriteOnlySet();
+	WriteOnlySet(const WriteOnlySet&) = delete;
+	WriteOnlySet& operator=(const WriteOnlySet&) = delete;
 
 	// Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so
 	// the actual size might change anytime after or even during the call. This function only guarantees that the size
@@ -39,36 +42,39 @@ public:
 	// to handle this is by assuming that this returns an estimate.
 	unsigned size();
 
-	Index insert(const Reference<ActorLineage>& lineage);
+	Index insert(const Reference<T>& lineage);
 	void erase(Index idx);
-	std::vector<Reference<ActorLineage>> copy();
+	std::vector<Reference<T>> copy();
 
 private:
 	static constexpr uintptr_t FREE = 0b1;
 	static constexpr uintptr_t LOCK = 0b10;
-	std::atomic<unsigned> _size = 0;
+	std::atomic<Index> _size = 0;
 	std::vector<std::atomic<std::uintptr_t>> _set;
+	static_assert(std::atomic<Index>::is_always_lock_free, "Index type can't be used as a lock-free type");
+	static_assert(std::atomic<Index>::is_always_lock_free, "uintptr_t can't be used as a lock-free type");
 	boost::lockfree::queue<Index, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeQueue;
-	boost::lockfree::queue<ActorLineage*, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>>
-	    freeList;
+	boost::lockfree::queue<T*, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeList;
 };
 
-ActorLineageSet::ActorLineageSet() {
+template <class T, class IndexType, IndexType CAPACITY>
+WriteOnlySet<T, IndexType, CAPACITY>::WriteOnlySet() : _set(CAPACITY) {
 	// insert the free indexes in reverse order
 	for (unsigned i = CAPACITY; i > 0; --i) {
 		freeQueue.push(i - 1);
-		_set[i] = uintptr_t(1);
+		_set[i] = uintptr_t(FREE);
 	}
 }
 
-std::vector<Reference<ActorLineage>> ActorLineageSet::copy() {
-	std::vector<Reference<ActorLineage>> result;
+template <class T, class IndexType, IndexType CAPACITY>
+std::vector<Reference<T>> WriteOnlySet<T, IndexType, CAPACITY>::copy() {
+	std::vector<Reference<T>> result;
 	for (int i = 0; i < CAPACITY; ++i) {
 		auto ptr = _set[i].load();
 		if ((ptr & FREE) != 0) {
 			ASSERT((ptr & LOCK) == 0);
 			if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) {
-				ActorLineage* entry = reinterpret_cast<ActorLineage*>(ptr);
+				T* entry = reinterpret_cast<T*>(ptr);
 				ptr |= LOCK;
 				entry->addref();
 				// we try to unlock now. If this element was removed while we incremented the refcount, the element will
@@ -85,32 +91,5 @@ std::vector<Reference<ActorLineage>> ActorLineageSet::copy() {
 	return result;
 }
 
-ActorLineageSet::Index ActorLineageSet::insert(const Reference<ActorLineage>& lineage) {
-	Index res;
-	if (!freeQueue.pop(res)) {
-		TraceEvent(SevWarnAlways, "NoCapacityInActorLineageSet");
-		return npos;
-	}
-	ASSERT(_set[res].load() & FREE);
-	auto ptr = reinterpret_cast<uintptr_t>(lineage.getPtr());
-	ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned
-	lineage->addref();
-	_set[res].store(ptr);
-	return res;
-}
-
-void ActorLineageSet::erase(Index idx) {
-	while (true) {
-		auto ptr = _set[idx].load();
-		if (ptr & LOCK) {
-			_set[idx].store(FREE);
-			freeList.push(reinterpret_cast<ActorLineage*>(ptr ^ LOCK));
-			return;
-		} else {
-			if (_set[idx].compare_exchange_strong(ptr, FREE)) {
-				reinterpret_cast<ActorLineage*>(ptr)->delref();
-				return;
-			}
-		}
-	}
-}
\ No newline at end of file
+class ActorLineage;
+extern template class WriteOnlySet<ActorLineage, unsigned, 1024>;

From 459afeed4cd9d6df4892e085f94d369af59f1efc Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 19 Mar 2021 11:25:55 -0600
Subject: [PATCH 019/461] disable jemalloc on macOS

---
 cmake/Jemalloc.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Jemalloc.cmake b/cmake/Jemalloc.cmake
index 6dff173b93..e89ef3ce82 100644
--- a/cmake/Jemalloc.cmake
+++ b/cmake/Jemalloc.cmake
@@ -3,7 +3,7 @@ add_library(jemalloc INTERFACE)
 set(USE_JEMALLOC ON)
 # We don't want to use jemalloc on Windows
 # Nor on FreeBSD, where jemalloc is the default system allocator
-if(USE_SANITIZER OR WIN32 OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD"))
+if(USE_SANITIZER OR WIN32 OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") OR APPLE)
   set(USE_JEMALLOC OFF)
   return()
 endif()

From 995ae34b1e637f6f776fc889e00474eb1ca1a322 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 19 Mar 2021 17:10:42 -0600
Subject: [PATCH 020/461] Bugfxies & hack to allow new unit test to run

---
 fdbserver/fdbserver.actor.cpp |  4 ++
 flow/WriteOnlySet.actor.cpp   | 89 ++++++++++++++++++++++++++++++-----
 flow/WriteOnlySet.h           | 44 +++--------------
 3 files changed, 89 insertions(+), 48 deletions(-)

diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index ff28269e4f..a285c0b958 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -66,6 +66,7 @@
 #include "flow/SystemMonitor.h"
 #include "flow/TLSConfig.actor.h"
 #include "flow/Tracing.h"
+#include "flow/WriteOnlySet.h"
 
 #if defined(__linux__) || defined(__FreeBSD__)
 #include <execinfo.h>
@@ -1572,6 +1573,9 @@ private:
 } // namespace
 
 int main(int argc, char* argv[]) {
+	// TODO: Remove later, this is just to force the statics to be initialized
+	// otherwise the unit test won't run
+	ActorLineageSet _;
 	try {
 		platformInit();
 
diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index d0f7c514ad..32023f5e24 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -34,32 +34,75 @@ auto WriteOnlySet<T, IndexType, CAPACITY>::insert(const Reference<T>& lineage) -
 		TraceEvent(SevWarnAlways, "NoCapacityInWriteOnlySet");
 		return npos;
 	}
-	ASSERT(_set[res].load() & FREE);
+	ASSERT(_set[res].load() == 0);
 	auto ptr = reinterpret_cast<uintptr_t>(lineage.getPtr());
-	ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned
-	ASSERT((ptr & FREE) == 0 && (ptr & LOCK) == 0);
+	ASSERT((ptr % 2) == 0); // this needs to be at least 2-byte aligned
+	ASSERT(ptr != 0);
 	lineage->addref();
 	_set[res].store(ptr);
 	return res;
 }
 
 template <class T, class IndexType, IndexType CAPACITY>
-void WriteOnlySet<T, IndexType, CAPACITY>::erase(Index idx) {
+bool WriteOnlySet<T, IndexType, CAPACITY>::eraseImpl(Index idx) {
 	while (true) {
 		auto ptr = _set[idx].load();
 		if (ptr & LOCK) {
-			_set[idx].store(FREE);
+			_set[idx].store(0);
 			freeList.push(reinterpret_cast<T*>(ptr ^ LOCK));
-			return;
+			return false;
 		} else {
-			if (_set[idx].compare_exchange_strong(ptr, FREE)) {
+			if (_set[idx].compare_exchange_strong(ptr, 0)) {
 				reinterpret_cast<T*>(ptr)->delref();
-				return;
+				return true;
 			}
 		}
 	}
 }
 
+template <class T, class IndexType, IndexType CAPACITY>
+bool WriteOnlySet<T, IndexType, CAPACITY>::erase(Index idx) {
+	auto res = eraseImpl(idx);
+	ASSERT(freeQueue.push(idx));
+	return res;
+}
+
+template <class T, class IndexType, IndexType CAPACITY>
+WriteOnlySet<T, IndexType, CAPACITY>::WriteOnlySet() : _set(CAPACITY) {
+	// insert the free indexes in reverse order
+	for (unsigned i = CAPACITY; i > 0; --i) {
+		freeQueue.push(i - 1);
+		_set[i] = uintptr_t(0);
+	}
+}
+
+template <class T, class IndexType, IndexType CAPACITY>
+std::vector<Reference<T>> WriteOnlySet<T, IndexType, CAPACITY>::copy() {
+	std::vector<Reference<T>> result;
+	for (int i = 0; i < CAPACITY; ++i) {
+		auto ptr = _set[i].load();
+		if (ptr) {
+			ASSERT((ptr & LOCK) == 0); // if we lock something we need to immediately unlock after we're done copying
+			// We attempt lock so this won't get deleted. We will try this only once, if the other thread removed the
+			// object from the set between the previews lines and now, we just won't make it part of the result.
+			if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) {
+				T* entry = reinterpret_cast<T*>(ptr);
+				ptr |= LOCK;
+				entry->addref();
+				// we try to unlock now. If this element was removed while we incremented the refcount, the element will
+				// end up in the freeList, so we will decrement later.
+				_set[i].compare_exchange_strong(ptr, ptr ^ LOCK);
+				result.emplace_back(entry);
+			}
+		}
+	}
+	// after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread
+	// might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next
+	// iteration
+	freeList.consume_all([](auto toClean) { toClean->delref(); });
+	return result;
+}
+
 // Explicit instantiation
 template class WriteOnlySet<ActorLineage, unsigned, 1024>;
 
@@ -67,7 +110,10 @@ template class WriteOnlySet<ActorLineage, unsigned, 1024>;
 namespace {
 
 std::atomic<unsigned long> instanceCounter = 0;
-constexpr double iteration_frequency = 10.0;
+std::atomic<unsigned long> numInserts = 0;
+std::atomic<unsigned long> numErase = 0;
+std::atomic<unsigned long> numLockedErase = 0;
+std::atomic<unsigned long> numCopied = 0;
 
 struct TestObject {
 	mutable std::atomic<unsigned> _refCount = 1;
@@ -117,6 +163,7 @@ void testCopier(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
 			return;
 		}
 		auto copy = set->copy();
+		numCopied.fetch_add(copy.size());
 		std::this_thread::sleep_for(std::chrono::milliseconds(10));
 	}
 }
@@ -126,17 +173,32 @@ void writer(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
 	std::random_device rDev;
 	DeterministicRandom rnd(rDev());
 	while (true) {
+		unsigned inserts = 0, erases = 0;
 		if (Clock::now() - start > runFor) {
 			return;
 		}
 		std::vector<TestSet::Index> positions;
 		for (int i = 0; i < rnd.randomInt(1, 101); ++i) {
-			positions.push_back(set->insert(Reference<TestObject>(new TestObject())));
+			Reference<TestObject> o(new TestObject());
+			auto pos = set->insert(o);
+			if (pos == TestSet::npos) {
+				// could not insert -- ignore
+				break;
+			}
+			++inserts;
+			ASSERT(pos < TestSet::capacity);
+			positions.push_back(pos);
 		}
 		rnd.randomShuffle(positions);
 		for (auto p : positions) {
-			set->erase(p);
+			if (!set->erase(p)) {
+				++numLockedErase;
+			}
+			++erases;
 		}
+		numInserts.fetch_add(inserts);
+		numErase.fetch_add(erases);
+		ASSERT(inserts == erases);
 		std::this_thread::sleep_for(std::chrono::milliseconds(1));
 	}
 }
@@ -154,6 +216,11 @@ TEST_CASE("/flow/WriteOnlySet") {
 	}
 	threads->emplace_back([set, runFor]() { testCopier(set, runFor); });
 	wait(threadjoiner(threads, set));
+	TraceEvent("WriteOnlySetTestResult")
+	    .detail("Inserts", numInserts.load())
+	    .detail("Erases", numErase.load())
+	    .detail("Copies", numCopied.load())
+	    .detail("LockedErase", numLockedErase.load());
 	return Void();
 }
 } // namespace
\ No newline at end of file
diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h
index a319ad22f0..9d80795c68 100644
--- a/flow/WriteOnlySet.h
+++ b/flow/WriteOnlySet.h
@@ -31,6 +31,7 @@ public:
 	using Index = IndexType;
 	// For now we use a fixed size capacity
 	constexpr static Index npos = std::numeric_limits<Index>::max();
+	constexpr static IndexType capacity = CAPACITY;
 
 	explicit WriteOnlySet();
 	WriteOnlySet(const WriteOnlySet&) = delete;
@@ -43,12 +44,13 @@ public:
 	unsigned size();
 
 	Index insert(const Reference<T>& lineage);
-	void erase(Index idx);
+	bool erase(Index idx);
 	std::vector<Reference<T>> copy();
 
 private:
-	static constexpr uintptr_t FREE = 0b1;
-	static constexpr uintptr_t LOCK = 0b10;
+	bool eraseImpl(Index idx);
+
+	static constexpr uintptr_t LOCK = 0b1;
 	std::atomic<Index> _size = 0;
 	std::vector<std::atomic<std::uintptr_t>> _set;
 	static_assert(std::atomic<Index>::is_always_lock_free, "Index type can't be used as a lock-free type");
@@ -57,39 +59,7 @@ private:
 	boost::lockfree::queue<T*, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeList;
 };
 
-template <class T, class IndexType, IndexType CAPACITY>
-WriteOnlySet<T, IndexType, CAPACITY>::WriteOnlySet() : _set(CAPACITY) {
-	// insert the free indexes in reverse order
-	for (unsigned i = CAPACITY; i > 0; --i) {
-		freeQueue.push(i - 1);
-		_set[i] = uintptr_t(FREE);
-	}
-}
-
-template <class T, class IndexType, IndexType CAPACITY>
-std::vector<Reference<T>> WriteOnlySet<T, IndexType, CAPACITY>::copy() {
-	std::vector<Reference<T>> result;
-	for (int i = 0; i < CAPACITY; ++i) {
-		auto ptr = _set[i].load();
-		if ((ptr & FREE) != 0) {
-			ASSERT((ptr & LOCK) == 0);
-			if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) {
-				T* entry = reinterpret_cast<T*>(ptr);
-				ptr |= LOCK;
-				entry->addref();
-				// we try to unlock now. If this element was removed while we incremented the refcount, the element will
-				// end up in the freeList, so we will decrement later.
-				_set[i].compare_exchange_strong(ptr, ptr ^ LOCK);
-				result.emplace_back(entry);
-			}
-		}
-	}
-	// after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread
-	// might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next
-	// iteration
-	freeList.consume_all([](auto toClean) { toClean->delref(); });
-	return result;
-}
-
 class ActorLineage;
 extern template class WriteOnlySet<ActorLineage, unsigned, 1024>;
+
+using ActorLineageSet = WriteOnlySet<ActorLineage, unsigned, 1024>;

From 99ac47e96c10922ca40e1267467bcfcbb51a51a0 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 19 Mar 2021 18:08:09 -0600
Subject: [PATCH 021/461] documentation

---
 flow/WriteOnlySet.actor.cpp |  6 ++++
 flow/WriteOnlySet.h         | 65 +++++++++++++++++++++++++++++++++----
 2 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index 32023f5e24..93d9e99fc7 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -109,12 +109,14 @@ template class WriteOnlySet<ActorLineage, unsigned, 1024>;
 // testing code
 namespace {
 
+// Some statistics
 std::atomic<unsigned long> instanceCounter = 0;
 std::atomic<unsigned long> numInserts = 0;
 std::atomic<unsigned long> numErase = 0;
 std::atomic<unsigned long> numLockedErase = 0;
 std::atomic<unsigned long> numCopied = 0;
 
+// A simple object that counts the number of its instances. This is used to detect memory leaks.
 struct TestObject {
 	mutable std::atomic<unsigned> _refCount = 1;
 	TestObject() { instanceCounter.fetch_add(1); }
@@ -130,6 +132,7 @@ struct TestObject {
 using TestSet = WriteOnlySet<TestObject, unsigned, 128>;
 using Clock = std::chrono::steady_clock;
 
+// An actor that can join a set of threads in an async way.
 ACTOR Future<Void> threadjoiner(std::shared_ptr<std::vector<std::thread>> threads, std::shared_ptr<TestSet> set) {
 	loop {
 		wait(delay(0.1));
@@ -156,6 +159,7 @@ ACTOR Future<Void> threadjoiner(std::shared_ptr<std::vector<std::thread>> thread
 	}
 }
 
+// occasionally copy the contents of the past set.
 void testCopier(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
 	auto start = Clock::now();
 	while (true) {
@@ -168,6 +172,7 @@ void testCopier(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
 	}
 }
 
+// In a loop adds and removes a set of objects to the set
 void writer(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
 	auto start = Clock::now();
 	std::random_device rDev;
@@ -203,6 +208,7 @@ void writer(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
 	}
 }
 
+// This unit test creates 5 writer threads and one copier thread.
 TEST_CASE("/flow/WriteOnlySet") {
 	if (g_network->isSimulated()) {
 		// This test is not deterministic, so we shouldn't run it in simulation
diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h
index 9d80795c68..a2589ec387 100644
--- a/flow/WriteOnlySet.h
+++ b/flow/WriteOnlySet.h
@@ -24,6 +24,21 @@
 #include "flow/Trace.h"
 #include <boost/lockfree/queue.hpp>
 
+/**
+ * This is a Write-Only set that supports copying the whole content. This data structure is lock-free and allows a user
+ * to insert and remove objects up to a given capacity (passed by a template).
+ *
+ * Template parameters:
+ * \param T The type to store.
+ * \param IndexType The type used as an index
+ * \param CAPACITY The maximum number of object this structure can store (if a user tries to store more, insert will
+ *                 fail gracefully)
+ * \pre T implements `void addref() const` and `void delref() const`
+ * \pre IndexType must have a copy constructor
+ * \pre IndexType must have a trivial assignment operator
+ * \pre IndexType must have a trivial destructor
+ * \pre IndexType can be used as an index into a std::vector
+ */
 template <class T, class IndexType, IndexType CAPACITY>
 class WriteOnlySet {
 public:
@@ -37,25 +52,61 @@ public:
 	WriteOnlySet(const WriteOnlySet&) = delete;
 	WriteOnlySet& operator=(const WriteOnlySet&) = delete;
 
-	// Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so
-	// the actual size might change anytime after or even during the call. This function only guarantees that the size
-	// was whatever the method returns at one point between the start and the end of the function call. The safest way
-	// to handle this is by assuming that this returns an estimate.
-	unsigned size();
+	/**
+	 * Attempts to insert \p lineage into the set. This method can fail if the set is full (its size is equal to its
+	 * capacity). Calling insert on a full set is safe but the method will return \ref npos if the operation fails.
+	 *
+	 * \param lineage A reference to the object the user wants to insert.
+	 * \ret An index that can later be used to erase the value again or \ref npos if the insert failed.
+	 * \pre lineage.getPtr() % 2 == 0 (the memory for lineage has to be at least 2 byte aligned)
+	 */
+	[[nodiscard]] Index insert(const Reference<T>& lineage);
 
-	Index insert(const Reference<T>& lineage);
+	/**
+	 * Erases the object associated with \p idx from the set.
+	 *
+	 * \ret Whether the reference count was decremented. Usually the return value is only interesting for testing and
+	 *      benchmarking purposes and will in most cases be ignored. If \ref delref wasn't called, it will be called
+	 *      later. Note that at the time the return value is checked, \ref delref might already have been called.
+	 */
 	bool erase(Index idx);
+	/**
+	 * Copies all elements that are stored in the set into a vector. This copy operation does NOT provide a snapshot of
+	 * the data structure. The contract is weak:
+	 * - All object that were in the set before copy is called and weren't removed until after copy returned are
+	 *   guaranteed to be in the result.
+	 * - Any object that was inserted while copy is running might be in the result.
+	 * - Any object that was erased while copy is running might be in the result.
+	 */
 	std::vector<Reference<T>> copy();
 
 private:
+	// the implementation of erase -- the wrapper just makes the function a bit more readable.
 	bool eraseImpl(Index idx);
 
+	// the last bit of a pointer within the set is used like a boolean and true means that the object is locked. Locking
+	// an object is only relevant for memory management. A locked pointer can still be erased from the set, but the
+	// erase won't call delref on the object. Instead it will push the pointer into the \ref freeList and copy will call
+	// delref later.
 	static constexpr uintptr_t LOCK = 0b1;
-	std::atomic<Index> _size = 0;
+
+	// The actual memory
 	std::vector<std::atomic<std::uintptr_t>> _set;
 	static_assert(std::atomic<Index>::is_always_lock_free, "Index type can't be used as a lock-free type");
 	static_assert(std::atomic<Index>::is_always_lock_free, "uintptr_t can't be used as a lock-free type");
+
+	// The freeQueue. On creation all indexes (0..capacity-1) are pushed into this queue. On insert one element from
+	// this queue is consumed and the resulting number is used as an index into the set. On erase the index is given
+	// back to the freeQueue.
 	boost::lockfree::queue<Index, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeQueue;
+
+	// The freeList is used for memory management. Generally copying a shared pointer can't be done in a lock-free way.
+	// Instead, when we copy the data structure we first copy the address, then attempt to set the last bit to 1 and
+	// only if that succeeds we will increment the reference count. Whenever we attempt to remove an object
+	// in \ref erase we remove the object from the set (using an atomic compare and swap) and only decrement the
+	// reference count if the last bit is 0. If it's not we'll push the pointer into this free list.
+	// \ref copy will consume all elements from this freeList each time it runs and decrements the refcount for each
+	// element.
 	boost::lockfree::queue<T*, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeList;
 };
 

From 61352b912444c5d3601b8e33de234cc1f61fe32b Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 22 Mar 2021 11:41:45 -0600
Subject: [PATCH 022/461] use push_back where emplace_back is unnecessary

---
 flow/WriteOnlySet.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index 93d9e99fc7..9ab63aa56f 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -92,7 +92,7 @@ std::vector<Reference<T>> WriteOnlySet<T, IndexType, CAPACITY>::copy() {
 				// we try to unlock now. If this element was removed while we incremented the refcount, the element will
 				// end up in the freeList, so we will decrement later.
 				_set[i].compare_exchange_strong(ptr, ptr ^ LOCK);
-				result.emplace_back(entry);
+				result.push_back(entry);
 			}
 		}
 	}

From 301daf326939d6378d410420d007322f7c7a3dd3 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 22 Mar 2021 11:46:16 -0600
Subject: [PATCH 023/461] address review comments

---
 flow/WriteOnlySet.actor.cpp | 2 +-
 flow/WriteOnlySet.h         | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index 9ab63aa56f..364c53460d 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -1,5 +1,5 @@
 /*
- * WriteOnlySet.cpp
+ * WriteOnlySet.actor.cpp
  *
  * This source file is part of the FoundationDB open source project
  *
diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h
index a2589ec387..c71736f852 100644
--- a/flow/WriteOnlySet.h
+++ b/flow/WriteOnlySet.h
@@ -1,5 +1,5 @@
 /*
- * WriteOnlySet.cpp
+ * WriteOnlySet.h
  *
  * This source file is part of the FoundationDB open source project
  *
@@ -50,7 +50,9 @@ public:
 
 	explicit WriteOnlySet();
 	WriteOnlySet(const WriteOnlySet&) = delete;
+	WriteOnlySet(WriteOnlySet&&) = delete;
 	WriteOnlySet& operator=(const WriteOnlySet&) = delete;
+	WriteOnlySet& operator=(WriteOnlySet&&) = delete;
 
 	/**
 	 * Attempts to insert \p lineage into the set. This method can fail if the set is full (its size is equal to its
@@ -93,7 +95,7 @@ private:
 	// The actual memory
 	std::vector<std::atomic<std::uintptr_t>> _set;
 	static_assert(std::atomic<Index>::is_always_lock_free, "Index type can't be used as a lock-free type");
-	static_assert(std::atomic<Index>::is_always_lock_free, "uintptr_t can't be used as a lock-free type");
+	static_assert(std::atomic<uintptr_t>::is_always_lock_free, "uintptr_t can't be used as a lock-free type");
 
 	// The freeQueue. On creation all indexes (0..capacity-1) are pushed into this queue. On insert one element from
 	// this queue is consumed and the resulting number is used as an index into the set. On erase the index is given

From 5bd79de88179945a78e7862d90e7de183d3d690c Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 22 Mar 2021 10:01:28 -0700
Subject: [PATCH 024/461] Fix build

---
 flow/Profiler.actor.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp
index 46b0bcecb4..24bba87739 100644
--- a/flow/Profiler.actor.cpp
+++ b/flow/Profiler.actor.cpp
@@ -142,6 +142,8 @@ struct Profiler {
 	}
 
 	void signal_handler() { // async signal safe!
+		static std::atomic<bool> inSigHandler = false;
+		if (inSigHandler.exchange(true)) { return; }
 		if (profilingEnabled) {
 			double t = timer();
 			output_buffer->push(*(void**)&t);

From 0ec7340a6f72f8d29b43ade50667d2b0e88ebd75 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 22 Mar 2021 10:55:52 -0700
Subject: [PATCH 025/461] Create reference

---
 flow/WriteOnlySet.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index 364c53460d..92eceea7bc 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -92,7 +92,7 @@ std::vector<Reference<T>> WriteOnlySet<T, IndexType, CAPACITY>::copy() {
 				// we try to unlock now. If this element was removed while we incremented the refcount, the element will
 				// end up in the freeList, so we will decrement later.
 				_set[i].compare_exchange_strong(ptr, ptr ^ LOCK);
-				result.push_back(entry);
+				result.push_back(Reference(entry));
 			}
 		}
 	}
@@ -229,4 +229,4 @@ TEST_CASE("/flow/WriteOnlySet") {
 	    .detail("LockedErase", numLockedErase.load());
 	return Void();
 }
-} // namespace
\ No newline at end of file
+} // namespace

From 35f9fe08a277ba3c1e0d74dc6795cb7ca7811194 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 23 Mar 2021 14:44:14 -0700
Subject: [PATCH 026/461] Remove unnecessary header in IClientApi.h

---
 fdbclient/IClientApi.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h
index 25f5098b0f..0791f795a4 100644
--- a/fdbclient/IClientApi.h
+++ b/fdbclient/IClientApi.h
@@ -20,7 +20,6 @@
 
 #ifndef FDBCLIENT_ICLIENTAPI_H
 #define FDBCLIENT_ICLIENTAPI_H
-#include "fdbclient/ManagementAPI.actor.h"
 #pragma once
 
 #include "fdbclient/FDBOptions.g.h"

From cb39d1a6ed1ee89f3e02369e56068465818e42b8 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Wed, 24 Mar 2021 09:33:20 -0700
Subject: [PATCH 027/461] Refactor consistencycheck command using special keys

---
 fdbcli/CMakeLists.txt                    |  3 +
 fdbcli/ConsistencycheckCommand.actor.cpp | 45 +++++++++++++
 fdbcli/Util.cpp                          | 12 ++++
 fdbcli/fdbcli.actor.cpp                  | 82 +++++++++++-------------
 fdbcli/fdbcli.h                          | 60 +++++++++++++++++
 5 files changed, 156 insertions(+), 46 deletions(-)
 create mode 100644 fdbcli/ConsistencycheckCommand.actor.cpp
 create mode 100644 fdbcli/Util.cpp
 create mode 100644 fdbcli/fdbcli.h

diff --git a/fdbcli/CMakeLists.txt b/fdbcli/CMakeLists.txt
index 2b65baf040..b1eb09d491 100644
--- a/fdbcli/CMakeLists.txt
+++ b/fdbcli/CMakeLists.txt
@@ -1,7 +1,10 @@
 set(FDBCLI_SRCS
+  fdbcli.h
   fdbcli.actor.cpp
+  ConsistencycheckCommand.actor.cpp
   FlowLineNoise.actor.cpp
   FlowLineNoise.h
+  Util.cpp
   linenoise/linenoise.h)
 
 if(NOT WIN32)
diff --git a/fdbcli/ConsistencycheckCommand.actor.cpp b/fdbcli/ConsistencycheckCommand.actor.cpp
new file mode 100644
index 0000000000..349be547f5
--- /dev/null
+++ b/fdbcli/ConsistencycheckCommand.actor.cpp
@@ -0,0 +1,45 @@
+#include "fdbcli/fdbcli.h"
+
+#include "fdbclient/FDBOptions.g.h"
+#include "fdbclient/IClientApi.h"
+
+#include "flow/Arena.h"
+#include "flow/FastRef.h"
+#include "flow/ThreadHelper.actor.h"
+#include "flow/actorcompiler.h"
+
+using namespace FDBCLI;
+
+ACTOR static Future<bool> consistencycheckCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
+    state Reference<ITransaction> tr = db->createTransaction();
+    tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+    KeyRef k = LiteralStringRef("\xff\xff/management/consistency_check_suspended");
+    if (tokens.size() == 1) {
+        Optional<Value> suspended = wait(unsafeThreadFutureToFuture(tr->get(k)));
+        printf("ConsistencyCheck is %s\n", suspended.present() ? "off" : "on");
+    } else if (tokens.size() == 2 && tokencmp(tokens[1], "off")) {
+        tr->set(k, Value());
+        wait(unsafeThreadFutureToFuture(tr->commit()));
+    } else if (tokens.size() == 2 && tokencmp(tokens[1], "on")) {
+        tr->clear(k);
+        wait(unsafeThreadFutureToFuture(tr->commit()));
+    } else {
+        printUsage(tokens[0]);
+        return false;
+    }
+    return true;
+}
+
+namespace FDBCLI {
+
+Future<bool> consistencycheckCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
+    return consistencycheckCommandActor(db, tokens);
+}
+
+CommandFactory consistencycheckFactory("consistencycheck", CommandHelp(
+	    "consistencycheck [on|off]",
+	    "permits or prevents consistency checking",
+	    "Calling this command with `on' permits consistency check processes to run and `off' will halt their checking. "
+	    "Calling this command with no arguments will display if consistency checking is currently allowed.\n"));
+
+} // namespace FDBCLI
\ No newline at end of file
diff --git a/fdbcli/Util.cpp b/fdbcli/Util.cpp
new file mode 100644
index 0000000000..20d9da2f2c
--- /dev/null
+++ b/fdbcli/Util.cpp
@@ -0,0 +1,12 @@
+#include "flow/Arena.h"
+
+namespace FDBCLI {
+
+bool tokencmp(StringRef token, const char* command) {
+	if (token.size() != strlen(command))
+		return false;
+
+	return !memcmp(token.begin(), command, token.size());
+}
+
+}
\ No newline at end of file
diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index e608e96086..d88e98455f 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -21,6 +21,8 @@
 #include "boost/lexical_cast.hpp"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/FDBTypes.h"
+#include "fdbclient/IClientApi.h"
+#include "fdbclient/MultiVersionTransaction.h"
 #include "fdbclient/Status.h"
 #include "fdbclient/StatusClient.h"
 #include "fdbclient/DatabaseContext.h"
@@ -34,12 +36,14 @@
 #include "fdbclient/TagThrottle.h"
 
 #include "flow/DeterministicRandom.h"
+#include "flow/FastRef.h"
 #include "flow/Platform.h"
 
 #include "flow/TLSConfig.actor.h"
 #include "flow/SimpleOpt.h"
 
 #include "fdbcli/FlowLineNoise.h"
+#include "fdbcli/fdbcli.h"
 
 #include <cinttypes>
 #include <type_traits>
@@ -55,6 +59,12 @@
 
 #include "flow/actorcompiler.h" // This must be the last #include.
 
+/*
+ * While we could just use the MultiVersionApi instance directly, this #define allows us to swap in any other IClientApi
+ * instance (e.g. from ThreadSafeApi)
+ */
+#define API ((IClientApi*)MultiVersionApi::api)
+
 extern const char* getSourceVersion();
 
 std::vector<std::string> validOptions;
@@ -319,12 +329,12 @@ static std::string formatStringRef(StringRef item, bool fullEscaping = false) {
 	return ret;
 }
 
-static bool tokencmp(StringRef token, const char* command) {
-	if (token.size() != strlen(command))
-		return false;
+// static bool tokencmp(StringRef token, const char* command) {
+// 	if (token.size() != strlen(command))
+// 		return false;
 
-	return !memcmp(token.begin(), command, token.size());
-}
+// 	return !memcmp(token.begin(), command, token.size());
+// }
 
 static std::vector<std::vector<StringRef>> parseLine(std::string& line, bool& err, bool& partial) {
 	err = false;
@@ -452,20 +462,13 @@ static void printProgramUsage(const char* name) {
 	       "  -h, --help     Display this help and exit.\n");
 }
 
-struct CommandHelp {
-	std::string usage;
-	std::string short_desc;
-	std::string long_desc;
-	CommandHelp() {}
-	CommandHelp(const char* u, const char* s, const char* l) : usage(u), short_desc(s), long_desc(l) {}
-};
-
-std::map<std::string, CommandHelp> helpMap;
-std::set<std::string> hiddenCommands;
-
 #define ESCAPINGK "\n\nFor information on escaping keys, type `help escaping'."
 #define ESCAPINGKV "\n\nFor information on escaping keys and values, type `help escaping'."
 
+using namespace FDBCLI;
+std::map<std::string, CommandHelp>& helpMap = FDBCLI::CommandFactory::commands();
+std::set<std::string>& hiddenCommands = FDBCLI::CommandFactory::hiddenCommands();
+
 void initHelp() {
 	helpMap["begin"] =
 	    CommandHelp("begin",
@@ -649,11 +652,6 @@ void initHelp() {
 	    "SECONDS have elapsed, or after a storage server with a different ZONEID fails. Only one ZONEID can be marked "
 	    "for maintenance. Calling this command with no arguments will display any ongoing maintenance. Calling this "
 	    "command with `off' will disable maintenance.\n");
-	helpMap["consistencycheck"] = CommandHelp(
-	    "consistencycheck [on|off]",
-	    "permits or prevents consistency checking",
-	    "Calling this command with `on' permits consistency check processes to run and `off' will halt their checking. "
-	    "Calling this command with no arguments will display if consistency checking is currently allowed.\n");
 	helpMap["throttle"] =
 	    CommandHelp("throttle <on|off|enable auto|disable auto|list> [ARGS]",
 	                "view and control throttled tags",
@@ -719,7 +717,7 @@ void printHelp(StringRef command) {
 		printf("I don't know anything about `%s'\n", formatStringRef(command).c_str());
 }
 
-void printUsage(StringRef command) {
+void FDBCLI::printUsage(StringRef command) {
 	auto i = helpMap.find(command.toString());
 	if (i != helpMap.end())
 		printf("Usage: %s\n", i->second.usage.c_str());
@@ -3140,6 +3138,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 
 	state Database db;
 	state Reference<ReadYourWritesTransaction> tr;
+	// refactoring
+	state Reference<IDatabase> db2;
 
 	state bool writeMode = false;
 
@@ -3177,6 +3177,14 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 		return 1;
 	}
 
+	try {
+		db2 = API->createDatabase(opt.clusterFile.c_str());
+	} catch (Error& e) {
+		fprintf(stderr, "(CAPI)ERROR: %s (%d)\n", e.what(), e.code());
+		printf("(CAPI): Unable to connect to cluster from `%s'\n", ccf->getFilename().c_str());
+		return 1;
+	}
+
 	if (opt.trace) {
 		TraceEvent("CLIProgramStart")
 		    .setMaxEventLength(12000)
@@ -3795,29 +3803,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 				}
 
 				if (tokencmp(tokens[0], "consistencycheck")) {
-					getTransaction(db, tr, options, intrans);
-					tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-					tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-					tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-					if (tokens.size() == 1) {
-						state Future<Optional<Standalone<StringRef>>> ccSuspendSettingFuture =
-						    tr->get(fdbShouldConsistencyCheckBeSuspended);
-						wait(makeInterruptable(success(ccSuspendSettingFuture)));
-						bool ccSuspendSetting =
-						    ccSuspendSettingFuture.get().present()
-						        ? BinaryReader::fromStringRef<bool>(ccSuspendSettingFuture.get().get(), Unversioned())
-						        : false;
-						printf("ConsistencyCheck is %s\n", ccSuspendSetting ? "off" : "on");
-					} else if (tokens.size() == 2 && tokencmp(tokens[1], "off")) {
-						tr->set(fdbShouldConsistencyCheckBeSuspended, BinaryWriter::toValue(true, Unversioned()));
-						wait(commitTransaction(tr));
-					} else if (tokens.size() == 2 && tokencmp(tokens[1], "on")) {
-						tr->set(fdbShouldConsistencyCheckBeSuspended, BinaryWriter::toValue(false, Unversioned()));
-						wait(commitTransaction(tr));
-					} else {
-						printUsage(tokens[0]);
-						is_error = true;
-					}
+					bool _result = wait(consistencycheckCommand(db2, tokens));
+					is_error = _result;
 					continue;
 				}
 
@@ -4909,7 +4896,10 @@ int main(int argc, char** argv) {
 	}
 
 	try {
-		setupNetwork();
+		// setupNetwork();
+		// refactoring fdbcli
+		API->selectApiVersion(700);
+		API->setupNetwork();
 		Future<int> cliFuture = runCli(opt);
 		Future<Void> timeoutFuture = opt.exit_timeout ? timeExit(opt.exit_timeout) : Never();
 		auto f = stopNetworkAfter(success(cliFuture) || timeoutFuture);
diff --git a/fdbcli/fdbcli.h b/fdbcli/fdbcli.h
new file mode 100644
index 0000000000..d93616e657
--- /dev/null
+++ b/fdbcli/fdbcli.h
@@ -0,0 +1,60 @@
+/*
+ * fdbcli.actor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FDBCLI_H
+#define FDBCLI_H
+#pragma once
+
+#include "fdbclient/IClientApi.h"
+#include "flow/Arena.h"
+
+namespace FDBCLI {
+
+struct CommandHelp {
+	std::string usage;
+	std::string short_desc;
+	std::string long_desc;
+	CommandHelp() {}
+	CommandHelp(const char* u, const char* s, const char* l) : usage(u), short_desc(s), long_desc(l) {}
+};
+
+struct CommandFactory {
+	CommandFactory(const char* name, CommandHelp help) { commands()[name] = help; }
+    CommandFactory(const char* name) { hiddenCommands().insert(name); }
+    static std::map<std::string, CommandHelp>& commands() {
+		static std::map<std::string, CommandHelp> helpMap;
+		return helpMap;
+	}
+    static std::set<std::string>& hiddenCommands() {
+        static std::set<std::string> commands;
+        return commands;
+    }
+};
+
+// help functions
+bool tokencmp(StringRef token, const char* command);
+void printUsage(StringRef command);
+
+// consistency command
+Future<bool> consistencycheckCommand(Reference<IDatabase> db, std::vector<StringRef> tokens);
+
+} // namespace FDBCLI
+
+#endif
\ No newline at end of file

From b246e673bceab43b28cc4a855584333eb3404146 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Wed, 24 Mar 2021 15:34:19 -0400
Subject: [PATCH 028/461] Added comment to seedShardServers (taken from
 existing desc in .h file)

---
 fdbserver/MoveKeys.actor.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp
index 83f7170e95..0702b8d097 100644
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@@ -1212,6 +1212,8 @@ ACTOR Future<Void> moveKeys(Database cx,
 	return Void();
 }
 
+// Called by the master server to write the very first transaction to the database
+// establishing a set of shard servers and all invariants of the systemKeys.
 void seedShardServers(Arena& arena, CommitTransactionRef& tr, vector<StorageServerInterface> servers) {
 	std::map<Optional<Value>, Tag> dcId_locality;
 	std::map<UID, Tag> server_tag;

From 2dfd420882537d7fa7d477c08b699f1a5e961a1c Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 24 Mar 2021 14:52:42 -0700
Subject: [PATCH 029/461] Add sampling profiler thread

---
 fdbrpc/AsyncFileKAIO.actor.h  |  6 +++++-
 fdbrpc/IAsyncFile.h           |  4 ++++
 fdbrpc/Net2FileSystem.cpp     |  4 ++++
 fdbrpc/Net2FileSystem.h       |  3 +++
 fdbrpc/sim2.actor.cpp         |  4 ++++
 fdbrpc/simulator.h            |  4 ++++
 fdbserver/fdbserver.actor.cpp |  1 +
 flow/Platform.actor.cpp       | 27 +++++++++++++++++++++++++++
 flow/Platform.h               |  2 ++
 9 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/AsyncFileKAIO.actor.h
index 5e6592e6ba..dbdb040d00 100644
--- a/fdbrpc/AsyncFileKAIO.actor.h
+++ b/fdbrpc/AsyncFileKAIO.actor.h
@@ -242,7 +242,11 @@ public:
 		// result = map(result, [=](int r) mutable { KAIOLogBlockEvent(io, OpLogEntry::READY, r); return r; });
 #endif
 
-		return success(result);
+		auto& actorLineageSet = IAsyncFileSystem::filesystem()->getActorLineageSet();
+		auto index = actorLineageSet.insert(currentLineage);
+		Future<Void> res = success(result);
+		actorLineageSet.erase(index);
+		return res;
 	}
 // TODO(alexmiller): Remove when we upgrade the dev docker image to >14.10
 #ifndef FALLOC_FL_ZERO_RANGE
diff --git a/fdbrpc/IAsyncFile.h b/fdbrpc/IAsyncFile.h
index ed703514c6..ad48db5f07 100644
--- a/fdbrpc/IAsyncFile.h
+++ b/fdbrpc/IAsyncFile.h
@@ -25,6 +25,7 @@
 
 #include <ctime>
 #include "flow/flow.h"
+#include "flow/WriteOnlySet.h"
 #include "fdbrpc/IRateControl.h"
 
 // All outstanding operations must be cancelled before the destructor of IAsyncFile is called.
@@ -118,6 +119,9 @@ public:
 	// Returns the time of the last modification of the file.
 	virtual Future<std::time_t> lastWriteTime(const std::string& filename) = 0;
 
+	// Returns the shared memory data structure used to store actor lineages.
+	virtual ActorLineageSet& getActorLineageSet() = 0;
+
 	static IAsyncFileSystem* filesystem() { return filesystem(g_network); }
 	static runCycleFuncPtr runCycleFunc() {
 		return reinterpret_cast<runCycleFuncPtr>(
diff --git a/fdbrpc/Net2FileSystem.cpp b/fdbrpc/Net2FileSystem.cpp
index 71a7d784a1..8e895c08dc 100644
--- a/fdbrpc/Net2FileSystem.cpp
+++ b/fdbrpc/Net2FileSystem.cpp
@@ -89,6 +89,10 @@ Future<std::time_t> Net2FileSystem::lastWriteTime(const std::string& filename) {
 	return Net2AsyncFile::lastWriteTime(filename);
 }
 
+ActorLineageSet& Net2FileSystem::getActorLineageSet() {
+	return actorLineageSet;
+}
+
 void Net2FileSystem::newFileSystem(double ioTimeout, const std::string& fileSystemPath) {
 	g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Net2FileSystem(ioTimeout, fileSystemPath));
 }
diff --git a/fdbrpc/Net2FileSystem.h b/fdbrpc/Net2FileSystem.h
index 702b87828f..0c2229b5ca 100644
--- a/fdbrpc/Net2FileSystem.h
+++ b/fdbrpc/Net2FileSystem.h
@@ -39,6 +39,8 @@ public:
 
 	Future<Void> renameFile(std::string const& from, std::string const& to) override;
 
+	ActorLineageSet& getActorLineageSet() override;
+
 	// void init();
 	static void stop();
 
@@ -52,6 +54,7 @@ public:
 	dev_t fileSystemDeviceId;
 	bool checkFileSystem;
 #endif
+	ActorLineageSet actorLineageSet;
 };
 
 #endif
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index 6101ca8512..e9219f3ff3 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -2494,6 +2494,10 @@ Future<std::time_t> Sim2FileSystem::lastWriteTime(const std::string& filename) {
 	return fileWrites[filename];
 }
 
+ActorLineageSet& Sim2FileSystem::getActorLineageSet() {
+	return actorLineageSet;
+}
+
 void Sim2FileSystem::newFileSystem() {
 	g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Sim2FileSystem());
 }
diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h
index cde0eb0dda..08b4264e81 100644
--- a/fdbrpc/simulator.h
+++ b/fdbrpc/simulator.h
@@ -471,6 +471,8 @@ public:
 
 	Future<std::time_t> lastWriteTime(const std::string& filename) override;
 
+	ActorLineageSet& getActorLineageSet() override;
+
 	Future<Void> renameFile(std::string const& from, std::string const& to) override;
 
 	Sim2FileSystem() {}
@@ -478,6 +480,8 @@ public:
 	~Sim2FileSystem() override {}
 
 	static void newFileSystem();
+
+	ActorLineageSet actorLineageSet;
 };
 
 #endif
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index a285c0b958..fbcd7fd9ee 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -1948,6 +1948,7 @@ int main(int argc, char* argv[]) {
 				ASSERT(opts.connectionFile);
 
 				setupRunLoopProfiler();
+				setupSamplingProfiler();
 
 				auto dataFolder = opts.dataFolder;
 				if (!dataFolder.size())
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 42d8decccc..756fb6a7e3 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -48,6 +48,8 @@
 #include "flow/UnitTest.h"
 #include "flow/FaultInjection.h"
 
+#include "fdbrpc/IAsyncFile.h"
+
 #ifdef _WIN32
 #include <windows.h>
 #include <winioctl.h>
@@ -3673,6 +3675,31 @@ void setupRunLoopProfiler() {
 #endif
 }
 
+void* sampleThread(void* arg) {
+	while (true) {
+		threadSleep(1.0); // TODO: Read sample rate from global config
+
+		// TODO: Copy actor lineage of currently running actor
+
+		auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy();
+		printf("Disk ALPs: %d\n", diskAlps.size());
+
+		// TODO: Call collect on all actor lineages
+		for (auto actorLineage : diskAlps) {
+		}
+
+		// TODO: Serialize collected actor linage properties
+	}
+
+	return nullptr;
+}
+
+void setupSamplingProfiler() {
+	// TODO: Add knob
+	TraceEvent("StartingSamplingProfilerThread");
+	startThread(&sampleThread, nullptr);
+}
+
 // UnitTest for getMemoryInfo
 #ifdef __linux__
 TEST_CASE("/flow/Platform/getMemoryInfo") {
diff --git a/flow/Platform.h b/flow/Platform.h
index 74c9395c53..edf9ff3997 100644
--- a/flow/Platform.h
+++ b/flow/Platform.h
@@ -741,6 +741,8 @@ void registerCrashHandler();
 void setupRunLoopProfiler();
 EXTERNC void setProfilingEnabled(int enabled);
 
+void setupSamplingProfiler();
+
 // Use _exit() or criticalError(), not exit()
 #define exit static_assert(false, "Calls to exit() are forbidden by policy");
 

From 36f4c17ef143cd3c82b7038f001d256867e2a7fa Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 24 Mar 2021 15:04:45 -0700
Subject: [PATCH 030/461] Reduce the number of actor calls in load balancing to
 improve performance.

---
 fdbrpc/LoadBalance.actor.h | 321 +++++++++++++++++++++----------------
 1 file changed, 184 insertions(+), 137 deletions(-)

diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h
index 9b47912993..78f73352ba 100644
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@@ -75,109 +75,169 @@ struct LoadBalancedReply {
 Optional<LoadBalancedReply> getLoadBalancedReply(const LoadBalancedReply* reply);
 Optional<LoadBalancedReply> getLoadBalancedReply(const void*);
 
-// Returns true if we got a value for our request
-// Throws an error if the request returned an error that should bubble out
-// Returns false if we got an error that should result in reissuing the request
-template <class T>
-bool checkAndProcessResult(ErrorOr<T> result, Reference<ModelHolder> holder, bool atMostOnce, bool triedAllOptions) {
-	Optional<LoadBalancedReply> loadBalancedReply;
-	if (!result.isError()) {
-		loadBalancedReply = getLoadBalancedReply(&result.get());
+// Stores state for a request made by the load balancer
+template <class Request>
+struct RequestData : NonCopyable {
+	Future<ErrorOr<REPLY_TYPE(Request)>> response;
+	Reference<ModelHolder> modelHolder;
+	Future<Void> backoffDelay;
+	RequestStream<Request> const* stream = nullptr;
+	bool triedAllOptions = false;
+
+	bool requestStarted = false; // true once the request has been sent to an alternative
+	bool requestProcessed = false; // true once a response has been received and handled by checkAndProcessResult
+
+	// Whether or not the response future is valid
+	// This is true once setupRequest is called, even though at that point the response is Never().
+	bool isValid() { return response.isValid(); }
+
+	// Initializes the request state and starts the backoff delay
+	void setupRequest(double backoff, bool triedAllOptions, RequestStream<Request> const* stream) {
+		backoffDelay = (backoff > 0) ? delay(backoff) : Void();
+		response = Never();
+		modelHolder = Reference<ModelHolder>();
+		requestStarted = false;
+		requestProcessed = false;
+
+		this->stream = stream;
+		this->triedAllOptions = triedAllOptions;
 	}
 
-	int errCode;
-	if (loadBalancedReply.present()) {
-		errCode =
-		    loadBalancedReply.get().error.present() ? loadBalancedReply.get().error.get().code() : error_code_success;
-	} else {
-		errCode = result.isError() ? result.getError().code() : error_code_success;
+	// Sends the request to the configured stream
+	// This should not be called until after setupRequest has been called and the backoff delay has elapsed
+	void startRequest(Request request, QueueModel* model) {
+		ASSERT(stream);
+		ASSERT(backoffDelay.isReady());
+
+		backoffDelay = Never();
+		modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
+		response = stream->tryGetReply(request);
+		requestStarted = true;
 	}
 
-	bool maybeDelivered = errCode == error_code_broken_promise || errCode == error_code_request_maybe_delivered;
-	bool receivedResponse = loadBalancedReply.present() ? !loadBalancedReply.get().error.present() : result.present();
-	receivedResponse = receivedResponse || (!maybeDelivered && errCode != error_code_process_behind);
-	bool futureVersion = errCode == error_code_future_version || errCode == error_code_process_behind;
+	// Implementation of the logic to handle a response.
+	// Checks the state of the response, updates the queue model, and returns one of the following outcomes:
+	// A return value of true means that the request completed successfully
+	// A return value of false means that the request failed but should be retried
+	// A return value with an error means that the error should be thrown back to original caller
+	static ErrorOr<bool> checkAndProcessResultImpl(ErrorOr<REPLY_TYPE(Request)> result,
+	                                               Reference<ModelHolder> modelHolder,
+	                                               bool atMostOnce,
+	                                               bool triedAllOptions) {
+		ASSERT(modelHolder);
 
-	holder->release(
-	    receivedResponse, futureVersion, loadBalancedReply.present() ? loadBalancedReply.get().penalty : -1.0);
+		Optional<LoadBalancedReply> loadBalancedReply;
+		if (!result.isError()) {
+			loadBalancedReply = getLoadBalancedReply(&result.get());
+		}
+
+		int errCode;
+		if (loadBalancedReply.present()) {
+			errCode = loadBalancedReply.get().error.present() ? loadBalancedReply.get().error.get().code()
+			                                                  : error_code_success;
+		} else {
+			errCode = result.isError() ? result.getError().code() : error_code_success;
+		}
+
+		bool maybeDelivered = errCode == error_code_broken_promise || errCode == error_code_request_maybe_delivered;
+		bool receivedResponse =
+		    loadBalancedReply.present() ? !loadBalancedReply.get().error.present() : result.present();
+		receivedResponse = receivedResponse || (!maybeDelivered && errCode != error_code_process_behind);
+		bool futureVersion = errCode == error_code_future_version || errCode == error_code_process_behind;
+
+		modelHolder->release(
+		    receivedResponse, futureVersion, loadBalancedReply.present() ? loadBalancedReply.get().penalty : -1.0);
+
+		if (errCode == error_code_server_overloaded) {
+			return false;
+		}
+
+		if (loadBalancedReply.present() && !loadBalancedReply.get().error.present()) {
+			return true;
+		}
+
+		if (!loadBalancedReply.present() && result.present()) {
+			return true;
+		}
+
+		if (receivedResponse) {
+			return loadBalancedReply.present() ? loadBalancedReply.get().error.get() : result.getError();
+		}
+
+		if (atMostOnce && maybeDelivered) {
+			return request_maybe_delivered();
+		}
+
+		if (triedAllOptions && errCode == error_code_process_behind) {
+			return process_behind();
+		}
 
-	if (errCode == error_code_server_overloaded) {
 		return false;
 	}
 
-	if (loadBalancedReply.present() && !loadBalancedReply.get().error.present()) {
-		return true;
+	// Checks the state of the response, updates the queue model, and returns one of the following outcomes:
+	// A return value of true means that the request completed successfully
+	// A return value of false means that the request failed but should be retried
+	// In the event of a non-retryable failure, an error is thrown indicating the failure
+	bool checkAndProcessResult(bool atMostOnce) {
+		ASSERT(response.isReady());
+		requestProcessed = true;
+
+		ErrorOr<bool> outcome =
+		    checkAndProcessResultImpl(response.get(), std::move(modelHolder), atMostOnce, triedAllOptions);
+
+		if (outcome.isError()) {
+			throw outcome.getError();
+		} else if (!outcome.get()) {
+			response = Future<ErrorOr<REPLY_TYPE(Request)>>();
+		}
+
+		return outcome.get();
 	}
 
-	if (!loadBalancedReply.present() && result.present()) {
-		return true;
+	// Convert this request to a lagging request. Such a request is no longer being waited on, but it still needs to be
+	// processed so we can update the queue model.
+	void makeLaggingRequest() {
+		ASSERT(response.isValid());
+		ASSERT(!response.isReady());
+		ASSERT(modelHolder);
+		ASSERT(modelHolder->model);
+
+		QueueModel* model = modelHolder->model;
+		if (model->laggingRequestCount > FLOW_KNOBS->MAX_LAGGING_REQUESTS_OUTSTANDING ||
+		    model->laggingRequests.isReady()) {
+			model->laggingRequests.cancel();
+			model->laggingRequestCount = 0;
+			model->addActor = PromiseStream<Future<Void>>();
+			model->laggingRequests = actorCollection(model->addActor.getFuture(), &model->laggingRequestCount);
+		}
+
+		// We need to process the lagging request in order to update the queue model
+		Reference<ModelHolder> holderCapture = std::move(modelHolder);
+		bool triedAllOptionsCapture = triedAllOptions;
+		Future<Void> updateModel =
+		    map(response, [holderCapture, triedAllOptionsCapture](ErrorOr<REPLY_TYPE(Request)> result) {
+			    checkAndProcessResultImpl(result, holderCapture, false, triedAllOptionsCapture);
+			    return Void();
+		    });
+		model->addActor.send(updateModel);
 	}
 
-	if (receivedResponse) {
-		throw loadBalancedReply.present() ? loadBalancedReply.get().error.get() : result.getError();
-	}
-
-	if (atMostOnce && maybeDelivered) {
-		throw request_maybe_delivered();
-	}
-
-	if (triedAllOptions && errCode == error_code_process_behind) {
-		throw process_behind();
-	}
-
-	return false;
-}
-
-ACTOR template <class Request>
-Future<Optional<REPLY_TYPE(Request)>> makeRequest(RequestStream<Request> const* stream,
-                                                  Request request,
-                                                  double backoff,
-                                                  Future<Void> requestUnneeded,
-                                                  QueueModel* model,
-                                                  bool isFirstRequest,
-                                                  bool atMostOnce,
-                                                  bool triedAllOptions) {
-	if (backoff > 0.0) {
-		wait(delay(backoff) || requestUnneeded);
-	}
-
-	if (requestUnneeded.isReady()) {
-		return Optional<REPLY_TYPE(Request)>();
-	}
-
-	state Reference<ModelHolder> holder(new ModelHolder(model, stream->getEndpoint().token.first()));
-
-	ErrorOr<REPLY_TYPE(Request)> result = wait(stream->tryGetReply(request));
-	if (checkAndProcessResult(result, holder, atMostOnce, triedAllOptions)) {
-		return result.get();
-	} else {
-		return Optional<REPLY_TYPE(Request)>();
-	}
-}
-
-template <class Reply>
-void addLaggingRequest(Future<Optional<Reply>> reply, Promise<Void> requestFinished, QueueModel* model) {
-	requestFinished.send(Void());
-	if (!reply.isReady()) {
-		if (model) {
-			if (model->laggingRequestCount > FLOW_KNOBS->MAX_LAGGING_REQUESTS_OUTSTANDING ||
-			    model->laggingRequests.isReady()) {
-				model->laggingRequests.cancel();
-				model->laggingRequestCount = 0;
-				model->addActor = PromiseStream<Future<Void>>();
-				model->laggingRequests = actorCollection(model->addActor.getFuture(), &model->laggingRequestCount);
-			}
-
-			model->addActor.send(success(errorOr(reply)));
+	~RequestData() {
+		// If the request has been started but hasn't completed, mark it as a lagging request
+		if (requestStarted && !requestProcessed && modelHolder && modelHolder->model) {
+			makeLaggingRequest();
 		}
 	}
-}
+};
 
-// Keep trying to get a reply from any of servers until success or cancellation; tries to take into account
-//   failMon's information for load balancing and avoiding failed servers
+// Try to get a reply from one of the alternatives until success, cancellation, or certain errors.
+// Load balancing has a budget to race requests to a second alternative if the first request is slow.
+// Tries to take into account failMon's information for load balancing and avoiding failed servers.
 // If ALL the servers are failed and the list of servers is not fresh, throws an exception to let the caller refresh the
-// list of servers. When model is set, load balance among alternatives in the same DC, aiming to balance request queue
-// length on these interfaces. If too many interfaces in the same DC are bad, try remote interfaces.
+// list of servers.
+// When model is set, load balance among alternatives in the same DC aims to balance request queue length on these
+// interfaces. If too many interfaces in the same DC are bad, try remote interfaces.
 ACTOR template <class Interface, class Request, class Multi>
 Future<REPLY_TYPE(Request)> loadBalance(
     Reference<MultiInterface<Multi>> alternatives,
@@ -186,9 +246,11 @@ Future<REPLY_TYPE(Request)> loadBalance(
     TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint,
     bool atMostOnce = false, // if true, throws request_maybe_delivered() instead of retrying automatically
     QueueModel* model = nullptr) {
-	state Future<Optional<REPLY_TYPE(Request)>> firstRequest;
+
+	state RequestData<Request> firstRequestData;
+	state RequestData<Request> secondRequestData;
+
 	state Optional<uint64_t> firstRequestEndpoint;
-	state Future<Optional<REPLY_TYPE(Request)>> secondRequest;
 	state Future<Void> secondDelay = Never();
 
 	state Promise<Void> requestFinished;
@@ -320,7 +382,7 @@ Future<REPLY_TYPE(Request)> loadBalance(
 		}
 
 		// Find an alternative, if any, that is not failed, starting with
-		// nextAlt. This logic matters only if model == NULL. Otherwise, the
+		// nextAlt. This logic matters only if model == nullptr. Otherwise, the
 		// bestAlt and nextAlt have been decided.
 		state RequestStream<Request> const* stream = nullptr;
 		for (int alternativeNum = 0; alternativeNum < alternatives->size(); alternativeNum++) {
@@ -340,7 +402,7 @@ Future<REPLY_TYPE(Request)> loadBalance(
 			stream = nullptr;
 		}
 
-		if (!stream && !firstRequest.isValid()) {
+		if (!stream && !firstRequestData.isValid()) {
 			// Everything is down!  Wait for someone to be up.
 
 			vector<Future<Void>> ok(alternatives->size());
@@ -391,50 +453,40 @@ Future<REPLY_TYPE(Request)> loadBalance(
 			numAttempts = 0; // now that we've got a server back, reset the backoff
 		} else if (!stream) {
 			// Only the first location is available.
-			Optional<REPLY_TYPE(Request)> result = wait(firstRequest);
-			if (result.present()) {
-				return result.get();
-			}
+			loop choose {
+				when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); }
+				when(ErrorOr<REPLY_TYPE(Request)> result = wait(firstRequestData.response)) {
+					if (firstRequestData.checkAndProcessResult(atMostOnce)) {
+						return result.get();
+					}
 
-			firstRequest = Future<Optional<REPLY_TYPE(Request)>>();
-			firstRequestEndpoint = Optional<uint64_t>();
-		} else if (firstRequest.isValid()) {
+					firstRequestEndpoint = Optional<uint64_t>();
+					break;
+				}
+			}
+		} else if (firstRequestData.isValid()) {
 			// Issue a second request, the first one is taking a long time.
-			secondRequest = makeRequest(
-			    stream, request, backoff, requestFinished.getFuture(), model, false, atMostOnce, triedAllOptions);
+			secondRequestData.setupRequest(backoff, triedAllOptions, stream);
 			state bool firstFinished = false;
 
-			loop {
-				choose {
-					when(ErrorOr<Optional<REPLY_TYPE(Request)>> result =
-					         wait(firstRequest.isValid() ? errorOr(firstRequest) : Never())) {
-						if (result.isError() || result.get().present()) {
-							addLaggingRequest(secondRequest, requestFinished, model);
-							if (result.isError()) {
-								throw result.getError();
-							} else {
-								return result.get().get();
-							}
-						}
-
-						firstRequest = Future<Optional<REPLY_TYPE(Request)>>();
-						firstRequestEndpoint = Optional<uint64_t>();
-						firstFinished = true;
+			loop choose {
+				when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); }
+				when(wait(secondRequestData.backoffDelay)) { secondRequestData.startRequest(request, model); }
+				when(ErrorOr<REPLY_TYPE(Request)> result =
+				         wait(firstRequestData.response.isValid() ? firstRequestData.response : Never())) {
+					if (firstRequestData.checkAndProcessResult(atMostOnce)) {
+						return result.get();
 					}
-					when(ErrorOr<Optional<REPLY_TYPE(Request)>> result = wait(errorOr(secondRequest))) {
-						if (result.isError() || result.get().present()) {
-							if (!firstFinished) {
-								addLaggingRequest(firstRequest, requestFinished, model);
-							}
-							if (result.isError()) {
-								throw result.getError();
-							} else {
-								return result.get().get();
-							}
-						}
 
-						break;
+					firstRequestEndpoint = Optional<uint64_t>();
+					firstFinished = true;
+				}
+				when(ErrorOr<REPLY_TYPE(Request)> result = wait(secondRequestData.response)) {
+					if (secondRequestData.checkAndProcessResult(atMostOnce)) {
+						return result.get();
 					}
+
+					break;
 				}
 			}
 
@@ -445,13 +497,13 @@ Future<REPLY_TYPE(Request)> loadBalance(
 			}
 		} else {
 			// Issue a request, if it takes too long to get a reply, go around the loop
-			firstRequest = makeRequest(
-			    stream, request, backoff, requestFinished.getFuture(), model, true, atMostOnce, triedAllOptions);
+			firstRequestData.setupRequest(backoff, triedAllOptions, stream);
 			firstRequestEndpoint = stream->getEndpoint().token.first();
 
 			loop {
 				choose {
-					when(ErrorOr<Optional<REPLY_TYPE(Request)>> result = wait(errorOr(firstRequest))) {
+					when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); }
+					when(ErrorOr<REPLY_TYPE(Request)> result = wait(firstRequestData.response)) {
 						if (model) {
 							model->secondMultiplier =
 							    std::max(model->secondMultiplier - FLOW_KNOBS->SECOND_REQUEST_MULTIPLIER_DECAY, 1.0);
@@ -460,15 +512,10 @@ Future<REPLY_TYPE(Request)> loadBalance(
 							             FLOW_KNOBS->SECOND_REQUEST_MAX_BUDGET);
 						}
 
-						if (result.isError()) {
-							throw result.getError();
+						if (firstRequestData.checkAndProcessResult(atMostOnce)) {
+							return result.get();
 						}
 
-						if (result.get().present()) {
-							return result.get().get();
-						}
-
-						firstRequest = Future<Optional<REPLY_TYPE(Request)>>();
 						firstRequestEndpoint = Optional<uint64_t>();
 						break;
 					}

From f7d3b31ef8f93a9ec845bef3a8216e70c384d804 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 24 Mar 2021 16:27:35 -0600
Subject: [PATCH 031/461] Actually close files in simulation

---
 fdbrpc/AsyncFileNonDurable.actor.h |  4 ++++
 fdbrpc/sim2.actor.cpp              | 16 ++++++++--------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 49fe0e2c8f..13fdcc25a5 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -267,6 +267,10 @@ public:
 			Future<Void> deleteFuture = deleteFile(this);
 			if (!deleteFuture.isReady())
 				filesBeingDeleted[filename] = deleteFuture;
+		} else if (isSoleOwner()) {
+			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
+			// we remove the file from the map to make sure it gets closed.
+			g_simulator.getCurrentProcess()->machine->openFiles.erase(filename);
 		}
 	}
 
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index 1af14ec676..6cddbb7e88 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -536,7 +536,10 @@ public:
 
 	std::string getFilename() const override { return actualFilename; }
 
-	~SimpleFile() override { _close(h); }
+	~SimpleFile() override {
+		_close(h);
+		--openCount;
+	}
 
 private:
 	int h;
@@ -1933,10 +1936,7 @@ public:
 		TraceEvent("ClogInterface")
 		    .detail("IP", ip.toString())
 		    .detail("Delay", seconds)
-		    .detail("Queue",
-		            mode == ClogSend      ? "Send"
-		            : mode == ClogReceive ? "Receive"
-		                                  : "All");
+		    .detail("Queue", mode == ClogSend ? "Send" : mode == ClogReceive ? "Receive" : "All");
 
 		if (mode == ClogSend || mode == ClogAll)
 			g_clogging.clogSendFor(ip, seconds);
@@ -2408,9 +2408,9 @@ int sf_open(const char* filename, int flags, int convFlags, int mode) {
 	                       GENERIC_READ | ((flags & IAsyncFile::OPEN_READWRITE) ? GENERIC_WRITE : 0),
 	                       FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
 	                       nullptr,
-	                       (flags & IAsyncFile::OPEN_EXCLUSIVE) ? CREATE_NEW
-	                       : (flags & IAsyncFile::OPEN_CREATE)  ? OPEN_ALWAYS
-	                                                            : OPEN_EXISTING,
+	                       (flags & IAsyncFile::OPEN_EXCLUSIVE)
+	                           ? CREATE_NEW
+	                           : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS : OPEN_EXISTING,
 	                       FILE_ATTRIBUTE_NORMAL,
 	                       nullptr);
 	int h = -1;

From 6a344ddeab4eac19ee34f1af7649a6b5e8e39efc Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 24 Mar 2021 16:56:11 -0600
Subject: [PATCH 032/461] fix typo

---
 fdbrpc/AsyncFileNonDurable.actor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 13fdcc25a5..8cc65bf4a5 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -269,7 +269,7 @@ public:
 				filesBeingDeleted[filename] = deleteFuture;
 		} else if (isSoleOwner()) {
 			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
-			// we remove the file from the map to make sure it gets closed.
+			// remove the file from the map to make sure it gets closed.
 			g_simulator.getCurrentProcess()->machine->openFiles.erase(filename);
 		}
 	}

From b51e4aa59048ed73afbb6a6d82b4d86f520f6129 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 24 Mar 2021 19:57:24 -0600
Subject: [PATCH 033/461] handle file renames properly

---
 fdbrpc/AsyncFileNonDurable.actor.h | 12 +++++++++++-
 flow/flow.h                        |  2 ++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 8cc65bf4a5..21cfda8907 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -270,7 +270,17 @@ public:
 		} else if (isSoleOwner()) {
 			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
 			// remove the file from the map to make sure it gets closed.
-			g_simulator.getCurrentProcess()->machine->openFiles.erase(filename);
+			auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
+			auto iter = openFiles.find(filename);
+			// the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the
+			// map anymore.
+			if (iter != openFiles.end()) {
+				// even if the filename exists, it doesn't mean that it references the same file. It could be that the
+				// file was renamed and later a file with the same name was opened.
+				if (iter->second.canGet() && iter->second.get().getPtr() == this) {
+					openFiles.erase(filename);
+				}
+			}
 		}
 	}
 
diff --git a/flow/flow.h b/flow/flow.h
index 987572d7c5..e03d598d9b 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -674,6 +674,8 @@ public:
 	bool isValid() const { return sav != 0; }
 	bool isReady() const { return sav->isSet(); }
 	bool isError() const { return sav->isError(); }
+	// returns true if get can be called on this future (counterpart of canBeSet on Promises)
+	bool canGet() const { return isValid() && isReady() && !isError(); }
 	Error& getError() const {
 		ASSERT(isError());
 		return sav->error_state;

From 1385a776daa0b90cb20478251d0faf8766cb1a10 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 25 Mar 2021 13:22:29 -0600
Subject: [PATCH 034/461] only remove files from the open map if they have no
 modifications in flight

---
 fdbrpc/AsyncFileNonDurable.actor.h | 49 ++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 21cfda8907..281b3f289d 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -259,6 +259,37 @@ public:
 		//TraceEvent("AsyncFileNonDurable_Destroy", id).detail("Filename", filename);
 	}
 
+	// The purpose of this actor is to simply keep a reference to a non-durable file until all pending modifications
+	// have completed. When they return, this actor will die and therefore decrement the reference count by 1.
+	ACTOR void waitOnOutstandingModifications(Reference<AsyncFileNonDurable> self) {
+		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
+		state TaskPriority currentTaskID = g_network->getCurrentTask();
+		state std::string filename = self->filename;
+
+		wait(g_simulator.onMachine(currentProcess));
+		try {
+			Promise<bool> startSyncPromise = self->startSyncPromise;
+			self->startSyncPromise = Promise<bool>();
+			startSyncPromise.send(true);
+
+			std::vector<Future<Void>> outstandingModifications;
+
+			for (auto itr = self->pendingModifications.ranges().begin();
+			     itr != self->pendingModifications.ranges().end();
+			     ++itr)
+				if (itr->value().isValid() && !itr->value().isReady())
+					outstandingModifications.push_back(itr->value());
+
+			// Ignore errors here so that all modifications can finish
+			wait(waitForAllReady(outstandingModifications));
+			wait(g_simulator.onProcess(currentProcess, currentTaskID));
+		} catch (Error& e) {
+			state Error err = e;
+			wait(g_simulator.onProcess(currentProcess, currentTaskID));
+			throw err;
+		}
+	}
+
 	void addref() override { ReferenceCounted<AsyncFileNonDurable>::addref(); }
 	void delref() override {
 		if (delref_no_destroy()) {
@@ -270,6 +301,24 @@ public:
 		} else if (isSoleOwner()) {
 			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
 			// remove the file from the map to make sure it gets closed.
+			bool hasPendingModifications = false;
+			for (auto iter = pendingModifications.ranges().begin(); iter != pendingModifications.ranges().end();
+			     ++iter) {
+				if (iter->value().isValid() && !iter->value().isReady()) {
+					hasPendingModifications = true;
+					break;
+				}
+			}
+			if (hasPendingModifications) {
+				// If we still have pending references we won't close the file and instead wait for them. But while we
+				// wait for those to complete, another actor might open the file. So we call into an actor that will
+				// hold a refernce until all pending operations are complete. If someone opens this file before this
+				// completes, nothing will happen. Otherwise we will enter delref again but this time
+				// hasPendingModifications will evalualte to false.
+				addref();
+				waitOnOutstandingModifications(Reference<AsyncFileNonDurable>(this));
+				return;
+			}
 			auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
 			auto iter = openFiles.find(filename);
 			// the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the

From 1033db9fba275a809b3159fc2d52a92293350a45 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 25 Mar 2021 14:00:07 -0600
Subject: [PATCH 035/461] Revert change

---
 fdbrpc/AsyncFileNonDurable.actor.h | 47 +++++++-----------------------
 1 file changed, 11 insertions(+), 36 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 281b3f289d..f65895067e 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -267,27 +267,20 @@ public:
 		state std::string filename = self->filename;
 
 		wait(g_simulator.onMachine(currentProcess));
-		try {
-			Promise<bool> startSyncPromise = self->startSyncPromise;
-			self->startSyncPromise = Promise<bool>();
-			startSyncPromise.send(true);
+		Promise<bool> startSyncPromise = self->startSyncPromise;
+		self->startSyncPromise = Promise<bool>();
+		startSyncPromise.send(true);
 
-			std::vector<Future<Void>> outstandingModifications;
+		std::vector<Future<Void>> outstandingModifications;
 
-			for (auto itr = self->pendingModifications.ranges().begin();
-			     itr != self->pendingModifications.ranges().end();
-			     ++itr)
-				if (itr->value().isValid() && !itr->value().isReady())
-					outstandingModifications.push_back(itr->value());
+		for (auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end();
+		     ++itr)
+			if (itr->value().isValid() && !itr->value().isReady())
+				outstandingModifications.push_back(itr->value());
 
-			// Ignore errors here so that all modifications can finish
-			wait(waitForAllReady(outstandingModifications));
-			wait(g_simulator.onProcess(currentProcess, currentTaskID));
-		} catch (Error& e) {
-			state Error err = e;
-			wait(g_simulator.onProcess(currentProcess, currentTaskID));
-			throw err;
-		}
+		// Ignore errors here so that all modifications can finish
+		wait(waitForAllReady(outstandingModifications));
+		wait(g_simulator.onProcess(currentProcess, currentTaskID));
 	}
 
 	void addref() override { ReferenceCounted<AsyncFileNonDurable>::addref(); }
@@ -301,24 +294,6 @@ public:
 		} else if (isSoleOwner()) {
 			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
 			// remove the file from the map to make sure it gets closed.
-			bool hasPendingModifications = false;
-			for (auto iter = pendingModifications.ranges().begin(); iter != pendingModifications.ranges().end();
-			     ++iter) {
-				if (iter->value().isValid() && !iter->value().isReady()) {
-					hasPendingModifications = true;
-					break;
-				}
-			}
-			if (hasPendingModifications) {
-				// If we still have pending references we won't close the file and instead wait for them. But while we
-				// wait for those to complete, another actor might open the file. So we call into an actor that will
-				// hold a refernce until all pending operations are complete. If someone opens this file before this
-				// completes, nothing will happen. Otherwise we will enter delref again but this time
-				// hasPendingModifications will evalualte to false.
-				addref();
-				waitOnOutstandingModifications(Reference<AsyncFileNonDurable>(this));
-				return;
-			}
 			auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
 			auto iter = openFiles.find(filename);
 			// the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the

From c3ba4659ff461d3a5eb16eaa62d563627ea2032b Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Fri, 26 Mar 2021 18:06:21 +0000
Subject: [PATCH 036/461] Document that ryw disable can only be set at
 beginning of transaction

---
 fdbclient/vexillographer/fdb.options | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options
index 82ba1910c2..db68bb31a4 100644
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@@ -210,7 +210,7 @@ description is not currently required but encouraged.
     <Option name="check_writes_enable" code="50"
             hidden="true" />
     <Option name="read_your_writes_disable" code="51"
-            description="Reads performed by a transaction will not see any prior mutations that occured in that transaction, instead seeing the value which was in the database at the transaction's read version. This option may provide a small performance benefit for the client, but also disables a number of client-side optimizations which are beneficial for transactions which tend to read and write the same keys within a single transaction."/>
+            description="Reads performed by a transaction will not see any prior mutations that occured in that transaction, instead seeing the value which was in the database at the transaction's read version. This option may provide a small performance benefit for the client, but also disables a number of client-side optimizations which are beneficial for transactions which tend to read and write the same keys within a single transaction. This option can only be set at the beginning of a transaction."/>
     <Option name="read_ahead_disable" code="52"
             description="Deprecated" />
     <Option name="durability_datacenter" code="110" />

From 8d52abf048da81a95f6760e4431ad737a83811d3 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Fri, 26 Mar 2021 11:33:28 -0700
Subject: [PATCH 037/461] Update fdbclient/vexillographer/fdb.options

---
 fdbclient/vexillographer/fdb.options | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options
index db68bb31a4..c6a4a9749c 100644
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@@ -210,7 +210,7 @@ description is not currently required but encouraged.
     <Option name="check_writes_enable" code="50"
             hidden="true" />
     <Option name="read_your_writes_disable" code="51"
-            description="Reads performed by a transaction will not see any prior mutations that occured in that transaction, instead seeing the value which was in the database at the transaction's read version. This option may provide a small performance benefit for the client, but also disables a number of client-side optimizations which are beneficial for transactions which tend to read and write the same keys within a single transaction. This option can only be set at the beginning of a transaction."/>
+            description="Reads performed by a transaction will not see any prior mutations that occured in that transaction, instead seeing the value which was in the database at the transaction's read version. This option may provide a small performance benefit for the client, but also disables a number of client-side optimizations which are beneficial for transactions which tend to read and write the same keys within a single transaction. It is an error to set this option after performing any reads or writes on the transaction."/>
     <Option name="read_ahead_disable" code="52"
             description="Deprecated" />
     <Option name="durability_datacenter" code="110" />

From 0eff74f2053bd91a37ec947f31edd55e2c29f681 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Fri, 26 Mar 2021 12:19:33 -0700
Subject: [PATCH 038/461] Add special keys for maintenance and datadistribution

---
 fdbclient/NativeAPI.actor.cpp                 |  10 +
 fdbclient/SpecialKeySpace.actor.cpp           | 177 +++++++++++++++++-
 fdbclient/SpecialKeySpace.actor.h             |  13 ++
 .../SpecialKeySpaceCorrectness.actor.cpp      | 174 +++++++++++++++++
 4 files changed, 373 insertions(+), 1 deletion(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index f5058df92b..73ad8abc07 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -970,6 +970,16 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 		    std::make_unique<ClientProfilingImpl>(
 		        KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
 				.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
+		registerSpecialKeySpaceModule(
+		    SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE,
+		    std::make_unique<MaintenanceImpl>(
+		        KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0"))
+		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
+		registerSpecialKeySpaceModule(
+		    SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE,
+		    std::make_unique<DataDistributionImpl>(
+		        KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
+		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
 	}
 	if (apiVersionAtLeast(630)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION, SpecialKeySpace::IMPLTYPE::READONLY,
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 4a987238b3..1fc2dbf15f 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -79,7 +79,11 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandT
 	{ "advanceversion", singleKeyRange(LiteralStringRef("min_required_commit_version"))
 	                        .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
 	{ "profile", KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
-	                 .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
+	                 .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "maintenance", KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0"))
+	                     .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "datadistribution", KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
+	                          .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
 };
 
 std::set<std::string> SpecialKeySpace::options = { "excluded/force", "failed/force" };
@@ -1679,6 +1683,7 @@ ACTOR static Future<Standalone<RangeResultRef>> ClientProfilingGetRangeActor(Rea
 	return result;
 }
 
+// TODO : add limitation on set operation
 Future<Standalone<RangeResultRef>> ClientProfilingImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return ClientProfilingGetRangeActor(ryw, getKeyRange().begin, kr);
 }
@@ -1734,3 +1739,173 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke
 	    ryw, "profile",
 	    "Clear operation is forbidden for profile client. You can set it to default to disable profiling.");
 }
+
+MaintenanceImpl::MaintenanceImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
+
+ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYourWritesTransaction* ryw, KeyRef prefix,
+                                                                         KeyRangeRef kr) {
+	state Standalone<RangeResultRef> result;
+	// zoneId
+	ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
+	Optional<Value> val = wait(ryw->getTransaction().get(healthyZoneKey));
+	if (val.present()) {
+		TraceEvent(SevDebug, "MaintenanceDebug2").detail("KeyRange", kr.toString());
+		auto healthyZone = decodeHealthyZoneValue(val.get());
+		if ((healthyZone.first == ignoreSSFailuresZoneString) ||
+		    (healthyZone.second > ryw->getTransaction().getReadVersion().get())) {
+			Key zone_key = healthyZone.first.withPrefix(prefix);
+			int64_t seconds = healthyZone.first == ignoreSSFailuresZoneString
+			                      ? 0
+			                      : (healthyZone.second - ryw->getTransaction().getReadVersion().get()) /
+			                            CLIENT_KNOBS->CORE_VERSIONSPERSECOND;
+			if (kr.contains(zone_key)) {
+				result.push_back_deep(result.arena(),
+				                      KeyValueRef(zone_key, Value(boost::lexical_cast<std::string>(seconds))));
+			}
+		}
+	}
+	return rywGetRange(ryw, kr, result);
+}
+
+Future<Standalone<RangeResultRef>> MaintenanceImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+	return MaintenanceGetRangeActor(ryw, getKeyRange().begin, kr);
+}
+
+ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
+	// read
+	ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
+	ryw->getTransaction().setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+	Optional<Value> val = wait(ryw->getTransaction().get(healthyZoneKey));
+	Optional<std::pair<Key, Version>> healthyZone =
+	    val.present() ? decodeHealthyZoneValue(val.get()) : Optional<std::pair<Key, Version>>();
+
+	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
+	    ryw->getSpecialKeySpaceWriteMap().containedRanges(kr);
+	Key zoneId;
+	int64_t seconds;
+	bool isSet = false;
+	// Since maintenance only allows one zone at the same time,
+	// if a transaction has more than one set operation on different zone keys,
+	// the commit will throw an error
+	for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
+		if (!iter->value().first) continue;
+		if (iter->value().second.present()) {
+			if (isSet)
+				return Optional<std::string>(ManagementAPIError::toJsonString(
+				    false, "maintenance", "Multiple zones given for maintenance, only one allowed at the same time"));
+			isSet = true;
+			zoneId = iter->begin().removePrefix(kr.begin);
+			seconds = boost::lexical_cast<int64_t>(iter->value().second.get().toString());
+		} else {
+			// if we already have set operation, then all clear operations will be meaningless, thus skip
+			if (!isSet && healthyZone.present() && iter.range().contains(healthyZone.get().first.withPrefix(kr.begin)))
+				ryw->getTransaction().clear(healthyZoneKey);
+		}
+	}
+
+	if (isSet) {
+		if (healthyZone.present() && healthyZone.get().first == ignoreSSFailuresZoneString) {
+			std::string msg = "Maintenance mode cannot be used while data distribution is disabled for storage "
+			                  "server failures.";
+			return Optional<std::string>(ManagementAPIError::toJsonString(false, "maintenance", msg));
+		} else {
+			TraceEvent(SevDebug, "SKSMaintenanceSet").detail("ZoneId", zoneId.toString());
+			ryw->getTransaction().set(healthyZoneKey,
+			                          healthyZoneValue(zoneId, ryw->getTransaction().getReadVersion().get() +
+			                                                       (seconds * CLIENT_KNOBS->CORE_VERSIONSPERSECOND)));
+		}
+	}
+	return Optional<std::string>();
+}
+
+Future<Optional<std::string>> MaintenanceImpl::commit(ReadYourWritesTransaction* ryw) {
+	return maintenanceCommitActor(ryw, getKeyRange());
+}
+
+DataDistributionImpl::DataDistributionImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
+
+ACTOR static Future<Standalone<RangeResultRef>> DataDistributionGetRangeActor(ReadYourWritesTransaction* ryw,
+                                                                              KeyRef prefix, KeyRangeRef kr) {
+	state Standalone<RangeResultRef> result;
+	// dataDistributionModeKey
+	state Key modeKey = LiteralStringRef("mode").withPrefix(prefix);
+	if (kr.contains(modeKey)) {
+		auto entry = ryw->getSpecialKeySpaceWriteMap()[modeKey];
+		if (ryw->readYourWritesDisabled() || !entry.first) {
+			Optional<Value> f = wait(ryw->getTransaction().get(dataDistributionModeKey));
+			int mode = -1;
+			if (f.present()) {
+				mode = BinaryReader::fromStringRef<int>(f.get(), Unversioned());
+			}
+			result.push_back_deep(result.arena(), KeyValueRef(modeKey, Value(boost::lexical_cast<std::string>(mode))));
+		}
+	}
+	// rebalanceDDIgnoreKey
+	state Key rebalanceIgnoredKey = LiteralStringRef("rebalance_ignored").withPrefix(prefix);
+	if (kr.contains(rebalanceIgnoredKey)) {
+		auto entry = ryw->getSpecialKeySpaceWriteMap()[rebalanceIgnoredKey];
+		if (ryw->readYourWritesDisabled() || !entry.first) {
+			Optional<Value> f = wait(ryw->getTransaction().get(rebalanceDDIgnoreKey));
+			if (f.present()) {
+				result.push_back_deep(result.arena(), KeyValueRef(rebalanceIgnoredKey, Value()));
+			}
+		}
+	}
+	return rywGetRange(ryw, kr, result);
+}
+
+Future<Standalone<RangeResultRef>> DataDistributionImpl::getRange(ReadYourWritesTransaction* ryw,
+                                                                  KeyRangeRef kr) const {
+	return DataDistributionGetRangeActor(ryw, getKeyRange().begin, kr);
+}
+
+Future<Optional<std::string>> DataDistributionImpl::commit(ReadYourWritesTransaction* ryw) {
+	// there are two valid keys in the range
+	// <prefix>/mode -> dataDistributionModeKey, the value is only allowed to be set as "0"(disable) or "1"(enable)
+	// <prefix>/rebalance_ignored -> rebalanceDDIgnoreKey, value is unused thus empty
+	Optional<std::string> msg;
+	KeyRangeRef kr = getKeyRange();
+	Key modeKey = LiteralStringRef("mode").withPrefix(kr.begin);
+	Key rebalanceIgnoredKey = LiteralStringRef("rebalance_ignored").withPrefix(kr.begin);
+	auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(kr);
+	for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
+		if (!iter->value().first) continue;
+		if (iter->value().second.present()) {
+			if (iter->range() == singleKeyRange(modeKey)) {
+				try {
+					int mode = boost::lexical_cast<int>(iter->value().second.get().toString());
+					Value modeVal = BinaryWriter::toValue(mode, Unversioned());
+					if (mode == 0 || mode == 1)
+						ryw->getTransaction().set(dataDistributionModeKey, modeVal);
+					else
+						msg = ManagementAPIError::toJsonString(false, "datadistribution",
+						                                       "Please set the value of the data_distribution/mode to "
+						                                       "0(disable) or 1(enable), other values are not allowed");
+				} catch (boost::bad_lexical_cast& e) {
+					msg = ManagementAPIError::toJsonString(false, "datadistribution",
+					                                       "Invalid datadistribution mode(int): " +
+					                                           iter->value().second.get().toString());
+				}
+			} else if (iter->range() == singleKeyRange(rebalanceIgnoredKey)) {
+				if (iter->value().second.get().size())
+					msg =
+					    ManagementAPIError::toJsonString(false, "datadistribution",
+					                                     "Value is unused for the data_distribution/rebalance_ignored "
+					                                     "key, please set it to an empty value");
+				else
+					ryw->getTransaction().set(rebalanceDDIgnoreKey, LiteralStringRef("on"));
+			} else {
+				msg = ManagementAPIError::toJsonString(
+				    false, "datadistribution",
+				    "Changing invalid keys, please read the documentation to check valid keys in the range");
+			}
+		} else {
+			// clear
+			if (iter->range().contains(modeKey))
+				ryw->getTransaction().clear(dataDistributionModeKey);
+			else if (iter->range().contains(rebalanceIgnoredKey))
+				ryw->getTransaction().clear(rebalanceDDIgnoreKey);
+		}
+	}
+	return msg;
+}
diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h
index 0c70e43525..0d46f51961 100644
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@@ -363,5 +363,18 @@ public:
 	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
 };
 
+class MaintenanceImpl : public SpecialKeyRangeRWImpl {
+public:
+	explicit MaintenanceImpl(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
+};
+class DataDistributionImpl : public SpecialKeyRangeRWImpl {
+public:
+	explicit DataDistributionImpl(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
+};
+
 #include "flow/unactorcompiler.h"
 #endif
diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
index f101a02360..ce41d98812 100644
--- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
+++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
@@ -1229,6 +1229,180 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 				}
 			}
 		}
+		// data_distribution & maintenance get
+		loop {
+			try {
+				// maintenance
+				Standalone<RangeResultRef> maintenanceKVs = wait(
+				    tx->getRange(SpecialKeySpace::getManamentApiCommandRange("maintenance"), CLIENT_KNOBS->TOO_MANY));
+				// By default, no maintenance is going on
+				ASSERT(!maintenanceKVs.more && !maintenanceKVs.size());
+				// datadistribution
+				Standalone<RangeResultRef> ddKVs = wait(tx->getRange(
+				    SpecialKeySpace::getManamentApiCommandRange("datadistribution"), CLIENT_KNOBS->TOO_MANY));
+				// By default, data_distribution/mode := "-1"
+				ASSERT(!ddKVs.more && ddKVs.size() == 1);
+				ASSERT(ddKVs[0].key == LiteralStringRef("mode").withPrefix(
+				                           SpecialKeySpace::getManagementApiCommandPrefix("datadistribution")));
+				ASSERT(ddKVs[0].value == Value(boost::lexical_cast<std::string>(-1)));
+				tx->reset();
+				break;
+			} catch (Error& e) {
+				TraceEvent(SevDebug, "MaintenanceGet").error(e);
+				wait(tx->onError(e));
+			}
+		}
+		// maintenance set
+		{
+			// Make sure setting more than one zone as maintenance will fail
+			loop {
+				try {
+					tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+					tx->set(Key(deterministicRandom()->randomAlphaNumeric(8))
+					            .withPrefix(SpecialKeySpace::getManagementApiCommandPrefix("maintenance")),
+					        Value(boost::lexical_cast<std::string>(deterministicRandom()->randomInt(1, 100))));
+					// make sure this is a different zone id
+					tx->set(Key(deterministicRandom()->randomAlphaNumeric(9))
+					            .withPrefix(SpecialKeySpace::getManagementApiCommandPrefix("maintenance")),
+					        Value(boost::lexical_cast<std::string>(deterministicRandom()->randomInt(1, 100))));
+					wait(tx->commit());
+					ASSERT(false);
+				} catch (Error& e) {
+					TraceEvent(SevDebug, "MaintenanceSetMoreThanOneZone").error(e);
+					if (e.code() == error_code_special_keys_api_failure) {
+						Optional<Value> errorMsg =
+						    wait(tx->get(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ERRORMSG).begin));
+						ASSERT(errorMsg.present());
+						std::string errorStr;
+						auto valueObj = readJSONStrictly(errorMsg.get().toString()).get_obj();
+						auto schema = readJSONStrictly(JSONSchemas::managementApiErrorSchema.toString()).get_obj();
+						// special_key_space_management_api_error_msg schema validation
+						ASSERT(schemaMatch(schema, valueObj, errorStr, SevError, true));
+						ASSERT(valueObj["command"].get_str() == "maintenance" && !valueObj["retriable"].get_bool());
+						TraceEvent(SevDebug, "MaintenanceSetMoreThanOneZone")
+						    .detail("ErrorMessage", valueObj["message"].get_str());
+						tx->reset();
+						break;
+					} else {
+						wait(tx->onError(e));
+					}
+					wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+				}
+			}
+			// Disable DD for SS failures
+			state int ignoreSSFailuresRetry = 0;
+			loop {
+				try {
+					tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+					tx->set(ignoreSSFailuresZoneString.withPrefix(
+					            SpecialKeySpace::getManagementApiCommandPrefix("maintenance")),
+					        Value(boost::lexical_cast<std::string>(0)));
+					wait(tx->commit());
+					tx->reset();
+					ignoreSSFailuresRetry++;
+					wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+				} catch (Error& e) {
+					TraceEvent(SevDebug, "MaintenanceDDIgnoreSSFailures").error(e);
+					// the second commit will fail since maintenance not allowed to use while DD disabled for SS
+					// failures
+					if (e.code() == error_code_special_keys_api_failure) {
+						Optional<Value> errorMsg =
+						    wait(tx->get(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ERRORMSG).begin));
+						ASSERT(errorMsg.present());
+						std::string errorStr;
+						auto valueObj = readJSONStrictly(errorMsg.get().toString()).get_obj();
+						auto schema = readJSONStrictly(JSONSchemas::managementApiErrorSchema.toString()).get_obj();
+						// special_key_space_management_api_error_msg schema validation
+						ASSERT(schemaMatch(schema, valueObj, errorStr, SevError, true));
+						ASSERT(valueObj["command"].get_str() == "maintenance" && !valueObj["retriable"].get_bool());
+						ASSERT(ignoreSSFailuresRetry > 0);
+						TraceEvent(SevDebug, "MaintenanceDDIgnoreSSFailures")
+						    .detail("Retry", ignoreSSFailuresRetry)
+						    .detail("ErrorMessage", valueObj["message"].get_str());
+						tx->reset();
+						break;
+					} else {
+						wait(tx->onError(e));
+					}
+					ignoreSSFailuresRetry++;
+					wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+				}
+			}
+			// set dd mode to 0 and disable DD for rebalance
+			loop {
+				try {
+					tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+					KeyRef ddPrefix = SpecialKeySpace::getManagementApiCommandPrefix("datadistribution");
+					tx->set(LiteralStringRef("mode").withPrefix(ddPrefix), LiteralStringRef("0"));
+					tx->set(LiteralStringRef("rebalance_ignored").withPrefix(ddPrefix), Value());
+					wait(tx->commit());
+					tx->reset();
+					break;
+				} catch (Error& e) {
+					TraceEvent(SevDebug, "DataDistributionDisableModeAndRebalance").error(e);
+					wait(tx->onError(e));
+				}
+			}
+			// verify underlying system keys are consistent with the change
+			loop {
+				try {
+					tx->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+					// check DD disabled for SS failures
+					Optional<Value> val1 = wait(tx->get(healthyZoneKey));
+					ASSERT(val1.present());
+					auto healthyZone = decodeHealthyZoneValue(val1.get());
+					ASSERT(healthyZone.first == ignoreSSFailuresZoneString);
+					// check DD mode
+					Optional<Value> val2 = wait(tx->get(dataDistributionModeKey));
+					ASSERT(val2.present());
+					// mode should be set to 0
+					ASSERT(BinaryReader::fromStringRef<int>(val2.get(), Unversioned()) == 0);
+					// check DD disabled for rebalance
+					Optional<Value> val3 = wait(tx->get(rebalanceDDIgnoreKey));
+					// default value "on"
+					ASSERT(val3.present() && val3.get() == LiteralStringRef("on"));
+					tx->reset();
+					break;
+				} catch (Error& e) {
+					wait(tx->onError(e));
+				}
+			}
+			// then, clear all changes
+			loop {
+				try {
+					tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+					tx->clear(ignoreSSFailuresZoneString.withPrefix(
+					    SpecialKeySpace::getManagementApiCommandPrefix("maintenance")));
+					KeyRef ddPrefix = SpecialKeySpace::getManagementApiCommandPrefix("datadistribution");
+					tx->clear(LiteralStringRef("mode").withPrefix(ddPrefix));
+					tx->clear(LiteralStringRef("rebalance_ignored").withPrefix(ddPrefix));
+					wait(tx->commit());
+					tx->reset();
+					break;
+				} catch (Error& e) {
+					wait(tx->onError(e));
+				}
+			}
+			// verify all changes are cleared
+			loop {
+				try {
+					tx->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+					// check DD SSFailures key
+					Optional<Value> val1 = wait(tx->get(healthyZoneKey));
+					ASSERT(!val1.present());
+					// check DD mode
+					Optional<Value> val2 = wait(tx->get(dataDistributionModeKey));
+					ASSERT(!val2.present());
+					// check DD rebalance key
+					Optional<Value> val3 = wait(tx->get(rebalanceDDIgnoreKey));
+					ASSERT(!val3.present());
+					tx->reset();
+					break;
+				} catch (Error& e) {
+					wait(tx->onError(e));
+				}
+			}
+		}
 		return Void();
 	}
 };

From ff6e922c9d9e0574b6348720b49ac5a16fe12d16 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Fri, 26 Mar 2021 12:24:45 -0700
Subject: [PATCH 039/461] Clang-format SpecialKeySpace.actor.cpp

---
 fdbclient/SpecialKeySpace.actor.cpp | 59 ++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 22 deletions(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 0327dbc154..c402d64c06 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -76,18 +76,24 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandT
 	  KeyRangeRef(LiteralStringRef("failed/"), LiteralStringRef("failed0"))
 	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
 	{ "lock", singleKeyRange(LiteralStringRef("db_locked")).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
-	{ "consistencycheck", singleKeyRange(LiteralStringRef("consistency_check_suspended"))
-	                          .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
-	{ "coordinators", KeyRangeRef(LiteralStringRef("coordinators/"), LiteralStringRef("coordinators0"))
-	                      .withPrefix(moduleToBoundary[MODULE::CONFIGURATION].begin) },
-	{ "advanceversion", singleKeyRange(LiteralStringRef("min_required_commit_version"))
-	                        .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
-	{ "profile", KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
-	                 .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
-	{ "maintenance", KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0"))
-	                     .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
-	{ "datadistribution", KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
-	                          .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
+	{ "consistencycheck",
+	  singleKeyRange(LiteralStringRef("consistency_check_suspended"))
+	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "coordinators",
+	  KeyRangeRef(LiteralStringRef("coordinators/"), LiteralStringRef("coordinators0"))
+	      .withPrefix(moduleToBoundary[MODULE::CONFIGURATION].begin) },
+	{ "advanceversion",
+	  singleKeyRange(LiteralStringRef("min_required_commit_version"))
+	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "profile",
+	  KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
+	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "maintenance",
+	  KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0"))
+	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "datadistribution",
+	  KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
+	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
 };
 
 std::set<std::string> SpecialKeySpace::options = { "excluded/force", "failed/force" };
@@ -1798,7 +1804,8 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke
 
 MaintenanceImpl::MaintenanceImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYourWritesTransaction* ryw, KeyRef prefix,
+ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYourWritesTransaction* ryw,
+                                                                         KeyRef prefix,
                                                                          KeyRangeRef kr) {
 	state Standalone<RangeResultRef> result;
 	// zoneId
@@ -1844,7 +1851,8 @@ ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWrites
 	// if a transaction has more than one set operation on different zone keys,
 	// the commit will throw an error
 	for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
-		if (!iter->value().first) continue;
+		if (!iter->value().first)
+			continue;
 		if (iter->value().second.present()) {
 			if (isSet)
 				return Optional<std::string>(ManagementAPIError::toJsonString(
@@ -1867,8 +1875,9 @@ ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWrites
 		} else {
 			TraceEvent(SevDebug, "SKSMaintenanceSet").detail("ZoneId", zoneId.toString());
 			ryw->getTransaction().set(healthyZoneKey,
-			                          healthyZoneValue(zoneId, ryw->getTransaction().getReadVersion().get() +
-			                                                       (seconds * CLIENT_KNOBS->CORE_VERSIONSPERSECOND)));
+			                          healthyZoneValue(zoneId,
+			                                           ryw->getTransaction().getReadVersion().get() +
+			                                               (seconds * CLIENT_KNOBS->CORE_VERSIONSPERSECOND)));
 		}
 	}
 	return Optional<std::string>();
@@ -1881,7 +1890,8 @@ Future<Optional<std::string>> MaintenanceImpl::commit(ReadYourWritesTransaction*
 DataDistributionImpl::DataDistributionImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
 ACTOR static Future<Standalone<RangeResultRef>> DataDistributionGetRangeActor(ReadYourWritesTransaction* ryw,
-                                                                              KeyRef prefix, KeyRangeRef kr) {
+                                                                              KeyRef prefix,
+                                                                              KeyRangeRef kr) {
 	state Standalone<RangeResultRef> result;
 	// dataDistributionModeKey
 	state Key modeKey = LiteralStringRef("mode").withPrefix(prefix);
@@ -1925,7 +1935,8 @@ Future<Optional<std::string>> DataDistributionImpl::commit(ReadYourWritesTransac
 	Key rebalanceIgnoredKey = LiteralStringRef("rebalance_ignored").withPrefix(kr.begin);
 	auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(kr);
 	for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
-		if (!iter->value().first) continue;
+		if (!iter->value().first)
+			continue;
 		if (iter->value().second.present()) {
 			if (iter->range() == singleKeyRange(modeKey)) {
 				try {
@@ -1934,25 +1945,29 @@ Future<Optional<std::string>> DataDistributionImpl::commit(ReadYourWritesTransac
 					if (mode == 0 || mode == 1)
 						ryw->getTransaction().set(dataDistributionModeKey, modeVal);
 					else
-						msg = ManagementAPIError::toJsonString(false, "datadistribution",
+						msg = ManagementAPIError::toJsonString(false,
+						                                       "datadistribution",
 						                                       "Please set the value of the data_distribution/mode to "
 						                                       "0(disable) or 1(enable), other values are not allowed");
 				} catch (boost::bad_lexical_cast& e) {
-					msg = ManagementAPIError::toJsonString(false, "datadistribution",
+					msg = ManagementAPIError::toJsonString(false,
+					                                       "datadistribution",
 					                                       "Invalid datadistribution mode(int): " +
 					                                           iter->value().second.get().toString());
 				}
 			} else if (iter->range() == singleKeyRange(rebalanceIgnoredKey)) {
 				if (iter->value().second.get().size())
 					msg =
-					    ManagementAPIError::toJsonString(false, "datadistribution",
+					    ManagementAPIError::toJsonString(false,
+					                                     "datadistribution",
 					                                     "Value is unused for the data_distribution/rebalance_ignored "
 					                                     "key, please set it to an empty value");
 				else
 					ryw->getTransaction().set(rebalanceDDIgnoreKey, LiteralStringRef("on"));
 			} else {
 				msg = ManagementAPIError::toJsonString(
-				    false, "datadistribution",
+				    false,
+				    "datadistribution",
 				    "Changing invalid keys, please read the documentation to check valid keys in the range");
 			}
 		} else {

From 62516cb8449fb19522d72fc6c09eaec559ea98f3 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Mon, 29 Mar 2021 11:16:54 -0700
Subject: [PATCH 040/461] Add documentation for special keys
 \xff\xff/management/maintenance and \xff\xff/management/data_distribution

---
 documentation/sphinx/source/developer-guide.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index bb30e9e469..6f4f7bcad4 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -949,6 +949,8 @@ that process, and wait for necessary data to be moved away.
 #. ``\xff\xff/management/options/failed/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed/<exclusion>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
 #. ``\xff\xff/management/min_required_commit_version`` Read/write. Changing this key will change the corresponding system key ``\xff/minRequiredCommitVersion = [[Version]]``. The value of this special key is the literal text of the underlying ``Version``, which is ``int64_t``. If you set the key with a value failed to be parsed as ``int64_t``, ``special_keys_api_failure`` will be thrown. In addition, the given ``Version`` should be larger than the current read version and smaller than the upper bound(``2**63-1-version_per_second*3600*24*365*1000``). Otherwise, ``special_keys_api_failure`` is thrown. For more details, see help text of ``fdbcli`` command ``advanceversion``.
 #. ``\xff\xff/management/profiling/<client_txn_sample_rate|client_txn_size_limit>`` Read/write. Changing these two keys will change the corresponding system keys ``\xff\x02/fdbClientInfo/<client_txn_sample_rate|client_txn_size_limit>``, respectively. The value of ``\xff\xff/management/client_txn_sample_rate`` is a literal text of ``double``, and the value of ``\xff\xff/management/client_txn_size_limit`` is a literal text of ``int64_t``. A special value ``default`` can be set to or read from these two keys, representing the client profiling is disabled. In addition, ``clear`` in this range is not allowed. For more details, see help text of ``fdbcli`` command ``profile client``.
+#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of ``int`` which represents the remaining time for the zone to be in maintenance. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. The special key ``\xff\xff/management/maintenance/IgnoreSSFailures``, if set, will disable datadistribution for storage server failures and thus maintenance mode will be unable to use until the key is cleared. For more details, see help text of ``fdbcli`` command ``maintenance``.
+#. ``\xff\xff/management/data_distribution/<mode|rebalance_ignored>`` Read/write. Changing these two keys will change the two corresponding system keys ``\xff/dataDistributionMode`` and ``\xff\x02/rebalanceDDIgnored``. The value of ``\xff\xff/management/data_distribution/mode`` is a literal text of ``0`` (disable) or ``1`` (enable). Transactions committed with invalid values will throw ``special_keys_api_failure`` . The value of ``\xff\xff/management/data_distribution/rebalance_ignored`` is empty. If present, it means data distribution is disabled for rebalance. Any transaction committed with non-empty value for this key will throw ``special_keys_api_failure``. For more details, see help text of ``fdbcli`` command ``datadistribution``.
 
 An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
 an ip address and port (e.g. ``127.0.0.1:4500``). If no port is specified,

From e2bd500fa731073c6b1110f253e97f2b7f831491 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Mon, 29 Mar 2021 11:57:14 -0700
Subject: [PATCH 041/461] Remove debug trace

---
 fdbclient/SpecialKeySpace.actor.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index c402d64c06..82ce62f589 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1812,7 +1812,6 @@ ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYou
 	ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
 	Optional<Value> val = wait(ryw->getTransaction().get(healthyZoneKey));
 	if (val.present()) {
-		TraceEvent(SevDebug, "MaintenanceDebug2").detail("KeyRange", kr.toString());
 		auto healthyZone = decodeHealthyZoneValue(val.get());
 		if ((healthyZone.first == ignoreSSFailuresZoneString) ||
 		    (healthyZone.second > ryw->getTransaction().getReadVersion().get())) {

From 41c135ec934d3c7d2ec7bf6e639ecd4971a50ea2 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Mon, 29 Mar 2021 12:51:32 -0700
Subject: [PATCH 042/461] Change namespace FDBCLI to fdb_cli

---
 fdbcli/ConsistencycheckCommand.actor.cpp |  6 +++---
 fdbcli/Util.cpp                          | 14 ++++++++++++--
 fdbcli/fdbcli.actor.cpp                  | 21 +++------------------
 fdbcli/fdbcli.h                          | 10 +++++++---
 4 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/fdbcli/ConsistencycheckCommand.actor.cpp b/fdbcli/ConsistencycheckCommand.actor.cpp
index 349be547f5..faeb3269ba 100644
--- a/fdbcli/ConsistencycheckCommand.actor.cpp
+++ b/fdbcli/ConsistencycheckCommand.actor.cpp
@@ -8,7 +8,7 @@
 #include "flow/ThreadHelper.actor.h"
 #include "flow/actorcompiler.h"
 
-using namespace FDBCLI;
+using namespace fdb_cli;
 
 ACTOR static Future<bool> consistencycheckCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
     state Reference<ITransaction> tr = db->createTransaction();
@@ -30,7 +30,7 @@ ACTOR static Future<bool> consistencycheckCommandActor(Reference<IDatabase> db,
     return true;
 }
 
-namespace FDBCLI {
+namespace fdb_cli {
 
 Future<bool> consistencycheckCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
     return consistencycheckCommandActor(db, tokens);
@@ -42,4 +42,4 @@ CommandFactory consistencycheckFactory("consistencycheck", CommandHelp(
 	    "Calling this command with `on' permits consistency check processes to run and `off' will halt their checking. "
 	    "Calling this command with no arguments will display if consistency checking is currently allowed.\n"));
 
-} // namespace FDBCLI
\ No newline at end of file
+} // namespace fdb_cli
\ No newline at end of file
diff --git a/fdbcli/Util.cpp b/fdbcli/Util.cpp
index 20d9da2f2c..fa1bb273a3 100644
--- a/fdbcli/Util.cpp
+++ b/fdbcli/Util.cpp
@@ -1,6 +1,7 @@
+#include "fdbcli/fdbcli.h"
 #include "flow/Arena.h"
 
-namespace FDBCLI {
+namespace fdb_cli {
 
 bool tokencmp(StringRef token, const char* command) {
 	if (token.size() != strlen(command))
@@ -9,4 +10,13 @@ bool tokencmp(StringRef token, const char* command) {
 	return !memcmp(token.begin(), command, token.size());
 }
 
-}
\ No newline at end of file
+void printUsage(StringRef command) {
+	const auto& helpMap = CommandFactory::commands();
+	auto i = helpMap.find(command.toString());
+	if (i != helpMap.end())
+		printf("Usage: %s\n", i->second.usage.c_str());
+	else
+		fprintf(stderr, "ERROR: Unknown command `%s'\n", command.toString().c_str());
+}
+
+}
diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index d88e98455f..22387d33c4 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -329,13 +329,6 @@ static std::string formatStringRef(StringRef item, bool fullEscaping = false) {
 	return ret;
 }
 
-// static bool tokencmp(StringRef token, const char* command) {
-// 	if (token.size() != strlen(command))
-// 		return false;
-
-// 	return !memcmp(token.begin(), command, token.size());
-// }
-
 static std::vector<std::vector<StringRef>> parseLine(std::string& line, bool& err, bool& partial) {
 	err = false;
 	partial = false;
@@ -465,9 +458,9 @@ static void printProgramUsage(const char* name) {
 #define ESCAPINGK "\n\nFor information on escaping keys, type `help escaping'."
 #define ESCAPINGKV "\n\nFor information on escaping keys and values, type `help escaping'."
 
-using namespace FDBCLI;
-std::map<std::string, CommandHelp>& helpMap = FDBCLI::CommandFactory::commands();
-std::set<std::string>& hiddenCommands = FDBCLI::CommandFactory::hiddenCommands();
+using namespace fdb_cli;
+std::map<std::string, CommandHelp>& helpMap = CommandFactory::commands();
+std::set<std::string>& hiddenCommands = CommandFactory::hiddenCommands();
 
 void initHelp() {
 	helpMap["begin"] =
@@ -717,14 +710,6 @@ void printHelp(StringRef command) {
 		printf("I don't know anything about `%s'\n", formatStringRef(command).c_str());
 }
 
-void FDBCLI::printUsage(StringRef command) {
-	auto i = helpMap.find(command.toString());
-	if (i != helpMap.end())
-		printf("Usage: %s\n", i->second.usage.c_str());
-	else
-		fprintf(stderr, "ERROR: Unknown command `%s'\n", command.toString().c_str());
-}
-
 std::string getCoordinatorsInfoString(StatusObjectReader statusObj) {
 	std::string outputString;
 	try {
diff --git a/fdbcli/fdbcli.h b/fdbcli/fdbcli.h
index d93616e657..d0b0581aff 100644
--- a/fdbcli/fdbcli.h
+++ b/fdbcli/fdbcli.h
@@ -25,7 +25,7 @@
 #include "fdbclient/IClientApi.h"
 #include "flow/Arena.h"
 
-namespace FDBCLI {
+namespace fdb_cli {
 
 struct CommandHelp {
 	std::string usage;
@@ -48,13 +48,17 @@ struct CommandFactory {
     }
 };
 
-// help functions
+// help functions (Copied from fdbcli.actor.cpp)
+
+// compare StringRef with the given c string
 bool tokencmp(StringRef token, const char* command);
+// print the usage of the specified command
 void printUsage(StringRef command);
 
+// All fdbcli commands (alphabetically)
 // consistency command
 Future<bool> consistencycheckCommand(Reference<IDatabase> db, std::vector<StringRef> tokens);
 
-} // namespace FDBCLI
+} // namespace fdb_cli
 
 #endif
\ No newline at end of file

From f2d368711058226f76b89ca57909a25a61127e85 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 29 Mar 2021 16:06:26 -0700
Subject: [PATCH 043/461] Print stack

---
 cmake/CompileBoost.cmake | 2 +-
 flow/Platform.actor.cpp  | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/cmake/CompileBoost.cmake b/cmake/CompileBoost.cmake
index 9e7fbd2971..0b1cc68502 100644
--- a/cmake/CompileBoost.cmake
+++ b/cmake/CompileBoost.cmake
@@ -10,7 +10,7 @@ function(compile_boost)
   set(BOOST_COMPILER_FLAGS -fvisibility=hidden -fPIC -std=c++14 -w)
   set(BOOST_CXX_COMPILER "${CMAKE_CXX_COMPILER}")
   if(APPLE)
-    set(BOOST_TOOLSET "darwin")
+    set(BOOST_TOOLSET "clang-darwin")
     # this is to fix a weird macOS issue -- by default
     # cmake would otherwise pass a compiler that can't
     # compile boost
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 756fb6a7e3..d81dee877a 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -3680,12 +3680,19 @@ void* sampleThread(void* arg) {
 		threadSleep(1.0); // TODO: Read sample rate from global config
 
 		// TODO: Copy actor lineage of currently running actor
+		// Read currentLineage
 
 		auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy();
 		printf("Disk ALPs: %d\n", diskAlps.size());
 
 		// TODO: Call collect on all actor lineages
 		for (auto actorLineage : diskAlps) {
+			auto stack = actorLineage->stack(&StackLineage::actorName);
+			while (!stack.empty()) {
+				printf("%s ", stack.top());
+				stack.pop();
+			}
+			printf("\n");
 		}
 
 		// TODO: Serialize collected actor linage properties

From 60747a52a585d5510c04678c94778362f9088f0d Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 30 Mar 2021 14:52:56 -0700
Subject: [PATCH 044/461] Add a thread-safe function that converts ThreadFuture
 into Future

---
 fdbcli/ConsistencycheckCommand.actor.cpp | 12 +++----
 flow/ThreadHelper.actor.h                | 40 ++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/fdbcli/ConsistencycheckCommand.actor.cpp b/fdbcli/ConsistencycheckCommand.actor.cpp
index faeb3269ba..615c078203 100644
--- a/fdbcli/ConsistencycheckCommand.actor.cpp
+++ b/fdbcli/ConsistencycheckCommand.actor.cpp
@@ -15,15 +15,15 @@ ACTOR static Future<bool> consistencycheckCommandActor(Reference<IDatabase> db,
     tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
     KeyRef k = LiteralStringRef("\xff\xff/management/consistency_check_suspended");
     if (tokens.size() == 1) {
-        Optional<Value> suspended = wait(unsafeThreadFutureToFuture(tr->get(k)));
-        printf("ConsistencyCheck is %s\n", suspended.present() ? "off" : "on");
+		Optional<Value> suspended = wait(safeThreadFutureToFuture(tr->get(k)));
+		printf("ConsistencyCheck is %s\n", suspended.present() ? "off" : "on");
     } else if (tokens.size() == 2 && tokencmp(tokens[1], "off")) {
         tr->set(k, Value());
-        wait(unsafeThreadFutureToFuture(tr->commit()));
-    } else if (tokens.size() == 2 && tokencmp(tokens[1], "on")) {
+		wait(safeThreadFutureToFuture(tr->commit()));
+	} else if (tokens.size() == 2 && tokencmp(tokens[1], "on")) {
         tr->clear(k);
-        wait(unsafeThreadFutureToFuture(tr->commit()));
-    } else {
+		wait(safeThreadFutureToFuture(tr->commit()));
+	} else {
         printUsage(tokens[0]);
         return false;
     }
diff --git a/flow/ThreadHelper.actor.h b/flow/ThreadHelper.actor.h
index 12f408ce22..b89c99b571 100644
--- a/flow/ThreadHelper.actor.h
+++ b/flow/ThreadHelper.actor.h
@@ -532,6 +532,46 @@ Future<T> unsafeThreadFutureToFuture(ThreadFuture<T> threadFuture) {
 	return callback->promise.getFuture();
 }
 
+// A callback waiting on a thread future and will delete itself once fired
+template <class T>
+struct UtilCallback : public ThreadCallback, ReferenceCounted<UtilCallback<T>> {
+public:
+	UtilCallback(ThreadFuture<T> f, void* userdata) : f(f), userdata(userdata) {}
+
+	bool canFire(int notMadeActive) const override { return true; }
+	void fire(const Void& unused, int& userParam) override {
+		g_network->onMainThread(Promise<Void>((SAV<Void>*)userdata), TaskPriority::DefaultOnMainThread);
+		delete this;
+	}
+	void error(const Error&, int& userParam) override {
+		g_network->onMainThread(Promise<Void>((SAV<Void>*)userdata), TaskPriority::DefaultOnMainThread);
+		delete this;
+	}
+
+private:
+	ThreadFuture<T> f;
+	void* userdata;
+};
+
+ACTOR template <class T>
+static Future<T> safeThreadFutureToFutureActor(ThreadFuture<T> threadFuture) {
+	Promise<Void> ready;
+	Future<Void> onReady = ready.getFuture();
+	UtilCallback<T>* callback = new UtilCallback<T>(threadFuture, ready.extractRawPointer());
+	int unused = 0;
+	threadFuture.callOrSetAsCallback(callback, unused, 0);
+	wait(onReady);
+	// threadFuture should be ready
+	if (threadFuture.isError())
+		throw threadFuture.getError();
+	return threadFuture.get();
+}
+
+template <class T>
+Future<T> safeThreadFutureToFuture(ThreadFuture<T> threadFuture) {
+	return safeThreadFutureToFutureActor(threadFuture);
+}
+
 ACTOR template <class R, class F>
 Future<Void> doOnMainThread(Future<Void> signal, F f, ThreadSingleAssignmentVar<R>* result) {
 	try {

From c90be2003f8dffe6161d96ae90d53023cd6a4a3b Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 1 Apr 2021 10:34:59 -0700
Subject: [PATCH 045/461] Profile running actor

---
 flow/Platform.actor.cpp     |  8 +++++++
 flow/WriteOnlySet.actor.cpp | 41 +++++++++++++++++++++++++++++++++
 flow/WriteOnlySet.h         | 46 ++++++++++++++++++++++++++++++++++++-
 flow/flow.cpp               |  1 +
 flow/flow.h                 | 17 ++++++++++++--
 5 files changed, 110 insertions(+), 3 deletions(-)

diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index d81dee877a..50f252021b 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -3681,6 +3681,14 @@ void* sampleThread(void* arg) {
 
 		// TODO: Copy actor lineage of currently running actor
 		// Read currentLineage
+		auto actorLineage = currentLineageThreadSafe.get();
+		printf("Currently running actor lineage (%p):\n", actorLineage.getPtr());
+		auto stack = actorLineage->stack(&StackLineage::actorName);
+		while (!stack.empty()) {
+			printf("%s ", stack.top());
+			stack.pop();
+		}
+		printf("\n");
 
 		auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy();
 		printf("Disk ALPs: %d\n", diskAlps.size());
diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index 92eceea7bc..c79f8f4db7 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -67,6 +67,32 @@ bool WriteOnlySet<T, IndexType, CAPACITY>::erase(Index idx) {
 	return res;
 }
 
+template <class T, class IndexType, IndexType CAPACITY>
+bool WriteOnlySet<T, IndexType, CAPACITY>::replace(Index idx, const Reference<T>& lineage) {
+	auto lineagePtr = reinterpret_cast<uintptr_t>(lineage.getPtr());
+	ASSERT((lineagePtr % 2) == 0); // this needs to be at least 2-byte aligned
+
+	while (true) {
+		if (lineage.isValid()) {
+			lineage->addref();
+		}
+
+		auto ptr = _set[idx].load();
+		if (ptr & LOCK) {
+			_set[idx].store(lineagePtr);
+			return false;
+		} else {
+			if (_set[idx].compare_exchange_strong(ptr, lineagePtr)) {
+				if (ptr) {
+					reinterpret_cast<T*>(ptr)->delref();
+				}
+				_set[idx].store(lineagePtr);
+				return ptr != 0;
+			}
+		}
+	}
+}
+
 template <class T, class IndexType, IndexType CAPACITY>
 WriteOnlySet<T, IndexType, CAPACITY>::WriteOnlySet() : _set(CAPACITY) {
 	// insert the free indexes in reverse order
@@ -103,8 +129,23 @@ std::vector<Reference<T>> WriteOnlySet<T, IndexType, CAPACITY>::copy() {
 	return result;
 }
 
+template <class T, class IndexType>
+WriteOnlyVariable<T, IndexType>::WriteOnlyVariable() : WriteOnlySet<T, IndexType, 1>() {}
+
+template <class T, class IndexType>
+Reference<T> WriteOnlyVariable<T, IndexType>::get() {
+	auto result = WriteOnlySet<T, IndexType, 1>::copy();
+	return result.size() ? result.at(0) : Reference<T>();
+}
+
+template <class T, class IndexType>
+bool WriteOnlyVariable<T, IndexType>::replace(const Reference<T>& element) {
+	return WriteOnlySet<T, IndexType, 1>::replace(0, element);
+}
+
 // Explicit instantiation
 template class WriteOnlySet<ActorLineage, unsigned, 1024>;
+template class WriteOnlyVariable<ActorLineage, unsigned>;
 
 // testing code
 namespace {
diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h
index c71736f852..73da2bfac1 100644
--- a/flow/WriteOnlySet.h
+++ b/flow/WriteOnlySet.h
@@ -72,6 +72,17 @@ public:
 	 *      later. Note that at the time the return value is checked, \ref delref might already have been called.
 	 */
 	bool erase(Index idx);
+
+	/**
+	 * Replaces the object associated with \p idx with \p lineage.
+	 *
+	 * \ret Whether the reference count of the replaced object was decremented. Usually the return value is only
+	 *      interesting for testing and benchmarking purposes and will in most cases be ignored. If \ref delref
+	 *      wasn't called, it will be called later. Note that at the time the return value is checked, \ref delref
+	 *      might already have been called.
+	 */
+	bool replace(Index idx, const Reference<T>& lineage);
+
 	/**
 	 * Copies all elements that are stored in the set into a vector. This copy operation does NOT provide a snapshot of
 	 * the data structure. The contract is weak:
@@ -82,7 +93,7 @@ public:
 	 */
 	std::vector<Reference<T>> copy();
 
-private:
+protected:
 	// the implementation of erase -- the wrapper just makes the function a bit more readable.
 	bool eraseImpl(Index idx);
 
@@ -112,6 +123,39 @@ private:
 	boost::lockfree::queue<T*, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeList;
 };
 
+/**
+ * Provides a thread safe, lock-free write only variable.
+ *
+ * Template parameters:
+ * \param T The type to store.
+ * \param IndexType The type used as an index
+ * \pre T implements `void addref() const` and `void delref() const`
+ * \pre IndexType must have a copy constructor
+ * \pre IndexType must have a trivial assignment operator
+ * \pre IndexType must have a trivial destructor
+ * \pre IndexType can be used as an index into a std::vector
+ */
+template <class T, class IndexType>
+class WriteOnlyVariable : private WriteOnlySet<T, IndexType, 1> {
+public:
+	explicit WriteOnlyVariable();
+
+	/**
+	 * Returns a copied reference to the stored variable.
+	 */
+	Reference<T> get();
+
+	/**
+	 * Replaces the variable with \p lineage. \p lineage is permitted to be an invalid pointer.
+	 *
+	 * \ret Whether the reference count of the replaced object was decremented. Note that if the reference being replaced
+	 *      is invalid, this function will always return false. If \ref delref wasn't called and the reference was valid,
+	 *      it will be called later. Note that at the time the return value is checked, \ref delref might already have
+	 *      been called.
+	 */
+	bool replace(const Reference<T>& element);
+};
+
 class ActorLineage;
 extern template class WriteOnlySet<ActorLineage, unsigned, 1024>;
 
diff --git a/flow/flow.cpp b/flow/flow.cpp
index 02e5b93410..82bf2be43b 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -27,6 +27,7 @@
 #include <cinttypes>
 
 thread_local Reference<ActorLineage> currentLineage;
+WriteOnlyVariable<ActorLineage, unsigned> currentLineageThreadSafe;
 
 LineagePropertiesBase::~LineagePropertiesBase() {}
 
diff --git a/flow/flow.h b/flow/flow.h
index 430a12a460..b61453c8f2 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -50,6 +50,7 @@
 #include "flow/ThreadPrimitives.h"
 #include "flow/network.h"
 #include "flow/FileIdentifier.h"
+#include "flow/WriteOnlySet.h"
 
 #include <boost/version.hpp>
 
@@ -500,6 +501,7 @@ public:
 };
 
 extern thread_local Reference<ActorLineage> currentLineage;
+extern WriteOnlyVariable<ActorLineage, unsigned> currentLineageThreadSafe;
 
 // This class can be used in order to modify all lineage properties
 // of actors created within a (non-actor) scope
@@ -509,14 +511,21 @@ struct LocalLineage {
 	LocalLineage() {
 		oldLineage = currentLineage;
 		currentLineage = lineage;
+		currentLineageThreadSafe.replace(lineage);
+	}
+	~LocalLineage() {
+		currentLineage = oldLineage;
+		currentLineageThreadSafe.replace(oldLineage);
 	}
-	~LocalLineage() { currentLineage = oldLineage; }
 };
 
 struct restore_lineage {
 	Reference<ActorLineage> prev;
 	restore_lineage() : prev(currentLineage) {}
-	~restore_lineage() { currentLineage = prev; }
+	~restore_lineage() {
+		currentLineage = prev;
+		currentLineageThreadSafe.replace(prev);
+	}
 };
 
 struct StackLineage : LineageProperties<StackLineage> {
@@ -1108,12 +1117,14 @@ struct Actor : SAV<ReturnValue> {
 	Actor() : SAV<ReturnValue>(1, 1), actor_wait_state(0) {
 		/*++actorCount;*/
 		currentLineage = lineage;
+		currentLineageThreadSafe.replace(lineage);
 	}
 	//~Actor() { --actorCount; }
 
 	Reference<ActorLineage> setLineage() {
 		auto res = currentLineage;
 		currentLineage = lineage;
+		currentLineageThreadSafe.replace(lineage);
 		return res;
 	}
 };
@@ -1128,12 +1139,14 @@ struct Actor<void> {
 	Actor() : actor_wait_state(0) {
 		/*++actorCount;*/
 		currentLineage = lineage;
+		currentLineageThreadSafe.replace(lineage);
 	}
 	//~Actor() { --actorCount; }
 
 	Reference<ActorLineage> setLineage() {
 		auto res = currentLineage;
 		currentLineage = lineage;
+		currentLineageThreadSafe.replace(lineage);
 		return res;
 	}
 };

From 41d1aee609374905ad217b11524bb3c19adef0cb Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 1 Apr 2021 14:06:13 -0600
Subject: [PATCH 046/461] delete dead code

---
 fdbrpc/AsyncFileNonDurable.actor.h | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 6168c01abc..2234ee0b26 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -268,30 +268,6 @@ public:
 		//TraceEvent("AsyncFileNonDurable_Destroy", id).detail("Filename", filename);
 	}
 
-	// The purpose of this actor is to simply keep a reference to a non-durable file until all pending modifications
-	// have completed. When they return, this actor will die and therefore decrement the reference count by 1.
-	ACTOR void waitOnOutstandingModifications(Reference<AsyncFileNonDurable> self) {
-		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
-		state TaskPriority currentTaskID = g_network->getCurrentTask();
-		state std::string filename = self->filename;
-
-		wait(g_simulator.onMachine(currentProcess));
-		Promise<bool> startSyncPromise = self->startSyncPromise;
-		self->startSyncPromise = Promise<bool>();
-		startSyncPromise.send(true);
-
-		std::vector<Future<Void>> outstandingModifications;
-
-		for (auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end();
-		     ++itr)
-			if (itr->value().isValid() && !itr->value().isReady())
-				outstandingModifications.push_back(itr->value());
-
-		// Ignore errors here so that all modifications can finish
-		wait(waitForAllReady(outstandingModifications));
-		wait(g_simulator.onProcess(currentProcess, currentTaskID));
-	}
-
 	void addref() override { ReferenceCounted<AsyncFileNonDurable>::addref(); }
 	void delref() override {
 		if (delref_no_destroy()) {

From 90ebf90c8ba619afe85c1933851c7fb53fd56943 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sat, 3 Apr 2021 19:54:49 -0700
Subject: [PATCH 047/461] Refactored page rebuild logic to bulk build pages
 full and split pages more evenly.

---
 fdbserver/DeltaTree.h              |   2 -
 fdbserver/IPager.h                 |   2 +-
 fdbserver/Knobs.cpp                |   2 +-
 fdbserver/Knobs.h                  |   2 +-
 fdbserver/VersionedBTree.actor.cpp | 439 +++++++++++++++++------------
 5 files changed, 267 insertions(+), 180 deletions(-)

diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h
index bef753a440..2e0fee0b40 100644
--- a/fdbserver/DeltaTree.h
+++ b/fdbserver/DeltaTree.h
@@ -230,8 +230,6 @@ struct DeltaTree {
 	inline Node& newNode() { return *(Node*)((uint8_t*)this + size()); }
 
 public:
-	// Get count of total overhead bytes (everything but the user-formatted Delta) for a tree given size n
-	static int emptyTreeSize() { return sizeof(DeltaTree); }
 
 	struct DecodedNode {
 		DecodedNode() {}
diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h
index 0f74c744a8..45c9f02fcc 100644
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@@ -76,7 +76,7 @@ public:
 	virtual void delref() = 0;
 };
 
-// This API is probably customized to the behavior of DWALPager and probably needs some changes to be more generic.
+// This API is probably too customized to the behavior of DWALPager and probably needs some changes to be more generic.
 class IPager2 : public IClosable {
 public:
 	// Returns an IPage that can be passed to writePage. The data in the returned IPage might not be zeroed.
diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index 29fbc6fcc6..539637580a 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -703,7 +703,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( REDWOOD_DEFAULT_PAGE_SIZE,                            4096 );
 	init( REDWOOD_KVSTORE_CONCURRENT_READS,                       64 );
 	init( REDWOOD_COMMIT_CONCURRENT_READS,                        64 );
-	init( REDWOOD_PAGE_REBUILD_FILL_FACTOR,                     0.66 );
+	init( REDWOOD_PAGE_REBUILD_MAX_SLACK,                       0.33 );
 	init( REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES,                    10 );
 	init( REDWOOD_LAZY_CLEAR_MIN_PAGES,                            0 );
 	init( REDWOOD_LAZY_CLEAR_MAX_PAGES,                          1e6 );
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 16abf63692..1b8e6874cd 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -636,7 +636,7 @@ public:
 	int REDWOOD_DEFAULT_PAGE_SIZE; // Page size for new Redwood files
 	int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress.
 	int REDWOOD_COMMIT_CONCURRENT_READS; // Max number of concurrent reads done to support commit operations
-	double REDWOOD_PAGE_REBUILD_FILL_FACTOR; // When rebuilding pages, start a new page after this capacity
+	double REDWOOD_PAGE_REBUILD_MAX_SLACK; // When rebuilding pages, max slack to allow in page
 	int REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES; // Number of pages to try to pop from the lazy delete queue and process at
 	                                         // once
 	int REDWOOD_LAZY_CLEAR_MIN_PAGES; // Minimum number of pages to free before ending a lazy clear cycle, unless the
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 8051d956b0..4f3bb874c4 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -2864,7 +2864,8 @@ struct RedwoodRecordRef {
 
 	bool operator>=(const RedwoodRecordRef& rhs) const { return compare(rhs) >= 0; }
 
-	// Worst case overhead means to assu
+	// Worst case overhead means to assume that either the prefix length or the suffix length
+	// could contain the full key size
 	int deltaSize(const RedwoodRecordRef& base, int skipLen, bool worstCaseOverhead) const {
 		int prefixLen = getCommonPrefixLen(base, skipLen);
 		int keySuffixLen = key.size() - prefixLen;
@@ -3732,6 +3733,184 @@ private:
 	Future<int> m_lazyClearActor;
 	bool m_lazyClearStop;
 
+	struct PageToBuild {
+		PageToBuild(int index, int blockSize)
+		  : startIndex(index), count(0), pageSize(blockSize),
+		    bytesLeft(blockSize - sizeof(BTreePage) - sizeof(BTreePage::BinaryTree)),
+		    largeDeltaTree(pageSize > BTreePage::BinaryTree::SmallSizeLimit), blockSize(blockSize), blockCount(1),
+		    kvBytes(0) {}
+
+		int startIndex;
+		int count;
+		int pageSize;
+		int bytesLeft;
+		bool largeDeltaTree;
+		int blockSize;
+		int blockCount;
+		int kvBytes;
+
+		int size() const { return pageSize - bytesLeft; }
+
+		double usedFraction() const { return (double)size() / pageSize; }
+
+		double slackFraction() const { return (double)bytesLeft / pageSize; }
+
+		double kvFraction() const { return (double)kvBytes / pageSize; }
+
+		int endIndex() const { return startIndex + count; }
+
+		int lastIndex() const { return endIndex() - 1; }
+
+		std::string toString() const {
+			return format(
+			    "{start=%d count=%d used %d/%d bytes (%.2f%% slack) kvBytes=%d blocks=%d blockSize=%d large=%d}",
+			    startIndex,
+			    count,
+			    size(),
+			    pageSize,
+			    slackFraction() * 100,
+			    kvBytes,
+			    blockCount,
+			    blockSize,
+			    largeDeltaTree);
+		}
+
+		// Move an item from a to b if a has 2 or more items and the item fits in b
+		// a and b must be consecutive pages from the same array of records
+		static bool shiftItem(PageToBuild& a, PageToBuild& b, int deltaSize, int kvBytes) {
+			if (a.count < 2) {
+				return false;
+			}
+
+			// Size of the nodes in A and B, respectively
+			int aNodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(a.largeDeltaTree);
+			int bNodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(b.largeDeltaTree);
+
+			if (b.bytesLeft < bNodeSize) {
+				return false;
+			}
+
+			--a.count;
+			++b.count;
+			--b.startIndex;
+			a.bytesLeft += aNodeSize;
+			b.bytesLeft -= bNodeSize;
+			a.kvBytes -= kvBytes;
+			b.kvBytes += kvBytes;
+
+			return true;
+		}
+
+		// Try to add a record of the given delta size to the page.
+		// If force is true, the page will be expanded to make the record fit if needed.
+		// Return value is whether or not the record was added to the page.
+		bool addRecord(const RedwoodRecordRef& rec, int deltaSize, bool force) {
+			int nodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(largeDeltaTree);
+
+			// If the record doesn't fit and the page can't be expanded then return false
+			if (nodeSize > bytesLeft && !force) {
+				return false;
+			}
+
+			++count;
+			bytesLeft -= nodeSize;
+			kvBytes += rec.kvBytes();
+
+			// If needed, expand page so that record fits.
+			// This is a loop because the first expansion may increase per-node overhead which could
+			// then require a second expansion.
+			while (bytesLeft < 0) {
+				int newBlocks = (-bytesLeft + blockSize - 1) / blockSize;
+				int extraSpace = newBlocks * blockSize;
+				blockCount += newBlocks;
+				bytesLeft += extraSpace;
+				pageSize += extraSpace;
+
+				// If size has moved into the "large" range then every node has gotten bigger so adjust bytesLeft
+				if (!largeDeltaTree && pageSize > BTreePage::BinaryTree::SmallSizeLimit) {
+					largeDeltaTree = true;
+					bytesLeft -= (count * BTreePage::BinaryTree::LargeTreePerNodeExtraOverhead);
+				}
+			}
+			return true;
+		}
+	};
+
+	static std::vector<PageToBuild> splitPages(const RedwoodRecordRef* lowerBound,
+	                                           const RedwoodRecordRef* upperBound,
+	                                           int prefixLen,
+	                                           VectorRef<RedwoodRecordRef> records,
+	                                           int height,
+	                                           int blockSize) {
+		debug_printf("splitPages height=%d records=%d lowerBound=%s upperBound=%s\n",
+		             height,
+		             records.size(),
+		             lowerBound->toString(false).c_str(),
+		             upperBound->toString(false).c_str());
+		ASSERT(!records.empty());
+
+		// Leaves can have just one record if it's large, but internal pages should have at least 4
+		int minRecords = height == 1 ? 1 : 4;
+		double maxSlack = SERVER_KNOBS->REDWOOD_PAGE_REBUILD_MAX_SLACK;
+		std::vector<PageToBuild> pages;
+
+		// deltaSizes contains pair-wise delta sizes for [lowerBound, records..., upperBound]
+		std::vector<int> deltaSizes(records.size() + 1);
+		deltaSizes.front() = records.front().deltaSize(*lowerBound, prefixLen, true);
+		deltaSizes.back() = records.back().deltaSize(*upperBound, prefixLen, true);
+		for (int i = 1; i < records.size(); ++i) {
+			deltaSizes[i] = records[i].deltaSize(records[i - 1], prefixLen, true);
+		}
+
+		PageToBuild p(0, blockSize);
+
+		for (int i = 0; i < records.size(); ++i) {
+			bool force = p.count < minRecords || p.slackFraction() > maxSlack;
+			debug_printf(
+			    "  before addRecord  i=%d  records=%d  deltaSize=%d  kvSize=%d  force=%d  pageToBuild=%s  record=%s",
+			    i,
+			    records.size(),
+			    deltaSizes[i],
+			    records[i].kvBytes(),
+			    force,
+			    p.toString().c_str(),
+			    records[i].toString(height == 1).c_str());
+
+			if (!p.addRecord(records[i], deltaSizes[i], force)) {
+				pages.push_back(p);
+				p = PageToBuild(p.endIndex(), blockSize);
+				p.addRecord(records[i], deltaSizes[i], true);
+			}
+		}
+
+		if (p.count > 0) {
+			pages.push_back(p);
+		}
+
+		debug_printf("  Before shift: %s\n", ::toString(pages).c_str());
+
+		// If page count is > 1, try to balance slack between last two pages
+		// The buggify disables this balancing as this will result in more edge
+		// cases of pages with very few records.
+		if (pages.size() > 1 && !BUGGIFY) {
+			PageToBuild& a = pages[pages.size() - 2];
+			PageToBuild& b = pages.back();
+
+			// While the last page page has too much slack and the second to last page
+			// has more than the minimum record count, shift a record from the second
+			// to last page to the last page.
+			while (b.slackFraction() > maxSlack && a.count > minRecords) {
+				int i = a.lastIndex();
+				if (!PageToBuild::shiftItem(a, b, deltaSizes[i], records[i].kvBytes())) {
+					break;
+				}
+				debug_printf("  After shifting i=%d: a=%s b=%s\n", i, a.toString().c_str(), b.toString().c_str());
+			}
+		}
+
+		return pages;
+	}
+
 	// Writes entries to 1 or more pages and return a vector of boundary keys with their IPage(s)
 	ACTOR static Future<Standalone<VectorRef<RedwoodRecordRef>>> writePages(VersionedBTree* self,
 	                                                                        const RedwoodRecordRef* lowerBound,
@@ -3741,197 +3920,130 @@ private:
 	                                                                        Version v,
 	                                                                        BTreePageIDRef previousID) {
 		ASSERT(entries.size() > 0);
+
 		state Standalone<VectorRef<RedwoodRecordRef>> records;
 
-		// This is how much space for the binary tree exists in the page, after the header
-		state int blockSize = self->m_blockSize;
-		state int pageSize = blockSize - sizeof(BTreePage);
-		state int pageFillTarget = pageSize * SERVER_KNOBS->REDWOOD_PAGE_REBUILD_FILL_FACTOR;
-		state int blockCount = 1;
+		// All records share the prefix shared by the lower and upper boundaries
+		state int prefixLen = lowerBound->getCommonPrefixLen(*upperBound);
 
-		state int kvBytes = 0;
-		state int compressedBytes = BTreePage::BinaryTree::emptyTreeSize();
-		state bool largeTree = false;
-
-		state int start = 0;
-		state int i = 0;
-		// The common prefix length between the first and last records are common to all records
-		state int skipLen = entries.front().getCommonPrefixLen(entries.back());
-
-		// Leaves can have just one record if it's large, but internal pages should have at least 4
-		state int minimumEntries = (height == 1 ? 1 : 4);
+		state std::vector<PageToBuild> pagesToBuild =
+		    splitPages(lowerBound, upperBound, prefixLen, entries, height, self->m_blockSize);
+		debug_printf("splitPages returning %s\n", toString(pagesToBuild).c_str());
 
 		// Lower bound of the page being added to
 		state RedwoodRecordRef pageLowerBound = lowerBound->withoutValue();
 		state RedwoodRecordRef pageUpperBound;
 
-		while (1) {
-			// While there are still entries to add and the page isn't full enough, add an entry
-			while (i < entries.size() && (i - start < minimumEntries || compressedBytes < pageFillTarget)) {
-				const RedwoodRecordRef& entry = entries[i];
+		state int pageIndex;
 
-				// Get delta from previous record or page lower boundary if this is the first item in a page
-				const RedwoodRecordRef& base = (i == start) ? pageLowerBound : entries[i - 1];
+		for (pageIndex = 0; pageIndex < pagesToBuild.size(); ++pageIndex) {
+			auto& p = pagesToBuild[pageIndex];
+			debug_printf("building page %d of %d %s\n", pageIndex + 1, pagesToBuild.size(), p.toString().c_str());
+			ASSERT(p.count != 0);
 
-				// All record pairs in entries have skipLen bytes in common with each other, but for i == 0 the base is
-				// lowerBound
-				int skip = i == 0 ? 0 : skipLen;
+			// For internal pages, skip first entry if child link is null.  Such links only exist
+			// to maintain a borrow-able prefix for the previous subtree after a subtree deletion.
+			// If the null link falls on a new page post-split, then the pageLowerBound of the page
+			// being built now will serve as the previous subtree's upper boundary as it is the same
+			// key as entries[p.startIndex] and there is no need to actually store the null link in
+			// the new page.
+			if (height != 1 && !entries[p.startIndex].value.present()) {
+				p.kvBytes -= entries[p.startIndex].key.size();
+				++p.startIndex;
+				--p.count;
+				debug_printf("Skipping first null record, new count=%d\n", p.count);
 
-				// In a delta tree, all common prefix bytes that can be borrowed, will be, but not necessarily
-				// by the same records during the linear estimate of the built page size.  Since the key suffix bytes
-				// and therefore the key prefix lengths can be distributed differently in the balanced tree, worst case
-				// overhead for the delta size must be assumed.
-				int deltaSize = entry.deltaSize(base, skip, true);
-
-				int nodeSize = BTreePage::BinaryTree::Node::headerSize(largeTree) + deltaSize;
-				debug_printf("Adding %3d of %3lu (i=%3d) klen %4d  vlen %5d  nodeSize %5d  deltaSize %5d  page usage: "
-				             "%d/%d (%.2f%%)  record=%s\n",
-				             i + 1,
-				             entries.size(),
-				             i,
-				             entry.key.size(),
-				             entry.value.orDefault(StringRef()).size(),
-				             nodeSize,
-				             deltaSize,
-				             compressedBytes,
-				             pageSize,
-				             (float)compressedBytes / pageSize * 100,
-				             entry.toString(height == 1).c_str());
-
-				// While the node doesn't fit, expand the page.
-				// This is a loop because if the page size moves into "large" range for DeltaTree
-				// then the overhead will increase, which could require another page expansion.
-				int spaceAvailable = pageSize - compressedBytes;
-				if (nodeSize > spaceAvailable) {
-					// Figure out how many additional whole or partial blocks are needed
-					// newBlocks = ceil ( additional space needed / block size)
-					int newBlocks = 1 + (nodeSize - spaceAvailable - 1) / blockSize;
-					int newPageSize = pageSize + (newBlocks * blockSize);
-
-					// If we've moved into "large" page range for the delta tree then add additional overhead required
-					if (!largeTree && newPageSize > BTreePage::BinaryTree::SmallSizeLimit) {
-						largeTree = true;
-						// Add increased overhead for the current node to nodeSize
-						nodeSize += BTreePage::BinaryTree::LargeTreePerNodeExtraOverhead;
-						// Add increased overhead for all previously added nodes
-						compressedBytes += (i - start) * BTreePage::BinaryTree::LargeTreePerNodeExtraOverhead;
-
-						// Update calculations above made with previous overhead sizes
-						spaceAvailable = pageSize - compressedBytes;
-						newBlocks = 1 + (nodeSize - spaceAvailable - 1) / blockSize;
-						newPageSize = pageSize + (newBlocks * blockSize);
-					}
-
-					blockCount += newBlocks;
-					pageSize = newPageSize;
-					pageFillTarget = pageSize * SERVER_KNOBS->REDWOOD_PAGE_REBUILD_FILL_FACTOR;
+				// If the page is now empty then it must be the last page in pagesToBuild, otherwise there would
+				// be more than 1 item since internal pages need to have multiple children. While there is no page
+				// to be built here, a record must be added to the output set because the upper boundary of the last
+				// page built does not match the upper boundary of the original page that this call to writePages() is
+				// replacing.  Put another way, the upper boundary of the rightmost page of the page set that was just
+				// built does not match the upper boundary of the original page that the page set is replacing, so
+				// adding the extra null link fixes this.
+				if (p.count == 0) {
+					ASSERT(pageIndex == pagesToBuild.size() - 1);
+					records.push_back_deep(records.arena(), pageUpperBound);
+					break;
 				}
-
-				kvBytes += entry.kvBytes();
-				compressedBytes += nodeSize;
-				++i;
-			}
-
-			// Flush the accumulated records to a page
-			state int nextStart = i;
-			// If we are building internal pages and there is a record after this page (index nextStart) but it has an
-			// empty childPage value then skip it. It only exists to serve as an upper boundary for a child page that
-			// has not been rewritten in the current commit, and that purpose will now be served by the upper bound of
-			// the page we are now building.
-			if (height != 1 && nextStart < entries.size() && !entries[nextStart].value.present()) {
-				++nextStart;
 			}
 
 			// Use the next entry as the upper bound, or upperBound if there are no more entries beyond this page
-			pageUpperBound = (i == entries.size()) ? upperBound->withoutValue() : entries[i].withoutValue();
+			int endIndex = p.endIndex();
+			bool lastPage = endIndex == entries.size();
+			pageUpperBound = lastPage ? upperBound->withoutValue() : entries[endIndex].withoutValue();
 
 			// If this is a leaf page, and not the last one to be written, shorten the upper boundary
-			state bool isLastPage = (nextStart == entries.size());
-			if (!isLastPage && height == 1) {
-				int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[i - 1], 0);
+			if (!lastPage && height == 1) {
+				int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[endIndex - 1], prefixLen);
 				pageUpperBound.truncate(commonPrefix + 1);
 			}
 
 			state std::vector<Reference<IPage>> pages;
 			BTreePage* btPage;
 
-			int capacity = blockSize * blockCount;
-			if (blockCount == 1) {
+			if (p.blockCount == 1) {
 				Reference<IPage> page = self->m_pager->newPageBuffer();
 				btPage = (BTreePage*)page->mutate();
 				pages.push_back(std::move(page));
 			} else {
-				ASSERT(blockCount > 1);
-				btPage = (BTreePage*)new uint8_t[capacity];
+				ASSERT(p.blockCount > 1);
+				btPage = (BTreePage*)new uint8_t[p.pageSize];
 			}
 
 			btPage->height = height;
-			btPage->kvBytes = kvBytes;
+			btPage->kvBytes = p.kvBytes;
 
-			debug_printf(
-			    "Building tree.  start=%d  i=%d  count=%d  page usage: %d/%d (%.2f%%) bytes\nlower: %s\nupper: %s\n",
-			    start,
-			    i,
-			    i - start,
-			    compressedBytes,
-			    pageSize,
-			    (float)compressedBytes / pageSize * 100,
-			    pageLowerBound.toString(false).c_str(),
-			    pageUpperBound.toString(false).c_str());
+			debug_printf("Building tree for %s\nlower: %s\nupper: %s\n",
+			             p.toString().c_str(),
+			             pageLowerBound.toString(false).c_str(),
+			             pageUpperBound.toString(false).c_str());
 
-			int written =
-			    btPage->tree().build(pageSize, &entries[start], &entries[i], &pageLowerBound, &pageUpperBound);
-			if (written > pageSize) {
-				debug_printf("ERROR:  Wrote %d bytes to %d byte page (%d blocks). recs %d  kvBytes %d  compressed %d\n",
+			int deltaTreeSpace = p.pageSize - sizeof(BTreePage);
+			state int written = btPage->tree().build(
+			    deltaTreeSpace, &entries[p.startIndex], &entries[endIndex], &pageLowerBound, &pageUpperBound);
+
+			if (written > deltaTreeSpace) {
+				debug_printf("ERROR:  Wrote %d bytes to page %s deltaTreeSpace=%d\n",
 				             written,
-				             pageSize,
-				             blockCount,
-				             i - start,
-				             kvBytes,
-				             compressedBytes);
-				fprintf(stderr,
-				        "ERROR:  Wrote %d bytes to %d byte page (%d blocks). recs %d  kvBytes %d  compressed %d\n",
-				        written,
-				        pageSize,
-				        blockCount,
-				        i - start,
-				        kvBytes,
-				        compressedBytes);
+				             p.toString().c_str(),
+				             deltaTreeSpace);
+				TraceEvent(SevError, "RedwoodDeltaTreeOverflow")
+				    .detail("PageSize", p.pageSize)
+				    .detail("BytesWritten", written);
 				ASSERT(false);
 			}
 
 			auto& metrics = g_redwoodMetrics.level(btPage->height);
 			metrics.pageBuild += 1;
-			metrics.pageBuildExt += blockCount - 1;
-			metrics.buildFillPct += (double)written / capacity;
-			metrics.buildStoredPct += (double)btPage->kvBytes / capacity;
-			metrics.buildItemCount += btPage->tree().numItems;
+			metrics.pageBuildExt += p.blockCount - 1;
+			metrics.buildFillPct += p.usedFraction();
+			metrics.buildStoredPct += p.kvFraction();
+			metrics.buildItemCount += p.count;
 
 			// Create chunked pages
 			// TODO: Avoid copying page bytes, but this is not trivial due to how pager checksums are currently handled.
-			if (blockCount != 1) {
+			if (p.blockCount != 1) {
 				// Mark the slack in the page buffer as defined
-				VALGRIND_MAKE_MEM_DEFINED(((uint8_t*)btPage) + written, (blockCount * blockSize) - written);
+				VALGRIND_MAKE_MEM_DEFINED(((uint8_t*)btPage) + written, (p.blockCount * p.blockSize) - written);
 				const uint8_t* rptr = (const uint8_t*)btPage;
-				for (int b = 0; b < blockCount; ++b) {
+				for (int b = 0; b < p.blockCount; ++b) {
 					Reference<IPage> page = self->m_pager->newPageBuffer();
-					memcpy(page->mutate(), rptr, blockSize);
-					rptr += blockSize;
+					memcpy(page->mutate(), rptr, p.blockSize);
+					rptr += p.blockSize;
 					pages.push_back(std::move(page));
 				}
 				delete[](uint8_t*) btPage;
 			}
 
 			// Write this btree page, which is made of 1 or more pager pages.
-			state int p;
 			state BTreePageIDRef childPageID;
+			state int k;
 
 			// If we are only writing 1 page and it has the same BTreePageID size as the original then try to reuse the
 			// LogicalPageIDs in previousID and try to update them atomically.
-			bool isOnlyPage = isLastPage && (start == 0);
-			if (isOnlyPage && previousID.size() == pages.size()) {
-				for (p = 0; p < pages.size(); ++p) {
-					LogicalPageID id = wait(self->m_pager->atomicUpdatePage(previousID[p], pages[p], v));
+			if (pagesToBuild.size() == 1 && previousID.size() == pages.size()) {
+				for (k = 0; k < pages.size(); ++k) {
+					LogicalPageID id = wait(self->m_pager->atomicUpdatePage(previousID[k], pages[k], v));
 					childPageID.push_back(records.arena(), id);
 				}
 			} else {
@@ -3942,31 +4054,25 @@ private:
 				if (records.empty()) {
 					self->freeBTreePage(previousID, v);
 				}
-				for (p = 0; p < pages.size(); ++p) {
+				for (k = 0; k < pages.size(); ++k) {
 					LogicalPageID id = wait(self->m_pager->newPageID());
-					self->m_pager->updatePage(id, pages[p]);
+					self->m_pager->updatePage(id, pages[k]);
 					childPageID.push_back(records.arena(), id);
 				}
 			}
 
 			wait(yield());
 
-			debug_printf("Flushing %s  lastPage=%d  original=%s  start=%d  i=%d  count=%d  page usage: %d/%d (%.2f%%) "
-			             "bytes\nlower: %s\nupper: %s\n",
-			             toString(childPageID).c_str(),
-			             isLastPage,
-			             toString(previousID).c_str(),
-			             start,
-			             i,
-			             i - start,
-			             compressedBytes,
-			             pageSize,
-			             (float)compressedBytes / pageSize * 100,
-			             pageLowerBound.toString(false).c_str(),
-			             pageUpperBound.toString(false).c_str());
-
 			if (REDWOOD_DEBUG) {
-				for (int j = start; j < i; ++j) {
+				auto& p = pagesToBuild[pageIndex];
+				debug_printf("Wrote %s original=%s deltaTreeSize=%d for %s\nlower: %s\nupper: %s\n",
+				             toString(childPageID).c_str(),
+				             toString(previousID).c_str(),
+				             written,
+				             p.toString().c_str(),
+				             pageLowerBound.toString(false).c_str(),
+				             pageUpperBound.toString(false).c_str());
+				for (int j = p.startIndex; j < p.endIndex(); ++j) {
 					debug_printf(" %3d: %s\n", j, entries[j].toString(height == 1).c_str());
 				}
 				ASSERT(pageLowerBound.key <= pageUpperBound.key);
@@ -3978,27 +4084,9 @@ private:
 			// records.arena() above
 			records.back().setChildPage(childPageID);
 
-			if (isLastPage) {
-				break;
-			}
-
-			start = nextStart;
-			kvBytes = 0;
-			compressedBytes = BTreePage::BinaryTree::emptyTreeSize();
 			pageLowerBound = pageUpperBound;
 		}
 
-		// If we're writing internal pages, if the last entry was the start of a new page and had an empty child link
-		// then it would not be written to a page. This means that the upper boundary for the the page set being built
-		// is not the upper bound of the final page in that set, so it must be added to the output set to preserve the
-		// decodability of the subtree to its left. Fortunately, this is easy to detect because the loop above would
-		// exit before i has reached the item count.
-		if (height != 1 && i != entries.size()) {
-			debug_printf("Adding dummy record to avoid writing useless page containing only one null link: %s\n",
-			             pageUpperBound.toString(false).c_str());
-			records.push_back_deep(records.arena(), pageUpperBound);
-		}
-
 		return records;
 	}
 
@@ -4294,11 +4382,12 @@ private:
 
 		std::string toString() const {
 			std::string s;
-			s += format("SubtreeSlice: addr=%p skipLen=%d subtreeCleared=%d childrenChanged=%d\n",
+			s += format("SubtreeSlice: addr=%p skipLen=%d subtreeCleared=%d childrenChanged=%d inPlaceUpdate=%d\n",
 			            this,
 			            skipLen,
 			            childrenChanged && newLinks.empty(),
-			            childrenChanged);
+			            childrenChanged,
+			            inPlaceUpdate);
 			s += format("SubtreeLower: %s\n", subtreeLowerBound->toString(false).c_str());
 			s += format(" DecodeLower: %s\n", decodeLowerBound->toString(false).c_str());
 			s += format(" DecodeUpper: %s\n", decodeUpperBound->toString(false).c_str());

From 5c93e684f8c130a399fdb6b7d998917b5a085f7f Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sun, 4 Apr 2021 19:23:08 -0700
Subject: [PATCH 048/461] Added comments.

---
 fdbserver/DeltaTree.h              |  1 -
 fdbserver/VersionedBTree.actor.cpp | 28 ++++++++++++++++++----------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h
index 2e0fee0b40..ceff1f2ec3 100644
--- a/fdbserver/DeltaTree.h
+++ b/fdbserver/DeltaTree.h
@@ -230,7 +230,6 @@ struct DeltaTree {
 	inline Node& newNode() { return *(Node*)((uint8_t*)this + size()); }
 
 public:
-
 	struct DecodedNode {
 		DecodedNode() {}
 
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 4f3bb874c4..071ca5d074 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -3733,6 +3733,7 @@ private:
 	Future<int> m_lazyClearActor;
 	bool m_lazyClearStop;
 
+	// Describes a range of a vector of records that should be built into a BTreePage
 	struct PageToBuild {
 		PageToBuild(int index, int blockSize)
 		  : startIndex(index), count(0), pageSize(blockSize),
@@ -3740,27 +3741,33 @@ private:
 		    largeDeltaTree(pageSize > BTreePage::BinaryTree::SmallSizeLimit), blockSize(blockSize), blockCount(1),
 		    kvBytes(0) {}
 
-		int startIndex;
-		int count;
-		int pageSize;
-		int bytesLeft;
-		bool largeDeltaTree;
-		int blockSize;
-		int blockCount;
-		int kvBytes;
+		int startIndex; // Index of the first record
+		int count; // Number of records added to the page
+		int pageSize; // Page size required to hold a BTreePage of the added records, which is a multiple of blockSize
+		int bytesLeft; // Bytes in pageSize that are unused by the BTreePage so far
+		bool largeDeltaTree; // Whether or not the DeltaTree in the generated page is in the 'large' size range
+		int blockSize; // Base block size by which pageSize can be incremented
+		int blockCount; // The number of blocks in pageSize
+		int kvBytes; // The amount of user key/value bytes added to the page
 
+		// Number of bytes used by the generated/serialized BTreePage
 		int size() const { return pageSize - bytesLeft; }
 
+		// Used fraction of pageSize bytes
 		double usedFraction() const { return (double)size() / pageSize; }
 
+		// Unused fraction of pageSize bytes
 		double slackFraction() const { return (double)bytesLeft / pageSize; }
 
+		// Fraction of PageSize in use by key or value string bytes, disregarding all overhead including string sizes
 		double kvFraction() const { return (double)kvBytes / pageSize; }
 
-		int endIndex() const { return startIndex + count; }
-
+		// Index of the last record to be included in this page
 		int lastIndex() const { return endIndex() - 1; }
 
+		// Index of the first record NOT included in this page
+		int endIndex() const { return startIndex + count; }
+
 		std::string toString() const {
 			return format(
 			    "{start=%d count=%d used %d/%d bytes (%.2f%% slack) kvBytes=%d blocks=%d blockSize=%d large=%d}",
@@ -3836,6 +3843,7 @@ private:
 		}
 	};
 
+	// Scans a vector of records and decides on page split points, returning a vector of 1+ pages to build
 	static std::vector<PageToBuild> splitPages(const RedwoodRecordRef* lowerBound,
 	                                           const RedwoodRecordRef* upperBound,
 	                                           int prefixLen,

From eec119e0d0548a71c59be3c41028bfa5e2fd98fb Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sun, 4 Apr 2021 21:36:05 -0700
Subject: [PATCH 049/461] Added an fdbserver role to run unit tests directly
 without a cluster or test spec file, and added a unit test parameters concept
 for passing options into unit tests. Updated p2p network test to use unit
 test parameters instead of the environment.

---
 fdbserver/TesterInterface.actor.h |  2 +-
 fdbserver/fdbserver.actor.cpp     | 40 ++++++++++++++++----
 fdbserver/networktest.actor.cpp   | 63 ++++++++++++++++++-------------
 fdbserver/tester.actor.cpp        | 20 ++++++++--
 flow/UnitTest.cpp                 | 30 +++++++++++++++
 flow/UnitTest.h                   | 20 ++++++++++
 6 files changed, 137 insertions(+), 38 deletions(-)

diff --git a/fdbserver/TesterInterface.actor.h b/fdbserver/TesterInterface.actor.h
index f5e84a2a58..a59e2244ea 100644
--- a/fdbserver/TesterInterface.actor.h
+++ b/fdbserver/TesterInterface.actor.h
@@ -135,7 +135,7 @@ ACTOR Future<Void> testerServerCore(TesterInterface interf,
                                     LocalityData locality);
 
 enum test_location_t { TEST_HERE, TEST_ON_SERVERS, TEST_ON_TESTERS };
-enum test_type_t { TEST_TYPE_FROM_FILE, TEST_TYPE_CONSISTENCY_CHECK };
+enum test_type_t { TEST_TYPE_FROM_FILE, TEST_TYPE_CONSISTENCY_CHECK, TEST_TYPE_UNIT_TESTS };
 
 ACTOR Future<Void> runTests(Reference<ClusterConnectionFile> connFile,
                             test_type_t whatToRun,
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 545d407953..f59fdcd7c0 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -66,6 +66,7 @@
 #include "flow/SystemMonitor.h"
 #include "flow/TLSConfig.actor.h"
 #include "flow/Tracing.h"
+#include "flow/UnitTest.h"
 
 #if defined(__linux__) || defined(__FreeBSD__)
 #include <execinfo.h>
@@ -88,7 +89,7 @@ enum {
 	OPT_CONNFILE, OPT_SEEDCONNFILE, OPT_SEEDCONNSTRING, OPT_ROLE, OPT_LISTEN, OPT_PUBLICADDR, OPT_DATAFOLDER, OPT_LOGFOLDER, OPT_PARENTPID, OPT_TRACER, OPT_NEWCONSOLE,
 	OPT_NOBOX, OPT_TESTFILE, OPT_RESTARTING, OPT_RESTORING, OPT_RANDOMSEED, OPT_KEY, OPT_MEMLIMIT, OPT_STORAGEMEMLIMIT, OPT_CACHEMEMLIMIT, OPT_MACHINEID,
 	OPT_DCID, OPT_MACHINE_CLASS, OPT_BUGGIFY, OPT_VERSION, OPT_BUILD_FLAGS, OPT_CRASHONERROR, OPT_HELP, OPT_NETWORKIMPL, OPT_NOBUFSTDOUT, OPT_BUFSTDOUTERR,
-	OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE,
+	OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_UNITTESTPARAM, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE,
 	OPT_METRICSPREFIX, OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE,
 	OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE
 };
@@ -162,6 +163,7 @@ CSimpleOpt::SOption g_rgOptions[] = {
 	{ OPT_HELP,                  "--help",                      SO_NONE },
 	{ OPT_DEVHELP,               "--dev-help",                  SO_NONE },
 	{ OPT_KNOB,                  "--knob_",                     SO_REQ_SEP },
+	{ OPT_UNITTESTPARAM,         "--test_",                     SO_REQ_SEP },
 	{ OPT_LOCALITY,              "--locality_",                 SO_REQ_SEP },
 	{ OPT_TESTSERVERS,           "--testservers",               SO_REQ_SEP },
 	{ OPT_TEST_ON_SERVERS,       "--testonservers",             SO_NONE },
@@ -622,16 +624,19 @@ static void printUsage(const char* name, bool devhelp) {
 	printOptionUsage("-h, -?, --help", "Display this help and exit.");
 	if (devhelp) {
 		printf("  --build_flags  Print build information and exit.\n");
-		printOptionUsage("-r ROLE, --role ROLE",
-		                 " Server role (valid options are fdbd, test, multitest,"
-		                 " simulation, networktestclient, networktestserver, restore"
-		                 " consistencycheck, kvfileintegritycheck, kvfilegeneratesums). The default is `fdbd'.");
+		printOptionUsage(
+		    "-r ROLE, --role ROLE",
+		    " Server role (valid options are fdbd, test, multitest,"
+		    " simulation, networktestclient, networktestserver, restore"
+		    " consistencycheck, kvfileintegritycheck, kvfilegeneratesums, unittests). The default is `fdbd'.");
 #ifdef _WIN32
 		printOptionUsage("-n, --newconsole", " Create a new console.");
 		printOptionUsage("-q, --no_dialog", " Disable error dialog on crash.");
 		printOptionUsage("--parentpid PID", " Specify a process after whose termination to exit.");
 #endif
-		printOptionUsage("-f TESTFILE, --testfile", " Testfile to run, defaults to `tests/default.txt'.");
+		printOptionUsage("-f TESTFILE, --testfile",
+		                 " Testfile to run, defaults to `tests/default.txt'.  If role is `unittests', specifies which "
+		                 "unit tests to run as a search prefix.");
 		printOptionUsage("-R, --restarting", " Restart a previous simulation that was cleanly shut down.");
 		printOptionUsage("-s SEED, --seed SEED", " Random seed.");
 		printOptionUsage("-k KEY, --key KEY", "Target key for search role.");
@@ -651,6 +656,8 @@ static void printUsage(const char* name, bool devhelp) {
 		printOptionUsage("--num_testers NUM",
 		                 " A multitester will wait for NUM testers before starting"
 		                 " (defaults to 1).");
+		printOptionUsage("--test_PARAMNAME PARAMVALUE",
+		                 " Set a UnitTest named parameter to the given value.  Names are case sensitive.");
 #ifdef __linux__
 		printOptionUsage("--rsssize SIZE",
 		                 " Turns on automatic heap profiling when RSS memory size exceeds"
@@ -922,6 +929,7 @@ enum class ServerRole {
 	SkipListTest,
 	Test,
 	VersionedMapTest,
+	UnitTests
 };
 struct CLIOptions {
 	std::string commandLine;
@@ -1044,6 +1052,15 @@ private:
 				knobs.push_back(std::make_pair(syn, args.OptionArg()));
 				break;
 			}
+			case OPT_UNITTESTPARAM: {
+				std::string syn = args.OptionSyntax();
+				if (!StringRef(syn).startsWith(LiteralStringRef("--test_"))) {
+					fprintf(stderr, "ERROR: unable to parse knob option '%s'\n", syn.c_str());
+					flushAndExit(FDB_EXIT_ERROR);
+				}
+				UnitTestCollection::setParam(syn.substr(7), args.OptionArg());
+				break;
+			}
 			case OPT_LOCALITY: {
 				std::string syn = args.OptionSyntax();
 				if (!StringRef(syn).startsWith(LiteralStringRef("--locality_"))) {
@@ -1102,6 +1119,8 @@ private:
 					role = ServerRole::KVFileGenerateIOLogChecksums;
 				else if (!strcmp(sRole, "consistencycheck"))
 					role = ServerRole::ConsistencyCheck;
+				else if (!strcmp(sRole, "unittests"))
+					role = ServerRole::UnitTests;
 				else {
 					fprintf(stderr, "ERROR: Unknown role `%s'\n", sRole);
 					printHelpTeaser(argv[0]);
@@ -1461,7 +1480,8 @@ private:
 			    return StringRef(addr).startsWith(LiteralStringRef("auto:"));
 		    });
 		if ((role != ServerRole::Simulation && role != ServerRole::CreateTemplateDatabase &&
-		     role != ServerRole::KVFileIntegrityCheck && role != ServerRole::KVFileGenerateIOLogChecksums) ||
+		     role != ServerRole::KVFileIntegrityCheck && role != ServerRole::KVFileGenerateIOLogChecksums &&
+		     role != ServerRole::UnitTests) ||
 		    autoPublicAddress) {
 
 			if (seedSpecified && !fileExists(connFile)) {
@@ -1994,6 +2014,12 @@ int main(int argc, char* argv[]) {
 			                       StringRef(),
 			                       opts.localities));
 			g_network->run();
+		} else if (role == ServerRole::UnitTests) {
+			setupRunLoopProfiler();
+			auto m = startSystemMonitor(opts.dataFolder, opts.dcId, opts.zoneId, opts.zoneId);
+			f = stopAfter(runTests(
+			    opts.connectionFile, TEST_TYPE_UNIT_TESTS, TEST_HERE, 1, opts.testFile, StringRef(), opts.localities));
+			g_network->run();
 		} else if (role == ServerRole::CreateTemplateDatabase) {
 			createTemplateDatabase();
 		} else if (role == ServerRole::NetworkTestClient) {
diff --git a/fdbserver/networktest.actor.cpp b/fdbserver/networktest.actor.cpp
index 4acb46a2f0..540720e948 100644
--- a/fdbserver/networktest.actor.cpp
+++ b/fdbserver/networktest.actor.cpp
@@ -517,13 +517,6 @@ struct P2PNetworkTest {
 		       self->listeners.size(),
 		       self->remotes.size(),
 		       self->connectionsOut);
-		printf("Request size: %s\n", self->requestBytes.toString().c_str());
-		printf("Response size: %s\n", self->replyBytes.toString().c_str());
-		printf("Requests per outgoing session: %d\n", self->requests.toString().c_str());
-		printf("Delay before socket read: %s\n", self->waitReadMilliseconds.toString().c_str());
-		printf("Delay before socket write: %s\n", self->waitWriteMilliseconds.toString().c_str());
-		printf("Delay before session close: %s\n", self->idleMilliseconds.toString().c_str());
-		printf("Send/Recv size %d bytes\n", FLOW_KNOBS->MAX_PACKET_SEND_BYTES);
 
 		for (auto n : self->remotes) {
 			printf("Remote: %s\n", n.toString().c_str());
@@ -534,6 +527,19 @@ struct P2PNetworkTest {
 			actors.add(incoming(self, el));
 		}
 
+		printf("Request size: %s\n", self->requestBytes.toString().c_str());
+		printf("Response size: %s\n", self->replyBytes.toString().c_str());
+		printf("Requests per outgoing session: %s\n", self->requests.toString().c_str());
+		printf("Delay before socket read: %s\n", self->waitReadMilliseconds.toString().c_str());
+		printf("Delay before socket write: %s\n", self->waitWriteMilliseconds.toString().c_str());
+		printf("Delay before session close: %s\n", self->idleMilliseconds.toString().c_str());
+		printf("Send/Recv size %d bytes\n", FLOW_KNOBS->MAX_PACKET_SEND_BYTES);
+
+		if ((self->remotes.empty() || self->connectionsOut == 0) && self->listeners.empty()) {
+			printf("No listeners and no remotes or connectionsOut, so there is nothing to do!\n");
+			ASSERT((!self->remotes.empty() && (self->connectionsOut > 0)) || !self->listeners.empty());
+		}
+
 		if (!self->remotes.empty()) {
 			for (int i = 0; i < self->connectionsOut; ++i) {
 				actors.add(outgoing(self));
@@ -549,27 +555,30 @@ struct P2PNetworkTest {
 	Future<Void> run() { return run_impl(this); }
 };
 
-int getEnvInt(const char* name, int defaultValue = 0) {
-	const char* val = getenv(name);
-	return val != nullptr ? atol(val) : defaultValue;
-}
-
-std::string getEnvStr(const char* name, std::string defaultValue = "") {
-	const char* val = getenv(name);
-	return val != nullptr ? val : defaultValue;
-}
-
-// TODO: Remove this hacky thing and make a "networkp2ptest" role in fdbserver
+// Peer-to-Peer network test.
+// One or more instances can be run and set to talk to each other.
+// Each instance
+//   - listens on 0 or more listenerAddresses
+//   - maintains 0 or more connectionsOut at a time, each to a random choice from remoteAddresses
+// Address lists are a string of comma-separated IP:port[:tls] strings.
+//
+// The other arguments can be specified as "fixedValue" or "minValue:maxValue".
+// Each outgoing connection will live for a random requests count.
+// Each request will
+//   - send a random requestBytes sized message
+//   - wait for a random replyBytes sized response.
+// The client will close the connection after a random idleMilliseconds.
+// Reads and writes can optionally preceded by random delays, waitReadMilliseconds and waitWriteMilliseconds.
 TEST_CASE("!p2ptest") {
-	state P2PNetworkTest p2p(getEnvStr("listenerAddresses", ""),
-	                         getEnvStr("remoteAddresses", ""),
-	                         getEnvInt("connectionsOut", 0),
-	                         getEnvStr("requestBytes", "0"),
-	                         getEnvStr("replyBytes", "0"),
-	                         getEnvStr("requests", "0"),
-	                         getEnvStr("idleMilliseconds", "0"),
-	                         getEnvStr("waitReadMilliseconds", "0"),
-	                         getEnvStr("waitWriteMilliseconds", "0"));
+	state P2PNetworkTest p2p(UnitTestCollection::getParam("listenerAddresses").orDefault(""),
+	                         UnitTestCollection::getParam("remoteAddresses").orDefault(""),
+	                         UnitTestCollection::getIntParam("connectionsOut").orDefault(1),
+	                         UnitTestCollection::getParam("requestBytes").orDefault("50:100"),
+	                         UnitTestCollection::getParam("replyBytes").orDefault("500:1000"),
+	                         UnitTestCollection::getParam("requests").orDefault("10:10000"),
+	                         UnitTestCollection::getParam("idleMilliseconds").orDefault("0"),
+	                         UnitTestCollection::getParam("waitReadMilliseconds").orDefault("0"),
+	                         UnitTestCollection::getParam("waitWriteMilliseconds").orDefault("0"));
 
 	wait(p2p.run());
 	return Void();
diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp
index 839df40999..4307329840 100644
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@@ -763,7 +763,7 @@ ACTOR Future<DistributedTestResults> runWorkload(Database cx, std::vector<Tester
 		req.title = spec.title;
 		req.useDatabase = spec.useDB;
 		req.timeout = spec.timeout;
-		req.databasePingDelay = spec.databasePingDelay;
+		req.databasePingDelay = spec.useDB ? spec.databasePingDelay : 0.0;
 		req.options = spec.options;
 		req.clientId = i;
 		req.clientCount = testers.size();
@@ -1577,8 +1577,10 @@ ACTOR Future<Void> runTests(Reference<ClusterConnectionFile> connFile,
 	auto cc = makeReference<AsyncVar<Optional<ClusterControllerFullInterface>>>();
 	auto ci = makeReference<AsyncVar<Optional<ClusterInterface>>>();
 	vector<Future<Void>> actors;
-	actors.push_back(reportErrors(monitorLeader(connFile, cc), "MonitorLeader"));
-	actors.push_back(reportErrors(extractClusterInterface(cc, ci), "ExtractClusterInterface"));
+	if (connFile) {
+		actors.push_back(reportErrors(monitorLeader(connFile, cc), "MonitorLeader"));
+		actors.push_back(reportErrors(extractClusterInterface(cc, ci), "ExtractClusterInterface"));
+	}
 
 	if (whatToRun == TEST_TYPE_CONSISTENCY_CHECK) {
 		TestSpec spec;
@@ -1603,6 +1605,18 @@ ACTOR Future<Void> runTests(Reference<ClusterConnectionFile> connFile,
 		                       KeyValueRef(LiteralStringRef("shuffleShards"), LiteralStringRef("true")));
 		spec.options.push_back_deep(spec.options.arena(), options);
 		testSpecs.push_back(spec);
+	} else if (whatToRun == TEST_TYPE_UNIT_TESTS) {
+		TestSpec spec;
+		Standalone<VectorRef<KeyValueRef>> options;
+		spec.title = LiteralStringRef("UnitTests");
+		spec.startDelay = 0;
+		spec.useDB = false;
+		spec.timeout = 0;
+		options.push_back_deep(options.arena(),
+		                       KeyValueRef(LiteralStringRef("testName"), LiteralStringRef("UnitTests")));
+		options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("testsMatching"), fileName));
+		spec.options.push_back_deep(spec.options.arena(), options);
+		testSpecs.push_back(spec);
 	} else {
 		ifstream ifs;
 		ifs.open(fileName.c_str(), ifstream::in);
diff --git a/flow/UnitTest.cpp b/flow/UnitTest.cpp
index 1d383ac00e..83eec8c62a 100644
--- a/flow/UnitTest.cpp
+++ b/flow/UnitTest.cpp
@@ -26,3 +26,33 @@ UnitTest::UnitTest(const char* name, const char* file, int line, TestFunction fu
   : name(name), file(file), line(line), func(func), next(g_unittests.tests) {
 	g_unittests.tests = this;
 }
+
+UnitTestParameters& UnitTestCollection::params() {
+	static UnitTestParameters p;
+	return p;
+}
+
+void UnitTestCollection::setParam(const std::string& name, const std::string& value) {
+	printf("setting %s = %s\n", name.c_str(), value.c_str());
+	params()[name] = value;
+}
+
+Optional<std::string> UnitTestCollection::getParam(const std::string& name) {
+	auto it = params().find(name);
+	if (it != params().end()) {
+		return it->second;
+	}
+	return {};
+}
+
+void UnitTestCollection::setParam(const std::string& name, int64_t value) {
+	setParam(name, format("%" PRId64, value));
+};
+
+Optional<int64_t> UnitTestCollection::getIntParam(const std::string& name) {
+	auto opt = getParam(name);
+	if (opt.present()) {
+		return atoll(opt.get().c_str());
+	}
+	return {};
+}
diff --git a/flow/UnitTest.h b/flow/UnitTest.h
index c76344e4bb..92f44084ba 100644
--- a/flow/UnitTest.h
+++ b/flow/UnitTest.h
@@ -45,6 +45,9 @@
 
 #include "flow/flow.h"
 
+#include <cinttypes>
+
+// Unit test definition structured as a linked list item
 struct UnitTest {
 	typedef Future<Void> (*TestFunction)();
 
@@ -57,8 +60,25 @@ struct UnitTest {
 	UnitTest(const char* name, const char* file, int line, TestFunction func);
 };
 
+// Collection of unit tests in the form of a linked list
+typedef std::map<std::string, std::string> UnitTestParameters;
 struct UnitTestCollection {
 	UnitTest* tests;
+
+	// Map of named case-sensitive parameters available for all unit tests
+	static UnitTestParameters& params();
+
+	// Set a named parameter to a string value, replacing any existing value
+	static void setParam(const std::string& name, const std::string& value);
+
+	// Set a named parameter to an integer converted to a string value, replacing any existing value
+	static void setParam(const std::string& name, int64_t value);
+
+	// Get a parameter's value, will return !present() if parameter was not set
+	static Optional<std::string> getParam(const std::string& name);
+
+	// Get a parameter's value as an integer, will return !present() if parameter was not set
+	static Optional<int64_t> getIntParam(const std::string& name);
 };
 
 extern UnitTestCollection g_unittests;

From ffeb94ada43edce94b0ec5da0a10b1252c140408 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sun, 4 Apr 2021 23:33:04 -0700
Subject: [PATCH 050/461] Updated Redwood set unit test to use unit test
 parameters.

---
 fdbserver/VersionedBTree.actor.cpp | 32 ++++++++++++++++--------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 8051d956b0..72fbb5eada 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -8045,21 +8045,23 @@ TEST_CASE("!/redwood/performance/set") {
 		deleteFile(pagerFile);
 	}
 
-	state int pageSize = SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE;
-	state int64_t pageCacheBytes = FLOW_KNOBS->PAGE_CACHE_4K;
-	state int nodeCount = 1e9;
-	state int maxRecordsPerCommit = 20000;
-	state int maxKVBytesPerCommit = 20e6;
-	state int64_t kvBytesTarget = 4e9;
-	state int minKeyPrefixBytes = 25;
-	state int maxKeyPrefixBytes = 25;
-	state int minValueSize = 100;
-	state int maxValueSize = 500;
-	state int minConsecutiveRun = 1;
-	state int maxConsecutiveRun = 100000;
-	state char firstKeyChar = 'a';
-	state char lastKeyChar = 'm';
-	state Version remapCleanupWindow = SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW;
+	state int pageSize = UnitTestCollection::getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
+	state int64_t pageCacheBytes =
+	    UnitTestCollection::getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
+	state int nodeCount = UnitTestCollection::getIntParam("nodeCount").orDefault(1e9);
+	state int maxRecordsPerCommit = UnitTestCollection::getIntParam("maxRecordsPerCommit").orDefault(20000);
+	state int maxKVBytesPerCommit = UnitTestCollection::getIntParam("maxKVBytesPerCommit").orDefault(20e6);
+	state int64_t kvBytesTarget = UnitTestCollection::getIntParam("kvBytesTarget").orDefault(4e9);
+	state int minKeyPrefixBytes = UnitTestCollection::getIntParam("minKeyPrefixBytes").orDefault(25);
+	state int maxKeyPrefixBytes = UnitTestCollection::getIntParam("maxKeyPrefixBytes").orDefault(25);
+	state int minValueSize = UnitTestCollection::getIntParam("minValueSize").orDefault(100);
+	state int maxValueSize = UnitTestCollection::getIntParam("maxValueSize").orDefault(500);
+	state int minConsecutiveRun = UnitTestCollection::getIntParam("minConsecutiveRun").orDefault(1);
+	state int maxConsecutiveRun = UnitTestCollection::getIntParam("maxConsecutiveRun").orDefault(100);
+	state char firstKeyChar = UnitTestCollection::getParam("firstKeyChar").orDefault("a")[0];
+	state char lastKeyChar = UnitTestCollection::getParam("lastKeyChar").orDefault("m")[0];
+	state Version remapCleanupWindow =
+	    UnitTestCollection::getIntParam("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
 
 	printf("pageSize: %d\n", pageSize);
 	printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes);

From e7573d546f094bd96e1b2fca0914c6e8d61c77e4 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Mon, 5 Apr 2021 00:03:15 -0700
Subject: [PATCH 051/461] Some unit tests names had a prefixed "!" in order to
 be excluded from random selection, this has been changed to a ":" as it is
 less problematic on the command line.  Some Redwood unit tests have been
 enabled for random selection.

---
 fdbserver/VersionedBTree.actor.cpp      | 18 +++++++++---------
 fdbserver/networktest.actor.cpp         |  2 +-
 tests/RedwoodCorrectness.txt            |  2 +-
 tests/RedwoodCorrectnessBTree.txt       |  2 +-
 tests/RedwoodCorrectnessPager.txt       |  2 +-
 tests/RedwoodCorrectnessUnits.txt       |  2 +-
 tests/RedwoodPerfPrefixCompression.txt  |  2 +-
 tests/RedwoodPerfSequentialInsert.txt   |  2 +-
 tests/RedwoodPerfSet.txt                |  2 +-
 tests/RedwoodPerfTests.txt              |  2 +-
 tests/rare/RedwoodCorrectnessBTree.toml |  2 +-
 11 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 72fbb5eada..773d3a1cce 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -6956,7 +6956,7 @@ RedwoodRecordRef randomRedwoodRecordRef(const std::string& keyBuffer, const std:
 	return rec;
 }
 
-TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") {
+TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[0] == 3);
 	ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[1] == 4);
 	ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[2] == 6);
@@ -7092,7 +7092,7 @@ TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") {
 	return Void();
 }
 
-TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") {
+TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef") {
 	// Sanity check on delta tree node format
 	ASSERT(DeltaTree<RedwoodRecordRef>::Node::headerSize(false) == 4);
 	ASSERT(DeltaTree<RedwoodRecordRef>::Node::headerSize(true) == 8);
@@ -7271,7 +7271,7 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") {
 	return Void();
 }
 
-TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") {
+TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") {
 	const int N = 200;
 	IntIntPair prev = { 1, 0 };
 	IntIntPair next = { 10000, 10000 };
@@ -7615,7 +7615,7 @@ struct SimpleCounter {
 	std::string toString() { return format("%" PRId64 "/%.2f/%.2f", x, rate() / 1e6, avgRate() / 1e6); }
 };
 
-TEST_CASE("!/redwood/performance/mutationBuffer") {
+TEST_CASE(":/redwood/performance/mutationBuffer") {
 	// This test uses pregenerated short random keys
 	int count = 10e6;
 
@@ -7643,7 +7643,7 @@ TEST_CASE("!/redwood/performance/mutationBuffer") {
 	return Void();
 }
 
-TEST_CASE("!/redwood/correctness/btree") {
+TEST_CASE("/redwood/correctness/btree") {
 	g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting
 	g_redwoodMetrics.clear();
 
@@ -8003,7 +8003,7 @@ ACTOR Future<Void> randomScans(VersionedBTree* btree,
 	return Void();
 }
 
-TEST_CASE("!/redwood/correctness/pager/cow") {
+TEST_CASE(":/redwood/correctness/pager/cow") {
 	state std::string pagerFile = "unittest_pageFile.redwood";
 	printf("Deleting old test data\n");
 	deleteFile(pagerFile);
@@ -8030,7 +8030,7 @@ TEST_CASE("!/redwood/correctness/pager/cow") {
 	return Void();
 }
 
-TEST_CASE("!/redwood/performance/set") {
+TEST_CASE(":/redwood/performance/set") {
 	state SignalableActorCollection actors;
 
 	g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting
@@ -8543,7 +8543,7 @@ ACTOR Future<Void> doPrefixInsertComparison(int suffixSize,
 	return Void();
 }
 
-TEST_CASE("!/redwood/performance/prefixSizeComparison") {
+TEST_CASE(":/redwood/performance/prefixSizeComparison") {
 	state int suffixSize = 12;
 	state int valueSize = 100;
 	state int recordCountTarget = 100e6;
@@ -8564,7 +8564,7 @@ TEST_CASE("!/redwood/performance/prefixSizeComparison") {
 	return Void();
 }
 
-TEST_CASE("!/redwood/performance/sequentialInsert") {
+TEST_CASE(":/redwood/performance/sequentialInsert") {
 	state int prefixLen = 30;
 	state int valueSize = 100;
 	state int recordCountTarget = 100e6;
diff --git a/fdbserver/networktest.actor.cpp b/fdbserver/networktest.actor.cpp
index 540720e948..9603fc25cb 100644
--- a/fdbserver/networktest.actor.cpp
+++ b/fdbserver/networktest.actor.cpp
@@ -569,7 +569,7 @@ struct P2PNetworkTest {
 //   - wait for a random replyBytes sized response.
 // The client will close the connection after a random idleMilliseconds.
 // Reads and writes can optionally preceded by random delays, waitReadMilliseconds and waitWriteMilliseconds.
-TEST_CASE("!p2ptest") {
+TEST_CASE(":/network/p2ptest") {
 	state P2PNetworkTest p2p(UnitTestCollection::getParam("listenerAddresses").orDefault(""),
 	                         UnitTestCollection::getParam("remoteAddresses").orDefault(""),
 	                         UnitTestCollection::getIntParam("connectionsOut").orDefault(1),
diff --git a/tests/RedwoodCorrectness.txt b/tests/RedwoodCorrectness.txt
index fbda6b04f4..6f190f2131 100644
--- a/tests/RedwoodCorrectness.txt
+++ b/tests/RedwoodCorrectness.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/correctness/
+    testsMatching=/redwood/correctness/
diff --git a/tests/RedwoodCorrectnessBTree.txt b/tests/RedwoodCorrectnessBTree.txt
index a2495adb7a..92bb3de164 100644
--- a/tests/RedwoodCorrectnessBTree.txt
+++ b/tests/RedwoodCorrectnessBTree.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/correctness/btree
+    testsMatching=/redwood/correctness/btree
diff --git a/tests/RedwoodCorrectnessPager.txt b/tests/RedwoodCorrectnessPager.txt
index 13f9ef1961..4b94c21cfc 100644
--- a/tests/RedwoodCorrectnessPager.txt
+++ b/tests/RedwoodCorrectnessPager.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/correctness/pager
+    testsMatching=:/redwood/correctness/pager
diff --git a/tests/RedwoodCorrectnessUnits.txt b/tests/RedwoodCorrectnessUnits.txt
index d32242f3df..ac56735455 100644
--- a/tests/RedwoodCorrectnessUnits.txt
+++ b/tests/RedwoodCorrectnessUnits.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/correctness/unit/
+    testsMatching=/redwood/correctness/unit/
diff --git a/tests/RedwoodPerfPrefixCompression.txt b/tests/RedwoodPerfPrefixCompression.txt
index 09bb6a30cc..3383a74c2b 100644
--- a/tests/RedwoodPerfPrefixCompression.txt
+++ b/tests/RedwoodPerfPrefixCompression.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/performance/prefixSizeComparison
+    testsMatching=:/redwood/performance/prefixSizeComparison
diff --git a/tests/RedwoodPerfSequentialInsert.txt b/tests/RedwoodPerfSequentialInsert.txt
index 2e61df3b53..21c7005951 100644
--- a/tests/RedwoodPerfSequentialInsert.txt
+++ b/tests/RedwoodPerfSequentialInsert.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/performance/sequentialInsert
+    testsMatching=:/redwood/performance/sequentialInsert
diff --git a/tests/RedwoodPerfSet.txt b/tests/RedwoodPerfSet.txt
index 0694fccdce..f720479ac2 100644
--- a/tests/RedwoodPerfSet.txt
+++ b/tests/RedwoodPerfSet.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/performance/set
+    testsMatching=:/redwood/performance/set
diff --git a/tests/RedwoodPerfTests.txt b/tests/RedwoodPerfTests.txt
index 91675d4b64..8d56ebc823 100644
--- a/tests/RedwoodPerfTests.txt
+++ b/tests/RedwoodPerfTests.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/performance/
+    testsMatching=:/redwood/performance/
diff --git a/tests/rare/RedwoodCorrectnessBTree.toml b/tests/rare/RedwoodCorrectnessBTree.toml
index fea0577ee7..1a7320e416 100644
--- a/tests/rare/RedwoodCorrectnessBTree.toml
+++ b/tests/rare/RedwoodCorrectnessBTree.toml
@@ -6,4 +6,4 @@ startDelay = 0
     [[test.workload]]
     testName = 'UnitTests'
     maxTestCases = 0
-    testsMatching = '!/redwood/correctness/btree'
+    testsMatching = ':/redwood/correctness/btree'

From edb3dd44143a3871b95f17c90de14ab14db473aa Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 5 Apr 2021 17:36:49 -0700
Subject: [PATCH 052/461] Control backup's initial snapshot interval via knob.

---
 fdbclient/FileBackupAgent.actor.cpp | 5 ++---
 fdbclient/Knobs.cpp                 | 4 ++++
 fdbclient/Knobs.h                   | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 17112ca8f9..fd7f817711 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2777,9 +2777,8 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 
 		state Reference<TaskFuture> backupFinished = futureBucket->future(tr);
 
-		// Initialize the initial snapshot and create tasks to continually write logs and snapshots
-		// The initial snapshot has a desired duration of 0, meaning go as fast as possible.
-		wait(config.initNewSnapshot(tr, 0));
+		// Initialize the initial snapshot and create tasks to continually write logs and snapshots.
+		wait(config.initNewSnapshot(tr, CLIENT_KNOBS->BACKUP_INIT_SNAPSHOT_INTERVAL_SEC));
 
 		// Using priority 1 for both of these to at least start both tasks soon
 		// Do not add snapshot task if we only want the incremental backup
diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index bcca5ed166..b6500f84c2 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -133,6 +133,10 @@ void ClientKnobs::initialize(bool randomize) {
 	init( BACKUP_RANGE_TIMEOUT,   TASKBUCKET_TIMEOUT_VERSIONS/CORE_VERSIONSPERSECOND/2.0 );
 	init( BACKUP_RANGE_MINWAIT,   std::max(1.0, BACKUP_RANGE_TIMEOUT/2.0));
 	init( BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC,  10 * 60 );  // 10 minutes
+	init( BACKUP_INIT_SNAPSHOT_INTERVAL_SEC,          0);  // The initial snapshot has a desired duration of 0, meaning go as fast as possible.
+	if (randomize && BUGGIFY) {
+		BACKUP_INIT_SNAPSHOT_INTERVAL_SEC = deterministicRandom()->randomInt(0, 60); // 0 - 60 seconds
+	}
 	init( BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC,   3600 * 24 * 10); // 10 days
 	init( BACKUP_SHARD_TASK_LIMIT,                1000 ); if( randomize && BUGGIFY ) BACKUP_SHARD_TASK_LIMIT = 4;
 	init( BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL, 60);
diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h
index 3d22b5a24b..4fc925766c 100644
--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@@ -129,6 +129,7 @@ public:
 	double BACKUP_RANGE_TIMEOUT;
 	double BACKUP_RANGE_MINWAIT;
 	int BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC;
+	int BACKUP_INIT_SNAPSHOT_INTERVAL_SEC;
 	int BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC;
 	int BACKUP_SHARD_TASK_LIMIT;
 	double BACKUP_AGGREGATE_POLL_RATE;

From 7e1d60c924e346934a4732a2e5186f07104d14d4 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 5 Apr 2021 22:53:36 -0700
Subject: [PATCH 053/461] Format BACKUP_INIT_SNAPSHOT_INTERVAL_SEC like other
 knobs.

---
 fdbclient/Knobs.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index b6500f84c2..761a652a1a 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -133,10 +133,7 @@ void ClientKnobs::initialize(bool randomize) {
 	init( BACKUP_RANGE_TIMEOUT,   TASKBUCKET_TIMEOUT_VERSIONS/CORE_VERSIONSPERSECOND/2.0 );
 	init( BACKUP_RANGE_MINWAIT,   std::max(1.0, BACKUP_RANGE_TIMEOUT/2.0));
 	init( BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC,  10 * 60 );  // 10 minutes
-	init( BACKUP_INIT_SNAPSHOT_INTERVAL_SEC,          0);  // The initial snapshot has a desired duration of 0, meaning go as fast as possible.
-	if (randomize && BUGGIFY) {
-		BACKUP_INIT_SNAPSHOT_INTERVAL_SEC = deterministicRandom()->randomInt(0, 60); // 0 - 60 seconds
-	}
+	init( BACKUP_INIT_SNAPSHOT_INTERVAL_SEC,          0); if( randomize && BUGGIFY ) BACKUP_INIT_SNAPSHOT_INTERVAL_SEC = deterministicRandom()->randomInt(0, 60); // The initial snapshot has a desired duration of 0, meaning go as fast as possible. In simulation, choose a random value between 0 - 60 seconds.
 	init( BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC,   3600 * 24 * 10); // 10 days
 	init( BACKUP_SHARD_TASK_LIMIT,                1000 ); if( randomize && BUGGIFY ) BACKUP_SHARD_TASK_LIMIT = 4;
 	init( BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL, 60);

From b4e42476b726a18b196a31db338db960f2e99c81 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 6 Apr 2021 02:36:10 -0700
Subject: [PATCH 054/461] Unit test parameters are no longer global, they are
 accessible via a parameter to the unit test and initialized from otherwise
 unconsumed test options for the UnitTests workload in the test spec or from
 the fdbserver command line when using the unittests role.

---
 fdbserver/TesterInterface.actor.h       |  4 ++-
 fdbserver/VersionedBTree.actor.cpp      | 31 +++++++++----------
 fdbserver/fdbserver.actor.cpp           | 13 ++++++--
 fdbserver/networktest.actor.cpp         | 18 +++++------
 fdbserver/tester.actor.cpp              |  7 ++++-
 fdbserver/workloads/UnitTests.actor.cpp | 11 ++++++-
 flow/UnitTest.cpp                       | 19 +++++-------
 flow/UnitTest.h                         | 41 +++++++++++++------------
 flow/actorcompiler/ActorParser.cs       |  8 ++++-
 9 files changed, 88 insertions(+), 64 deletions(-)

diff --git a/fdbserver/TesterInterface.actor.h b/fdbserver/TesterInterface.actor.h
index a59e2244ea..c3dea2d445 100644
--- a/fdbserver/TesterInterface.actor.h
+++ b/fdbserver/TesterInterface.actor.h
@@ -29,6 +29,7 @@
 #include "fdbrpc/fdbrpc.h"
 #include "fdbrpc/PerfMetric.h"
 #include "fdbclient/NativeAPI.actor.h"
+#include "flow/UnitTest.h"
 #include "flow/actorcompiler.h" // has to be last include
 struct CheckReply {
 	constexpr static FileIdentifier file_identifier = 11;
@@ -143,7 +144,8 @@ ACTOR Future<Void> runTests(Reference<ClusterConnectionFile> connFile,
                             int minTestersExpected,
                             std::string fileName = std::string(),
                             StringRef startingConfiguration = StringRef(),
-                            LocalityData locality = LocalityData());
+                            LocalityData locality = LocalityData(),
+                            UnitTestParameters testOptions = UnitTestParameters());
 
 #include "flow/unactorcompiler.h"
 #endif
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 773d3a1cce..c12015ade8 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -8045,23 +8045,22 @@ TEST_CASE(":/redwood/performance/set") {
 		deleteFile(pagerFile);
 	}
 
-	state int pageSize = UnitTestCollection::getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
-	state int64_t pageCacheBytes =
-	    UnitTestCollection::getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
-	state int nodeCount = UnitTestCollection::getIntParam("nodeCount").orDefault(1e9);
-	state int maxRecordsPerCommit = UnitTestCollection::getIntParam("maxRecordsPerCommit").orDefault(20000);
-	state int maxKVBytesPerCommit = UnitTestCollection::getIntParam("maxKVBytesPerCommit").orDefault(20e6);
-	state int64_t kvBytesTarget = UnitTestCollection::getIntParam("kvBytesTarget").orDefault(4e9);
-	state int minKeyPrefixBytes = UnitTestCollection::getIntParam("minKeyPrefixBytes").orDefault(25);
-	state int maxKeyPrefixBytes = UnitTestCollection::getIntParam("maxKeyPrefixBytes").orDefault(25);
-	state int minValueSize = UnitTestCollection::getIntParam("minValueSize").orDefault(100);
-	state int maxValueSize = UnitTestCollection::getIntParam("maxValueSize").orDefault(500);
-	state int minConsecutiveRun = UnitTestCollection::getIntParam("minConsecutiveRun").orDefault(1);
-	state int maxConsecutiveRun = UnitTestCollection::getIntParam("maxConsecutiveRun").orDefault(100);
-	state char firstKeyChar = UnitTestCollection::getParam("firstKeyChar").orDefault("a")[0];
-	state char lastKeyChar = UnitTestCollection::getParam("lastKeyChar").orDefault("m")[0];
+	state int pageSize = params.getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
+	state int64_t pageCacheBytes = params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
+	state int nodeCount = params.getIntParam("nodeCount").orDefault(1e9);
+	state int maxRecordsPerCommit = params.getIntParam("maxRecordsPerCommit").orDefault(20000);
+	state int maxKVBytesPerCommit = params.getIntParam("maxKVBytesPerCommit").orDefault(20e6);
+	state int64_t kvBytesTarget = params.getIntParam("kvBytesTarget").orDefault(4e9);
+	state int minKeyPrefixBytes = params.getIntParam("minKeyPrefixBytes").orDefault(25);
+	state int maxKeyPrefixBytes = params.getIntParam("maxKeyPrefixBytes").orDefault(25);
+	state int minValueSize = params.getIntParam("minValueSize").orDefault(100);
+	state int maxValueSize = params.getIntParam("maxValueSize").orDefault(500);
+	state int minConsecutiveRun = params.getIntParam("minConsecutiveRun").orDefault(1);
+	state int maxConsecutiveRun = params.getIntParam("maxConsecutiveRun").orDefault(100);
+	state char firstKeyChar = params.getParam("firstKeyChar").orDefault("a")[0];
+	state char lastKeyChar = params.getParam("lastKeyChar").orDefault("m")[0];
 	state Version remapCleanupWindow =
-	    UnitTestCollection::getIntParam("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
+	    params.getIntParam("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
 
 	printf("pageSize: %d\n", pageSize);
 	printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes);
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index f59fdcd7c0..325caa10c5 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -978,6 +978,7 @@ struct CLIOptions {
 
 	Reference<ClusterConnectionFile> connectionFile;
 	Standalone<StringRef> machineId;
+	UnitTestParameters testParams;
 
 	static CLIOptions parseArgs(int argc, char* argv[]) {
 		CLIOptions opts;
@@ -1058,7 +1059,7 @@ private:
 					fprintf(stderr, "ERROR: unable to parse knob option '%s'\n", syn.c_str());
 					flushAndExit(FDB_EXIT_ERROR);
 				}
-				UnitTestCollection::setParam(syn.substr(7), args.OptionArg());
+				testParams.setParam(syn.substr(7), args.OptionArg());
 				break;
 			}
 			case OPT_LOCALITY: {
@@ -2017,8 +2018,14 @@ int main(int argc, char* argv[]) {
 		} else if (role == ServerRole::UnitTests) {
 			setupRunLoopProfiler();
 			auto m = startSystemMonitor(opts.dataFolder, opts.dcId, opts.zoneId, opts.zoneId);
-			f = stopAfter(runTests(
-			    opts.connectionFile, TEST_TYPE_UNIT_TESTS, TEST_HERE, 1, opts.testFile, StringRef(), opts.localities));
+			f = stopAfter(runTests(opts.connectionFile,
+			                       TEST_TYPE_UNIT_TESTS,
+			                       TEST_HERE,
+			                       1,
+			                       opts.testFile,
+			                       StringRef(),
+			                       opts.localities,
+			                       opts.testParams));
 			g_network->run();
 		} else if (role == ServerRole::CreateTemplateDatabase) {
 			createTemplateDatabase();
diff --git a/fdbserver/networktest.actor.cpp b/fdbserver/networktest.actor.cpp
index 9603fc25cb..cea0c2fca8 100644
--- a/fdbserver/networktest.actor.cpp
+++ b/fdbserver/networktest.actor.cpp
@@ -570,15 +570,15 @@ struct P2PNetworkTest {
 // The client will close the connection after a random idleMilliseconds.
 // Reads and writes can optionally preceded by random delays, waitReadMilliseconds and waitWriteMilliseconds.
 TEST_CASE(":/network/p2ptest") {
-	state P2PNetworkTest p2p(UnitTestCollection::getParam("listenerAddresses").orDefault(""),
-	                         UnitTestCollection::getParam("remoteAddresses").orDefault(""),
-	                         UnitTestCollection::getIntParam("connectionsOut").orDefault(1),
-	                         UnitTestCollection::getParam("requestBytes").orDefault("50:100"),
-	                         UnitTestCollection::getParam("replyBytes").orDefault("500:1000"),
-	                         UnitTestCollection::getParam("requests").orDefault("10:10000"),
-	                         UnitTestCollection::getParam("idleMilliseconds").orDefault("0"),
-	                         UnitTestCollection::getParam("waitReadMilliseconds").orDefault("0"),
-	                         UnitTestCollection::getParam("waitWriteMilliseconds").orDefault("0"));
+	state P2PNetworkTest p2p(params.getParam("listenerAddresses").orDefault(""),
+	                         params.getParam("remoteAddresses").orDefault(""),
+	                         params.getIntParam("connectionsOut").orDefault(1),
+	                         params.getParam("requestBytes").orDefault("50:100"),
+	                         params.getParam("replyBytes").orDefault("500:1000"),
+	                         params.getParam("requests").orDefault("10:10000"),
+	                         params.getParam("idleMilliseconds").orDefault("0"),
+	                         params.getParam("waitReadMilliseconds").orDefault("0"),
+	                         params.getParam("waitWriteMilliseconds").orDefault("0"));
 
 	wait(p2p.run());
 	return Void();
diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp
index 4307329840..5e7ce2b550 100644
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@@ -1572,7 +1572,8 @@ ACTOR Future<Void> runTests(Reference<ClusterConnectionFile> connFile,
                             int minTestersExpected,
                             std::string fileName,
                             StringRef startingConfiguration,
-                            LocalityData locality) {
+                            LocalityData locality,
+                            UnitTestParameters testOptions) {
 	state vector<TestSpec> testSpecs;
 	auto cc = makeReference<AsyncVar<Optional<ClusterControllerFullInterface>>>();
 	auto ci = makeReference<AsyncVar<Optional<ClusterInterface>>>();
@@ -1615,6 +1616,10 @@ ACTOR Future<Void> runTests(Reference<ClusterConnectionFile> connFile,
 		options.push_back_deep(options.arena(),
 		                       KeyValueRef(LiteralStringRef("testName"), LiteralStringRef("UnitTests")));
 		options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("testsMatching"), fileName));
+		// Add unit test options as test spec options
+		for (auto& kv : testOptions.params) {
+			options.push_back_deep(options.arena(), KeyValueRef(kv.first, kv.second));
+		}
 		spec.options.push_back_deep(spec.options.arena(), options);
 		testSpecs.push_back(spec);
 	} else {
diff --git a/fdbserver/workloads/UnitTests.actor.cpp b/fdbserver/workloads/UnitTests.actor.cpp
index 2cb4210d0f..024cfc9973 100644
--- a/fdbserver/workloads/UnitTests.actor.cpp
+++ b/fdbserver/workloads/UnitTests.actor.cpp
@@ -34,6 +34,7 @@ struct UnitTestWorkload : TestWorkload {
 	bool enabled;
 	std::string testPattern;
 	int testRunLimit;
+	UnitTestParameters testParams;
 
 	PerfIntCounter testsAvailable, testsExecuted, testsFailed;
 	PerfDoubleCounter totalWallTime, totalSimTime;
@@ -45,6 +46,14 @@ struct UnitTestWorkload : TestWorkload {
 		enabled = !clientId; // only do this on the "first" client
 		testPattern = getOption(options, LiteralStringRef("testsMatching"), Value()).toString();
 		testRunLimit = getOption(options, LiteralStringRef("maxTestCases"), -1);
+
+		// Consume all remaining options as testParams which the unit test can access
+		for(auto &kv : options) {
+			if(kv.value.size() != 0) {
+				testParams.setParam(kv.key.toString(), getOption(options, kv.key, StringRef()).toString());
+			}
+		}
+
 		forceLinkIndexedSetTests();
 		forceLinkDequeTests();
 		forceLinkFlowTests();
@@ -94,7 +103,7 @@ struct UnitTestWorkload : TestWorkload {
 			state double start_timer = timer();
 
 			try {
-				wait(test->func());
+				wait(test->func(self->testParams));
 			} catch (Error& e) {
 				++self->testsFailed;
 				result = e;
diff --git a/flow/UnitTest.cpp b/flow/UnitTest.cpp
index 83eec8c62a..7303cd33c7 100644
--- a/flow/UnitTest.cpp
+++ b/flow/UnitTest.cpp
@@ -27,29 +27,24 @@ UnitTest::UnitTest(const char* name, const char* file, int line, TestFunction fu
 	g_unittests.tests = this;
 }
 
-UnitTestParameters& UnitTestCollection::params() {
-	static UnitTestParameters p;
-	return p;
-}
-
-void UnitTestCollection::setParam(const std::string& name, const std::string& value) {
+void UnitTestParameters::setParam(const std::string& name, const std::string& value) {
 	printf("setting %s = %s\n", name.c_str(), value.c_str());
-	params()[name] = value;
+	params[name] = value;
 }
 
-Optional<std::string> UnitTestCollection::getParam(const std::string& name) {
-	auto it = params().find(name);
-	if (it != params().end()) {
+Optional<std::string> UnitTestParameters::getParam(const std::string& name) const {
+	auto it = params.find(name);
+	if (it != params.end()) {
 		return it->second;
 	}
 	return {};
 }
 
-void UnitTestCollection::setParam(const std::string& name, int64_t value) {
+void UnitTestParameters::setParam(const std::string& name, int64_t value) {
 	setParam(name, format("%" PRId64, value));
 };
 
-Optional<int64_t> UnitTestCollection::getIntParam(const std::string& name) {
+Optional<int64_t> UnitTestParameters::getIntParam(const std::string& name) const {
 	auto opt = getParam(name);
 	if (opt.present()) {
 		return atoll(opt.get().c_str());
diff --git a/flow/UnitTest.h b/flow/UnitTest.h
index 92f44084ba..21d51a158f 100644
--- a/flow/UnitTest.h
+++ b/flow/UnitTest.h
@@ -47,9 +47,26 @@
 
 #include <cinttypes>
 
+struct UnitTestParameters {
+	// Map of named case-sensitive parameters
+	std::map<std::string, std::string> params;
+
+	// Set a named parameter to a string value, replacing any existing value
+	void setParam(const std::string& name, const std::string& value);
+
+	// Set a named parameter to an integer converted to a string value, replacing any existing value
+	void setParam(const std::string& name, int64_t value);
+
+	// Get a parameter's value, will return !present() if parameter was not set
+	Optional<std::string> getParam(const std::string& name) const;
+
+	// Get a parameter's value as an integer, will return !present() if parameter was not set
+	Optional<int64_t> getIntParam(const std::string& name) const;
+};
+
 // Unit test definition structured as a linked list item
 struct UnitTest {
-	typedef Future<Void> (*TestFunction)();
+	typedef Future<Void> (*TestFunction)(const UnitTestParameters& params);
 
 	const char* name;
 	const char* file;
@@ -61,24 +78,8 @@ struct UnitTest {
 };
 
 // Collection of unit tests in the form of a linked list
-typedef std::map<std::string, std::string> UnitTestParameters;
 struct UnitTestCollection {
 	UnitTest* tests;
-
-	// Map of named case-sensitive parameters available for all unit tests
-	static UnitTestParameters& params();
-
-	// Set a named parameter to a string value, replacing any existing value
-	static void setParam(const std::string& name, const std::string& value);
-
-	// Set a named parameter to an integer converted to a string value, replacing any existing value
-	static void setParam(const std::string& name, int64_t value);
-
-	// Get a parameter's value, will return !present() if parameter was not set
-	static Optional<std::string> getParam(const std::string& name);
-
-	// Get a parameter's value as an integer, will return !present() if parameter was not set
-	static Optional<int64_t> getIntParam(const std::string& name);
 };
 
 extern UnitTestCollection g_unittests;
@@ -91,17 +92,17 @@ extern UnitTestCollection g_unittests;
 
 #ifdef FLOW_DISABLE_UNIT_TESTS
 
-#define TEST_CASE(name) static Future<Void> FILE_UNIQUE_NAME(disabled_testcase_func)()
+#define TEST_CASE(name) static Future<Void> FILE_UNIQUE_NAME(disabled_testcase_func)(const UnitTestParameters& params)
 #define ACTOR_TEST_CASE(actorname, name)
 
 #else
 
 #define TEST_CASE(name)                                                                                                \
-	static Future<Void> FILE_UNIQUE_NAME(testcase_func)();                                                             \
+	static Future<Void> FILE_UNIQUE_NAME(testcase_func)(const UnitTestParameters& params);                             \
 	namespace {                                                                                                        \
 	static UnitTest FILE_UNIQUE_NAME(testcase)(name, __FILE__, __LINE__, &FILE_UNIQUE_NAME(testcase_func));            \
 	}                                                                                                                  \
-	static Future<Void> FILE_UNIQUE_NAME(testcase_func)()
+	static Future<Void> FILE_UNIQUE_NAME(testcase_func)(const UnitTestParameters& params)
 
 // ACTOR_TEST_CASE generated by actorcompiler; don't use directly
 #define ACTOR_TEST_CASE(actorname, name)                                                                               \
diff --git a/flow/actorcompiler/ActorParser.cs b/flow/actorcompiler/ActorParser.cs
index d92bba9d53..f44b4e433f 100644
--- a/flow/actorcompiler/ActorParser.cs
+++ b/flow/actorcompiler/ActorParser.cs
@@ -535,7 +535,13 @@ namespace actorcompiler
             actor.testCaseParameters = str(paramRange);
 
             actor.name = "flowTestCase" + toks.First().SourceLine;
-            actor.parameters = new VarDeclaration[] { };
+            actor.parameters = new VarDeclaration[] { new VarDeclaration {
+                    name = "params",
+                    type = "UnitTestParameters",
+                    initializer = "",
+                    initializerConstructorSyntax = false
+                }
+            };
             actor.returnType = "Void";
         }
 

From 5f89640b1bc8853ea1c069b038548848d5cc24b0 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 6 Apr 2021 02:45:33 -0700
Subject: [PATCH 055/461] Added performance unit test options for read
 parallelism, using existing file, and whether or not to insert new records.

---
 fdbserver/VersionedBTree.actor.cpp | 48 +++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 614ce3a51b..10866d8e12 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -8133,17 +8133,10 @@ TEST_CASE(":/redwood/performance/set") {
 	g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting
 	g_redwoodMetrics.clear();
 
-	// If a test file is passed in by environment then don't write new data to it.
-	state bool reload = getenv("TESTFILE") == nullptr;
-	state std::string pagerFile = reload ? "unittest.redwood" : getenv("TESTFILE");
-
-	if (reload) {
-		printf("Deleting old test data\n");
-		deleteFile(pagerFile);
-	}
-
+	state std::string fileName = params.getParam("fileName").orDefault("unittest.redwood");
 	state int pageSize = params.getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
-	state int64_t pageCacheBytes = params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
+	state int64_t pageCacheBytes =
+	    params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
 	state int nodeCount = params.getIntParam("nodeCount").orDefault(1e9);
 	state int maxRecordsPerCommit = params.getIntParam("maxRecordsPerCommit").orDefault(20000);
 	state int maxKVBytesPerCommit = params.getIntParam("maxKVBytesPerCommit").orDefault(20e6);
@@ -8158,6 +8151,10 @@ TEST_CASE(":/redwood/performance/set") {
 	state char lastKeyChar = params.getParam("lastKeyChar").orDefault("m")[0];
 	state Version remapCleanupWindow =
 	    params.getIntParam("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
+	state bool openExisting = params.getIntParam("openExisting").orDefault(0);
+	state bool insertRecords = !openExisting || params.getIntParam("insertRecords").orDefault(0);
+	state int concurrentSeeks = params.getIntParam("concurrentSeeks").orDefault(64);
+	state int concurrentScans = params.getIntParam("concurrentScans").orDefault(64);
 
 	printf("pageSize: %d\n", pageSize);
 	printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes);
@@ -8173,9 +8170,19 @@ TEST_CASE(":/redwood/performance/set") {
 	printf("kvBytesTarget: %" PRId64 "\n", kvBytesTarget);
 	printf("KeyLexicon '%c' to '%c'\n", firstKeyChar, lastKeyChar);
 	printf("remapCleanupWindow: %" PRId64 "\n", remapCleanupWindow);
+	printf("fileName: %s\n", fileName.c_str());
+	printf("concurrentScans: %d\n", concurrentScans);
+	printf("concurrentSeeks: %d\n", concurrentSeeks);
+	printf("openExisting: %d\n", openExisting);
+	printf("insertRecords: %d\n", insertRecords);
 
-	DWALPager* pager = new DWALPager(pageSize, pagerFile, pageCacheBytes, remapCleanupWindow);
-	state VersionedBTree* btree = new VersionedBTree(pager, pagerFile);
+	if (!openExisting) {
+		printf("Deleting old test data\n");
+		deleteFile(fileName);
+	}
+
+	DWALPager* pager = new DWALPager(pageSize, fileName, pageCacheBytes, remapCleanupWindow);
+	state VersionedBTree* btree = new VersionedBTree(pager, fileName);
 	wait(btree->init());
 
 	state int64_t kvBytesThisCommit = 0;
@@ -8188,7 +8195,7 @@ TEST_CASE(":/redwood/performance/set") {
 	state double intervalStart = timer();
 	state double start = intervalStart;
 
-	if (reload) {
+	if (insertRecords) {
 		while (kvBytesTotal < kvBytesTarget) {
 			wait(yield());
 
@@ -8298,15 +8305,22 @@ TEST_CASE(":/redwood/performance/set") {
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
+	printf("Parallel scans, concurrency=%d, no readAhead ...\n", concurrentScans);
+	for(int x = 0; x < concurrentScans; ++x) {
+		actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar));
+	}
+	wait(actors.signalAndReset());
+	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
+
 	printf("Serial seeks...\n");
 	actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
-	printf("Parallel seeks...\n");
-	actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
-	actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
-	actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
+	printf("Parallel seeks, concurrency=%d ...\n", concurrentSeeks);
+	for(int x = 0; x < concurrentSeeks; ++x) {
+		actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
+	}
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 

From aacee0656926dd19661dac94b9ef464619404af8 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 6 Apr 2021 03:06:29 -0700
Subject: [PATCH 056/461] Applied clang-format.

---
 fdbserver/VersionedBTree.actor.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 10866d8e12..c45ff44f38 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -8135,8 +8135,7 @@ TEST_CASE(":/redwood/performance/set") {
 
 	state std::string fileName = params.getParam("fileName").orDefault("unittest.redwood");
 	state int pageSize = params.getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
-	state int64_t pageCacheBytes =
-	    params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
+	state int64_t pageCacheBytes = params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
 	state int nodeCount = params.getIntParam("nodeCount").orDefault(1e9);
 	state int maxRecordsPerCommit = params.getIntParam("maxRecordsPerCommit").orDefault(20000);
 	state int maxKVBytesPerCommit = params.getIntParam("maxKVBytesPerCommit").orDefault(20e6);
@@ -8306,7 +8305,7 @@ TEST_CASE(":/redwood/performance/set") {
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Parallel scans, concurrency=%d, no readAhead ...\n", concurrentScans);
-	for(int x = 0; x < concurrentScans; ++x) {
+	for (int x = 0; x < concurrentScans; ++x) {
 		actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar));
 	}
 	wait(actors.signalAndReset());
@@ -8318,7 +8317,7 @@ TEST_CASE(":/redwood/performance/set") {
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Parallel seeks, concurrency=%d ...\n", concurrentSeeks);
-	for(int x = 0; x < concurrentSeeks; ++x) {
+	for (int x = 0; x < concurrentSeeks; ++x) {
 		actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
 	}
 	wait(actors.signalAndReset());

From 394e5628033175ef2629f06a43401e44cfc301ed Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 6 Apr 2021 03:44:49 -0700
Subject: [PATCH 057/461] Added seek and scan counts, parallel reads divide
 count over concurrent readers.

---
 fdbserver/VersionedBTree.actor.cpp | 31 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index c45ff44f38..a7b999539f 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -8154,6 +8154,8 @@ TEST_CASE(":/redwood/performance/set") {
 	state bool insertRecords = !openExisting || params.getIntParam("insertRecords").orDefault(0);
 	state int concurrentSeeks = params.getIntParam("concurrentSeeks").orDefault(64);
 	state int concurrentScans = params.getIntParam("concurrentScans").orDefault(64);
+	state int seeks = params.getIntParam("seeks").orDefault(1000000);
+	state int scans = params.getIntParam("scans").orDefault(20000);
 
 	printf("pageSize: %d\n", pageSize);
 	printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes);
@@ -8169,9 +8171,11 @@ TEST_CASE(":/redwood/performance/set") {
 	printf("kvBytesTarget: %" PRId64 "\n", kvBytesTarget);
 	printf("KeyLexicon '%c' to '%c'\n", firstKeyChar, lastKeyChar);
 	printf("remapCleanupWindow: %" PRId64 "\n", remapCleanupWindow);
-	printf("fileName: %s\n", fileName.c_str());
 	printf("concurrentScans: %d\n", concurrentScans);
 	printf("concurrentSeeks: %d\n", concurrentSeeks);
+	printf("seeks: %d\n", seeks);
+	printf("scans: %d\n", scans);
+	printf("fileName: %s\n", fileName.c_str());
 	printf("openExisting: %d\n", openExisting);
 	printf("insertRecords: %d\n", insertRecords);
 
@@ -8269,56 +8273,53 @@ TEST_CASE(":/redwood/performance/set") {
 		       kvBytesTotal / (timer() - start) / 1e6);
 	}
 
-	int seeks = 1e6;
 	printf("Warming cache with seeks\n");
-	actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar));
-	actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar));
-	actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar));
+	for (int x = 0; x < concurrentSeeks; ++x) {
+		actors.add(randomSeeks(btree, seeks / concurrentSeeks, firstKeyChar, lastKeyChar));
+	}
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
-	state int ops = 10000;
-
 	printf("Serial scans with adaptive readAhead...\n");
-	actors.add(randomScans(btree, ops, 50, -1, firstKeyChar, lastKeyChar));
+	actors.add(randomScans(btree, scans, 50, -1, firstKeyChar, lastKeyChar));
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Serial scans with readAhead 3 pages...\n");
-	actors.add(randomScans(btree, ops, 50, 12000, firstKeyChar, lastKeyChar));
+	actors.add(randomScans(btree, scans, 50, 12000, firstKeyChar, lastKeyChar));
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Serial scans with readAhead 2 pages...\n");
-	actors.add(randomScans(btree, ops, 50, 8000, firstKeyChar, lastKeyChar));
+	actors.add(randomScans(btree, scans, 50, 8000, firstKeyChar, lastKeyChar));
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Serial scans with readAhead 1 page...\n");
-	actors.add(randomScans(btree, ops, 50, 4000, firstKeyChar, lastKeyChar));
+	actors.add(randomScans(btree, scans, 50, 4000, firstKeyChar, lastKeyChar));
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Serial scans...\n");
-	actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar));
+	actors.add(randomScans(btree, scans, 50, 0, firstKeyChar, lastKeyChar));
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Parallel scans, concurrency=%d, no readAhead ...\n", concurrentScans);
 	for (int x = 0; x < concurrentScans; ++x) {
-		actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar));
+		actors.add(randomScans(btree, scans / concurrentScans, 50, 0, firstKeyChar, lastKeyChar));
 	}
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Serial seeks...\n");
-	actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
+	actors.add(randomSeeks(btree, seeks, firstKeyChar, lastKeyChar));
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Parallel seeks, concurrency=%d ...\n", concurrentSeeks);
 	for (int x = 0; x < concurrentSeeks; ++x) {
-		actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
+		actors.add(randomSeeks(btree, seeks / concurrentSeeks, firstKeyChar, lastKeyChar));
 	}
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());

From cf33cea0b46d600901c98f2ed602256cde61d17b Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh3y@gmail.com>
Date: Tue, 6 Apr 2021 13:44:33 +0000
Subject: [PATCH 058/461] loopback_cluster: Find `fdbcli` from `$BUILD` path

Currently expects to find `fdbcli` in current path, which doesn't seem
right. `fdbcli` should always be in the directory where we'll find
`fdbserver`.
---
 tests/loopback_cluster/run_cluster.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/loopback_cluster/run_cluster.sh b/tests/loopback_cluster/run_cluster.sh
index cea762ed17..a16228f9b9 100755
--- a/tests/loopback_cluster/run_cluster.sh
+++ b/tests/loopback_cluster/run_cluster.sh
@@ -40,7 +40,7 @@ for i in `seq 1 $2` ; do
 		${FDB} -p auto:${PORT_PREFIX}${j} -d $DATA -L $LOG -C $CLUSTER &
 	done
 	
-	CLI="$ROOT/bin/fdbcli -C ${CLUSTER} --exec"
+	CLI="$BUILD/bin/fdbcli -C ${CLUSTER} --exec"
 	( sleep 2 ; $CLI "configure new ssd single" ) &
 done;
 

From 48a475366cae487139f84f1badc341c1ad5bd79f Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Mon, 22 Mar 2021 10:39:06 -0400
Subject: [PATCH 059/461] Log latency metrics for batch GRV requests

---
 .../sphinx/source/mr-status-json-schemas.rst.inc   | 14 ++++++++++++++
 fdbclient/Schemas.cpp                              | 14 ++++++++++++++
 fdbserver/GrvProxyServer.actor.cpp                 |  6 ++++++
 fdbserver/Status.actor.cpp                         | 13 +++++++++++--
 fdbserver/storageserver.actor.cpp                  | 14 ++++++++++----
 5 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 81da2adf83..a80adc2a22 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -134,6 +134,20 @@
                          "p99.9":0.0
                      }
                   },
+                  "grv_batch_latency_statistics":{
+                     "default":{
+                         "count":0,
+                         "min":0.0,
+                         "max":0.0,
+                         "median":0.0,
+                         "mean":0.0,
+                         "p25":0.0,
+                         "p90":0.0,
+                         "p95":0.0,
+                         "p99":0.0,
+                         "p99.9":0.0
+                     }
+                  },                  
                   "read_latency_statistics":{
                      "count":0,
                      "min":0.0,
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 866ea4441e..4a734005b0 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -157,6 +157,20 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                         "p99.9":0.0
                      }
                   },
+                  "grv_batch_latency_statistics":{
+                     "default":{
+                        "count":0,
+                        "min":0.0,
+                        "max":0.0,
+                        "median":0.0,
+                        "mean":0.0,
+                        "p25":0.0,
+                        "p90":0.0,
+                        "p95":0.0,
+                        "p99":0.0,
+                        "p99.9":0.0
+                     }
+                  },                  
                   "read_latency_statistics":{
                      "count":0,
                      "min":0.0,
diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index 8ab3719181..5e70ea4f5b 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -47,6 +47,7 @@ struct GrvProxyStats {
 
 	LatencyBands grvLatencyBands;
 	LatencySample grvLatencySample;
+    LatencySample grvBatchLatencySample;
 
 	Future<Void> logger;
 
@@ -101,6 +102,8 @@ struct GrvProxyStats {
 	                     id,
 	                     SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
 	                     SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	    grvBatchLatencySample("BatchLatencyMetrics", id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+	                     SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 	    grvLatencyBands("GRVLatencyMetrics", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
 		// The rate at which the limit(budget) is allowed to grow.
 		specialCounter(cc, "SystemAndDefaultTxnRateAllowed", [this]() { return this->transactionRateAllowed; });
@@ -528,6 +531,9 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
 		if (request.priority >= TransactionPriority::DEFAULT) {
 			stats->grvLatencyBands.addMeasurement(duration);
 		}
+		else {
+			stats->grvBatchLatencySample.addMeasurement(duration);
+		}
 
 		if (request.flags & GetReadVersionRequest::FLAG_USE_MIN_KNOWN_COMMITTED_VERSION) {
 			// Only backup worker may infrequently use this flag.
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 284cedf97b..1559bd2acd 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -499,6 +499,11 @@ struct RolesInfo {
 				    maxTLogVersion - version - SERVER_KNOBS->STORAGE_LOGGING_DELAY * SERVER_KNOBS->VERSIONS_PER_SECOND);
 			}
 
+			TraceEventFields const& batchLatencyMetrics = metrics.at("BatchLatencyMetrics");
+			if(batchLatencyMetrics.size()) {
+				obj["grv_batch_latency_statistics"] = addLatencyStatistics(batchLatencyMetrics);
+			}
+
 			TraceEventFields const& readLatencyMetrics = metrics.at("ReadLatencyMetrics");
 			if (readLatencyMetrics.size()) {
 				obj["read_latency_statistics"] = addLatencyStatistics(readLatencyMetrics);
@@ -641,6 +646,10 @@ struct RolesInfo {
 			if (grvLatencyBands.size()) {
 				obj["grv_latency_bands"] = addLatencyBandInfo(grvLatencyBands);
 			}
+			TraceEventFields const& grvBatchMetrics = metrics.at("BatchLatencyMetrics");
+			if(grvBatchMetrics.size()) {
+				obj["grv_batch_latency_statistics"] = addLatencyStatistics(grvBatchMetrics);
+			}						
 		} catch (Error& e) {
 			if (e.code() != error_code_attribute_not_found) {
 				throw e;
@@ -1812,7 +1821,7 @@ ACTOR static Future<vector<std::pair<StorageServerInterface, EventMap>>> getStor
 	           getServerMetrics(servers,
 	                            address_workers,
 	                            std::vector<std::string>{
-	                                "StorageMetrics", "ReadLatencyMetrics", "ReadLatencyBands", "BusiestReadTag" })) &&
+	                                "StorageMetrics", "ReadLatencyMetrics", "ReadLatencyBands", "BusiestReadTag", "BatchLatencyMetrics" })) &&
 	     store(busiestWriteTags, getServerBusiestWriteTags(servers, address_workers, rkWorker)));
 
 	ASSERT(busiestWriteTags.size() == results.size());
@@ -1850,7 +1859,7 @@ ACTOR static Future<vector<std::pair<GrvProxyInterface, EventMap>>> getGrvProxie
 	vector<std::pair<GrvProxyInterface, EventMap>> results =
 	    wait(getServerMetrics(db->get().client.grvProxies,
 	                          address_workers,
-	                          std::vector<std::string>{ "GRVLatencyMetrics", "GRVLatencyBands" }));
+	                          std::vector<std::string>{ "GRVLatencyMetrics", "GRVLatencyBands", "BatchLatencyMetrics" }));
 	return results;
 }
 
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 8c26f955bb..47af85145a 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -678,6 +678,7 @@ public:
 
 		LatencySample readLatencySample;
 		LatencyBands readLatencyBands;
+		LatencySample batchLatencySample;
 
 		Counters(StorageServer* self)
 		  : cc("StorageServer", self->thisServerID.toString()), getKeyQueries("GetKeyQueries", cc),
@@ -692,10 +693,15 @@ public:
 		    updateBatches("UpdateBatches", cc), updateVersions("UpdateVersions", cc), loops("Loops", cc),
 		    fetchWaitingMS("FetchWaitingMS", cc), fetchWaitingCount("FetchWaitingCount", cc),
 		    fetchExecutingMS("FetchExecutingMS", cc), fetchExecutingCount("FetchExecutingCount", cc),
-		    readsRejected("ReadsRejected", cc), readLatencySample("ReadLatencyMetrics",
-		                                                          self->thisServerID,
-		                                                          SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                                                          SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		    readsRejected("ReadsRejected", cc), 
+			batchLatencySample("BatchLatencyMetrics", 
+				self->thisServerID, 
+				SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, 
+				SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+			readLatencySample("ReadLatencyMetrics",
+				self->thisServerID,
+				SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+				SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 		    readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
 			specialCounter(cc, "LastTLogVersion", [self]() { return self->lastTLogVersion; });
 			specialCounter(cc, "Version", [self]() { return self->version.get(); });

From cabf192f57a53eac4719c562fb72dea9ece828f7 Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Wed, 24 Mar 2021 13:41:13 -0400
Subject: [PATCH 060/461] Respond to review comments 3/23

---
 .../source/mr-status-json-schemas.rst.inc      |  8 +++-----
 fdbclient/Schemas.cpp                          |  8 +++-----
 fdbserver/GrvProxyServer.actor.cpp             | 11 ++++++-----
 fdbserver/Status.actor.cpp                     | 18 ++++++++----------
 fdbserver/storageserver.actor.cpp              | 14 ++++----------
 5 files changed, 24 insertions(+), 35 deletions(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index a80adc2a22..17a67dd57e 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -132,10 +132,8 @@
                          "p95":0.0,
                          "p99":0.0,
                          "p99.9":0.0
-                     }
-                  },
-                  "grv_batch_latency_statistics":{
-                     "default":{
+                     },
+                     "batch":{
                          "count":0,
                          "min":0.0,
                          "max":0.0,
@@ -147,7 +145,7 @@
                          "p99":0.0,
                          "p99.9":0.0
                      }
-                  },                  
+                  },
                   "read_latency_statistics":{
                      "count":0,
                      "min":0.0,
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 4a734005b0..fbe30fce83 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -155,10 +155,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                         "p95":0.0,
                         "p99":0.0,
                         "p99.9":0.0
-                     }
-                  },
-                  "grv_batch_latency_statistics":{
-                     "default":{
+                     },
+                     "batch":{
                         "count":0,
                         "min":0.0,
                         "max":0.0,
@@ -170,7 +168,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                         "p99":0.0,
                         "p99.9":0.0
                      }
-                  },                  
+                  },
                   "read_latency_statistics":{
                      "count":0,
                      "min":0.0,
diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index 5e70ea4f5b..b790baaf7a 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -47,7 +47,7 @@ struct GrvProxyStats {
 
 	LatencyBands grvLatencyBands;
 	LatencySample grvLatencySample;
-    LatencySample grvBatchLatencySample;
+	LatencySample grvBatchLatencySample;
 
 	Future<Void> logger;
 
@@ -102,8 +102,10 @@ struct GrvProxyStats {
 	                     id,
 	                     SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
 	                     SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
-	    grvBatchLatencySample("BatchLatencyMetrics", id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                     SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	    grvBatchLatencySample("GRVBatchLatencyMetrics",
+	                          id,
+	                          SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+	                          SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 	    grvLatencyBands("GRVLatencyMetrics", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
 		// The rate at which the limit(budget) is allowed to grow.
 		specialCounter(cc, "SystemAndDefaultTxnRateAllowed", [this]() { return this->transactionRateAllowed; });
@@ -530,8 +532,7 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
 
 		if (request.priority >= TransactionPriority::DEFAULT) {
 			stats->grvLatencyBands.addMeasurement(duration);
-		}
-		else {
+		} else {
 			stats->grvBatchLatencySample.addMeasurement(duration);
 		}
 
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 1559bd2acd..2ebfe30da0 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -499,11 +499,6 @@ struct RolesInfo {
 				    maxTLogVersion - version - SERVER_KNOBS->STORAGE_LOGGING_DELAY * SERVER_KNOBS->VERSIONS_PER_SECOND);
 			}
 
-			TraceEventFields const& batchLatencyMetrics = metrics.at("BatchLatencyMetrics");
-			if(batchLatencyMetrics.size()) {
-				obj["grv_batch_latency_statistics"] = addLatencyStatistics(batchLatencyMetrics);
-			}
-
 			TraceEventFields const& readLatencyMetrics = metrics.at("ReadLatencyMetrics");
 			if (readLatencyMetrics.size()) {
 				obj["read_latency_statistics"] = addLatencyStatistics(readLatencyMetrics);
@@ -646,10 +641,13 @@ struct RolesInfo {
 			if (grvLatencyBands.size()) {
 				obj["grv_latency_bands"] = addLatencyBandInfo(grvLatencyBands);
 			}
-			TraceEventFields const& grvBatchMetrics = metrics.at("BatchLatencyMetrics");
+
+			TraceEventFields const& grvBatchMetrics = metrics.at("GRVBatchLatencyMetrics");
 			if(grvBatchMetrics.size()) {
-				obj["grv_batch_latency_statistics"] = addLatencyStatistics(grvBatchMetrics);
-			}						
+				JsonBuilderObject priorityStats;
+				priorityStats["batch"] = addLatencyStatistics(grvBatchMetrics);
+				obj["grv_batch_latency_statistics"] = priorityStats;
+			}
 		} catch (Error& e) {
 			if (e.code() != error_code_attribute_not_found) {
 				throw e;
@@ -1821,7 +1819,7 @@ ACTOR static Future<vector<std::pair<StorageServerInterface, EventMap>>> getStor
 	           getServerMetrics(servers,
 	                            address_workers,
 	                            std::vector<std::string>{
-	                                "StorageMetrics", "ReadLatencyMetrics", "ReadLatencyBands", "BusiestReadTag", "BatchLatencyMetrics" })) &&
+	                                "StorageMetrics", "ReadLatencyMetrics", "ReadLatencyBands", "BusiestReadTag" })) &&
 	     store(busiestWriteTags, getServerBusiestWriteTags(servers, address_workers, rkWorker)));
 
 	ASSERT(busiestWriteTags.size() == results.size());
@@ -1859,7 +1857,7 @@ ACTOR static Future<vector<std::pair<GrvProxyInterface, EventMap>>> getGrvProxie
 	vector<std::pair<GrvProxyInterface, EventMap>> results =
 	    wait(getServerMetrics(db->get().client.grvProxies,
 	                          address_workers,
-	                          std::vector<std::string>{ "GRVLatencyMetrics", "GRVLatencyBands", "BatchLatencyMetrics" }));
+	                          std::vector<std::string>{ "GRVLatencyMetrics", "GRVLatencyBands", "GRVBatchLatencyMetrics" }));
 	return results;
 }
 
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 47af85145a..8c26f955bb 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -678,7 +678,6 @@ public:
 
 		LatencySample readLatencySample;
 		LatencyBands readLatencyBands;
-		LatencySample batchLatencySample;
 
 		Counters(StorageServer* self)
 		  : cc("StorageServer", self->thisServerID.toString()), getKeyQueries("GetKeyQueries", cc),
@@ -693,15 +692,10 @@ public:
 		    updateBatches("UpdateBatches", cc), updateVersions("UpdateVersions", cc), loops("Loops", cc),
 		    fetchWaitingMS("FetchWaitingMS", cc), fetchWaitingCount("FetchWaitingCount", cc),
 		    fetchExecutingMS("FetchExecutingMS", cc), fetchExecutingCount("FetchExecutingCount", cc),
-		    readsRejected("ReadsRejected", cc), 
-			batchLatencySample("BatchLatencyMetrics", 
-				self->thisServerID, 
-				SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, 
-				SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
-			readLatencySample("ReadLatencyMetrics",
-				self->thisServerID,
-				SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-				SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		    readsRejected("ReadsRejected", cc), readLatencySample("ReadLatencyMetrics",
+		                                                          self->thisServerID,
+		                                                          SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+		                                                          SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 		    readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
 			specialCounter(cc, "LastTLogVersion", [self]() { return self->lastTLogVersion; });
 			specialCounter(cc, "Version", [self]() { return self->version.get(); });

From 7faca702d27622ecd1201a61c65c8aea586370a2 Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Mon, 29 Mar 2021 15:10:40 -0400
Subject: [PATCH 061/461] Fix bug in writes to json objects.

---
 fdbserver/GrvProxyServer.actor.cpp |  6 ++++--
 fdbserver/Status.actor.cpp         | 19 ++++++++++---------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index b790baaf7a..dd6048dd60 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -526,14 +526,16 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
 	double end = g_network->timer();
 	for (GetReadVersionRequest const& request : requests) {
 		double duration = end - request.requestTime();
+		if (request.priority == TransactionPriority::BATCH) {
+			stats->grvBatchLatencySample.addMeasurement(duration);
+		}
+
 		if (request.priority == TransactionPriority::DEFAULT) {
 			stats->grvLatencySample.addMeasurement(duration);
 		}
 
 		if (request.priority >= TransactionPriority::DEFAULT) {
 			stats->grvLatencyBands.addMeasurement(duration);
-		} else {
-			stats->grvBatchLatencySample.addMeasurement(duration);
 		}
 
 		if (request.flags & GetReadVersionRequest::FLAG_USE_MIN_KNOWN_COMMITTED_VERSION) {
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 2ebfe30da0..fbbc7df005 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -629,11 +629,19 @@ struct RolesInfo {
 		obj["id"] = iface.id().shortString();
 		obj["role"] = role;
 		try {
+			JsonBuilderObject priorityStats;
+
 			TraceEventFields const& grvLatencyMetrics = metrics.at("GRVLatencyMetrics");
 			if (grvLatencyMetrics.size()) {
-				JsonBuilderObject priorityStats;
-				// We only report default priority now, but this allows us to add other priorities if we want them
 				priorityStats["default"] = addLatencyStatistics(grvLatencyMetrics);
+			}
+
+			TraceEventFields const& grvBatchMetrics = metrics.at("GRVBatchLatencyMetrics");
+			if (grvBatchMetrics.size()) {
+				priorityStats["batch"] = addLatencyStatistics(grvBatchMetrics);
+			}
+
+			if (priorityStats.size()) {
 				obj["grv_latency_statistics"] = priorityStats;
 			}
 
@@ -641,13 +649,6 @@ struct RolesInfo {
 			if (grvLatencyBands.size()) {
 				obj["grv_latency_bands"] = addLatencyBandInfo(grvLatencyBands);
 			}
-
-			TraceEventFields const& grvBatchMetrics = metrics.at("GRVBatchLatencyMetrics");
-			if(grvBatchMetrics.size()) {
-				JsonBuilderObject priorityStats;
-				priorityStats["batch"] = addLatencyStatistics(grvBatchMetrics);
-				obj["grv_batch_latency_statistics"] = priorityStats;
-			}
 		} catch (Error& e) {
 			if (e.code() != error_code_attribute_not_found) {
 				throw e;

From a95e845200f8741a68138f4cb83eef0f36beebb6 Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Tue, 6 Apr 2021 14:56:58 -0400
Subject: [PATCH 062/461] document changes

---
 fdbserver/GrvProxyServer.actor.cpp | 4 ++--
 fdbserver/Status.actor.cpp         | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index dd6048dd60..9d3a0c2020 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -46,8 +46,8 @@ struct GrvProxyStats {
 	LatencySample batchTxnGRVTimeInQueue;
 
 	LatencyBands grvLatencyBands;
-	LatencySample grvLatencySample;
-	LatencySample grvBatchLatencySample;
+	LatencySample grvLatencySample; // GRV latency metric sample of default priority
+	LatencySample grvBatchLatencySample; // GRV latency metric sample of batched priority
 
 	Future<Void> logger;
 
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index fbbc7df005..90caeee703 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -631,6 +631,8 @@ struct RolesInfo {
 		try {
 			JsonBuilderObject priorityStats;
 
+			// GRV Latency metrics are grouped according to priority (currently batch or default).
+			// Other priorities can be added in the future.
 			TraceEventFields const& grvLatencyMetrics = metrics.at("GRVLatencyMetrics");
 			if (grvLatencyMetrics.size()) {
 				priorityStats["default"] = addLatencyStatistics(grvLatencyMetrics);
@@ -641,6 +643,7 @@ struct RolesInfo {
 				priorityStats["batch"] = addLatencyStatistics(grvBatchMetrics);
 			}
 
+			// Add GRV Latency metrics (for all priorities) to parent node.
 			if (priorityStats.size()) {
 				obj["grv_latency_statistics"] = priorityStats;
 			}

From 433872e17d2a128acc87d1efcc51fac152d0d706 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 6 Apr 2021 17:28:28 -0700
Subject: [PATCH 063/461] Sample actors waiting on network

---
 fdbclient/InstrumentRequest.h | 50 +++++++++++++++++++++++++++++++++++
 fdbclient/NativeAPI.actor.cpp |  5 ++++
 fdbrpc/FlowTests.actor.cpp    |  4 +++
 fdbrpc/sim2.actor.cpp         |  7 +++++
 flow/Net2.actor.cpp           |  8 ++++++
 flow/Platform.actor.cpp       | 12 ++++++---
 flow/network.h                |  4 +++
 7 files changed, 86 insertions(+), 4 deletions(-)
 create mode 100644 fdbclient/InstrumentRequest.h

diff --git a/fdbclient/InstrumentRequest.h b/fdbclient/InstrumentRequest.h
new file mode 100644
index 0000000000..77adbd1490
--- /dev/null
+++ b/fdbclient/InstrumentRequest.h
@@ -0,0 +1,50 @@
+/*
+ * InstrumentRequest.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "flow/flow.h"
+#include "flow/network.h"
+
+// Used to manually instrument waiting actors to collect samples for the
+// sampling profiler.
+struct InstrumentRequest {
+	unsigned index;
+
+	InstrumentRequest() {}
+
+	// This API isn't great. Ideally, no cleanup call is needed. I ran into an
+	// issue around the destructor being called twice because an instance of
+	// this class has to be stored as a class member (otherwise it goes away
+	// when wait is called), and due to how Flow does code generation the
+	// member will be default initialized and then initialized again when it is
+	// initially set. Then, the destructor will be called twice, causing issues
+	// when the WriteOnlySet tries to erase the same index twice. I'm working
+	// on this :)
+
+	void start() {
+		index = g_network->getActorLineageSet().insert(currentLineage);
+	}
+	
+	void complete() {
+		g_network->getActorLineageSet().erase(index);
+	}
+};
+
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index a0ed70997c..41e63c68f8 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -36,6 +36,7 @@
 #include "fdbclient/ClusterInterface.h"
 #include "fdbclient/CoordinationInterface.h"
 #include "fdbclient/DatabaseContext.h"
+#include "fdbclient/InstrumentRequest.h"
 #include "fdbclient/JsonBuilder.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/Knobs.h"
@@ -1770,6 +1771,7 @@ void runNetwork() {
 	if (networkOptions.traceDirectory.present() && networkOptions.runLoopProfilingEnabled) {
 		setupRunLoopProfiler();
 	}
+	setupSamplingProfiler();
 
 	g_network->run();
 
@@ -3025,6 +3027,8 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 						throw deterministicRandom()->randomChoice(
 						    std::vector<Error>{ transaction_too_old(), future_version() });
 					}
+					state InstrumentRequest request;
+					request.start();
 					GetKeyValuesReply _rep =
 					    wait(loadBalance(cx.getPtr(),
 					                     beginServer.second,
@@ -3035,6 +3039,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					                     cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
 					rep = _rep;
 					++cx->transactionPhysicalReadsCompleted;
+					request.complete();
 				} catch (Error&) {
 					++cx->transactionPhysicalReadsCompleted;
 					throw;
diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp
index 40e4ed1c52..c965149f70 100644
--- a/fdbrpc/FlowTests.actor.cpp
+++ b/fdbrpc/FlowTests.actor.cpp
@@ -24,6 +24,7 @@
 #include "flow/UnitTest.h"
 #include "flow/DeterministicRandom.h"
 #include "flow/IThreadPool.h"
+#include "flow/WriteOnlySet.h"
 #include "fdbrpc/fdbrpc.h"
 #include "fdbrpc/IAsyncFile.h"
 #include "flow/TLSConfig.actor.h"
@@ -283,6 +284,9 @@ struct YieldMockNetwork final : INetwork, ReferenceCounted<YieldMockNetwork> {
 		static TLSConfig emptyConfig;
 		return emptyConfig;
 	}
+	ActorLineageSet& getActorLineageSet() override {
+		throw std::exception();
+	}
 	ProtocolVersion protocolVersion() override { return baseNetwork->protocolVersion(); }
 };
 
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index e9219f3ff3..4bd2c9399e 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -31,6 +31,7 @@
 #include "flow/IThreadPool.h"
 #include "flow/ProtocolVersion.h"
 #include "flow/Util.h"
+#include "flow/WriteOnlySet.h"
 #include "fdbrpc/IAsyncFile.h"
 #include "fdbrpc/AsyncFileCached.actor.h"
 #include "fdbrpc/AsyncFileNonDurable.actor.h"
@@ -975,6 +976,10 @@ public:
 
 	bool checkRunnable() override { return net2->checkRunnable(); }
 
+	ActorLineageSet& getActorLineageSet() override {
+		return actorLineageSet;
+	}
+
 	void stop() override { isStopped = true; }
 	void addStopCallback(std::function<void()> fn) override { stopCallbacks.emplace_back(std::move(fn)); }
 	bool isSimulated() const override { return true; }
@@ -2117,6 +2122,8 @@ public:
 	// Whether or not yield has returned true during the current iteration of the run loop
 	bool yielded;
 	int yield_limit; // how many more times yield may return false before next returning true
+
+	ActorLineageSet actorLineageSet;
 };
 
 class UDPSimSocket : public IUDPSocket, ReferenceCounted<UDPSimSocket> {
diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp
index 5026d6a982..bb3c675de4 100644
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@@ -198,6 +198,8 @@ public:
 
 	bool checkRunnable() override;
 
+	ActorLineageSet& getActorLineageSet() override;
+
 	bool useThreadPool;
 
 	// private:
@@ -225,6 +227,8 @@ public:
 	std::atomic<bool> stopped;
 	mutable std::map<IPAddress, bool> addressOnHostCache;
 
+	ActorLineageSet actorLineageSet;
+
 	std::atomic<bool> started;
 
 	uint64_t numYields;
@@ -1377,6 +1381,10 @@ bool Net2::checkRunnable() {
 	return !started.exchange(true);
 }
 
+ActorLineageSet& Net2::getActorLineageSet() {
+	return actorLineageSet;
+}
+
 void Net2::run() {
 	TraceEvent::setNetworkThread();
 	TraceEvent("Net2Running");
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 50f252021b..5be9b6423f 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -3679,8 +3679,7 @@ void* sampleThread(void* arg) {
 	while (true) {
 		threadSleep(1.0); // TODO: Read sample rate from global config
 
-		// TODO: Copy actor lineage of currently running actor
-		// Read currentLineage
+		// Get actor lineage of currently running actor.
 		auto actorLineage = currentLineageThreadSafe.get();
 		printf("Currently running actor lineage (%p):\n", actorLineage.getPtr());
 		auto stack = actorLineage->stack(&StackLineage::actorName);
@@ -3690,11 +3689,16 @@ void* sampleThread(void* arg) {
 		}
 		printf("\n");
 
+		// Get lineage of actors waiting on disk.
 		auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy();
-		printf("Disk ALPs: %d\n", diskAlps.size());
+		// printf("Disk ALPs: %d\n", diskAlps.size());
+
+		// TODO: Get lineage of actors waiting on network
+		auto networkAlps = g_network->getActorLineageSet().copy();
+		printf("Network ALPs: %d\n", networkAlps.size());
 
 		// TODO: Call collect on all actor lineages
-		for (auto actorLineage : diskAlps) {
+		for (auto actorLineage : networkAlps) {
 			auto stack = actorLineage->stack(&StackLineage::actorName);
 			while (!stack.empty()) {
 				printf("%s ", stack.top());
diff --git a/flow/network.h b/flow/network.h
index 33fb7b0f26..b335db3c2d 100644
--- a/flow/network.h
+++ b/flow/network.h
@@ -34,6 +34,7 @@
 #include "flow/Arena.h"
 #include "flow/IRandom.h"
 #include "flow/Trace.h"
+#include "flow/WriteOnlySet.h"
 
 enum class TaskPriority {
 	Max = 1000000,
@@ -535,6 +536,9 @@ public:
 	// returns false.
 	virtual bool checkRunnable() = 0;
 
+	// Returns the shared memory data structure used to store actor lineages.
+	virtual ActorLineageSet& getActorLineageSet() = 0;
+
 	virtual ProtocolVersion protocolVersion() = 0;
 
 	// Shorthand for transport().getLocalAddress()

From c481ba2cfa0f330230d83f8f290fa663d9e08348 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 6 Apr 2021 17:32:02 -0700
Subject: [PATCH 064/461] Update annotation class name

---
 fdbclient/{InstrumentRequest.h => AnnotateActor.h} | 6 +++---
 fdbclient/NativeAPI.actor.cpp                      | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)
 rename fdbclient/{InstrumentRequest.h => AnnotateActor.h} (95%)

diff --git a/fdbclient/InstrumentRequest.h b/fdbclient/AnnotateActor.h
similarity index 95%
rename from fdbclient/InstrumentRequest.h
rename to fdbclient/AnnotateActor.h
index 77adbd1490..cf5bf2c57e 100644
--- a/fdbclient/InstrumentRequest.h
+++ b/fdbclient/AnnotateActor.h
@@ -1,5 +1,5 @@
 /*
- * InstrumentRequest.h
+ * AnnotateActor.h
  *
  * This source file is part of the FoundationDB open source project
  *
@@ -25,10 +25,10 @@
 
 // Used to manually instrument waiting actors to collect samples for the
 // sampling profiler.
-struct InstrumentRequest {
+struct AnnotateActor {
 	unsigned index;
 
-	InstrumentRequest() {}
+	AnnotateActor() {}
 
 	// This API isn't great. Ideally, no cleanup call is needed. I ran into an
 	// issue around the destructor being called twice because an instance of
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 41e63c68f8..e6d9463157 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -32,11 +32,11 @@
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/MultiInterface.h"
 
+#include "fdbclient/AnnotateActor.h"
 #include "fdbclient/Atomic.h"
 #include "fdbclient/ClusterInterface.h"
 #include "fdbclient/CoordinationInterface.h"
 #include "fdbclient/DatabaseContext.h"
-#include "fdbclient/InstrumentRequest.h"
 #include "fdbclient/JsonBuilder.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/Knobs.h"
@@ -3027,8 +3027,8 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 						throw deterministicRandom()->randomChoice(
 						    std::vector<Error>{ transaction_too_old(), future_version() });
 					}
-					state InstrumentRequest request;
-					request.start();
+					state AnnotateActor annotation;
+					annotation.start();
 					GetKeyValuesReply _rep =
 					    wait(loadBalance(cx.getPtr(),
 					                     beginServer.second,
@@ -3039,7 +3039,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					                     cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
 					rep = _rep;
 					++cx->transactionPhysicalReadsCompleted;
-					request.complete();
+					annotation.complete();
 				} catch (Error&) {
 					++cx->transactionPhysicalReadsCompleted;
 					throw;

From 18120d6b1a528a975035948425d04a5d81b73c44 Mon Sep 17 00:00:00 2001
From: sfc-gh-tclinkenbeard <trevor.clinkenbeard@snowflake.com>
Date: Tue, 6 Apr 2021 22:13:15 -0700
Subject: [PATCH 065/461] Add MasterMetrics periodic logging

---
 fdbserver/masterserver.actor.cpp | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp
index 5fbf5bc2de..179c2e5c75 100644
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@@ -245,6 +245,15 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {
 
 	std::vector<WorkerInterface> backupWorkers; // Recruited backup workers from cluster controller.
 
+	CounterCollection cc;
+	Counter changeCoordinatorsRequests;
+	Counter getCommitVersionRequests;
+	Counter backupWorkerDoneRequests;
+	Counter getLiveCommittedVersionRequests;
+	Counter reportLiveCommittedVersionRequests;
+
+	Future<Void> logger;
+
 	MasterData(Reference<AsyncVar<ServerDBInfo>> const& dbInfo,
 	           MasterInterface const& myInterface,
 	           ServerCoordinators const& coordinators,
@@ -258,7 +267,13 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {
 	    lastEpochEnd(invalidVersion), liveCommittedVersion(invalidVersion), databaseLocked(false),
 	    minKnownCommittedVersion(invalidVersion), recoveryTransactionVersion(invalidVersion), lastCommitTime(0),
 	    registrationCount(0), version(invalidVersion), lastVersionTime(0), txnStateStore(0), memoryLimit(2e9),
-	    addActor(addActor), hasConfiguration(false), recruitmentStalled(makeReference<AsyncVar<bool>>(false)) {
+	    addActor(addActor), hasConfiguration(false), recruitmentStalled(makeReference<AsyncVar<bool>>(false)),
+	    cc("Master", dbgid.toString()), changeCoordinatorsRequests("ChangeCoordinatorsRequests", cc),
+	    getCommitVersionRequests("GetCommitVersionRequests", cc),
+	    backupWorkerDoneRequests("BackupWorkerDoneRequests", cc),
+	    getLiveCommittedVersionRequests("GetLiveCommittedVersionRequests", cc),
+	    reportLiveCommittedVersionRequests("ReportLiveCommittedVersionRequests", cc) {
+		logger = traceCounters("MasterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "MasterMetrics");
 		if (forceRecovery && !myInterface.locality.dcId().present()) {
 			TraceEvent(SevError, "ForcedRecoveryRequiresDcID");
 			forceRecovery = false;
@@ -1095,6 +1110,8 @@ ACTOR Future<Void> getVersion(Reference<MasterData> self, GetCommitVersionReques
 	state std::map<UID, CommitProxyVersionReplies>::iterator proxyItr =
 	    self->lastCommitProxyVersionReplies.find(req.requestingProxy); // lastCommitProxyVersionReplies never changes
 
+	++self->getCommitVersionRequests;
+
 	if (proxyItr == self->lastCommitProxyVersionReplies.end()) {
 		// Request from invalid proxy (e.g. from duplicate recruitment request)
 		req.reply.send(Never());
@@ -1191,6 +1208,7 @@ ACTOR Future<Void> serveLiveCommittedVersion(Reference<MasterData> self) {
 				if (self->liveCommittedVersion == invalidVersion) {
 					self->liveCommittedVersion = self->recoveryTransactionVersion;
 				}
+				++self->getLiveCommittedVersionRequests;
 				GetRawCommittedVersionReply reply;
 				reply.version = self->liveCommittedVersion;
 				reply.locked = self->databaseLocked;
@@ -1206,6 +1224,7 @@ ACTOR Future<Void> serveLiveCommittedVersion(Reference<MasterData> self) {
 					self->databaseLocked = req.locked;
 					self->proxyMetadataVersion = req.metadataVersion;
 				}
+				++self->reportLiveCommittedVersionRequests;
 				req.reply.send(Void());
 			}
 		}
@@ -1374,6 +1393,7 @@ static std::set<int> const& normalMasterErrors() {
 ACTOR Future<Void> changeCoordinators(Reference<MasterData> self) {
 	loop {
 		ChangeCoordinatorsRequest req = waitNext(self->myInterface.changeCoordinators.getFuture());
+		++self->changeCoordinatorsRequests;
 		state ChangeCoordinatorsRequest changeCoordinatorsRequest = req;
 
 		while (!self->cstate.previousWrite.isReady()) {
@@ -1981,6 +2001,7 @@ ACTOR Future<Void> masterServer(MasterInterface mi,
 				if (self->logSystem.isValid() && self->logSystem->removeBackupWorker(req)) {
 					self->registrationTrigger.trigger();
 				}
+				++self->backupWorkerDoneRequests;
 				req.reply.send(Void());
 			}
 			when(wait(collection)) {

From 5c79d29140614245e21beb4dc12e1be30479e98d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 7 Apr 2021 10:59:45 -0700
Subject: [PATCH 066/461] Use object lifetimes instead of function calls

---
 fdbclient/AnnotateActor.h     | 37 +++++++++++++++++++++++------------
 fdbclient/NativeAPI.actor.cpp |  4 +---
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h
index cf5bf2c57e..0d0cd4a632 100644
--- a/fdbclient/AnnotateActor.h
+++ b/fdbclient/AnnotateActor.h
@@ -27,24 +27,35 @@
 // sampling profiler.
 struct AnnotateActor {
 	unsigned index;
+	bool set;
 
-	AnnotateActor() {}
+	AnnotateActor() : set(false) {}
 
-	// This API isn't great. Ideally, no cleanup call is needed. I ran into an
-	// issue around the destructor being called twice because an instance of
-	// this class has to be stored as a class member (otherwise it goes away
-	// when wait is called), and due to how Flow does code generation the
-	// member will be default initialized and then initialized again when it is
-	// initially set. Then, the destructor will be called twice, causing issues
-	// when the WriteOnlySet tries to erase the same index twice. I'm working
-	// on this :)
+	AnnotateActor(Reference<ActorLineage> lineage) : set(true) {
+		index = g_network->getActorLineageSet().insert(lineage);
+	}
 
-	void start() {
-		index = g_network->getActorLineageSet().insert(currentLineage);
+	AnnotateActor(const AnnotateActor& other) = delete;
+	AnnotateActor(AnnotateActor&& other) = delete;
+	AnnotateActor& operator=(const AnnotateActor& other) = delete;
+
+	AnnotateActor& operator=(AnnotateActor&& other) {
+		if (this == &other) {
+			return *this;
+		}
+
+		this->index = other.index;
+		this->set = other.set;
+
+		other.set = false;
+
+		return *this;
 	}
 	
-	void complete() {
-		g_network->getActorLineageSet().erase(index);
+	~AnnotateActor() {
+		if (set) {
+			g_network->getActorLineageSet().erase(index);
+		}
 	}
 };
 
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index e6d9463157..f05257e06d 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -3027,8 +3027,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 						throw deterministicRandom()->randomChoice(
 						    std::vector<Error>{ transaction_too_old(), future_version() });
 					}
-					state AnnotateActor annotation;
-					annotation.start();
+					state AnnotateActor annotation(currentLineage);
 					GetKeyValuesReply _rep =
 					    wait(loadBalance(cx.getPtr(),
 					                     beginServer.second,
@@ -3039,7 +3038,6 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					                     cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
 					rep = _rep;
 					++cx->transactionPhysicalReadsCompleted;
-					annotation.complete();
 				} catch (Error&) {
 					++cx->transactionPhysicalReadsCompleted;
 					throw;

From 2a64c227fb60599b2596cfa2512e24debbe2d6f3 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Wed, 7 Apr 2021 15:59:51 -0400
Subject: [PATCH 067/461] Added options to test config that specify
 maxtlogversion and array of excluded storage engine types

---
 fdbserver/SimulatedCluster.actor.cpp          | 26 ++++++++++++++-----
 fdbserver/TesterInterface.actor.h             |  7 +++--
 fdbserver/tester.actor.cpp                    |  6 +++--
 .../to_6.3.10/CycleTestRestart-1.txt          |  3 ++-
 .../to_6.3.10/CycleTestRestart-2.txt          |  3 ++-
 5 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index e5ce23da2f..f8d9610f32 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -21,6 +21,7 @@
 #include <cstdint>
 #include <fstream>
 #include <ostream>
+#include <sstream>
 #include "fdbrpc/Locality.h"
 #include "fdbrpc/simulator.h"
 #include "fdbclient/DatabaseContext.h"
@@ -874,7 +875,9 @@ void SimulationConfig::set_config(std::string config) {
 StringRef StringRefOf(const char* s) {
 	return StringRef((uint8_t*)s, strlen(s));
 }
-
+// Generates and sets an appropriate configuration for the database according to
+// the provided testConfig. Some attributes are randomly generated for more coverage
+// of different combinations
 void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	set_config("new");
 	const bool simple = false; // Set true to simplify simulation configs for easier debugging
@@ -897,7 +900,9 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 		db.resolverCount = deterministicRandom()->randomInt(1, 7);
 	int storage_engine_type = deterministicRandom()->randomInt(0, 4);
 	// Continuously re-pick the storage engine type if it's the one we want to exclude
-	while (storage_engine_type == testConfig.storageEngineExcludeType) {
+	while (std::find(testConfig.storageEngineExcludeTypes.begin(),
+	                 testConfig.storageEngineExcludeTypes.end(),
+	                 storage_engine_type) != testConfig.storageEngineExcludeTypes.end()) {
 		storage_engine_type = deterministicRandom()->randomInt(0, 4);
 	}
 	switch (storage_engine_type) {
@@ -989,11 +994,11 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	if (deterministicRandom()->random01() < 0.5) {
 		int logSpill = deterministicRandom()->randomInt(TLogSpillType::VALUE, TLogSpillType::END);
 		set_config(format("log_spill:=%d", logSpill));
-		int logVersion = deterministicRandom()->randomInt(TLogVersion::MIN_RECRUITABLE, TLogVersion::MAX_SUPPORTED + 1);
+		int logVersion = deterministicRandom()->randomInt(TLogVersion::MIN_RECRUITABLE, testConfig.maxTLogVersion + 1);
 		set_config(format("log_version:=%d", logVersion));
 	} else {
 		if (deterministicRandom()->random01() < 0.7)
-			set_config(format("log_version:=%d", TLogVersion::MAX_SUPPORTED));
+			set_config(format("log_version:=%d", testConfig.maxTLogVersion));
 		if (deterministicRandom()->random01() < 0.5)
 			set_config(format("log_spill:=%d", TLogSpillType::DEFAULT));
 	}
@@ -1663,8 +1668,17 @@ void checkTestConf(const char* testFile, TestConfig* testConfig) {
 			sscanf(value.c_str(), "%d", &testConfig->logAntiQuorum);
 		}
 
-		if (attrib == "storageEngineExcludeType") {
-			sscanf(value.c_str(), "%d", &testConfig->storageEngineExcludeType);
+		if (attrib == "storageEngineExcludeTypes") {
+			std::stringstream ss(value);
+			for (int i; ss >> i;) {
+				testConfig->storageEngineExcludeTypes.push_back(i);
+				if (ss.peek() == ',') {
+					ss.ignore();
+				}
+			}
+		}
+		if (attrib == "maxTLogVersion") {
+			sscanf(value.c_str(), "%d", &testConfig->maxTLogVersion);
 		}
 	}
 
diff --git a/fdbserver/TesterInterface.actor.h b/fdbserver/TesterInterface.actor.h
index f5e84a2a58..061128a9a7 100644
--- a/fdbserver/TesterInterface.actor.h
+++ b/fdbserver/TesterInterface.actor.h
@@ -109,12 +109,15 @@ struct TestConfig {
 	bool startIncompatibleProcess = false;
 	int logAntiQuorum = -1;
 	// Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig
-	//	-1 = None
 	//	0 = "ssd"
 	//	1 = "memory"
 	//	2 = "memory-radixtree-beta"
 	//	3 = "ssd-redwood-experimental"
-	int storageEngineExcludeType = -1;
+	// Requires a comma-separated list of numbers WITHOUT whitespaces
+	std::vector<int> storageEngineExcludeTypes;
+	// Set the maximum TLog version that can be selected for a test
+	// Refer to FDBTypes.h::TLogVersion. Defaults to the maximum supported version.
+	int maxTLogVersion = TLogVersion::MAX_SUPPORTED;
 };
 
 struct TesterInterface {
diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp
index 839df40999..714747ec32 100644
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@@ -1036,8 +1036,10 @@ std::map<std::string, std::function<void(const std::string&)>> testSpecGlobalKey
 	  } },
 	{ "startIncompatibleProcess",
 	  [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedStartIncompatibleProcess", value); } },
-	{ "storageEngineExcludeType",
-	  [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedStorageEngineExcludeType", ""); } }
+	{ "storageEngineExcludeTypes",
+	  [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedStorageEngineExcludeTypes", ""); } },
+	{ "maxTLogVersion",
+	  [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedMaxTLogVersion", ""); } }
 };
 
 std::map<std::string, std::function<void(const std::string& value, TestSpec* spec)>> testSpecTestKeys = {
diff --git a/tests/restarting/to_6.3.10/CycleTestRestart-1.txt b/tests/restarting/to_6.3.10/CycleTestRestart-1.txt
index 59e764c697..fe2a95fd46 100644
--- a/tests/restarting/to_6.3.10/CycleTestRestart-1.txt
+++ b/tests/restarting/to_6.3.10/CycleTestRestart-1.txt
@@ -1,4 +1,5 @@
-storageEngineExcludeType=-1
+storageEngineExcludeTypes=-1,-2
+maxTLogVersion=6
 testTitle=Clogged
     clearAfterTest=false
     testName=Cycle
diff --git a/tests/restarting/to_6.3.10/CycleTestRestart-2.txt b/tests/restarting/to_6.3.10/CycleTestRestart-2.txt
index ecd3c77b52..8af5b92392 100644
--- a/tests/restarting/to_6.3.10/CycleTestRestart-2.txt
+++ b/tests/restarting/to_6.3.10/CycleTestRestart-2.txt
@@ -1,4 +1,5 @@
-storageEngineExcludeType=-1
+storageEngineExcludeTypes=-1,-2
+maxTLogVersion=6
 testTitle=Clogged
     runSetup=false
     testName=Cycle

From d6c4aa67d71c829c2da198a65c4753cbaa1c1246 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 6 Apr 2021 17:28:28 -0700
Subject: [PATCH 068/461] Sample actors waiting on network

---
 fdbclient/InstrumentRequest.h | 50 +++++++++++++++++++++++++++++++++++
 fdbclient/NativeAPI.actor.cpp |  5 ++++
 fdbrpc/FlowTests.actor.cpp    |  4 +++
 fdbrpc/sim2.actor.cpp         |  7 +++++
 flow/Net2.actor.cpp           |  8 ++++++
 flow/Platform.actor.cpp       | 12 ++++++---
 flow/network.h                |  4 +++
 7 files changed, 86 insertions(+), 4 deletions(-)
 create mode 100644 fdbclient/InstrumentRequest.h

diff --git a/fdbclient/InstrumentRequest.h b/fdbclient/InstrumentRequest.h
new file mode 100644
index 0000000000..77adbd1490
--- /dev/null
+++ b/fdbclient/InstrumentRequest.h
@@ -0,0 +1,50 @@
+/*
+ * InstrumentRequest.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "flow/flow.h"
+#include "flow/network.h"
+
+// Used to manually instrument waiting actors to collect samples for the
+// sampling profiler.
+struct InstrumentRequest {
+	unsigned index;
+
+	InstrumentRequest() {}
+
+	// This API isn't great. Ideally, no cleanup call is needed. I ran into an
+	// issue around the destructor being called twice because an instance of
+	// this class has to be stored as a class member (otherwise it goes away
+	// when wait is called), and due to how Flow does code generation the
+	// member will be default initialized and then initialized again when it is
+	// initially set. Then, the destructor will be called twice, causing issues
+	// when the WriteOnlySet tries to erase the same index twice. I'm working
+	// on this :)
+
+	void start() {
+		index = g_network->getActorLineageSet().insert(currentLineage);
+	}
+	
+	void complete() {
+		g_network->getActorLineageSet().erase(index);
+	}
+};
+
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 8b55757621..0952bae4d4 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -36,6 +36,7 @@
 #include "fdbclient/ClusterInterface.h"
 #include "fdbclient/CoordinationInterface.h"
 #include "fdbclient/DatabaseContext.h"
+#include "fdbclient/InstrumentRequest.h"
 #include "fdbclient/JsonBuilder.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/Knobs.h"
@@ -1796,6 +1797,7 @@ void runNetwork() {
 	if (networkOptions.traceDirectory.present() && networkOptions.runLoopProfilingEnabled) {
 		setupRunLoopProfiler();
 	}
+	setupSamplingProfiler();
 
 	g_network->run();
 
@@ -3051,6 +3053,8 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 						throw deterministicRandom()->randomChoice(
 						    std::vector<Error>{ transaction_too_old(), future_version() });
 					}
+					state InstrumentRequest request;
+					request.start();
 					GetKeyValuesReply _rep =
 					    wait(loadBalance(cx.getPtr(),
 					                     beginServer.second,
@@ -3061,6 +3065,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					                     cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
 					rep = _rep;
 					++cx->transactionPhysicalReadsCompleted;
+					request.complete();
 				} catch (Error&) {
 					++cx->transactionPhysicalReadsCompleted;
 					throw;
diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp
index 40e4ed1c52..c965149f70 100644
--- a/fdbrpc/FlowTests.actor.cpp
+++ b/fdbrpc/FlowTests.actor.cpp
@@ -24,6 +24,7 @@
 #include "flow/UnitTest.h"
 #include "flow/DeterministicRandom.h"
 #include "flow/IThreadPool.h"
+#include "flow/WriteOnlySet.h"
 #include "fdbrpc/fdbrpc.h"
 #include "fdbrpc/IAsyncFile.h"
 #include "flow/TLSConfig.actor.h"
@@ -283,6 +284,9 @@ struct YieldMockNetwork final : INetwork, ReferenceCounted<YieldMockNetwork> {
 		static TLSConfig emptyConfig;
 		return emptyConfig;
 	}
+	ActorLineageSet& getActorLineageSet() override {
+		throw std::exception();
+	}
 	ProtocolVersion protocolVersion() override { return baseNetwork->protocolVersion(); }
 };
 
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index 3b965d22b9..5cf65da0a5 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -31,6 +31,7 @@
 #include "flow/IThreadPool.h"
 #include "flow/ProtocolVersion.h"
 #include "flow/Util.h"
+#include "flow/WriteOnlySet.h"
 #include "fdbrpc/IAsyncFile.h"
 #include "fdbrpc/AsyncFileCached.actor.h"
 #include "fdbrpc/AsyncFileNonDurable.actor.h"
@@ -975,6 +976,10 @@ public:
 
 	bool checkRunnable() override { return net2->checkRunnable(); }
 
+	ActorLineageSet& getActorLineageSet() override {
+		return actorLineageSet;
+	}
+
 	void stop() override { isStopped = true; }
 	void addStopCallback(std::function<void()> fn) override { stopCallbacks.emplace_back(std::move(fn)); }
 	bool isSimulated() const override { return true; }
@@ -2117,6 +2122,8 @@ public:
 	// Whether or not yield has returned true during the current iteration of the run loop
 	bool yielded;
 	int yield_limit; // how many more times yield may return false before next returning true
+
+	ActorLineageSet actorLineageSet;
 };
 
 class UDPSimSocket : public IUDPSocket, ReferenceCounted<UDPSimSocket> {
diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp
index bb0b0325c6..fb64671c28 100644
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@@ -204,6 +204,8 @@ public:
 
 	bool checkRunnable() override;
 
+	ActorLineageSet& getActorLineageSet() override;
+
 	bool useThreadPool;
 
 	// private:
@@ -231,6 +233,8 @@ public:
 	std::atomic<bool> stopped;
 	mutable std::map<IPAddress, bool> addressOnHostCache;
 
+	ActorLineageSet actorLineageSet;
+
 	std::atomic<bool> started;
 
 	uint64_t numYields;
@@ -1383,6 +1387,10 @@ bool Net2::checkRunnable() {
 	return !started.exchange(true);
 }
 
+ActorLineageSet& Net2::getActorLineageSet() {
+	return actorLineageSet;
+}
+
 void Net2::run() {
 	TraceEvent::setNetworkThread();
 	TraceEvent("Net2Running");
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 50f252021b..5be9b6423f 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -3679,8 +3679,7 @@ void* sampleThread(void* arg) {
 	while (true) {
 		threadSleep(1.0); // TODO: Read sample rate from global config
 
-		// TODO: Copy actor lineage of currently running actor
-		// Read currentLineage
+		// Get actor lineage of currently running actor.
 		auto actorLineage = currentLineageThreadSafe.get();
 		printf("Currently running actor lineage (%p):\n", actorLineage.getPtr());
 		auto stack = actorLineage->stack(&StackLineage::actorName);
@@ -3690,11 +3689,16 @@ void* sampleThread(void* arg) {
 		}
 		printf("\n");
 
+		// Get lineage of actors waiting on disk.
 		auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy();
-		printf("Disk ALPs: %d\n", diskAlps.size());
+		// printf("Disk ALPs: %d\n", diskAlps.size());
+
+		// TODO: Get lineage of actors waiting on network
+		auto networkAlps = g_network->getActorLineageSet().copy();
+		printf("Network ALPs: %d\n", networkAlps.size());
 
 		// TODO: Call collect on all actor lineages
-		for (auto actorLineage : diskAlps) {
+		for (auto actorLineage : networkAlps) {
 			auto stack = actorLineage->stack(&StackLineage::actorName);
 			while (!stack.empty()) {
 				printf("%s ", stack.top());
diff --git a/flow/network.h b/flow/network.h
index d0f117dede..ec14167121 100644
--- a/flow/network.h
+++ b/flow/network.h
@@ -35,6 +35,7 @@
 #include "flow/Arena.h"
 #include "flow/IRandom.h"
 #include "flow/Trace.h"
+#include "flow/WriteOnlySet.h"
 
 enum class TaskPriority {
 	Max = 1000000,
@@ -558,6 +559,9 @@ public:
 	// returns false.
 	virtual bool checkRunnable() = 0;
 
+	// Returns the shared memory data structure used to store actor lineages.
+	virtual ActorLineageSet& getActorLineageSet() = 0;
+
 	virtual ProtocolVersion protocolVersion() = 0;
 
 	// Shorthand for transport().getLocalAddress()

From d60011aa74105f496cd76578cd7413c84864f884 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 6 Apr 2021 17:32:02 -0700
Subject: [PATCH 069/461] Update annotation class name

---
 fdbclient/{InstrumentRequest.h => AnnotateActor.h} | 6 +++---
 fdbclient/NativeAPI.actor.cpp                      | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)
 rename fdbclient/{InstrumentRequest.h => AnnotateActor.h} (95%)

diff --git a/fdbclient/InstrumentRequest.h b/fdbclient/AnnotateActor.h
similarity index 95%
rename from fdbclient/InstrumentRequest.h
rename to fdbclient/AnnotateActor.h
index 77adbd1490..cf5bf2c57e 100644
--- a/fdbclient/InstrumentRequest.h
+++ b/fdbclient/AnnotateActor.h
@@ -1,5 +1,5 @@
 /*
- * InstrumentRequest.h
+ * AnnotateActor.h
  *
  * This source file is part of the FoundationDB open source project
  *
@@ -25,10 +25,10 @@
 
 // Used to manually instrument waiting actors to collect samples for the
 // sampling profiler.
-struct InstrumentRequest {
+struct AnnotateActor {
 	unsigned index;
 
-	InstrumentRequest() {}
+	AnnotateActor() {}
 
 	// This API isn't great. Ideally, no cleanup call is needed. I ran into an
 	// issue around the destructor being called twice because an instance of
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 0952bae4d4..cdac01f56f 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -32,11 +32,11 @@
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/MultiInterface.h"
 
+#include "fdbclient/AnnotateActor.h"
 #include "fdbclient/Atomic.h"
 #include "fdbclient/ClusterInterface.h"
 #include "fdbclient/CoordinationInterface.h"
 #include "fdbclient/DatabaseContext.h"
-#include "fdbclient/InstrumentRequest.h"
 #include "fdbclient/JsonBuilder.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/Knobs.h"
@@ -3053,8 +3053,8 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 						throw deterministicRandom()->randomChoice(
 						    std::vector<Error>{ transaction_too_old(), future_version() });
 					}
-					state InstrumentRequest request;
-					request.start();
+					state AnnotateActor annotation;
+					annotation.start();
 					GetKeyValuesReply _rep =
 					    wait(loadBalance(cx.getPtr(),
 					                     beginServer.second,
@@ -3065,7 +3065,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					                     cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
 					rep = _rep;
 					++cx->transactionPhysicalReadsCompleted;
-					request.complete();
+					annotation.complete();
 				} catch (Error&) {
 					++cx->transactionPhysicalReadsCompleted;
 					throw;

From 130e520ad78aefbe3dccf02680f13dfdc5d9ac89 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 7 Apr 2021 10:59:45 -0700
Subject: [PATCH 070/461] Use object lifetimes instead of function calls

---
 fdbclient/AnnotateActor.h     | 37 +++++++++++++++++++++++------------
 fdbclient/NativeAPI.actor.cpp |  4 +---
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h
index cf5bf2c57e..0d0cd4a632 100644
--- a/fdbclient/AnnotateActor.h
+++ b/fdbclient/AnnotateActor.h
@@ -27,24 +27,35 @@
 // sampling profiler.
 struct AnnotateActor {
 	unsigned index;
+	bool set;
 
-	AnnotateActor() {}
+	AnnotateActor() : set(false) {}
 
-	// This API isn't great. Ideally, no cleanup call is needed. I ran into an
-	// issue around the destructor being called twice because an instance of
-	// this class has to be stored as a class member (otherwise it goes away
-	// when wait is called), and due to how Flow does code generation the
-	// member will be default initialized and then initialized again when it is
-	// initially set. Then, the destructor will be called twice, causing issues
-	// when the WriteOnlySet tries to erase the same index twice. I'm working
-	// on this :)
+	AnnotateActor(Reference<ActorLineage> lineage) : set(true) {
+		index = g_network->getActorLineageSet().insert(lineage);
+	}
 
-	void start() {
-		index = g_network->getActorLineageSet().insert(currentLineage);
+	AnnotateActor(const AnnotateActor& other) = delete;
+	AnnotateActor(AnnotateActor&& other) = delete;
+	AnnotateActor& operator=(const AnnotateActor& other) = delete;
+
+	AnnotateActor& operator=(AnnotateActor&& other) {
+		if (this == &other) {
+			return *this;
+		}
+
+		this->index = other.index;
+		this->set = other.set;
+
+		other.set = false;
+
+		return *this;
 	}
 	
-	void complete() {
-		g_network->getActorLineageSet().erase(index);
+	~AnnotateActor() {
+		if (set) {
+			g_network->getActorLineageSet().erase(index);
+		}
 	}
 };
 
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index cdac01f56f..b208107fde 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -3053,8 +3053,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 						throw deterministicRandom()->randomChoice(
 						    std::vector<Error>{ transaction_too_old(), future_version() });
 					}
-					state AnnotateActor annotation;
-					annotation.start();
+					state AnnotateActor annotation(currentLineage);
 					GetKeyValuesReply _rep =
 					    wait(loadBalance(cx.getPtr(),
 					                     beginServer.second,
@@ -3065,7 +3064,6 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					                     cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
 					rep = _rep;
 					++cx->transactionPhysicalReadsCompleted;
-					annotation.complete();
 				} catch (Error&) {
 					++cx->transactionPhysicalReadsCompleted;
 					throw;

From 040ba0c5874e02ac7def13edee88fe6b01593b7f Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 7 Apr 2021 15:23:50 -0700
Subject: [PATCH 071/461] Rearrange things no that the backoff delay has no
 impact unless it's needed.

---
 fdbrpc/LoadBalance.actor.h | 79 +++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 43 deletions(-)

diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h
index 78f73352ba..f3fe58e441 100644
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@@ -78,10 +78,10 @@ Optional<LoadBalancedReply> getLoadBalancedReply(const void*);
 // Stores state for a request made by the load balancer
 template <class Request>
 struct RequestData : NonCopyable {
-	Future<ErrorOr<REPLY_TYPE(Request)>> response;
+	typedef ErrorOr<REPLY_TYPE(Request)> Reply;
+
+	Future<Reply> response;
 	Reference<ModelHolder> modelHolder;
-	Future<Void> backoffDelay;
-	RequestStream<Request> const* stream = nullptr;
 	bool triedAllOptions = false;
 
 	bool requestStarted = false; // true once the request has been sent to an alternative
@@ -91,36 +91,38 @@ struct RequestData : NonCopyable {
 	// This is true once setupRequest is called, even though at that point the response is Never().
 	bool isValid() { return response.isValid(); }
 
-	// Initializes the request state and starts the backoff delay
-	void setupRequest(double backoff, bool triedAllOptions, RequestStream<Request> const* stream) {
-		backoffDelay = (backoff > 0) ? delay(backoff) : Void();
-		response = Never();
+	// Initializes the request state and starts it, possibly after a backoff delay
+	void startRequest(double backoff,
+	                  bool triedAllOptions,
+	                  RequestStream<Request> const* stream,
+	                  Request const& request,
+	                  QueueModel* model) {
 		modelHolder = Reference<ModelHolder>();
 		requestStarted = false;
+
+		if (backoff > 0) {
+			response = mapAsync<Void, std::function<Future<Reply>(Void)>, Reply>(
+			    delay(backoff), [this, stream, &request, model](Void _) {
+				    requestStarted = true;
+				    modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
+				    return stream->tryGetReply(request);
+			    });
+		} else {
+			requestStarted = true;
+			modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
+			response = stream->tryGetReply(request);
+		}
+
 		requestProcessed = false;
-
-		this->stream = stream;
 		this->triedAllOptions = triedAllOptions;
 	}
 
-	// Sends the request to the configured stream
-	// This should not be called until after setupRequest has been called and the backoff delay has elapsed
-	void startRequest(Request request, QueueModel* model) {
-		ASSERT(stream);
-		ASSERT(backoffDelay.isReady());
-
-		backoffDelay = Never();
-		modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
-		response = stream->tryGetReply(request);
-		requestStarted = true;
-	}
-
 	// Implementation of the logic to handle a response.
 	// Checks the state of the response, updates the queue model, and returns one of the following outcomes:
 	// A return value of true means that the request completed successfully
 	// A return value of false means that the request failed but should be retried
 	// A return value with an error means that the error should be thrown back to original caller
-	static ErrorOr<bool> checkAndProcessResultImpl(ErrorOr<REPLY_TYPE(Request)> result,
+	static ErrorOr<bool> checkAndProcessResultImpl(Reply result,
 	                                               Reference<ModelHolder> modelHolder,
 	                                               bool atMostOnce,
 	                                               bool triedAllOptions) {
@@ -189,7 +191,7 @@ struct RequestData : NonCopyable {
 		if (outcome.isError()) {
 			throw outcome.getError();
 		} else if (!outcome.get()) {
-			response = Future<ErrorOr<REPLY_TYPE(Request)>>();
+			response = Future<Reply>();
 		}
 
 		return outcome.get();
@@ -215,11 +217,10 @@ struct RequestData : NonCopyable {
 		// We need to process the lagging request in order to update the queue model
 		Reference<ModelHolder> holderCapture = std::move(modelHolder);
 		bool triedAllOptionsCapture = triedAllOptions;
-		Future<Void> updateModel =
-		    map(response, [holderCapture, triedAllOptionsCapture](ErrorOr<REPLY_TYPE(Request)> result) {
-			    checkAndProcessResultImpl(result, holderCapture, false, triedAllOptionsCapture);
-			    return Void();
-		    });
+		Future<Void> updateModel = map(response, [holderCapture, triedAllOptionsCapture](Reply result) {
+			checkAndProcessResultImpl(result, holderCapture, false, triedAllOptionsCapture);
+			return Void();
+		});
 		model->addActor.send(updateModel);
 	}
 
@@ -453,25 +454,18 @@ Future<REPLY_TYPE(Request)> loadBalance(
 			numAttempts = 0; // now that we've got a server back, reset the backoff
 		} else if (!stream) {
 			// Only the first location is available.
-			loop choose {
-				when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); }
-				when(ErrorOr<REPLY_TYPE(Request)> result = wait(firstRequestData.response)) {
-					if (firstRequestData.checkAndProcessResult(atMostOnce)) {
-						return result.get();
-					}
-
-					firstRequestEndpoint = Optional<uint64_t>();
-					break;
-				}
+			ErrorOr<REPLY_TYPE(Request)> result = wait(firstRequestData.response);
+			if (firstRequestData.checkAndProcessResult(atMostOnce)) {
+				return result.get();
 			}
+
+			firstRequestEndpoint = Optional<uint64_t>();
 		} else if (firstRequestData.isValid()) {
 			// Issue a second request, the first one is taking a long time.
-			secondRequestData.setupRequest(backoff, triedAllOptions, stream);
+			secondRequestData.startRequest(backoff, triedAllOptions, stream, request, model);
 			state bool firstFinished = false;
 
 			loop choose {
-				when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); }
-				when(wait(secondRequestData.backoffDelay)) { secondRequestData.startRequest(request, model); }
 				when(ErrorOr<REPLY_TYPE(Request)> result =
 				         wait(firstRequestData.response.isValid() ? firstRequestData.response : Never())) {
 					if (firstRequestData.checkAndProcessResult(atMostOnce)) {
@@ -497,12 +491,11 @@ Future<REPLY_TYPE(Request)> loadBalance(
 			}
 		} else {
 			// Issue a request, if it takes too long to get a reply, go around the loop
-			firstRequestData.setupRequest(backoff, triedAllOptions, stream);
+			firstRequestData.startRequest(backoff, triedAllOptions, stream, request, model);
 			firstRequestEndpoint = stream->getEndpoint().token.first();
 
 			loop {
 				choose {
-					when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); }
 					when(ErrorOr<REPLY_TYPE(Request)> result = wait(firstRequestData.response)) {
 						if (model) {
 							model->secondMultiplier =

From 83cf9658750bfed301702c7e24d4de5de0fb1a65 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 7 Apr 2021 15:38:01 -0700
Subject: [PATCH 072/461] Add global variable to fetch each type of sample

---
 fdbclient/AnnotateActor.cpp | 23 +++++++++++++++++++++++
 fdbclient/AnnotateActor.h   |  3 +++
 fdbclient/CMakeLists.txt    |  1 +
 flow/Platform.actor.cpp     | 32 +++++++++++++++++---------------
 4 files changed, 44 insertions(+), 15 deletions(-)
 create mode 100644 fdbclient/AnnotateActor.cpp

diff --git a/fdbclient/AnnotateActor.cpp b/fdbclient/AnnotateActor.cpp
new file mode 100644
index 0000000000..80b9a8cec4
--- /dev/null
+++ b/fdbclient/AnnotateActor.cpp
@@ -0,0 +1,23 @@
+/*
+ * AnnotateActor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/AnnotateActor.h"
+
+std::map<WaitState, std::function<std::vector<Reference<ActorLineage>>()>> samples;
diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h
index 0d0cd4a632..265d1bb3ad 100644
--- a/fdbclient/AnnotateActor.h
+++ b/fdbclient/AnnotateActor.h
@@ -59,3 +59,6 @@ struct AnnotateActor {
 	}
 };
 
+enum WaitState { Disk, Network };
+
+extern std::map<WaitState, std::function<std::vector<Reference<ActorLineage>>()>> samples;
diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index 129f9e7d3e..0f61d0c638 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(FDBCLIENT_SRCS
+  AnnotateActor.cpp
   AsyncFileS3BlobStore.actor.cpp
   AsyncFileS3BlobStore.actor.h
   AsyncTaskThread.actor.cpp
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 5be9b6423f..be12a594d2 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -50,6 +50,8 @@
 
 #include "fdbrpc/IAsyncFile.h"
 
+#include "fdbclient/AnnotateActor.h"
+
 #ifdef _WIN32
 #include <windows.h>
 #include <winioctl.h>
@@ -3689,31 +3691,31 @@ void* sampleThread(void* arg) {
 		}
 		printf("\n");
 
-		// Get lineage of actors waiting on disk.
-		auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy();
-		// printf("Disk ALPs: %d\n", diskAlps.size());
+		for (const auto& [waitState, lineageFn] : samples) {
+			auto alps = lineageFn();
 
-		// TODO: Get lineage of actors waiting on network
-		auto networkAlps = g_network->getActorLineageSet().copy();
-		printf("Network ALPs: %d\n", networkAlps.size());
+			// TODO: Serialize collected actor linage properties
 
-		// TODO: Call collect on all actor lineages
-		for (auto actorLineage : networkAlps) {
-			auto stack = actorLineage->stack(&StackLineage::actorName);
-			while (!stack.empty()) {
-				printf("%s ", stack.top());
-				stack.pop();
+			printf("Wait State #%d ALPs (%d):\n", waitState, alps.size());
+			for (auto actorLineage : alps) {
+				auto stack = actorLineage->stack(&StackLineage::actorName);
+				while (!stack.empty()) {
+					printf("%s ", stack.top());
+					stack.pop();
+				}
+				printf("\n");
 			}
-			printf("\n");
 		}
-
-		// TODO: Serialize collected actor linage properties
 	}
 
 	return nullptr;
 }
 
 void setupSamplingProfiler() {
+	samples[WaitState::Disk] = std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet()));
+	samples[WaitState::Network] =
+	    std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet()));
+
 	// TODO: Add knob
 	TraceEvent("StartingSamplingProfilerThread");
 	startThread(&sampleThread, nullptr);

From c27d82cecdd11f91f3b32d924003fb0a72272eb3 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 7 Apr 2021 16:04:08 -0700
Subject: [PATCH 073/461] tlog recruitment used a degraded LogClass process
 over a non-degraded TransactionClass process tlog recruitment would not use
 TransactionClass processes if it fulfulled the required amount with LogClass
 processes Better master exists did not account for how many times a process
 had been used when comparing recruitments Better master exists did not
 account for the fact that tlogs prefer to be in a different dc than the
 cluster controller RoleFitness comparison did not properly order count before
 degraded or bestFit betterCount was returning worstFit when worstIsDegraded
 did not match backupWorker recruitment did not attempt to avoid sharing
 processes with other roles If any of the commit_proxy, grv_proxy, or resolver
 are forced to share a process, allow the recruitment for all of them to share
 to an equal degree, this change allows BetterMasterExists to be refactors as
 a tuple comparison

---
 fdbserver/ClusterController.actor.cpp | 491 ++++++++++++++++----------
 1 file changed, 310 insertions(+), 181 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 55770d6f3b..87dcbd7c11 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -407,16 +407,17 @@ public:
 
 			// This worker is a candidate for TLog recruitment.
 			bool inCCDC = worker_details.interf.locality.dcId() == clusterControllerDcId;
+			// Prefer recruiting a TransactionClass non-degraded process over a LogClass degraded process
+			if (worker_details.degraded) {
+				fitness = std::max(fitness, ProcessClass::GoodFit);
+			}
 			fitness_workers[std::make_tuple(fitness, id_used[worker_process_id], worker_details.degraded, inCCDC)]
 			    .push_back(worker_details);
 		}
 
-		//  FIXME: it's not clear whether this is necessary.
-		for (int fitness = ProcessClass::BestFit; fitness != ProcessClass::NeverAssign; fitness++) {
-			auto fitnessEnum = (ProcessClass::Fitness)fitness;
-			for (int addingDegraded = 0; addingDegraded < 2; addingDegraded++) {
-				fitness_workers[std::make_tuple(fitnessEnum, 0, addingDegraded, false)];
-			}
+		// Make sure we check for tlogs at the required size before adding processses from the next fitness level
+		for (int fitness = ProcessClass::GoodFit; fitness != ProcessClass::NeverAssign; fitness++) {
+			fitness_workers[std::make_tuple((ProcessClass::Fitness)fitness, 0, true, false)];
 		}
 		results.reserve(results.size() + id_worker.size());
 		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
@@ -432,7 +433,7 @@ public:
 				logServerMap->add(worker.interf.locality, &worker);
 			}
 
-			if (logServerSet->size() < (std::get<2>(workerIter->first) ? required : desired)) {
+			if (logServerSet->size() < (addingDegraded ? required : desired)) {
 			} else if (logServerSet->size() == required || logServerSet->size() <= desired) {
 				if (logServerSet->validate(policy)) {
 					for (auto& object : logServerMap->getObjects()) {
@@ -742,29 +743,40 @@ public:
 		return results;
 	}
 
+	// Allows the comparison of two different recruitments to determine which one is better
+	// Tlog recruitment is different from all the other roles, in that it avoids degraded processes
+	// And tried to avoid recruitment in the same DC as the cluster controller
 	struct RoleFitness {
 		ProcessClass::Fitness bestFit;
 		ProcessClass::Fitness worstFit;
 		ProcessClass::ClusterRole role;
 		int count;
+		int worstUsed;
 		bool worstIsDegraded;
+		bool inClusterControllerDC;
 
 		RoleFitness(int bestFit, int worstFit, int count, ProcessClass::ClusterRole role)
 		  : bestFit((ProcessClass::Fitness)bestFit), worstFit((ProcessClass::Fitness)worstFit), count(count),
-		    role(role), worstIsDegraded(false) {}
+		    role(role), worstUsed(1), worstIsDegraded(false), inClusterControllerDC(false) {}
 
 		RoleFitness(int fitness, int count, ProcessClass::ClusterRole role)
 		  : bestFit((ProcessClass::Fitness)fitness), worstFit((ProcessClass::Fitness)fitness), count(count), role(role),
-		    worstIsDegraded(false) {}
+		    worstUsed(1), worstIsDegraded(false), inClusterControllerDC(false) {}
 
 		RoleFitness()
 		  : bestFit(ProcessClass::NeverAssign), worstFit(ProcessClass::NeverAssign), role(ProcessClass::NoRole),
-		    count(0), worstIsDegraded(false) {}
+		    count(0), worstUsed(1), worstIsDegraded(false), inClusterControllerDC(false) {}
 
-		RoleFitness(vector<WorkerDetails> workers, ProcessClass::ClusterRole role) : role(role) {
-			worstFit = ProcessClass::GoodFit;
+		RoleFitness(const vector<WorkerDetails>& workers,
+		            ProcessClass::ClusterRole role,
+		            const std::map<Optional<Standalone<StringRef>>, int>& id_used,
+		            Optional<Standalone<StringRef>> ccDcId)
+		  : role(role) {
+			worstFit = ProcessClass::BestFit;
 			worstIsDegraded = false;
+			inClusterControllerDC = false;
 			bestFit = ProcessClass::NeverAssign;
+			worstUsed = 1;
 			for (auto& it : workers) {
 				auto thisFit = it.processClass.machineClassFitness(role);
 				if (thisFit > worstFit) {
@@ -774,7 +786,24 @@ public:
 					worstIsDegraded = worstIsDegraded || it.degraded;
 				}
 				bestFit = std::min(bestFit, thisFit);
+				auto thisUsed = id_used.find(it.interf.locality.processId());
+				if (thisUsed == id_used.end()) {
+					TraceEvent(SevError, "UsedNotFound").detail("ProcessId", it.interf.locality.processId().get());
+					ASSERT(false);
+				}
+				if (thisUsed->second == 0) {
+					TraceEvent(SevError, "UsedIsZero").detail("ProcessId", it.interf.locality.processId().get());
+					ASSERT(false);
+				}
+				worstUsed = std::max(worstUsed, thisUsed->second);
+				// only tlogs avoid the cluster controller dc
+				if (role == ProcessClass::TLog && it.interf.locality.dcId() == ccDcId) {
+					inClusterControllerDC = true;
+				}
 			}
+			// Every recruitment will attempt to recruit the preferred amount through GoodFit,
+			// So a recruitment which only has BestFit is not better than one that has a GoodFit process
+			worstFit = std::max(worstFit, ProcessClass::GoodFit);
 			count = workers.size();
 			// degraded is only used for recruitment of tlogs
 			if (role != ProcessClass::TLog) {
@@ -785,87 +814,45 @@ public:
 		bool operator<(RoleFitness const& r) const {
 			if (worstFit != r.worstFit)
 				return worstFit < r.worstFit;
+			if (worstUsed != r.worstUsed)
+				return worstUsed < r.worstUsed;
+			if (count != r.count)
+				return count > r.count;
 			if (worstIsDegraded != r.worstIsDegraded)
 				return r.worstIsDegraded;
+			if (inClusterControllerDC != r.inClusterControllerDC)
+				return r.inClusterControllerDC;
 			// FIXME: TLog recruitment process does not guarantee the best fit is not worsened.
 			if (role != ProcessClass::TLog && role != ProcessClass::LogRouter && bestFit != r.bestFit)
 				return bestFit < r.bestFit;
-			return count > r.count;
+			return false;
 		}
 		bool operator>(RoleFitness const& r) const { return r < *this; }
 		bool operator<=(RoleFitness const& r) const { return !(*this > r); }
 		bool operator>=(RoleFitness const& r) const { return !(*this < r); }
 
-		bool betterFitness(RoleFitness const& r) const {
-			if (worstFit != r.worstFit)
-				return worstFit < r.worstFit;
-			if (worstIsDegraded != r.worstIsDegraded)
-				return r.worstFit;
-			if (bestFit != r.bestFit)
-				return bestFit < r.bestFit;
-			return false;
-		}
-
 		bool betterCount(RoleFitness const& r) const {
 			if (count > r.count)
 				return true;
 			if (worstFit != r.worstFit)
 				return worstFit < r.worstFit;
+			if (worstUsed != r.worstUsed)
+				return worstUsed < r.worstUsed;
 			if (worstIsDegraded != r.worstIsDegraded)
-				return r.worstFit;
+				return r.worstIsDegraded;
+			if (inClusterControllerDC != r.inClusterControllerDC)
+				return r.inClusterControllerDC;
 			return false;
 		}
 
 		bool operator==(RoleFitness const& r) const {
-			return worstFit == r.worstFit && bestFit == r.bestFit && count == r.count &&
-			       worstIsDegraded == r.worstIsDegraded;
+			return worstFit == r.worstFit && worstUsed == r.worstUsed && bestFit == r.bestFit && count == r.count &&
+			       worstIsDegraded == r.worstIsDegraded && inClusterControllerDC == r.inClusterControllerDC;
 		}
 
-		std::string toString() const { return format("%d %d %d %d", bestFit, worstFit, count, worstIsDegraded); }
-	};
-
-	struct RoleFitnessPair {
-		RoleFitness proxy;
-		RoleFitness grvProxy;
-		RoleFitness resolver;
-
-		RoleFitnessPair() {}
-		RoleFitnessPair(RoleFitness const& proxy, RoleFitness const& grvProxy, RoleFitness const& resolver)
-		  : proxy(proxy), grvProxy(grvProxy), resolver(resolver) {}
-
-		bool operator<(RoleFitnessPair const& r) const {
-			if (proxy.betterFitness(r.proxy)) {
-				return true;
-			}
-			if (r.proxy.betterFitness(proxy)) {
-				return false;
-			}
-			if (grvProxy.betterFitness(r.grvProxy)) {
-				return true;
-			}
-			if (r.grvProxy.betterFitness(grvProxy)) {
-				return false;
-			}
-			if (resolver.betterFitness(r.resolver)) {
-				return true;
-			}
-			if (r.resolver.betterFitness(resolver)) {
-				return false;
-			}
-			if (proxy.count != r.proxy.count) {
-				return proxy.count > r.proxy.count;
-			}
-			if (grvProxy.count != r.grvProxy.count) {
-				return grvProxy.count > r.grvProxy.count;
-			}
-			return resolver.count > r.resolver.count;
-		}
-		bool operator>(RoleFitnessPair const& r) const { return r < *this; }
-		bool operator<=(RoleFitnessPair const& r) const { return !(*this > r); }
-		bool operator>=(RoleFitnessPair const& r) const { return !(*this < r); }
-
-		bool operator==(RoleFitnessPair const& r) const {
-			return proxy == r.proxy && grvProxy == r.grvProxy && resolver == r.resolver;
+		std::string toString() const {
+			return format(
+			    "%d %d %d %d %d %d", worstFit, worstUsed, count, worstIsDegraded, inClusterControllerDC, bestFit);
 		}
 	};
 
@@ -914,9 +901,9 @@ public:
 		if (!goodRemoteRecruitmentTime.isReady() &&
 		    ((RoleFitness(
 		          SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredRemoteLogs(), ProcessClass::TLog)
-		          .betterCount(RoleFitness(remoteLogs, ProcessClass::TLog))) ||
+		          .betterCount(RoleFitness(remoteLogs, ProcessClass::TLog, id_used, clusterControllerDcId))) ||
 		     (RoleFitness(SERVER_KNOBS->EXPECTED_LOG_ROUTER_FITNESS, req.logRouterCount, ProcessClass::LogRouter)
-		          .betterCount(RoleFitness(logRouters, ProcessClass::LogRouter))))) {
+		          .betterCount(RoleFitness(logRouters, ProcessClass::LogRouter, id_used, clusterControllerDcId))))) {
 			throw operation_failed();
 		}
 
@@ -980,6 +967,13 @@ public:
 		auto first_resolver = getWorkerForRoleInDatacenter(
 		    dcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, req.configuration, id_used);
 
+		// If one of the first process recruitments is forced to share a process, allow all of next recruitments
+		// to also share a process.
+		auto maxUsed = std::max(std::max(first_commit_proxy.used, first_grv_proxy.used), first_resolver.used);
+		first_commit_proxy.used = maxUsed;
+		first_grv_proxy.used = maxUsed;
+		first_resolver.used = maxUsed;
+
 		auto commit_proxies = getWorkersForRoleInDatacenter(dcId,
 		                                                    ProcessClass::CommitProxy,
 		                                                    req.configuration.getDesiredCommitProxies(),
@@ -1031,24 +1025,24 @@ public:
 
 		if (!goodRecruitmentTime.isReady() &&
 		    (RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs(), ProcessClass::TLog)
-		         .betterCount(RoleFitness(tlogs, ProcessClass::TLog)) ||
+		         .betterCount(RoleFitness(tlogs, ProcessClass::TLog, id_used, clusterControllerDcId)) ||
 		     (region.satelliteTLogReplicationFactor > 0 && req.configuration.usableRegions > 1 &&
 		      RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS,
 		                  req.configuration.getDesiredSatelliteLogs(dcId),
 		                  ProcessClass::TLog)
-		          .betterCount(RoleFitness(satelliteLogs, ProcessClass::TLog))) ||
+		          .betterCount(RoleFitness(satelliteLogs, ProcessClass::TLog, id_used, clusterControllerDcId))) ||
 		     RoleFitness(SERVER_KNOBS->EXPECTED_COMMIT_PROXY_FITNESS,
 		                 req.configuration.getDesiredCommitProxies(),
 		                 ProcessClass::CommitProxy)
-		         .betterCount(RoleFitness(commit_proxies, ProcessClass::CommitProxy)) ||
+		         .betterCount(RoleFitness(commit_proxies, ProcessClass::CommitProxy, id_used, clusterControllerDcId)) ||
 		     RoleFitness(SERVER_KNOBS->EXPECTED_GRV_PROXY_FITNESS,
 		                 req.configuration.getDesiredGrvProxies(),
 		                 ProcessClass::GrvProxy)
-		         .betterCount(RoleFitness(grv_proxies, ProcessClass::GrvProxy)) ||
+		         .betterCount(RoleFitness(grv_proxies, ProcessClass::GrvProxy, id_used, clusterControllerDcId)) ||
 		     RoleFitness(SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS,
 		                 req.configuration.getDesiredResolvers(),
 		                 ProcessClass::Resolver)
-		         .betterCount(RoleFitness(resolvers, ProcessClass::Resolver)))) {
+		         .betterCount(RoleFitness(resolvers, ProcessClass::Resolver, id_used, clusterControllerDcId)))) {
 			return operation_failed();
 		}
 
@@ -1149,7 +1143,7 @@ public:
 
 			auto datacenters = getDatacenters(req.configuration);
 
-			RoleFitnessPair bestFitness;
+			std::tuple<RoleFitness, RoleFitness, RoleFitness> bestFitness;
 			int numEquivalent = 1;
 			Optional<Key> bestDC;
 
@@ -1165,6 +1159,14 @@ public:
 					auto first_resolver = getWorkerForRoleInDatacenter(
 					    dcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, req.configuration, used);
 
+					// If one of the first process recruitments is forced to share a process, allow all of next
+					// recruitments to also share a process.
+					auto maxUsed =
+					    std::max(std::max(first_commit_proxy.used, first_grv_proxy.used), first_resolver.used);
+					first_commit_proxy.used = maxUsed;
+					first_grv_proxy.used = maxUsed;
+					first_resolver.used = maxUsed;
+
 					auto commit_proxies = getWorkersForRoleInDatacenter(dcId,
 					                                                    ProcessClass::CommitProxy,
 					                                                    req.configuration.getDesiredCommitProxies(),
@@ -1186,9 +1188,10 @@ public:
 					                                               used,
 					                                               first_resolver);
 
-					RoleFitnessPair fitness(RoleFitness(commit_proxies, ProcessClass::CommitProxy),
-					                        RoleFitness(grv_proxies, ProcessClass::GrvProxy),
-					                        RoleFitness(resolvers, ProcessClass::Resolver));
+					auto fitness = std::make_tuple(
+					    RoleFitness(commit_proxies, ProcessClass::CommitProxy, used, clusterControllerDcId),
+					    RoleFitness(grv_proxies, ProcessClass::GrvProxy, used, clusterControllerDcId),
+					    RoleFitness(resolvers, ProcessClass::Resolver, used, clusterControllerDcId));
 
 					if (dcId == clusterControllerDcId) {
 						bestFitness = fitness;
@@ -1206,7 +1209,7 @@ public:
 						if (req.configuration.backupWorkerEnabled) {
 							const int nBackup = std::max<int>(tlogs.size(), req.maxOldLogRouters);
 							auto backupWorkers = getWorkersForRoleInDatacenter(
-							    dcId, ProcessClass::Backup, nBackup, req.configuration, id_used);
+							    dcId, ProcessClass::Backup, nBackup, req.configuration, used);
 							std::transform(backupWorkers.begin(),
 							               backupWorkers.end(),
 							               std::back_inserter(result.backupWorkers),
@@ -1254,19 +1257,19 @@ public:
 			if (!goodRecruitmentTime.isReady() &&
 			    (RoleFitness(
 			         SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs(), ProcessClass::TLog)
-			         .betterCount(RoleFitness(tlogs, ProcessClass::TLog)) ||
+			         .betterCount(RoleFitness(tlogs, ProcessClass::TLog, id_used, clusterControllerDcId)) ||
 			     RoleFitness(SERVER_KNOBS->EXPECTED_COMMIT_PROXY_FITNESS,
 			                 req.configuration.getDesiredCommitProxies(),
 			                 ProcessClass::CommitProxy)
-			         .betterCount(bestFitness.proxy) ||
+			         .betterCount(std::get<0>(bestFitness)) ||
 			     RoleFitness(SERVER_KNOBS->EXPECTED_GRV_PROXY_FITNESS,
 			                 req.configuration.getDesiredGrvProxies(),
 			                 ProcessClass::GrvProxy)
-			         .betterCount(bestFitness.grvProxy) ||
+			         .betterCount(std::get<1>(bestFitness)) ||
 			     RoleFitness(SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS,
 			                 req.configuration.getDesiredResolvers(),
 			                 ProcessClass::Resolver)
-			         .betterCount(bestFitness.resolver))) {
+			         .betterCount(std::get<2>(bestFitness)))) {
 				throw operation_failed();
 			}
 
@@ -1337,6 +1340,12 @@ public:
 		}
 	}
 
+	void updateIdUsed(const vector<WorkerDetails>& workers, std::map<Optional<Standalone<StringRef>>, int>& id_used) {
+		for (auto& it : workers) {
+			id_used[it.interf.locality.processId()]++;
+		}
+	}
+
 	// FIXME: determine when to fail the cluster controller when a primaryDC has not been set
 
 	// This function returns true when the cluster controller determines it is worth forcing
@@ -1351,6 +1360,7 @@ public:
 		// Do not trigger better master exists if the cluster controller is excluded, since the master will change
 		// anyways once the cluster controller is moved
 		if (id_worker[clusterControllerProcessId].priorityInfo.isExcluded) {
+			TraceEvent("WorseMasterExists", id).detail("Reason", "ClusterControllerExcluded");
 			return false;
 		}
 
@@ -1363,6 +1373,9 @@ public:
 		// Get master process
 		auto masterWorker = id_worker.find(dbi.master.locality.processId());
 		if (masterWorker == id_worker.end()) {
+			TraceEvent("WorseMasterExists", id)
+			    .detail("Reason", "CannotFindMaster")
+			    .detail("ProcessID", dbi.master.locality.processId());
 			return false;
 		}
 
@@ -1378,10 +1391,18 @@ public:
 		for (auto& logSet : dbi.logSystemConfig.tLogs) {
 			for (auto& it : logSet.tLogs) {
 				auto tlogWorker = id_worker.find(it.interf().filteredLocality.processId());
-				if (tlogWorker == id_worker.end())
+				if (tlogWorker == id_worker.end()) {
+					TraceEvent("WorseMasterExists", id)
+					    .detail("Reason", "CannotFindTLog")
+					    .detail("ProcessID", it.interf().filteredLocality.processId());
 					return false;
-				if (tlogWorker->second.priorityInfo.isExcluded)
+				}
+				if (tlogWorker->second.priorityInfo.isExcluded) {
+					TraceEvent("BetterMasterExists", id)
+					    .detail("Reason", "TLogExcluded")
+					    .detail("ProcessID", it.interf().filteredLocality.processId());
 					return true;
+				}
 
 				if (logSet.isLocal && logSet.locality == tagLocalitySatellite) {
 					satellite_tlogs.push_back(tlogWorker->second.details);
@@ -1394,10 +1415,18 @@ public:
 
 			for (auto& it : logSet.logRouters) {
 				auto tlogWorker = id_worker.find(it.interf().filteredLocality.processId());
-				if (tlogWorker == id_worker.end())
+				if (tlogWorker == id_worker.end()) {
+					TraceEvent("WorseMasterExists", id)
+					    .detail("Reason", "CannotFindLogRouter")
+					    .detail("ProcessID", it.interf().filteredLocality.processId());
 					return false;
-				if (tlogWorker->second.priorityInfo.isExcluded)
+				}
+				if (tlogWorker->second.priorityInfo.isExcluded) {
+					TraceEvent("BetterMasterExists", id)
+					    .detail("Reason", "LogRouterExcluded")
+					    .detail("ProcessID", it.interf().filteredLocality.processId());
 					return true;
+				}
 				if (!logRouterAddresses.count(tlogWorker->second.details.interf.address())) {
 					logRouterAddresses.insert(tlogWorker->second.details.interf.address());
 					log_routers.push_back(tlogWorker->second.details);
@@ -1406,10 +1435,18 @@ public:
 
 			for (const auto& worker : logSet.backupWorkers) {
 				auto workerIt = id_worker.find(worker.interf().locality.processId());
-				if (workerIt == id_worker.end())
+				if (workerIt == id_worker.end()) {
+					TraceEvent("WorseMasterExists", id)
+					    .detail("Reason", "CannotFindBackupWorker")
+					    .detail("ProcessID", worker.interf().locality.processId());
 					return false;
-				if (workerIt->second.priorityInfo.isExcluded)
+				}
+				if (workerIt->second.priorityInfo.isExcluded) {
+					TraceEvent("BetterMasterExists", id)
+					    .detail("Reason", "BackupWorkerExcluded")
+					    .detail("ProcessID", worker.interf().locality.processId());
 					return true;
+				}
 				if (backup_addresses.count(workerIt->second.details.interf.address()) == 0) {
 					backup_addresses.insert(workerIt->second.details.interf.address());
 					backup_workers.push_back(workerIt->second.details);
@@ -1421,10 +1458,18 @@ public:
 		std::vector<WorkerDetails> commitProxyClasses;
 		for (auto& it : dbi.client.commitProxies) {
 			auto commitProxyWorker = id_worker.find(it.processId);
-			if (commitProxyWorker == id_worker.end())
+			if (commitProxyWorker == id_worker.end()) {
+				TraceEvent("WorseMasterExists", id)
+				    .detail("Reason", "CannotFindCommitProxy")
+				    .detail("ProcessID", it.processId);
 				return false;
-			if (commitProxyWorker->second.priorityInfo.isExcluded)
+			}
+			if (commitProxyWorker->second.priorityInfo.isExcluded) {
+				TraceEvent("BetterMasterExists", id)
+				    .detail("Reason", "CommitProxyExcluded")
+				    .detail("ProcessID", it.processId);
 				return true;
+			}
 			commitProxyClasses.push_back(commitProxyWorker->second.details);
 		}
 
@@ -1432,10 +1477,18 @@ public:
 		std::vector<WorkerDetails> grvProxyClasses;
 		for (auto& it : dbi.client.grvProxies) {
 			auto grvProxyWorker = id_worker.find(it.processId);
-			if (grvProxyWorker == id_worker.end())
+			if (grvProxyWorker == id_worker.end()) {
+				TraceEvent("WorseMasterExists", id)
+				    .detail("Reason", "CannotFindGrvProxy")
+				    .detail("ProcessID", it.processId);
 				return false;
-			if (grvProxyWorker->second.priorityInfo.isExcluded)
+			}
+			if (grvProxyWorker->second.priorityInfo.isExcluded) {
+				TraceEvent("BetterMasterExists", id)
+				    .detail("Reason", "GrvProxyExcluded")
+				    .detail("ProcessID", it.processId);
 				return true;
+			}
 			grvProxyClasses.push_back(grvProxyWorker->second.details);
 		}
 
@@ -1443,10 +1496,18 @@ public:
 		std::vector<WorkerDetails> resolverClasses;
 		for (auto& it : dbi.resolvers) {
 			auto resolverWorker = id_worker.find(it.locality.processId());
-			if (resolverWorker == id_worker.end())
+			if (resolverWorker == id_worker.end()) {
+				TraceEvent("WorseMasterExists", id)
+				    .detail("Reason", "CannotFindResolver")
+				    .detail("ProcessID", it.locality.processId());
 				return false;
-			if (resolverWorker->second.priorityInfo.isExcluded)
+			}
+			if (resolverWorker->second.priorityInfo.isExcluded) {
+				TraceEvent("BetterMasterExists", id)
+				    .detail("Reason", "ResolverExcluded")
+				    .detail("ProcessID", it.locality.processId());
 				return true;
+			}
 			resolverClasses.push_back(resolverWorker->second.details);
 		}
 
@@ -1459,7 +1520,9 @@ public:
 		}
 
 		std::map<Optional<Standalone<StringRef>>, int> id_used;
+		std::map<Optional<Standalone<StringRef>>, int> old_id_used;
 		id_used[clusterControllerProcessId]++;
+		old_id_used[clusterControllerProcessId]++;
 		WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(
 		    clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db.config, id_used, true);
 		auto newMasterFit = mworker.worker.processClass.machineClassFitness(ProcessClass::Master);
@@ -1467,11 +1530,25 @@ public:
 			newMasterFit = std::max(newMasterFit, ProcessClass::ExcludeFit);
 		}
 
-		if (oldMasterFit < newMasterFit)
+		old_id_used[masterWorker->first]++;
+		if (oldMasterFit < newMasterFit) {
+			TraceEvent("WorseMasterExists", id)
+			    .detail("OldMasterFit", oldMasterFit)
+			    .detail("NewMasterFit", newMasterFit)
+			    .detail("OldIsCC", dbi.master.locality.processId() == clusterControllerProcessId)
+			    .detail("NewIsCC", mworker.worker.interf.locality.processId() == clusterControllerProcessId);
+			;
 			return false;
+		}
 		if (oldMasterFit > newMasterFit || (dbi.master.locality.processId() == clusterControllerProcessId &&
-		                                    mworker.worker.interf.locality.processId() != clusterControllerProcessId))
+		                                    mworker.worker.interf.locality.processId() != clusterControllerProcessId)) {
+			TraceEvent("BetterMasterExists", id)
+			    .detail("OldMasterFit", oldMasterFit)
+			    .detail("NewMasterFit", newMasterFit)
+			    .detail("OldIsCC", dbi.master.locality.processId() == clusterControllerProcessId)
+			    .detail("NewIsCC", mworker.worker.interf.locality.processId() == clusterControllerProcessId);
 			return true;
+		}
 
 		std::set<Optional<Key>> primaryDC;
 		std::set<Optional<Key>> remoteDC;
@@ -1493,7 +1570,8 @@ public:
 		}
 
 		// Check tLog fitness
-		RoleFitness oldTLogFit(tlogs, ProcessClass::TLog);
+		updateIdUsed(tlogs, old_id_used);
+		RoleFitness oldTLogFit(tlogs, ProcessClass::TLog, old_id_used, clusterControllerDcId);
 		auto newTLogs = getWorkersForTlogs(db.config,
 		                                   db.config.tLogReplicationFactor,
 		                                   db.config.getDesiredLogs(),
@@ -1501,10 +1579,7 @@ public:
 		                                   id_used,
 		                                   true,
 		                                   primaryDC);
-		RoleFitness newTLogFit(newTLogs, ProcessClass::TLog);
-
-		if (oldTLogFit < newTLogFit)
-			return false;
+		RoleFitness newTLogFit(newTLogs, ProcessClass::TLog, id_used, clusterControllerDcId);
 
 		bool oldSatelliteFallback = false;
 
@@ -1520,13 +1595,16 @@ public:
 			}
 		}
 
-		RoleFitness oldSatelliteTLogFit(satellite_tlogs, ProcessClass::TLog);
+		updateIdUsed(satellite_tlogs, old_id_used);
+		RoleFitness oldSatelliteTLogFit(satellite_tlogs, ProcessClass::TLog, old_id_used, clusterControllerDcId);
 		bool newSatelliteFallback = false;
-		auto newSatelliteTLogs =
-		    (region.satelliteTLogReplicationFactor > 0 && db.config.usableRegions > 1)
-		        ? getWorkersForSatelliteLogs(db.config, region, remoteRegion, id_used, newSatelliteFallback, true)
-		        : satellite_tlogs;
-		RoleFitness newSatelliteTLogFit(newSatelliteTLogs, ProcessClass::TLog);
+		auto newSatelliteTLogs = satellite_tlogs;
+		RoleFitness newSatelliteTLogFit = oldSatelliteTLogFit;
+		if (region.satelliteTLogReplicationFactor > 0 && db.config.usableRegions > 1) {
+			newSatelliteTLogs =
+			    getWorkersForSatelliteLogs(db.config, region, remoteRegion, id_used, newSatelliteFallback, true);
+			newSatelliteTLogFit = RoleFitness(newSatelliteTLogs, ProcessClass::TLog, id_used, clusterControllerDcId);
+		}
 
 		std::map<Optional<Key>, int32_t> satellite_priority;
 		for (auto& r : region.satellites) {
@@ -1551,55 +1629,72 @@ public:
 			}
 		}
 
-		if (oldSatelliteFallback && !newSatelliteFallback)
+		if (oldSatelliteFallback && !newSatelliteFallback) {
+			TraceEvent("BetterMasterExists", id)
+			    .detail("OldSatelliteFallback", oldSatelliteFallback)
+			    .detail("NewSatelliteFallback", newSatelliteFallback);
 			return true;
-		if (!oldSatelliteFallback && newSatelliteFallback)
+		}
+		if (!oldSatelliteFallback && newSatelliteFallback) {
+			TraceEvent("WorseMasterExists", id)
+			    .detail("OldSatelliteFallback", oldSatelliteFallback)
+			    .detail("NewSatelliteFallback", newSatelliteFallback);
 			return false;
+		}
 
-		if (oldSatelliteRegionFit < newSatelliteRegionFit)
+		if (oldSatelliteRegionFit < newSatelliteRegionFit) {
+			TraceEvent("BetterMasterExists", id)
+			    .detail("OldSatelliteRegionFit", oldSatelliteRegionFit)
+			    .detail("NewSatelliteRegionFit", newSatelliteRegionFit);
 			return true;
-		if (oldSatelliteRegionFit > newSatelliteRegionFit)
+		}
+		if (oldSatelliteRegionFit > newSatelliteRegionFit) {
+			TraceEvent("WorseMasterExists", id)
+			    .detail("OldSatelliteRegionFit", oldSatelliteRegionFit)
+			    .detail("NewSatelliteRegionFit", newSatelliteRegionFit);
 			return false;
+		}
 
-		if (oldSatelliteTLogFit < newSatelliteTLogFit)
-			return false;
-
-		RoleFitness oldRemoteTLogFit(remote_tlogs, ProcessClass::TLog);
+		updateIdUsed(remote_tlogs, old_id_used);
+		RoleFitness oldRemoteTLogFit(remote_tlogs, ProcessClass::TLog, old_id_used, clusterControllerDcId);
 		std::vector<UID> exclusionWorkerIds;
 		auto fn = [](const WorkerDetails& in) { return in.interf.id(); };
 		std::transform(newTLogs.begin(), newTLogs.end(), std::back_inserter(exclusionWorkerIds), fn);
 		std::transform(newSatelliteTLogs.begin(), newSatelliteTLogs.end(), std::back_inserter(exclusionWorkerIds), fn);
-		RoleFitness newRemoteTLogFit(
-		    (db.config.usableRegions > 1 && (dbi.recoveryState == RecoveryState::ALL_LOGS_RECRUITED ||
-		                                     dbi.recoveryState == RecoveryState::FULLY_RECOVERED))
-		        ? getWorkersForTlogs(db.config,
-		                             db.config.getRemoteTLogReplicationFactor(),
-		                             db.config.getDesiredRemoteLogs(),
-		                             db.config.getRemoteTLogPolicy(),
-		                             id_used,
-		                             true,
-		                             remoteDC,
-		                             exclusionWorkerIds)
-		        : remote_tlogs,
-		    ProcessClass::TLog);
-		if (oldRemoteTLogFit < newRemoteTLogFit)
-			return false;
+		RoleFitness newRemoteTLogFit = oldRemoteTLogFit;
+		if (db.config.usableRegions > 1 && (dbi.recoveryState == RecoveryState::ALL_LOGS_RECRUITED ||
+		                                    dbi.recoveryState == RecoveryState::FULLY_RECOVERED)) {
+			newRemoteTLogFit = RoleFitness(getWorkersForTlogs(db.config,
+			                                                  db.config.getRemoteTLogReplicationFactor(),
+			                                                  db.config.getDesiredRemoteLogs(),
+			                                                  db.config.getRemoteTLogPolicy(),
+			                                                  id_used,
+			                                                  true,
+			                                                  remoteDC,
+			                                                  exclusionWorkerIds),
+			                               ProcessClass::TLog,
+			                               id_used,
+			                               clusterControllerDcId);
+		}
 		int oldRouterCount =
 		    oldTLogFit.count * std::max<int>(1, db.config.desiredLogRouterCount / std::max(1, oldTLogFit.count));
 		int newRouterCount =
 		    newTLogFit.count * std::max<int>(1, db.config.desiredLogRouterCount / std::max(1, newTLogFit.count));
-		RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter);
-		RoleFitness newLogRoutersFit(
-		    (db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED)
-		        ? getWorkersForRoleInDatacenter(*remoteDC.begin(),
-		                                        ProcessClass::LogRouter,
-		                                        newRouterCount,
-		                                        db.config,
-		                                        id_used,
-		                                        Optional<WorkerFitnessInfo>(),
-		                                        true)
-		        : log_routers,
-		    ProcessClass::LogRouter);
+		updateIdUsed(log_routers, old_id_used);
+		RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter, old_id_used, clusterControllerDcId);
+		RoleFitness newLogRoutersFit = oldLogRoutersFit;
+		if (db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED) {
+			newLogRoutersFit = RoleFitness(getWorkersForRoleInDatacenter(*remoteDC.begin(),
+			                                                             ProcessClass::LogRouter,
+			                                                             newRouterCount,
+			                                                             db.config,
+			                                                             id_used,
+			                                                             Optional<WorkerFitnessInfo>(),
+			                                                             true),
+			                               ProcessClass::LogRouter,
+			                               id_used,
+			                               clusterControllerDcId);
+		}
 
 		if (oldLogRoutersFit.count < oldRouterCount) {
 			oldLogRoutersFit.worstFit = ProcessClass::NeverAssign;
@@ -1607,13 +1702,15 @@ public:
 		if (newLogRoutersFit.count < newRouterCount) {
 			newLogRoutersFit.worstFit = ProcessClass::NeverAssign;
 		}
-		if (oldLogRoutersFit < newLogRoutersFit)
-			return false;
 
 		// Check proxy/grvProxy/resolver fitness
-		RoleFitnessPair oldInFit(RoleFitness(commitProxyClasses, ProcessClass::CommitProxy),
-		                         RoleFitness(grvProxyClasses, ProcessClass::GrvProxy),
-		                         RoleFitness(resolverClasses, ProcessClass::Resolver));
+		updateIdUsed(commitProxyClasses, old_id_used);
+		updateIdUsed(grvProxyClasses, old_id_used);
+		updateIdUsed(resolverClasses, old_id_used);
+		RoleFitness oldCommitProxyFit(
+		    commitProxyClasses, ProcessClass::CommitProxy, old_id_used, clusterControllerDcId);
+		RoleFitness oldGrvProxyFit(grvProxyClasses, ProcessClass::GrvProxy, old_id_used, clusterControllerDcId);
+		RoleFitness oldResolverFit(resolverClasses, ProcessClass::Resolver, old_id_used, clusterControllerDcId);
 
 		auto first_commit_proxy = getWorkerForRoleInDatacenter(
 		    clusterControllerDcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, db.config, id_used, true);
@@ -1621,6 +1718,10 @@ public:
 		    clusterControllerDcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, db.config, id_used, true);
 		auto first_resolver = getWorkerForRoleInDatacenter(
 		    clusterControllerDcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, db.config, id_used, true);
+		auto maxUsed = std::max(std::max(first_commit_proxy.used, first_grv_proxy.used), first_resolver.used);
+		first_commit_proxy.used = maxUsed;
+		first_grv_proxy.used = maxUsed;
+		first_resolver.used = maxUsed;
 		auto commit_proxies = getWorkersForRoleInDatacenter(clusterControllerDcId,
 		                                                    ProcessClass::CommitProxy,
 		                                                    db.config.getDesiredCommitProxies(),
@@ -1643,25 +1744,13 @@ public:
 		                                               first_resolver,
 		                                               true);
 
-		RoleFitnessPair newInFit(RoleFitness(commit_proxies, ProcessClass::CommitProxy),
-		                         RoleFitness(grv_proxies, ProcessClass::GrvProxy),
-		                         RoleFitness(resolvers, ProcessClass::Resolver));
-		if (oldInFit.proxy.betterFitness(newInFit.proxy) || oldInFit.grvProxy.betterFitness(newInFit.grvProxy) ||
-		    oldInFit.resolver.betterFitness(newInFit.resolver)) {
-			return false;
-		}
-
-		// Because a configuration with fewer proxies or resolvers does not cause this function to fail,
-		// we need an extra check to determine if the total number of processes has been reduced.
-		// This is mainly helpful in avoiding situations where killing a degraded process
-		// would result in a configuration with less total processes than desired.
-		if (oldTLogFit.count + oldInFit.proxy.count + oldInFit.grvProxy.count + oldInFit.resolver.count >
-		    newTLogFit.count + newInFit.proxy.count + newInFit.grvProxy.count + newInFit.resolver.count) {
-			return false;
-		}
+		RoleFitness newCommitProxyFit(commit_proxies, ProcessClass::CommitProxy, id_used, clusterControllerDcId);
+		RoleFitness newGrvProxyFit(grv_proxies, ProcessClass::GrvProxy, id_used, clusterControllerDcId);
+		RoleFitness newResolverFit(resolvers, ProcessClass::Resolver, id_used, clusterControllerDcId);
 
 		// Check backup worker fitness
-		RoleFitness oldBackupWorkersFit(backup_workers, ProcessClass::Backup);
+		updateIdUsed(backup_workers, old_id_used);
+		RoleFitness oldBackupWorkersFit(backup_workers, ProcessClass::Backup, old_id_used, clusterControllerDcId);
 		const int nBackup = backup_addresses.size();
 		RoleFitness newBackupWorkersFit(getWorkersForRoleInDatacenter(clusterControllerDcId,
 		                                                              ProcessClass::Backup,
@@ -1670,35 +1759,75 @@ public:
 		                                                              id_used,
 		                                                              Optional<WorkerFitnessInfo>(),
 		                                                              true),
-		                                ProcessClass::Backup);
+		                                ProcessClass::Backup,
+		                                id_used,
+		                                clusterControllerDcId);
 
-		if (oldTLogFit > newTLogFit || oldInFit > newInFit || oldSatelliteTLogFit > newSatelliteTLogFit ||
-		    oldRemoteTLogFit > newRemoteTLogFit || oldLogRoutersFit > newLogRoutersFit ||
-		    oldBackupWorkersFit > newBackupWorkersFit) {
+		auto oldFit = std::make_tuple(oldTLogFit,
+		                              oldSatelliteTLogFit,
+		                              oldCommitProxyFit,
+		                              oldGrvProxyFit,
+		                              oldResolverFit,
+		                              oldBackupWorkersFit,
+		                              oldRemoteTLogFit,
+		                              oldLogRoutersFit);
+		auto newFit = std::make_tuple(newTLogFit,
+		                              newSatelliteTLogFit,
+		                              newCommitProxyFit,
+		                              newGrvProxyFit,
+		                              newResolverFit,
+		                              newBackupWorkersFit,
+		                              newRemoteTLogFit,
+		                              newLogRoutersFit);
+
+		if (oldFit > newFit) {
 			TraceEvent("BetterMasterExists", id)
 			    .detail("OldMasterFit", oldMasterFit)
 			    .detail("NewMasterFit", newMasterFit)
 			    .detail("OldTLogFit", oldTLogFit.toString())
 			    .detail("NewTLogFit", newTLogFit.toString())
-			    .detail("OldProxyFit", oldInFit.proxy.toString())
-			    .detail("NewProxyFit", newInFit.proxy.toString())
-			    .detail("OldGrvProxyFit", oldInFit.grvProxy.toString())
-			    .detail("NewGrvProxyFit", newInFit.grvProxy.toString())
-			    .detail("OldResolverFit", oldInFit.resolver.toString())
-			    .detail("NewResolverFit", newInFit.resolver.toString())
 			    .detail("OldSatelliteFit", oldSatelliteTLogFit.toString())
 			    .detail("NewSatelliteFit", newSatelliteTLogFit.toString())
+			    .detail("OldCommitProxyFit", oldCommitProxyFit.toString())
+			    .detail("NewCommitProxyFit", newCommitProxyFit.toString())
+			    .detail("OldGrvProxyFit", oldGrvProxyFit.toString())
+			    .detail("NewGrvProxyFit", newGrvProxyFit.toString())
+			    .detail("OldResolverFit", oldResolverFit.toString())
+			    .detail("NewResolverFit", newResolverFit.toString())
+			    .detail("OldBackupWorkerFit", oldBackupWorkersFit.toString())
+			    .detail("NewBackupWorkerFit", newBackupWorkersFit.toString())
 			    .detail("OldRemoteFit", oldRemoteTLogFit.toString())
 			    .detail("NewRemoteFit", newRemoteTLogFit.toString())
 			    .detail("OldRouterFit", oldLogRoutersFit.toString())
 			    .detail("NewRouterFit", newLogRoutersFit.toString())
-			    .detail("OldBackupWorkerFit", oldBackupWorkersFit.toString())
-			    .detail("NewBackupWorkerFit", newBackupWorkersFit.toString())
 			    .detail("OldSatelliteFallback", oldSatelliteFallback)
 			    .detail("NewSatelliteFallback", newSatelliteFallback);
 			return true;
 		}
 
+		if (oldFit < newFit) {
+			TraceEvent("WorseMasterExists", id)
+			    .detail("OldMasterFit", oldMasterFit)
+			    .detail("NewMasterFit", newMasterFit)
+			    .detail("OldTLogFit", oldTLogFit.toString())
+			    .detail("NewTLogFit", newTLogFit.toString())
+			    .detail("OldSatelliteFit", oldSatelliteTLogFit.toString())
+			    .detail("NewSatelliteFit", newSatelliteTLogFit.toString())
+			    .detail("OldCommitProxyFit", oldCommitProxyFit.toString())
+			    .detail("NewCommitProxyFit", newCommitProxyFit.toString())
+			    .detail("OldGrvProxyFit", oldGrvProxyFit.toString())
+			    .detail("NewGrvProxyFit", newGrvProxyFit.toString())
+			    .detail("OldResolverFit", oldResolverFit.toString())
+			    .detail("NewResolverFit", newResolverFit.toString())
+			    .detail("OldBackupWorkerFit", oldBackupWorkersFit.toString())
+			    .detail("NewBackupWorkerFit", newBackupWorkersFit.toString())
+			    .detail("OldRemoteFit", oldRemoteTLogFit.toString())
+			    .detail("NewRemoteFit", newRemoteTLogFit.toString())
+			    .detail("OldRouterFit", oldLogRoutersFit.toString())
+			    .detail("NewRouterFit", newLogRoutersFit.toString())
+			    .detail("OldSatelliteFallback", oldSatelliteFallback)
+			    .detail("NewSatelliteFallback", newSatelliteFallback);
+		}
 		return false;
 	}
 

From 434f41a0937031338bf28f9ffca9300eef6be5c3 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 7 Apr 2021 18:14:44 -0700
Subject: [PATCH 074/461] Renamed members of UnitTestParameters to look
 cleaner.  Added getDouble().  Updated more Redwood unit test parameters to be
 initialized from params.

---
 fdbserver/VersionedBTree.actor.cpp      | 115 +++++++++++++-----------
 fdbserver/fdbserver.actor.cpp           |   2 +-
 fdbserver/networktest.actor.cpp         |  18 ++--
 fdbserver/workloads/UnitTests.actor.cpp |   6 +-
 flow/UnitTest.cpp                       |  24 +++--
 flow/UnitTest.h                         |  14 ++-
 6 files changed, 105 insertions(+), 74 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index c12015ade8..fab29a7034 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -7029,7 +7029,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 		bytes += deltaTest(a, b);
 	}
 	double elapsed = timer() - start;
-	printf("DeltaTest() on random large records %g M/s  %g MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
+	printf("DeltaTest() on random large records %f M/s  %f MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
 
 	keyBuffer.resize(30);
 	valueBuffer.resize(100);
@@ -7041,7 +7041,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 		RedwoodRecordRef b = randomRedwoodRecordRef(keyBuffer, valueBuffer);
 		bytes += deltaTest(a, b);
 	}
-	printf("DeltaTest() on random small records %g M/s  %g MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
+	printf("DeltaTest() on random small records %f M/s  %f MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
 
 	RedwoodRecordRef rec1;
 	RedwoodRecordRef rec2;
@@ -7058,7 +7058,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.getCommonPrefixLen(rec2, 50);
 	}
-	printf("%" PRId64 " getCommonPrefixLen(skip=50) %g M/s\n", total, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " getCommonPrefixLen(skip=50) %f M/s\n", total, count / (timer() - start) / 1e6);
 
 	start = timer();
 	total = 0;
@@ -7066,7 +7066,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.getCommonPrefixLen(rec2, 0);
 	}
-	printf("%" PRId64 " getCommonPrefixLen(skip=0) %g M/s\n", total, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " getCommonPrefixLen(skip=0) %f M/s\n", total, count / (timer() - start) / 1e6);
 
 	char buf[1000];
 	RedwoodRecordRef::Delta& d = *(RedwoodRecordRef::Delta*)buf;
@@ -7079,7 +7079,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.writeDelta(d, rec2, commonPrefix);
 	}
-	printf("%" PRId64 " writeDelta(commonPrefix=%d) %g M/s\n", total, commonPrefix, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " writeDelta(commonPrefix=%d) %f M/s\n", total, commonPrefix, count / (timer() - start) / 1e6);
 
 	start = timer();
 	total = 0;
@@ -7087,7 +7087,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.writeDelta(d, rec2);
 	}
-	printf("%" PRId64 " writeDelta() %g M/s\n", total, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " writeDelta() %f M/s\n", total, count / (timer() - start) / 1e6);
 
 	return Void();
 }
@@ -7647,30 +7647,43 @@ TEST_CASE("/redwood/correctness/btree") {
 	g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting
 	g_redwoodMetrics.clear();
 
-	state std::string pagerFile = "unittest_pageFile.redwood";
+	state std::string fileName = params.get("fileName").orDefault("unittest_pageFile.redwood");
 	IPager2* pager;
 
-	state bool serialTest = deterministicRandom()->coinflip();
-	state bool shortTest = deterministicRandom()->coinflip();
+	state bool serialTest = params.getInt("serialTest").orDefault(deterministicRandom()->coinflip());
+	state bool shortTest = params.getInt("shortTest").orDefault(deterministicRandom()->coinflip());
 
 	state int pageSize =
 	    shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400));
 
-	state int64_t targetPageOps = shortTest ? 50000 : 1000000;
-	state bool pagerMemoryOnly = shortTest && (deterministicRandom()->random01() < .001);
-	state int maxKeySize = deterministicRandom()->randomInt(1, pageSize * 2);
-	state int maxValueSize = randomSize(pageSize * 25);
-	state int maxCommitSize = shortTest ? 1000 : randomSize(std::min<int>((maxKeySize + maxValueSize) * 20000, 10e6));
-	state double clearProbability = deterministicRandom()->random01() * .1;
-	state double clearSingleKeyProbability = deterministicRandom()->random01();
-	state double clearPostSetProbability = deterministicRandom()->random01() * .1;
-	state double coldStartProbability = pagerMemoryOnly ? 0 : (deterministicRandom()->random01() * 0.3);
-	state double advanceOldVersionProbability = deterministicRandom()->random01();
+	state int64_t targetPageOps = params.getInt("targetPageOps").orDefault(shortTest ? 50000 : 1000000);
+	state bool pagerMemoryOnly =
+	    params.getInt("pagerMemoryOnly").orDefault(shortTest && (deterministicRandom()->random01() < .001));
+	state int maxKeySize = params.getInt("maxKeySize").orDefault(deterministicRandom()->randomInt(1, pageSize * 2));
+	state int maxValueSize = params.getInt("maxValueSize").orDefault(randomSize(pageSize * 25));
+	state int maxCommitSize =
+	    params.getInt("maxCommitSize")
+	        .orDefault(shortTest ? 1000 : randomSize(std::min<int>((maxKeySize + maxValueSize) * 20000, 10e6)));
+	state double clearProbability =
+	    params.getDouble("clearProbability").orDefault(deterministicRandom()->random01() * .1);
+	state double clearSingleKeyProbability =
+	    params.getDouble("clearSingleKeyProbability").orDefault(deterministicRandom()->random01());
+	state double clearPostSetProbability =
+	    params.getDouble("clearPostSetProbability").orDefault(deterministicRandom()->random01() * .1);
+	state double coldStartProbability = params.getDouble("coldStartProbability")
+	                                        .orDefault(pagerMemoryOnly ? 0 : (deterministicRandom()->random01() * 0.3));
+	state double advanceOldVersionProbability =
+	    params.getDouble("advanceOldVersionProbability").orDefault(deterministicRandom()->random01());
 	state int64_t cacheSizeBytes =
-	    pagerMemoryOnly ? 2e9 : (pageSize * deterministicRandom()->randomInt(1, (BUGGIFY ? 2 : 10000) + 1));
-	state Version versionIncrement = deterministicRandom()->randomInt64(1, 1e8);
-	state Version remapCleanupWindow = BUGGIFY ? 0 : deterministicRandom()->randomInt64(1, versionIncrement * 50);
-	state int maxVerificationMapEntries = 300e3;
+	    params.getInt("cacheSizeBytes")
+	        .orDefault(pagerMemoryOnly ? 2e9
+	                                   : (pageSize * deterministicRandom()->randomInt(1, (BUGGIFY ? 2 : 10000) + 1)));
+	state Version versionIncrement =
+	    params.getInt("versionIncrement").orDefault(deterministicRandom()->randomInt64(1, 1e8));
+	state Version remapCleanupWindow =
+	    params.getInt("remapCleanupWindow")
+	        .orDefault(BUGGIFY ? 0 : deterministicRandom()->randomInt64(1, versionIncrement * 50));
+	state int maxVerificationMapEntries = params.getInt("maxVerificationMapEntries").orDefault(300e3);
 
 	printf("\n");
 	printf("targetPageOps: %" PRId64 "\n", targetPageOps);
@@ -7693,11 +7706,11 @@ TEST_CASE("/redwood/correctness/btree") {
 	printf("\n");
 
 	printf("Deleting existing test data...\n");
-	deleteFile(pagerFile);
+	deleteFile(fileName);
 
 	printf("Initializing...\n");
-	pager = new DWALPager(pageSize, pagerFile, cacheSizeBytes, remapCleanupWindow, pagerMemoryOnly);
-	state VersionedBTree* btree = new VersionedBTree(pager, pagerFile);
+	pager = new DWALPager(pageSize, fileName, cacheSizeBytes, remapCleanupWindow, pagerMemoryOnly);
+	state VersionedBTree* btree = new VersionedBTree(pager, fileName);
 	wait(btree->init());
 
 	state std::map<std::pair<std::string, Version>, Optional<std::string>> written;
@@ -7900,8 +7913,8 @@ TEST_CASE("/redwood/correctness/btree") {
 				wait(closedFuture);
 
 				printf("Reopening btree from disk.\n");
-				IPager2* pager = new DWALPager(pageSize, pagerFile, cacheSizeBytes, remapCleanupWindow);
-				btree = new VersionedBTree(pager, pagerFile);
+				IPager2* pager = new DWALPager(pageSize, fileName, cacheSizeBytes, remapCleanupWindow);
+				btree = new VersionedBTree(pager, fileName);
 				wait(btree->init());
 
 				Version v = btree->getLatestVersion();
@@ -7937,7 +7950,7 @@ TEST_CASE("/redwood/correctness/btree") {
 	state Future<Void> closedFuture = btree->onClosed();
 	btree->close();
 	wait(closedFuture);
-	btree = new VersionedBTree(new DWALPager(pageSize, pagerFile, cacheSizeBytes, 0), pagerFile);
+	btree = new VersionedBTree(new DWALPager(pageSize, fileName, cacheSizeBytes, 0), fileName);
 	wait(btree->init());
 
 	wait(btree->clearAllAndCheckSanity());
@@ -8045,22 +8058,22 @@ TEST_CASE(":/redwood/performance/set") {
 		deleteFile(pagerFile);
 	}
 
-	state int pageSize = params.getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
-	state int64_t pageCacheBytes = params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
-	state int nodeCount = params.getIntParam("nodeCount").orDefault(1e9);
-	state int maxRecordsPerCommit = params.getIntParam("maxRecordsPerCommit").orDefault(20000);
-	state int maxKVBytesPerCommit = params.getIntParam("maxKVBytesPerCommit").orDefault(20e6);
-	state int64_t kvBytesTarget = params.getIntParam("kvBytesTarget").orDefault(4e9);
-	state int minKeyPrefixBytes = params.getIntParam("minKeyPrefixBytes").orDefault(25);
-	state int maxKeyPrefixBytes = params.getIntParam("maxKeyPrefixBytes").orDefault(25);
-	state int minValueSize = params.getIntParam("minValueSize").orDefault(100);
-	state int maxValueSize = params.getIntParam("maxValueSize").orDefault(500);
-	state int minConsecutiveRun = params.getIntParam("minConsecutiveRun").orDefault(1);
-	state int maxConsecutiveRun = params.getIntParam("maxConsecutiveRun").orDefault(100);
-	state char firstKeyChar = params.getParam("firstKeyChar").orDefault("a")[0];
-	state char lastKeyChar = params.getParam("lastKeyChar").orDefault("m")[0];
+	state int pageSize = params.getInt("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
+	state int64_t pageCacheBytes = params.getInt("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
+	state int nodeCount = params.getInt("nodeCount").orDefault(1e9);
+	state int maxRecordsPerCommit = params.getInt("maxRecordsPerCommit").orDefault(20000);
+	state int maxKVBytesPerCommit = params.getInt("maxKVBytesPerCommit").orDefault(20e6);
+	state int64_t kvBytesTarget = params.getInt("kvBytesTarget").orDefault(4e9);
+	state int minKeyPrefixBytes = params.getInt("minKeyPrefixBytes").orDefault(25);
+	state int maxKeyPrefixBytes = params.getInt("maxKeyPrefixBytes").orDefault(25);
+	state int minValueSize = params.getInt("minValueSize").orDefault(100);
+	state int maxValueSize = params.getInt("maxValueSize").orDefault(500);
+	state int minConsecutiveRun = params.getInt("minConsecutiveRun").orDefault(1);
+	state int maxConsecutiveRun = params.getInt("maxConsecutiveRun").orDefault(100);
+	state char firstKeyChar = params.get("firstKeyChar").orDefault("a")[0];
+	state char lastKeyChar = params.get("lastKeyChar").orDefault("m")[0];
 	state Version remapCleanupWindow =
-	    params.getIntParam("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
+	    params.getInt("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
 
 	printf("pageSize: %d\n", pageSize);
 	printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes);
@@ -8543,10 +8556,10 @@ ACTOR Future<Void> doPrefixInsertComparison(int suffixSize,
 }
 
 TEST_CASE(":/redwood/performance/prefixSizeComparison") {
-	state int suffixSize = 12;
-	state int valueSize = 100;
-	state int recordCountTarget = 100e6;
-	state int usePrefixesInOrder = false;
+	state int suffixSize = params.getInt("suffixSize").orDefault(12);
+	state int valueSize = params.getInt("valueSize").orDefault(100);
+	state int recordCountTarget = params.getInt("recordCountTarget").orDefault(100e6);
+	state bool usePrefixesInOrder = params.getInt("usePrefixesInOrder").orDefault(0);
 
 	wait(doPrefixInsertComparison(
 	    suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({ { 10, 100000 } })));
@@ -8564,9 +8577,9 @@ TEST_CASE(":/redwood/performance/prefixSizeComparison") {
 }
 
 TEST_CASE(":/redwood/performance/sequentialInsert") {
-	state int prefixLen = 30;
-	state int valueSize = 100;
-	state int recordCountTarget = 100e6;
+	state int prefixLen = params.getInt("prefixLen").orDefault(30);
+	state int valueSize = params.getInt("valueSize").orDefault(100);
+	state int recordCountTarget = params.getInt("recordCountTarget").orDefault(100e6);
 
 	deleteFile("test.redwood");
 	wait(delay(5));
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 325caa10c5..610410dec9 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -1059,7 +1059,7 @@ private:
 					fprintf(stderr, "ERROR: unable to parse knob option '%s'\n", syn.c_str());
 					flushAndExit(FDB_EXIT_ERROR);
 				}
-				testParams.setParam(syn.substr(7), args.OptionArg());
+				testParams.set(syn.substr(7), args.OptionArg());
 				break;
 			}
 			case OPT_LOCALITY: {
diff --git a/fdbserver/networktest.actor.cpp b/fdbserver/networktest.actor.cpp
index cea0c2fca8..d9ef7e4857 100644
--- a/fdbserver/networktest.actor.cpp
+++ b/fdbserver/networktest.actor.cpp
@@ -570,15 +570,15 @@ struct P2PNetworkTest {
 // The client will close the connection after a random idleMilliseconds.
 // Reads and writes can optionally preceded by random delays, waitReadMilliseconds and waitWriteMilliseconds.
 TEST_CASE(":/network/p2ptest") {
-	state P2PNetworkTest p2p(params.getParam("listenerAddresses").orDefault(""),
-	                         params.getParam("remoteAddresses").orDefault(""),
-	                         params.getIntParam("connectionsOut").orDefault(1),
-	                         params.getParam("requestBytes").orDefault("50:100"),
-	                         params.getParam("replyBytes").orDefault("500:1000"),
-	                         params.getParam("requests").orDefault("10:10000"),
-	                         params.getParam("idleMilliseconds").orDefault("0"),
-	                         params.getParam("waitReadMilliseconds").orDefault("0"),
-	                         params.getParam("waitWriteMilliseconds").orDefault("0"));
+	state P2PNetworkTest p2p(params.get("listenerAddresses").orDefault(""),
+	                         params.get("remoteAddresses").orDefault(""),
+	                         params.getInt("connectionsOut").orDefault(1),
+	                         params.get("requestBytes").orDefault("50:100"),
+	                         params.get("replyBytes").orDefault("500:1000"),
+	                         params.get("requests").orDefault("10:10000"),
+	                         params.get("idleMilliseconds").orDefault("0"),
+	                         params.get("waitReadMilliseconds").orDefault("0"),
+	                         params.get("waitWriteMilliseconds").orDefault("0"));
 
 	wait(p2p.run());
 	return Void();
diff --git a/fdbserver/workloads/UnitTests.actor.cpp b/fdbserver/workloads/UnitTests.actor.cpp
index 024cfc9973..db816fe4c7 100644
--- a/fdbserver/workloads/UnitTests.actor.cpp
+++ b/fdbserver/workloads/UnitTests.actor.cpp
@@ -48,9 +48,9 @@ struct UnitTestWorkload : TestWorkload {
 		testRunLimit = getOption(options, LiteralStringRef("maxTestCases"), -1);
 
 		// Consume all remaining options as testParams which the unit test can access
-		for(auto &kv : options) {
-			if(kv.value.size() != 0) {
-				testParams.setParam(kv.key.toString(), getOption(options, kv.key, StringRef()).toString());
+		for (auto& kv : options) {
+			if (kv.value.size() != 0) {
+				testParams.set(kv.key.toString(), getOption(options, kv.key, StringRef()).toString());
 			}
 		}
 
diff --git a/flow/UnitTest.cpp b/flow/UnitTest.cpp
index 7303cd33c7..f797fc32c1 100644
--- a/flow/UnitTest.cpp
+++ b/flow/UnitTest.cpp
@@ -27,12 +27,12 @@ UnitTest::UnitTest(const char* name, const char* file, int line, TestFunction fu
 	g_unittests.tests = this;
 }
 
-void UnitTestParameters::setParam(const std::string& name, const std::string& value) {
+void UnitTestParameters::set(const std::string& name, const std::string& value) {
 	printf("setting %s = %s\n", name.c_str(), value.c_str());
 	params[name] = value;
 }
 
-Optional<std::string> UnitTestParameters::getParam(const std::string& name) const {
+Optional<std::string> UnitTestParameters::get(const std::string& name) const {
 	auto it = params.find(name);
 	if (it != params.end()) {
 		return it->second;
@@ -40,14 +40,26 @@ Optional<std::string> UnitTestParameters::getParam(const std::string& name) cons
 	return {};
 }
 
-void UnitTestParameters::setParam(const std::string& name, int64_t value) {
-	setParam(name, format("%" PRId64, value));
+void UnitTestParameters::set(const std::string& name, int64_t value) {
+	set(name, format("%" PRId64, value));
 };
 
-Optional<int64_t> UnitTestParameters::getIntParam(const std::string& name) const {
-	auto opt = getParam(name);
+void UnitTestParameters::set(const std::string& name, double value) {
+	set(name, format("%g", value));
+};
+
+Optional<int64_t> UnitTestParameters::getInt(const std::string& name) const {
+	auto opt = get(name);
 	if (opt.present()) {
 		return atoll(opt.get().c_str());
 	}
 	return {};
 }
+
+Optional<double> UnitTestParameters::getDouble(const std::string& name) const {
+	auto opt = get(name);
+	if (opt.present()) {
+		return atof(opt.get().c_str());
+	}
+	return {};
+}
diff --git a/flow/UnitTest.h b/flow/UnitTest.h
index 21d51a158f..3a0d4c1db6 100644
--- a/flow/UnitTest.h
+++ b/flow/UnitTest.h
@@ -52,16 +52,22 @@ struct UnitTestParameters {
 	std::map<std::string, std::string> params;
 
 	// Set a named parameter to a string value, replacing any existing value
-	void setParam(const std::string& name, const std::string& value);
+	void set(const std::string& name, const std::string& value);
 
 	// Set a named parameter to an integer converted to a string value, replacing any existing value
-	void setParam(const std::string& name, int64_t value);
+	void set(const std::string& name, int64_t value);
+
+	// Set a named parameter to a double converted to a string value, replacing any existing value
+	void set(const std::string& name, double value);
 
 	// Get a parameter's value, will return !present() if parameter was not set
-	Optional<std::string> getParam(const std::string& name) const;
+	Optional<std::string> get(const std::string& name) const;
 
 	// Get a parameter's value as an integer, will return !present() if parameter was not set
-	Optional<int64_t> getIntParam(const std::string& name) const;
+	Optional<int64_t> getInt(const std::string& name) const;
+
+	// Get a parameter's value parsed as a double, will return !present() if parameter was not set
+	Optional<double> getDouble(const std::string& name) const;
 };
 
 // Unit test definition structured as a linked list item

From 7b08886caffdfc1ec4ac32cd2bff1c27a9feb3cc Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 7 Apr 2021 18:29:17 -0700
Subject: [PATCH 075/461] Updated btree unit test name.

---
 tests/rare/RedwoodCorrectnessBTree.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/rare/RedwoodCorrectnessBTree.toml b/tests/rare/RedwoodCorrectnessBTree.toml
index 1a7320e416..db21848a4b 100644
--- a/tests/rare/RedwoodCorrectnessBTree.toml
+++ b/tests/rare/RedwoodCorrectnessBTree.toml
@@ -6,4 +6,5 @@ startDelay = 0
     [[test.workload]]
     testName = 'UnitTests'
     maxTestCases = 0
-    testsMatching = ':/redwood/correctness/btree'
+    testsMatching = '/redwood/correctness/btree'
+    remapCleanupWindow = 1000000000

From 60e59555a729a8227da903d65a6a264de4d97629 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 7 Apr 2021 18:39:06 -0700
Subject: [PATCH 076/461] Removed btree cleanup parameter override.

---
 tests/rare/RedwoodCorrectnessBTree.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/rare/RedwoodCorrectnessBTree.toml b/tests/rare/RedwoodCorrectnessBTree.toml
index db21848a4b..c39098e4cc 100644
--- a/tests/rare/RedwoodCorrectnessBTree.toml
+++ b/tests/rare/RedwoodCorrectnessBTree.toml
@@ -7,4 +7,3 @@ startDelay = 0
     testName = 'UnitTests'
     maxTestCases = 0
     testsMatching = '/redwood/correctness/btree'
-    remapCleanupWindow = 1000000000

From 15e8b43961cec6eb4d0b700d754125a82b11eb57 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 7 Apr 2021 19:57:24 -0700
Subject: [PATCH 077/461] rewrote getWorkersForTLogs to do a much better job of
 avoiding degraded processes and processes in the same DC as the cluster
 controller

---
 fdbserver/ClusterController.actor.cpp | 292 ++++++++++++++++----------
 1 file changed, 178 insertions(+), 114 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 87dcbd7c11..c5546830c6 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -415,91 +415,37 @@ public:
 			    .push_back(worker_details);
 		}
 
-		// Make sure we check for tlogs at the required size before adding processses from the next fitness level
-		for (int fitness = ProcessClass::GoodFit; fitness != ProcessClass::NeverAssign; fitness++) {
-			fitness_workers[std::make_tuple((ProcessClass::Fitness)fitness, 0, true, false)];
-		}
-		results.reserve(results.size() + id_worker.size());
+		int requiredProcesses = 0;
+		auto requiredFitness = ProcessClass::BestFit;
+		int requiredUsed = 0;
+		bool requiredDegraded = false;
+		bool requiredInCCDC = false;
+
 		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
 			auto fitness = std::get<0>(workerIter->first);
 			auto used = std::get<1>(workerIter->first);
-			auto addingDegraded = std::get<2>(workerIter->first);
-			ASSERT(fitness < ProcessClass::NeverAssign);
-			if (bCompleted) {
-				break;
+			if (fitness > requiredFitness || used > requiredUsed) {
+				requiredFitness = fitness;
+				requiredUsed = used;
+				if (logServerSet->size() >= required && logServerSet->validate(policy)) {
+					requiredProcesses = logServerSet->size();
+					bCompleted = true;
+					break;
+				}
 			}
 
+			if (std::get<2>(workerIter->first)) {
+				requiredDegraded = true;
+			}
+			if (std::get<3>(workerIter->first)) {
+				requiredInCCDC = true;
+			}
 			for (auto& worker : workerIter->second) {
 				logServerMap->add(worker.interf.locality, &worker);
 			}
-
-			if (logServerSet->size() < (addingDegraded ? required : desired)) {
-			} else if (logServerSet->size() == required || logServerSet->size() <= desired) {
-				if (logServerSet->validate(policy)) {
-					for (auto& object : logServerMap->getObjects()) {
-						results.push_back(*object);
-					}
-					bCompleted = true;
-					break;
-				}
-				TraceEvent(SevWarn, "GWFTADNotAcceptable", id)
-				    .detail("DcIds", dcList)
-				    .detail("Fitness", fitness)
-				    .detail("Processes", logServerSet->size())
-				    .detail("Required", required)
-				    .detail("TLogPolicy", policy->info())
-				    .detail("DesiredLogs", desired)
-				    .detail("Used", used)
-				    .detail("AddingDegraded", addingDegraded);
-			}
-			// Try to select the desired size, if larger
-			else {
-				std::vector<LocalityEntry> bestSet;
-				std::vector<LocalityData> tLocalities;
-
-				// Try to find the best team of servers to fulfill the policy
-				if (findBestPolicySet(bestSet,
-				                      logServerSet,
-				                      policy,
-				                      desired,
-				                      SERVER_KNOBS->POLICY_RATING_TESTS,
-				                      SERVER_KNOBS->POLICY_GENERATIONS)) {
-					results.reserve(results.size() + bestSet.size());
-					for (auto& entry : bestSet) {
-						auto object = logServerMap->getObject(entry);
-						ASSERT(object);
-						results.push_back(*object);
-						tLocalities.push_back(object->interf.locality);
-					}
-					TraceEvent("GWFTADBestResults", id)
-					    .detail("DcIds", dcList)
-					    .detail("Fitness", fitness)
-					    .detail("Used", used)
-					    .detail("Processes", logServerSet->size())
-					    .detail("BestCount", bestSet.size())
-					    .detail("BestZones", ::describeZones(tLocalities))
-					    .detail("BestDataHalls", ::describeDataHalls(tLocalities))
-					    .detail("TLogPolicy", policy->info())
-					    .detail("TotalResults", results.size())
-					    .detail("DesiredLogs", desired)
-					    .detail("AddingDegraded", addingDegraded);
-					bCompleted = true;
-					break;
-				}
-				TraceEvent(SevWarn, "GWFTADNoBest", id)
-				    .detail("DcIds", dcList)
-				    .detail("Fitness", fitness)
-				    .detail("Used", used)
-				    .detail("Processes", logServerSet->size())
-				    .detail("Required", required)
-				    .detail("TLogPolicy", policy->info())
-				    .detail("DesiredLogs", desired)
-				    .detail("AddingDegraded", addingDegraded);
-			}
 		}
 
-		// If policy cannot be satisfied
-		if (!bCompleted) {
+		if (!bCompleted && !(logServerSet->size() >= required && logServerSet->validate(policy))) {
 			std::vector<LocalityData> tLocalities;
 			for (auto& object : logServerMap->getObjects()) {
 				tLocalities.push_back(object->interf.locality);
@@ -517,33 +463,154 @@ public:
 			    .detail("MissingDataHalls", ::describeDataHalls(unavailableLocals))
 			    .detail("Required", required)
 			    .detail("DesiredLogs", desired)
-			    .detail("RatingTests", SERVER_KNOBS->POLICY_RATING_TESTS)
 			    .detail("CheckStable", checkStable)
-			    .detail("NumExclusionWorkers", exclusionWorkerIds.size())
-			    .detail("PolicyGenerations", SERVER_KNOBS->POLICY_GENERATIONS)
-			    .backtrace();
+			    .detail("NumExclusionWorkers", exclusionWorkerIds.size());
 
 			logServerSet->clear();
 			logServerSet.clear();
 			throw no_more_servers();
 		}
 
+		if (requiredProcesses <= desired) {
+			for (auto& object : logServerMap->getObjects()) {
+				results.push_back(*object);
+			}
+			for (auto& result : results) {
+				id_used[result.interf.locality.processId()]++;
+			}
+			TraceEvent("GetTLogTeamDone")
+			    .detail("DcIds", dcList)
+			    .detail("Policy", policy->info())
+			    .detail("Results", results.size())
+			    .detail("Processes", logServerSet->size())
+			    .detail("Workers", id_worker.size())
+			    .detail("Required", required)
+			    .detail("Desired", desired)
+			    .detail("Fitness", requiredFitness)
+			    .detail("Used", requiredUsed)
+			    .detail("AddingDegraded", requiredDegraded)
+			    .detail("InCCDC", requiredInCCDC);
+			return results;
+		}
+
+		if (requiredDegraded) {
+			logServerMap->clear();
+			for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+				auto fitness = std::get<0>(workerIter->first);
+				auto used = std::get<1>(workerIter->first);
+				auto addingDegraded = std::get<2>(workerIter->first);
+				if (addingDegraded) {
+					continue;
+				}
+				if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+					break;
+				}
+				for (auto& worker : workerIter->second) {
+					logServerMap->add(worker.interf.locality, &worker);
+				}
+			}
+			if (logServerSet->size() >= desired && logServerSet->validate(policy)) {
+				requiredDegraded = false;
+			}
+		}
+
+		if (requiredInCCDC) {
+			logServerMap->clear();
+			for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+				auto fitness = std::get<0>(workerIter->first);
+				auto used = std::get<1>(workerIter->first);
+				auto addingDegraded = std::get<2>(workerIter->first);
+				auto inCCDC = std::get<3>(workerIter->first);
+				if (inCCDC || (!requiredDegraded && addingDegraded)) {
+					continue;
+				}
+				if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+					break;
+				}
+				for (auto& worker : workerIter->second) {
+					logServerMap->add(worker.interf.locality, &worker);
+				}
+			}
+			if (logServerSet->size() >= desired && logServerSet->validate(policy)) {
+				requiredInCCDC = false;
+			}
+		}
+
+		logServerMap->clear();
+		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+			auto fitness = std::get<0>(workerIter->first);
+			auto used = std::get<1>(workerIter->first);
+			auto addingDegraded = std::get<2>(workerIter->first);
+			auto inCCDC = std::get<3>(workerIter->first);
+			if ((!requiredInCCDC && inCCDC) || (!requiredDegraded && addingDegraded)) {
+				continue;
+			}
+			if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+				break;
+			}
+			for (auto& worker : workerIter->second) {
+				logServerMap->add(worker.interf.locality, &worker);
+			}
+		}
+
+		if (logServerSet->size() == desired) {
+			for (auto& object : logServerMap->getObjects()) {
+				results.push_back(*object);
+			}
+			for (auto& result : results) {
+				id_used[result.interf.locality.processId()]++;
+			}
+			TraceEvent("GetTLogTeamDone")
+			    .detail("DcIds", dcList)
+			    .detail("Policy", policy->info())
+			    .detail("Results", results.size())
+			    .detail("Processes", logServerSet->size())
+			    .detail("Workers", id_worker.size())
+			    .detail("Required", required)
+			    .detail("Desired", desired)
+			    .detail("Fitness", requiredFitness)
+			    .detail("Used", requiredUsed)
+			    .detail("AddingDegraded", requiredDegraded)
+			    .detail("InCCDC", requiredInCCDC);
+			return results;
+		}
+
+		std::vector<LocalityEntry> bestSet;
+		std::vector<LocalityData> tLocalities;
+
+		// Try to find the best team of servers to fulfill the policy
+		bCompleted = findBestPolicySet(bestSet,
+		                               logServerSet,
+		                               policy,
+		                               desired,
+		                               SERVER_KNOBS->POLICY_RATING_TESTS,
+		                               SERVER_KNOBS->POLICY_GENERATIONS);
+		ASSERT(bCompleted);
+		results.reserve(results.size() + bestSet.size());
+		for (auto& entry : bestSet) {
+			auto object = logServerMap->getObject(entry);
+			ASSERT(object);
+			results.push_back(*object);
+			tLocalities.push_back(object->interf.locality);
+		}
 		for (auto& result : results) {
 			id_used[result.interf.locality.processId()]++;
 		}
-
 		TraceEvent("GetTLogTeamDone")
 		    .detail("DcIds", dcList)
-		    .detail("Completed", bCompleted)
 		    .detail("Policy", policy->info())
 		    .detail("Results", results.size())
 		    .detail("Processes", logServerSet->size())
 		    .detail("Workers", id_worker.size())
 		    .detail("Required", required)
 		    .detail("Desired", desired)
-		    .detail("RatingTests", SERVER_KNOBS->POLICY_RATING_TESTS)
-		    .detail("PolicyGenerations", SERVER_KNOBS->POLICY_GENERATIONS);
-
+		    .detail("Fitness", requiredFitness)
+		    .detail("Used", requiredUsed)
+		    .detail("AddingDegraded", requiredDegraded)
+		    .detail("InCCDC", requiredInCCDC)
+		    .detail("BestCount", bestSet.size())
+		    .detail("BestZones", ::describeZones(tLocalities))
+		    .detail("BestDataHalls", ::describeDataHalls(tLocalities));
 		return results;
 	}
 
@@ -751,41 +818,42 @@ public:
 		ProcessClass::Fitness worstFit;
 		ProcessClass::ClusterRole role;
 		int count;
-		int worstUsed;
-		bool worstIsDegraded;
-		bool inClusterControllerDC;
+		int worstUsed = 1;
+		bool degraded = false;
+		bool inClusterControllerDC = false;
 
 		RoleFitness(int bestFit, int worstFit, int count, ProcessClass::ClusterRole role)
 		  : bestFit((ProcessClass::Fitness)bestFit), worstFit((ProcessClass::Fitness)worstFit), count(count),
-		    role(role), worstUsed(1), worstIsDegraded(false), inClusterControllerDC(false) {}
+		    role(role) {}
 
 		RoleFitness(int fitness, int count, ProcessClass::ClusterRole role)
-		  : bestFit((ProcessClass::Fitness)fitness), worstFit((ProcessClass::Fitness)fitness), count(count), role(role),
-		    worstUsed(1), worstIsDegraded(false), inClusterControllerDC(false) {}
+		  : bestFit((ProcessClass::Fitness)fitness), worstFit((ProcessClass::Fitness)fitness), count(count),
+		    role(role) {}
 
 		RoleFitness()
 		  : bestFit(ProcessClass::NeverAssign), worstFit(ProcessClass::NeverAssign), role(ProcessClass::NoRole),
-		    count(0), worstUsed(1), worstIsDegraded(false), inClusterControllerDC(false) {}
+		    count(0) {}
 
 		RoleFitness(const vector<WorkerDetails>& workers,
 		            ProcessClass::ClusterRole role,
 		            const std::map<Optional<Standalone<StringRef>>, int>& id_used,
 		            Optional<Standalone<StringRef>> ccDcId)
 		  : role(role) {
-			worstFit = ProcessClass::BestFit;
-			worstIsDegraded = false;
+			// Every recruitment will attempt to recruit the preferred amount through GoodFit,
+			// So a recruitment which only has BestFit is not better than one that has a GoodFit process
+			worstFit = ProcessClass::GoodFit;
+
+			degraded = false;
 			inClusterControllerDC = false;
 			bestFit = ProcessClass::NeverAssign;
 			worstUsed = 1;
 			for (auto& it : workers) {
 				auto thisFit = it.processClass.machineClassFitness(role);
-				if (thisFit > worstFit) {
-					worstFit = thisFit;
-					worstIsDegraded = it.degraded;
-				} else if (thisFit == worstFit) {
-					worstIsDegraded = worstIsDegraded || it.degraded;
-				}
+				worstFit = std::max(worstFit, thisFit);
 				bestFit = std::min(bestFit, thisFit);
+				degraded |= it.degraded;
+				inClusterControllerDC |= (it.interf.locality.dcId() == ccDcId);
+
 				auto thisUsed = id_used.find(it.interf.locality.processId());
 				if (thisUsed == id_used.end()) {
 					TraceEvent(SevError, "UsedNotFound").detail("ProcessId", it.interf.locality.processId().get());
@@ -796,18 +864,15 @@ public:
 					ASSERT(false);
 				}
 				worstUsed = std::max(worstUsed, thisUsed->second);
-				// only tlogs avoid the cluster controller dc
-				if (role == ProcessClass::TLog && it.interf.locality.dcId() == ccDcId) {
-					inClusterControllerDC = true;
-				}
 			}
-			// Every recruitment will attempt to recruit the preferred amount through GoodFit,
-			// So a recruitment which only has BestFit is not better than one that has a GoodFit process
-			worstFit = std::max(worstFit, ProcessClass::GoodFit);
+
 			count = workers.size();
+
 			// degraded is only used for recruitment of tlogs
+			// only tlogs avoid the cluster controller dc
 			if (role != ProcessClass::TLog) {
-				worstIsDegraded = false;
+				degraded = false;
+				inClusterControllerDC = false;
 			}
 		}
 
@@ -818,8 +883,8 @@ public:
 				return worstUsed < r.worstUsed;
 			if (count != r.count)
 				return count > r.count;
-			if (worstIsDegraded != r.worstIsDegraded)
-				return r.worstIsDegraded;
+			if (degraded != r.degraded)
+				return r.degraded;
 			if (inClusterControllerDC != r.inClusterControllerDC)
 				return r.inClusterControllerDC;
 			// FIXME: TLog recruitment process does not guarantee the best fit is not worsened.
@@ -838,8 +903,8 @@ public:
 				return worstFit < r.worstFit;
 			if (worstUsed != r.worstUsed)
 				return worstUsed < r.worstUsed;
-			if (worstIsDegraded != r.worstIsDegraded)
-				return r.worstIsDegraded;
+			if (degraded != r.degraded)
+				return r.degraded;
 			if (inClusterControllerDC != r.inClusterControllerDC)
 				return r.inClusterControllerDC;
 			return false;
@@ -847,12 +912,11 @@ public:
 
 		bool operator==(RoleFitness const& r) const {
 			return worstFit == r.worstFit && worstUsed == r.worstUsed && bestFit == r.bestFit && count == r.count &&
-			       worstIsDegraded == r.worstIsDegraded && inClusterControllerDC == r.inClusterControllerDC;
+			       degraded == r.degraded && inClusterControllerDC == r.inClusterControllerDC;
 		}
 
 		std::string toString() const {
-			return format(
-			    "%d %d %d %d %d %d", worstFit, worstUsed, count, worstIsDegraded, inClusterControllerDC, bestFit);
+			return format("%d %d %d %d %d %d", worstFit, worstUsed, count, degraded, inClusterControllerDC, bestFit);
 		}
 	};
 

From 14213b01519d83f109c85082a1b7c0579208e3f9 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 7 Apr 2021 20:06:30 -0700
Subject: [PATCH 078/461] code cleanup

---
 fdbserver/ClusterController.actor.cpp | 31 +++++++++++++--------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index c5546830c6..fb81c571c1 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1033,7 +1033,7 @@ public:
 
 		// If one of the first process recruitments is forced to share a process, allow all of next recruitments
 		// to also share a process.
-		auto maxUsed = std::max(std::max(first_commit_proxy.used, first_grv_proxy.used), first_resolver.used);
+		auto maxUsed = std::max({ first_commit_proxy.used, first_grv_proxy.used, first_resolver.used });
 		first_commit_proxy.used = maxUsed;
 		first_grv_proxy.used = maxUsed;
 		first_resolver.used = maxUsed;
@@ -1225,8 +1225,7 @@ public:
 
 					// If one of the first process recruitments is forced to share a process, allow all of next
 					// recruitments to also share a process.
-					auto maxUsed =
-					    std::max(std::max(first_commit_proxy.used, first_grv_proxy.used), first_resolver.used);
+					auto maxUsed = std::max({ first_commit_proxy.used, first_grv_proxy.used, first_resolver.used });
 					first_commit_proxy.used = maxUsed;
 					first_grv_proxy.used = maxUsed;
 					first_resolver.used = maxUsed;
@@ -1424,7 +1423,7 @@ public:
 		// Do not trigger better master exists if the cluster controller is excluded, since the master will change
 		// anyways once the cluster controller is moved
 		if (id_worker[clusterControllerProcessId].priorityInfo.isExcluded) {
-			TraceEvent("WorseMasterExists", id).detail("Reason", "ClusterControllerExcluded");
+			TraceEvent("NewRecruitmentIsWorse", id).detail("Reason", "ClusterControllerExcluded");
 			return false;
 		}
 
@@ -1437,7 +1436,7 @@ public:
 		// Get master process
 		auto masterWorker = id_worker.find(dbi.master.locality.processId());
 		if (masterWorker == id_worker.end()) {
-			TraceEvent("WorseMasterExists", id)
+			TraceEvent("NewRecruitmentIsWorse", id)
 			    .detail("Reason", "CannotFindMaster")
 			    .detail("ProcessID", dbi.master.locality.processId());
 			return false;
@@ -1456,7 +1455,7 @@ public:
 			for (auto& it : logSet.tLogs) {
 				auto tlogWorker = id_worker.find(it.interf().filteredLocality.processId());
 				if (tlogWorker == id_worker.end()) {
-					TraceEvent("WorseMasterExists", id)
+					TraceEvent("NewRecruitmentIsWorse", id)
 					    .detail("Reason", "CannotFindTLog")
 					    .detail("ProcessID", it.interf().filteredLocality.processId());
 					return false;
@@ -1480,7 +1479,7 @@ public:
 			for (auto& it : logSet.logRouters) {
 				auto tlogWorker = id_worker.find(it.interf().filteredLocality.processId());
 				if (tlogWorker == id_worker.end()) {
-					TraceEvent("WorseMasterExists", id)
+					TraceEvent("NewRecruitmentIsWorse", id)
 					    .detail("Reason", "CannotFindLogRouter")
 					    .detail("ProcessID", it.interf().filteredLocality.processId());
 					return false;
@@ -1500,7 +1499,7 @@ public:
 			for (const auto& worker : logSet.backupWorkers) {
 				auto workerIt = id_worker.find(worker.interf().locality.processId());
 				if (workerIt == id_worker.end()) {
-					TraceEvent("WorseMasterExists", id)
+					TraceEvent("NewRecruitmentIsWorse", id)
 					    .detail("Reason", "CannotFindBackupWorker")
 					    .detail("ProcessID", worker.interf().locality.processId());
 					return false;
@@ -1523,7 +1522,7 @@ public:
 		for (auto& it : dbi.client.commitProxies) {
 			auto commitProxyWorker = id_worker.find(it.processId);
 			if (commitProxyWorker == id_worker.end()) {
-				TraceEvent("WorseMasterExists", id)
+				TraceEvent("NewRecruitmentIsWorse", id)
 				    .detail("Reason", "CannotFindCommitProxy")
 				    .detail("ProcessID", it.processId);
 				return false;
@@ -1542,7 +1541,7 @@ public:
 		for (auto& it : dbi.client.grvProxies) {
 			auto grvProxyWorker = id_worker.find(it.processId);
 			if (grvProxyWorker == id_worker.end()) {
-				TraceEvent("WorseMasterExists", id)
+				TraceEvent("NewRecruitmentIsWorse", id)
 				    .detail("Reason", "CannotFindGrvProxy")
 				    .detail("ProcessID", it.processId);
 				return false;
@@ -1561,7 +1560,7 @@ public:
 		for (auto& it : dbi.resolvers) {
 			auto resolverWorker = id_worker.find(it.locality.processId());
 			if (resolverWorker == id_worker.end()) {
-				TraceEvent("WorseMasterExists", id)
+				TraceEvent("NewRecruitmentIsWorse", id)
 				    .detail("Reason", "CannotFindResolver")
 				    .detail("ProcessID", it.locality.processId());
 				return false;
@@ -1596,7 +1595,7 @@ public:
 
 		old_id_used[masterWorker->first]++;
 		if (oldMasterFit < newMasterFit) {
-			TraceEvent("WorseMasterExists", id)
+			TraceEvent("NewRecruitmentIsWorse", id)
 			    .detail("OldMasterFit", oldMasterFit)
 			    .detail("NewMasterFit", newMasterFit)
 			    .detail("OldIsCC", dbi.master.locality.processId() == clusterControllerProcessId)
@@ -1700,7 +1699,7 @@ public:
 			return true;
 		}
 		if (!oldSatelliteFallback && newSatelliteFallback) {
-			TraceEvent("WorseMasterExists", id)
+			TraceEvent("NewRecruitmentIsWorse", id)
 			    .detail("OldSatelliteFallback", oldSatelliteFallback)
 			    .detail("NewSatelliteFallback", newSatelliteFallback);
 			return false;
@@ -1713,7 +1712,7 @@ public:
 			return true;
 		}
 		if (oldSatelliteRegionFit > newSatelliteRegionFit) {
-			TraceEvent("WorseMasterExists", id)
+			TraceEvent("NewRecruitmentIsWorse", id)
 			    .detail("OldSatelliteRegionFit", oldSatelliteRegionFit)
 			    .detail("NewSatelliteRegionFit", newSatelliteRegionFit);
 			return false;
@@ -1782,7 +1781,7 @@ public:
 		    clusterControllerDcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, db.config, id_used, true);
 		auto first_resolver = getWorkerForRoleInDatacenter(
 		    clusterControllerDcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, db.config, id_used, true);
-		auto maxUsed = std::max(std::max(first_commit_proxy.used, first_grv_proxy.used), first_resolver.used);
+		auto maxUsed = std::max({ first_commit_proxy.used, first_grv_proxy.used, first_resolver.used });
 		first_commit_proxy.used = maxUsed;
 		first_grv_proxy.used = maxUsed;
 		first_resolver.used = maxUsed;
@@ -1870,7 +1869,7 @@ public:
 		}
 
 		if (oldFit < newFit) {
-			TraceEvent("WorseMasterExists", id)
+			TraceEvent("NewRecruitmentIsWorse", id)
 			    .detail("OldMasterFit", oldMasterFit)
 			    .detail("NewMasterFit", newMasterFit)
 			    .detail("OldTLogFit", oldTLogFit.toString())

From f8786da688737e42f1a482375c550258d03e0628 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 7 Apr 2021 20:14:16 -0700
Subject: [PATCH 079/461] Added StorageByte::toString() and printed it in
 Redwood direct perf test.

---
 fdbclient/FDBTypes.h               |  9 ++++++++-
 fdbserver/VersionedBTree.actor.cpp | 10 ++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h
index b2cd469ab8..dde2a348ca 100644
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@@ -880,8 +880,15 @@ struct StorageBytes {
 	void serialize(Ar& ar) {
 		serializer(ar, free, total, used, available);
 	}
-};
 
+	std::string toString() const {
+		return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used}",
+		              total / 1e6,
+		              free / 1e6,
+		              available / 1e6,
+		              used / 1e6);
+	}
+};
 struct LogMessageVersion {
 	// Each message pushed into the log system has a unique, totally ordered LogMessageVersion
 	// See ILogSystem::push() for how these are assigned
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index a7b999539f..89f6bae442 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -8187,6 +8187,7 @@ TEST_CASE(":/redwood/performance/set") {
 	DWALPager* pager = new DWALPager(pageSize, fileName, pageCacheBytes, remapCleanupWindow);
 	state VersionedBTree* btree = new VersionedBTree(pager, fileName);
 	wait(btree->init());
+	printf("Initialized.  StorageBytes=%s\n", btree->getStorageBytes().toString().c_str());
 
 	state int64_t kvBytesThisCommit = 0;
 	state int64_t kvBytesTotal = 0;
@@ -8271,6 +8272,7 @@ TEST_CASE(":/redwood/performance/set") {
 		printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n",
 		       kvBytesTotal / 1e6,
 		       kvBytesTotal / (timer() - start) / 1e6);
+		printf("StorageBytes=%s\n", btree->getStorageBytes().toString().c_str());
 	}
 
 	printf("Warming cache with seeks\n");
@@ -8441,14 +8443,6 @@ struct KVSource {
 	}
 };
 
-std::string toString(const StorageBytes& sb) {
-	return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used}",
-	              sb.total / 1e6,
-	              sb.free / 1e6,
-	              sb.available / 1e6,
-	              sb.used / 1e6);
-}
-
 ACTOR Future<StorageBytes> getStableStorageBytes(IKeyValueStore* kvs) {
 	state StorageBytes sb = kvs->getStorageBytes();
 

From 4d8dd0b0a0d173a3cea08c6ded51b11e8b16e3c9 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 7 Apr 2021 20:32:45 -0700
Subject: [PATCH 080/461] fix: desired must be greater than or equal to
 required

---
 fdbserver/ClusterController.actor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index fb81c571c1..64794cb2ae 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -345,6 +345,7 @@ public:
 		Reference<LocalitySet> logServerSet;
 		LocalityMap<WorkerDetails>* logServerMap;
 		bool bCompleted = false;
+		desired = std::max(required, desired);
 
 		// Construct the list of DCs where the TLog recruitment is happening. This is mainly for logging purpose.
 		std::string dcList;

From 5074ac6a4d9b5cfd1275193f2ad0746ddb0ca786 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 7 Apr 2021 20:40:06 -0700
Subject: [PATCH 081/461] Missed file from previous merge commit.

---
 fdbserver/VersionedBTree.actor.cpp | 129 ++++++++++++++++-------------
 1 file changed, 71 insertions(+), 58 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 89f6bae442..1c0c013892 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -7126,7 +7126,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 		bytes += deltaTest(a, b);
 	}
 	double elapsed = timer() - start;
-	printf("DeltaTest() on random large records %g M/s  %g MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
+	printf("DeltaTest() on random large records %f M/s  %f MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
 
 	keyBuffer.resize(30);
 	valueBuffer.resize(100);
@@ -7138,7 +7138,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 		RedwoodRecordRef b = randomRedwoodRecordRef(keyBuffer, valueBuffer);
 		bytes += deltaTest(a, b);
 	}
-	printf("DeltaTest() on random small records %g M/s  %g MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
+	printf("DeltaTest() on random small records %f M/s  %f MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
 
 	RedwoodRecordRef rec1;
 	RedwoodRecordRef rec2;
@@ -7155,7 +7155,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.getCommonPrefixLen(rec2, 50);
 	}
-	printf("%" PRId64 " getCommonPrefixLen(skip=50) %g M/s\n", total, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " getCommonPrefixLen(skip=50) %f M/s\n", total, count / (timer() - start) / 1e6);
 
 	start = timer();
 	total = 0;
@@ -7163,7 +7163,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.getCommonPrefixLen(rec2, 0);
 	}
-	printf("%" PRId64 " getCommonPrefixLen(skip=0) %g M/s\n", total, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " getCommonPrefixLen(skip=0) %f M/s\n", total, count / (timer() - start) / 1e6);
 
 	char buf[1000];
 	RedwoodRecordRef::Delta& d = *(RedwoodRecordRef::Delta*)buf;
@@ -7176,7 +7176,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.writeDelta(d, rec2, commonPrefix);
 	}
-	printf("%" PRId64 " writeDelta(commonPrefix=%d) %g M/s\n", total, commonPrefix, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " writeDelta(commonPrefix=%d) %f M/s\n", total, commonPrefix, count / (timer() - start) / 1e6);
 
 	start = timer();
 	total = 0;
@@ -7184,7 +7184,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.writeDelta(d, rec2);
 	}
-	printf("%" PRId64 " writeDelta() %g M/s\n", total, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " writeDelta() %f M/s\n", total, count / (timer() - start) / 1e6);
 
 	return Void();
 }
@@ -7744,30 +7744,43 @@ TEST_CASE("/redwood/correctness/btree") {
 	g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting
 	g_redwoodMetrics.clear();
 
-	state std::string pagerFile = "unittest_pageFile.redwood";
+	state std::string fileName = params.get("fileName").orDefault("unittest_pageFile.redwood");
 	IPager2* pager;
 
-	state bool serialTest = deterministicRandom()->coinflip();
-	state bool shortTest = deterministicRandom()->coinflip();
+	state bool serialTest = params.getInt("serialTest").orDefault(deterministicRandom()->coinflip());
+	state bool shortTest = params.getInt("shortTest").orDefault(deterministicRandom()->coinflip());
 
 	state int pageSize =
 	    shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400));
 
-	state int64_t targetPageOps = shortTest ? 50000 : 1000000;
-	state bool pagerMemoryOnly = shortTest && (deterministicRandom()->random01() < .001);
-	state int maxKeySize = deterministicRandom()->randomInt(1, pageSize * 2);
-	state int maxValueSize = randomSize(pageSize * 25);
-	state int maxCommitSize = shortTest ? 1000 : randomSize(std::min<int>((maxKeySize + maxValueSize) * 20000, 10e6));
-	state double clearProbability = deterministicRandom()->random01() * .1;
-	state double clearSingleKeyProbability = deterministicRandom()->random01();
-	state double clearPostSetProbability = deterministicRandom()->random01() * .1;
-	state double coldStartProbability = pagerMemoryOnly ? 0 : (deterministicRandom()->random01() * 0.3);
-	state double advanceOldVersionProbability = deterministicRandom()->random01();
+	state int64_t targetPageOps = params.getInt("targetPageOps").orDefault(shortTest ? 50000 : 1000000);
+	state bool pagerMemoryOnly =
+	    params.getInt("pagerMemoryOnly").orDefault(shortTest && (deterministicRandom()->random01() < .001));
+	state int maxKeySize = params.getInt("maxKeySize").orDefault(deterministicRandom()->randomInt(1, pageSize * 2));
+	state int maxValueSize = params.getInt("maxValueSize").orDefault(randomSize(pageSize * 25));
+	state int maxCommitSize =
+	    params.getInt("maxCommitSize")
+	        .orDefault(shortTest ? 1000 : randomSize(std::min<int>((maxKeySize + maxValueSize) * 20000, 10e6)));
+	state double clearProbability =
+	    params.getDouble("clearProbability").orDefault(deterministicRandom()->random01() * .1);
+	state double clearSingleKeyProbability =
+	    params.getDouble("clearSingleKeyProbability").orDefault(deterministicRandom()->random01());
+	state double clearPostSetProbability =
+	    params.getDouble("clearPostSetProbability").orDefault(deterministicRandom()->random01() * .1);
+	state double coldStartProbability = params.getDouble("coldStartProbability")
+	                                        .orDefault(pagerMemoryOnly ? 0 : (deterministicRandom()->random01() * 0.3));
+	state double advanceOldVersionProbability =
+	    params.getDouble("advanceOldVersionProbability").orDefault(deterministicRandom()->random01());
 	state int64_t cacheSizeBytes =
-	    pagerMemoryOnly ? 2e9 : (pageSize * deterministicRandom()->randomInt(1, (BUGGIFY ? 2 : 10000) + 1));
-	state Version versionIncrement = deterministicRandom()->randomInt64(1, 1e8);
-	state Version remapCleanupWindow = BUGGIFY ? 0 : deterministicRandom()->randomInt64(1, versionIncrement * 50);
-	state int maxVerificationMapEntries = 300e3;
+	    params.getInt("cacheSizeBytes")
+	        .orDefault(pagerMemoryOnly ? 2e9
+	                                   : (pageSize * deterministicRandom()->randomInt(1, (BUGGIFY ? 2 : 10000) + 1)));
+	state Version versionIncrement =
+	    params.getInt("versionIncrement").orDefault(deterministicRandom()->randomInt64(1, 1e8));
+	state Version remapCleanupWindow =
+	    params.getInt("remapCleanupWindow")
+	        .orDefault(BUGGIFY ? 0 : deterministicRandom()->randomInt64(1, versionIncrement * 50));
+	state int maxVerificationMapEntries = params.getInt("maxVerificationMapEntries").orDefault(300e3);
 
 	printf("\n");
 	printf("targetPageOps: %" PRId64 "\n", targetPageOps);
@@ -7790,11 +7803,11 @@ TEST_CASE("/redwood/correctness/btree") {
 	printf("\n");
 
 	printf("Deleting existing test data...\n");
-	deleteFile(pagerFile);
+	deleteFile(fileName);
 
 	printf("Initializing...\n");
-	pager = new DWALPager(pageSize, pagerFile, cacheSizeBytes, remapCleanupWindow, pagerMemoryOnly);
-	state VersionedBTree* btree = new VersionedBTree(pager, pagerFile);
+	pager = new DWALPager(pageSize, fileName, cacheSizeBytes, remapCleanupWindow, pagerMemoryOnly);
+	state VersionedBTree* btree = new VersionedBTree(pager, fileName);
 	wait(btree->init());
 
 	state std::map<std::pair<std::string, Version>, Optional<std::string>> written;
@@ -7997,8 +8010,8 @@ TEST_CASE("/redwood/correctness/btree") {
 				wait(closedFuture);
 
 				printf("Reopening btree from disk.\n");
-				IPager2* pager = new DWALPager(pageSize, pagerFile, cacheSizeBytes, remapCleanupWindow);
-				btree = new VersionedBTree(pager, pagerFile);
+				IPager2* pager = new DWALPager(pageSize, fileName, cacheSizeBytes, remapCleanupWindow);
+				btree = new VersionedBTree(pager, fileName);
 				wait(btree->init());
 
 				Version v = btree->getLatestVersion();
@@ -8034,7 +8047,7 @@ TEST_CASE("/redwood/correctness/btree") {
 	state Future<Void> closedFuture = btree->onClosed();
 	btree->close();
 	wait(closedFuture);
-	btree = new VersionedBTree(new DWALPager(pageSize, pagerFile, cacheSizeBytes, 0), pagerFile);
+	btree = new VersionedBTree(new DWALPager(pageSize, fileName, cacheSizeBytes, 0), fileName);
 	wait(btree->init());
 
 	wait(btree->clearAllAndCheckSanity());
@@ -8133,29 +8146,29 @@ TEST_CASE(":/redwood/performance/set") {
 	g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting
 	g_redwoodMetrics.clear();
 
-	state std::string fileName = params.getParam("fileName").orDefault("unittest.redwood");
-	state int pageSize = params.getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
-	state int64_t pageCacheBytes = params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
-	state int nodeCount = params.getIntParam("nodeCount").orDefault(1e9);
-	state int maxRecordsPerCommit = params.getIntParam("maxRecordsPerCommit").orDefault(20000);
-	state int maxKVBytesPerCommit = params.getIntParam("maxKVBytesPerCommit").orDefault(20e6);
-	state int64_t kvBytesTarget = params.getIntParam("kvBytesTarget").orDefault(4e9);
-	state int minKeyPrefixBytes = params.getIntParam("minKeyPrefixBytes").orDefault(25);
-	state int maxKeyPrefixBytes = params.getIntParam("maxKeyPrefixBytes").orDefault(25);
-	state int minValueSize = params.getIntParam("minValueSize").orDefault(100);
-	state int maxValueSize = params.getIntParam("maxValueSize").orDefault(500);
-	state int minConsecutiveRun = params.getIntParam("minConsecutiveRun").orDefault(1);
-	state int maxConsecutiveRun = params.getIntParam("maxConsecutiveRun").orDefault(100);
-	state char firstKeyChar = params.getParam("firstKeyChar").orDefault("a")[0];
-	state char lastKeyChar = params.getParam("lastKeyChar").orDefault("m")[0];
+	state std::string fileName = params.get("fileName").orDefault("unittest.redwood");
+	state int pageSize = params.getInt("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
+	state int64_t pageCacheBytes = params.getInt("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
+	state int nodeCount = params.getInt("nodeCount").orDefault(1e9);
+	state int maxRecordsPerCommit = params.getInt("maxRecordsPerCommit").orDefault(20000);
+	state int maxKVBytesPerCommit = params.getInt("maxKVBytesPerCommit").orDefault(20e6);
+	state int64_t kvBytesTarget = params.getInt("kvBytesTarget").orDefault(4e9);
+	state int minKeyPrefixBytes = params.getInt("minKeyPrefixBytes").orDefault(25);
+	state int maxKeyPrefixBytes = params.getInt("maxKeyPrefixBytes").orDefault(25);
+	state int minValueSize = params.getInt("minValueSize").orDefault(100);
+	state int maxValueSize = params.getInt("maxValueSize").orDefault(500);
+	state int minConsecutiveRun = params.getInt("minConsecutiveRun").orDefault(1);
+	state int maxConsecutiveRun = params.getInt("maxConsecutiveRun").orDefault(100);
+	state char firstKeyChar = params.get("firstKeyChar").orDefault("a")[0];
+	state char lastKeyChar = params.get("lastKeyChar").orDefault("m")[0];
 	state Version remapCleanupWindow =
-	    params.getIntParam("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
-	state bool openExisting = params.getIntParam("openExisting").orDefault(0);
-	state bool insertRecords = !openExisting || params.getIntParam("insertRecords").orDefault(0);
-	state int concurrentSeeks = params.getIntParam("concurrentSeeks").orDefault(64);
-	state int concurrentScans = params.getIntParam("concurrentScans").orDefault(64);
-	state int seeks = params.getIntParam("seeks").orDefault(1000000);
-	state int scans = params.getIntParam("scans").orDefault(20000);
+	    params.getInt("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
+	state bool openExisting = params.getInt("openExisting").orDefault(0);
+	state bool insertRecords = !openExisting || params.getInt("insertRecords").orDefault(0);
+	state int concurrentSeeks = params.getInt("concurrentSeeks").orDefault(64);
+	state int concurrentScans = params.getInt("concurrentScans").orDefault(64);
+	state int seeks = params.getInt("seeks").orDefault(1000000);
+	state int scans = params.getInt("scans").orDefault(20000);
 
 	printf("pageSize: %d\n", pageSize);
 	printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes);
@@ -8648,10 +8661,10 @@ ACTOR Future<Void> doPrefixInsertComparison(int suffixSize,
 }
 
 TEST_CASE(":/redwood/performance/prefixSizeComparison") {
-	state int suffixSize = 12;
-	state int valueSize = 100;
-	state int recordCountTarget = 100e6;
-	state int usePrefixesInOrder = false;
+	state int suffixSize = params.getInt("suffixSize").orDefault(12);
+	state int valueSize = params.getInt("valueSize").orDefault(100);
+	state int recordCountTarget = params.getInt("recordCountTarget").orDefault(100e6);
+	state bool usePrefixesInOrder = params.getInt("usePrefixesInOrder").orDefault(0);
 
 	wait(doPrefixInsertComparison(
 	    suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({ { 10, 100000 } })));
@@ -8669,9 +8682,9 @@ TEST_CASE(":/redwood/performance/prefixSizeComparison") {
 }
 
 TEST_CASE(":/redwood/performance/sequentialInsert") {
-	state int prefixLen = 30;
-	state int valueSize = 100;
-	state int recordCountTarget = 100e6;
+	state int prefixLen = params.getInt("prefixLen").orDefault(30);
+	state int valueSize = params.getInt("valueSize").orDefault(100);
+	state int recordCountTarget = params.getInt("recordCountTarget").orDefault(100e6);
 
 	deleteFile("test.redwood");
 	wait(delay(5));

From 1b1f73ea16c7a4510b133a015677fd0c6777eb7b Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 7 Apr 2021 20:40:42 -0700
Subject: [PATCH 082/461] added comments

---
 fdbserver/ClusterController.actor.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 64794cb2ae..8480566914 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -416,12 +416,12 @@ public:
 			    .push_back(worker_details);
 		}
 
-		int requiredProcesses = 0;
 		auto requiredFitness = ProcessClass::BestFit;
 		int requiredUsed = 0;
 		bool requiredDegraded = false;
 		bool requiredInCCDC = false;
 
+		// Determine the minimum fitness and used necessary to fulfill the policy
 		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
 			auto fitness = std::get<0>(workerIter->first);
 			auto used = std::get<1>(workerIter->first);
@@ -429,7 +429,6 @@ public:
 				requiredFitness = fitness;
 				requiredUsed = used;
 				if (logServerSet->size() >= required && logServerSet->validate(policy)) {
-					requiredProcesses = logServerSet->size();
 					bCompleted = true;
 					break;
 				}
@@ -472,7 +471,8 @@ public:
 			throw no_more_servers();
 		}
 
-		if (requiredProcesses <= desired) {
+		// If we have less than the desired amount, return all of the processes we have
+		if (logServerSet->size() <= desired) {
 			for (auto& object : logServerMap->getObjects()) {
 				results.push_back(*object);
 			}
@@ -494,6 +494,8 @@ public:
 			return results;
 		}
 
+		// If we have added any degraded processes, try and remove them to see if we can still
+		// have the desired amount of processes
 		if (requiredDegraded) {
 			logServerMap->clear();
 			for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
@@ -515,6 +517,8 @@ public:
 			}
 		}
 
+		// If we have added any processes in the CC DC, try and remove them to see if we can still
+		// have the desired amount of processes
 		if (requiredInCCDC) {
 			logServerMap->clear();
 			for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
@@ -579,7 +583,8 @@ public:
 		std::vector<LocalityEntry> bestSet;
 		std::vector<LocalityData> tLocalities;
 
-		// Try to find the best team of servers to fulfill the policy
+		// We have more than the desired number of processes, so use the policy engine to
+		// pick a diverse subset of them
 		bCompleted = findBestPolicySet(bestSet,
 		                               logServerSet,
 		                               policy,

From 5695a1816f0bad8bc95c690a1c78977aa939914c Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 7 Apr 2021 21:31:14 -0700
Subject: [PATCH 083/461] fix: requiredFitness was being set to one higher than
 the actual requirement

---
 fdbserver/ClusterController.actor.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 8480566914..903c6cf2b8 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -426,12 +426,12 @@ public:
 			auto fitness = std::get<0>(workerIter->first);
 			auto used = std::get<1>(workerIter->first);
 			if (fitness > requiredFitness || used > requiredUsed) {
-				requiredFitness = fitness;
-				requiredUsed = used;
 				if (logServerSet->size() >= required && logServerSet->validate(policy)) {
 					bCompleted = true;
 					break;
 				}
+				requiredFitness = fitness;
+				requiredUsed = used;
 			}
 
 			if (std::get<2>(workerIter->first)) {
@@ -501,13 +501,13 @@ public:
 			for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
 				auto fitness = std::get<0>(workerIter->first);
 				auto used = std::get<1>(workerIter->first);
+				if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+					break;
+				}
 				auto addingDegraded = std::get<2>(workerIter->first);
 				if (addingDegraded) {
 					continue;
 				}
-				if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
-					break;
-				}
 				for (auto& worker : workerIter->second) {
 					logServerMap->add(worker.interf.locality, &worker);
 				}
@@ -524,14 +524,14 @@ public:
 			for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
 				auto fitness = std::get<0>(workerIter->first);
 				auto used = std::get<1>(workerIter->first);
+				if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+					break;
+				}
 				auto addingDegraded = std::get<2>(workerIter->first);
 				auto inCCDC = std::get<3>(workerIter->first);
 				if (inCCDC || (!requiredDegraded && addingDegraded)) {
 					continue;
 				}
-				if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
-					break;
-				}
 				for (auto& worker : workerIter->second) {
 					logServerMap->add(worker.interf.locality, &worker);
 				}
@@ -545,14 +545,14 @@ public:
 		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
 			auto fitness = std::get<0>(workerIter->first);
 			auto used = std::get<1>(workerIter->first);
+			if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+				break;
+			}
 			auto addingDegraded = std::get<2>(workerIter->first);
 			auto inCCDC = std::get<3>(workerIter->first);
 			if ((!requiredInCCDC && inCCDC) || (!requiredDegraded && addingDegraded)) {
 				continue;
 			}
-			if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
-				break;
-			}
 			for (auto& worker : workerIter->second) {
 				logServerMap->add(worker.interf.locality, &worker);
 			}

From 5e6655f11134f2880f55157f4a8d3e1515369398 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 7 Apr 2021 23:56:20 -0700
Subject: [PATCH 084/461] Added temp space to StorageBytes.

---
 fdbclient/FDBTypes.h               | 21 ++++++++++++++-------
 fdbserver/VersionedBTree.actor.cpp |  3 ++-
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h
index dde2a348ca..7334917639 100644
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@@ -866,15 +866,21 @@ struct TLogSpillType {
 
 // Contains the amount of free and total space for a storage server, in bytes
 struct StorageBytes {
+	// Free space on the filesystem
 	int64_t free;
+	// Total space on the filesystem
 	int64_t total;
-	int64_t used; // Used by *this* store, not total-free
-	int64_t available; // Amount of disk space that can be used by data structure, including free disk space and
-	                   // internally reusable space
+	// Used by *this* store, not total - free
+	int64_t used;
+	// Amount of space available for use by the store, which includes free space on the filesystem
+	// and internal free space within the store data that is immediately reusable.
+	int64_t available;
+	// Amount of space that could eventually be available for use after garbage collection
+	int64_t temp;
 
 	StorageBytes() {}
-	StorageBytes(int64_t free, int64_t total, int64_t used, int64_t available)
-	  : free(free), total(total), used(used), available(available) {}
+	StorageBytes(int64_t free, int64_t total, int64_t used, int64_t available, int64_t temp = 0)
+	  : free(free), total(total), used(used), available(available), temp(temp) {}
 
 	template <class Ar>
 	void serialize(Ar& ar) {
@@ -882,11 +888,12 @@ struct StorageBytes {
 	}
 
 	std::string toString() const {
-		return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used}",
+		return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used, %.2f MB temp}",
 		              total / 1e6,
 		              free / 1e6,
 		              available / 1e6,
-		              used / 1e6);
+		              used / 1e6,
+		              temp / 1e6);
 	}
 };
 struct LogMessageVersion {
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 1c0c013892..8d659ff368 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -2111,8 +2111,9 @@ public:
 		// known, if each commit delayed entries that were freeable were shuffled from the delayed free queue to the
 		// free queue, but this doesn't seem necessary.
 		int64_t reusable = (freeList.numEntries + delayedFreeList.numEntries) * physicalPageSize;
+		int64_t temp = remapQueue.numEntries * physicalPageSize;
 
-		return StorageBytes(free, total, pagerSize - reusable, free + reusable);
+		return StorageBytes(free, total, pagerSize - reusable, free + reusable, temp);
 	}
 
 	ACTOR static Future<Void> getUserPageCount_cleanup(DWALPager* self) {

From cbd77fe6f3861ffdcd35e20dbfc838d15da0f3e7 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Thu, 8 Apr 2021 01:09:47 -0700
Subject: [PATCH 085/461] Added new StorageBytes member to StorageMetrics and
 TLogMetrics (for newest TLog version only).  Moved StorageBytes detail from
 SpecialCounters to the traceCounters() decorator callback to avoid calling
 getStorageBytes(), which makes a system call, four extra times on storage
 servers and eight extra times on logs.

---
 fdbserver/TLogServer.actor.cpp    | 36 +++++++++++++++----------------
 fdbserver/storageserver.actor.cpp | 14 +++++++-----
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index 1561b2f81f..5c744f2e78 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -665,24 +665,6 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		specialCounter(cc, "SharedBytesDurable", [tLogData]() { return tLogData->bytesDurable; });
 		specialCounter(cc, "SharedOverheadBytesInput", [tLogData]() { return tLogData->overheadBytesInput; });
 		specialCounter(cc, "SharedOverheadBytesDurable", [tLogData]() { return tLogData->overheadBytesDurable; });
-		specialCounter(
-		    cc, "KvstoreBytesUsed", [tLogData]() { return tLogData->persistentData->getStorageBytes().used; });
-		specialCounter(
-		    cc, "KvstoreBytesFree", [tLogData]() { return tLogData->persistentData->getStorageBytes().free; });
-		specialCounter(cc, "KvstoreBytesAvailable", [tLogData]() {
-			return tLogData->persistentData->getStorageBytes().available;
-		});
-		specialCounter(
-		    cc, "KvstoreBytesTotal", [tLogData]() { return tLogData->persistentData->getStorageBytes().total; });
-		specialCounter(
-		    cc, "QueueDiskBytesUsed", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().used; });
-		specialCounter(
-		    cc, "QueueDiskBytesFree", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().free; });
-		specialCounter(cc, "QueueDiskBytesAvailable", [tLogData]() {
-			return tLogData->rawPersistentQueue->getStorageBytes().available;
-		});
-		specialCounter(
-		    cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; });
 		specialCounter(cc, "PeekMemoryReserved", [tLogData]() { return tLogData->peekMemoryLimiter.activePermits(); });
 		specialCounter(cc, "PeekMemoryRequestsStalled", [tLogData]() { return tLogData->peekMemoryLimiter.waiters(); });
 		specialCounter(cc, "Generation", [this]() { return this->recoveryCount; });
@@ -2672,7 +2654,23 @@ ACTOR Future<Void> tLogCore(TLogData* self,
 	                                     logData->logId,
 	                                     SERVER_KNOBS->STORAGE_LOGGING_DELAY,
 	                                     &logData->cc,
-	                                     logData->logId.toString() + "/TLogMetrics"));
+	                                     logData->logId.toString() + "/TLogMetrics",
+	                                     [self=self](TraceEvent& te) {
+		                                     StorageBytes sbTlog = self->persistentData->getStorageBytes();
+		                                     te.detail("KvstoreBytesUsed", sbTlog.used);
+		                                     te.detail("KvstoreBytesFree", sbTlog.free);
+		                                     te.detail("KvstoreBytesAvailable", sbTlog.available);
+		                                     te.detail("KvstoreBytesTotal", sbTlog.total);
+		                                     te.detail("KvstoreBytesTemp", sbTlog.temp);
+
+		                                     StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
+		                                     te.detail("QueueDiskBytesUsed", sbQueue.used);
+		                                     te.detail("QueueDiskBytesFree", sbQueue.free);
+		                                     te.detail("QueueDiskBytesAvailable", sbQueue.available);
+		                                     te.detail("QueueDiskBytesTotal", sbQueue.total);
+		                                     te.detail("QueueDiskBytesTemp", sbQueue.temp);
+	                                     }));
+
 	logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
 	logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));
 	logData->addActor.send(logPeekTrackers(logData.getPtr()));
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 8c26f955bb..5ded5d78d1 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -717,10 +717,6 @@ public:
 			specialCounter(cc, "ActiveWatches", [self]() { return self->numWatches; });
 			specialCounter(cc, "WatchBytes", [self]() { return self->watchBytes; });
 
-			specialCounter(cc, "KvstoreBytesUsed", [self]() { return self->storage.getStorageBytes().used; });
-			specialCounter(cc, "KvstoreBytesFree", [self]() { return self->storage.getStorageBytes().free; });
-			specialCounter(cc, "KvstoreBytesAvailable", [self]() { return self->storage.getStorageBytes().available; });
-			specialCounter(cc, "KvstoreBytesTotal", [self]() { return self->storage.getStorageBytes().total; });
 			specialCounter(cc, "KvstoreSizeTotal", [self]() { return std::get<0>(self->storage.getSize()); });
 			specialCounter(cc, "KvstoreNodeTotal", [self]() { return std::get<1>(self->storage.getSize()); });
 			specialCounter(cc, "KvstoreInlineKey", [self]() { return std::get<2>(self->storage.getSize()); });
@@ -4240,7 +4236,15 @@ ACTOR Future<Void> metricsCore(StorageServer* self, StorageServerInterface ssi)
 	                               SERVER_KNOBS->STORAGE_LOGGING_DELAY,
 	                               &self->counters.cc,
 	                               self->thisServerID.toString() + "/StorageMetrics",
-	                               [tag](TraceEvent& te) { te.detail("Tag", tag.toString()); }));
+	                               [tag, self=self](TraceEvent& te) {
+		                               te.detail("Tag", tag.toString());
+		                               StorageBytes sb = self->storage.getStorageBytes();
+		                               te.detail("KvstoreBytesUsed", sb.used);
+		                               te.detail("KvstoreBytesFree", sb.free);
+		                               te.detail("KvstoreBytesAvailable", sb.available);
+		                               te.detail("KvstoreBytesTotal", sb.total);
+		                               te.detail("KvstoreBytesTemp", sb.temp);
+	                               }));
 
 	loop {
 		choose {

From a90c26f1d03bb61acc64923f489b622ab6ccf6ea Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Thu, 8 Apr 2021 14:29:12 -0700
Subject: [PATCH 086/461] The master, proxies, and resolver all need to have
 the same machine class fitness function besides best fit to ensure
 recruitment is deterministic if the first GRV proxy or resolver is forced to
 share a process, it should prefer to share with the commit proxy so that the
 commit proxy has more potential options it can share with

---
 fdbrpc/Locality.cpp                   |  22 +---
 fdbserver/ClusterController.actor.cpp | 159 ++++++++++++++++++--------
 2 files changed, 116 insertions(+), 65 deletions(-)

diff --git a/fdbrpc/Locality.cpp b/fdbrpc/Locality.cpp
index 3cf70943e0..8cdc0751c4 100644
--- a/fdbrpc/Locality.cpp
+++ b/fdbrpc/Locality.cpp
@@ -63,7 +63,7 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 		default:
 			return ProcessClass::NeverAssign;
 		}
-	case ProcessClass::CommitProxy:
+	case ProcessClass::CommitProxy: // Resolver, Master, CommitProxy, and GrvProxy need to be the same besides best fit
 		switch (_class) {
 		case ProcessClass::CommitProxyClass:
 			return ProcessClass::BestFit;
@@ -71,10 +71,6 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 			return ProcessClass::GoodFit;
 		case ProcessClass::UnsetClass:
 			return ProcessClass::UnsetFit;
-		case ProcessClass::GrvProxyClass:
-			return ProcessClass::OkayFit;
-		case ProcessClass::ResolutionClass:
-			return ProcessClass::OkayFit;
 		case ProcessClass::TransactionClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
@@ -84,7 +80,7 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 		default:
 			return ProcessClass::WorstFit;
 		}
-	case ProcessClass::GrvProxy:
+	case ProcessClass::GrvProxy: // Resolver, Master, CommitProxy, and GrvProxy need to be the same besides best fit
 		switch (_class) {
 		case ProcessClass::GrvProxyClass:
 			return ProcessClass::BestFit;
@@ -92,10 +88,6 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 			return ProcessClass::GoodFit;
 		case ProcessClass::UnsetClass:
 			return ProcessClass::UnsetFit;
-		case ProcessClass::CommitProxyClass:
-			return ProcessClass::OkayFit;
-		case ProcessClass::ResolutionClass:
-			return ProcessClass::OkayFit;
 		case ProcessClass::TransactionClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
@@ -105,7 +97,7 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 		default:
 			return ProcessClass::WorstFit;
 		}
-	case ProcessClass::Master:
+	case ProcessClass::Master: // Resolver, Master, CommitProxy, and GrvProxy need to be the same besides best fit
 		switch (_class) {
 		case ProcessClass::MasterClass:
 			return ProcessClass::BestFit;
@@ -113,7 +105,7 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 			return ProcessClass::GoodFit;
 		case ProcessClass::UnsetClass:
 			return ProcessClass::UnsetFit;
-		case ProcessClass::ResolutionClass:
+		case ProcessClass::TransactionClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
 		case ProcessClass::TesterClass:
@@ -122,7 +114,7 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 		default:
 			return ProcessClass::WorstFit;
 		}
-	case ProcessClass::Resolver:
+	case ProcessClass::Resolver: // Resolver, Master, CommitProxy, and GrvProxy need to be the same besides best fit
 		switch (_class) {
 		case ProcessClass::ResolutionClass:
 			return ProcessClass::BestFit;
@@ -147,8 +139,6 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 			return ProcessClass::GoodFit;
 		case ProcessClass::UnsetClass:
 			return ProcessClass::UnsetFit;
-		case ProcessClass::ResolutionClass:
-			return ProcessClass::OkayFit;
 		case ProcessClass::TransactionClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
@@ -167,8 +157,6 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 			return ProcessClass::GoodFit;
 		case ProcessClass::UnsetClass:
 			return ProcessClass::UnsetFit;
-		case ProcessClass::ResolutionClass:
-			return ProcessClass::OkayFit;
 		case ProcessClass::TransactionClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 903c6cf2b8..97249dccf8 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -727,14 +727,15 @@ public:
 		return bestFitness;
 	}
 
-	WorkerFitnessInfo getWorkerForRoleInDatacenter(Optional<Standalone<StringRef>> const& dcId,
-	                                               ProcessClass::ClusterRole role,
-	                                               ProcessClass::Fitness unacceptableFitness,
-	                                               DatabaseConfiguration const& conf,
-	                                               std::map<Optional<Standalone<StringRef>>, int>& id_used,
-	                                               bool checkStable = false) {
-		std::map<std::pair<ProcessClass::Fitness, int>, std::pair<vector<WorkerDetails>, vector<WorkerDetails>>>
-		    fitness_workers;
+	WorkerFitnessInfo getWorkerForRoleInDatacenter(
+	    Optional<Standalone<StringRef>> const& dcId,
+	    ProcessClass::ClusterRole role,
+	    ProcessClass::Fitness unacceptableFitness,
+	    DatabaseConfiguration const& conf,
+	    std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	    Optional<Standalone<StringRef>> preferredSharing = Optional<Standalone<StringRef>>(),
+	    bool checkStable = false) {
+		std::map<std::tuple<ProcessClass::Fitness, int, bool, bool>, vector<WorkerDetails>> fitness_workers;
 
 		for (auto& it : id_worker) {
 			auto fitness = it.second.details.processClass.machineClassFitness(role);
@@ -743,23 +744,20 @@ public:
 			}
 			if (workerAvailable(it.second, checkStable) && fitness < unacceptableFitness &&
 			    it.second.details.interf.locality.dcId() == dcId) {
-				if (isLongLivedStateless(it.first)) {
-					fitness_workers[std::make_pair(fitness, id_used[it.first])].second.push_back(it.second.details);
-				} else {
-					fitness_workers[std::make_pair(fitness, id_used[it.first])].first.push_back(it.second.details);
-				}
+				fitness_workers[std::make_tuple(fitness,
+				                                id_used[it.first],
+				                                isLongLivedStateless(it.first),
+				                                preferredSharing != it.first)]
+				    .push_back(it.second.details);
 			}
 		}
 
-		for (auto& it : fitness_workers) {
-			for (int j = 0; j < 2; j++) {
-				auto& w = j == 0 ? it.second.first : it.second.second;
-				deterministicRandom()->randomShuffle(w);
-				for (int i = 0; i < w.size(); i++) {
-					id_used[w[i].interf.locality.processId()]++;
-					return WorkerFitnessInfo(w[i], std::max(ProcessClass::GoodFit, it.first.first), it.first.second);
-				}
-			}
+		if (fitness_workers.size()) {
+			auto worker = deterministicRandom()->randomChoice(fitness_workers.begin()->second);
+			id_used[worker.interf.locality.processId()]++;
+			return WorkerFitnessInfo(worker,
+			                         std::max(ProcessClass::GoodFit, std::get<0>(fitness_workers.begin()->first)),
+			                         std::get<1>(fitness_workers.begin()->first));
 		}
 
 		throw no_more_servers();
@@ -1032,10 +1030,18 @@ public:
 
 		auto first_commit_proxy = getWorkerForRoleInDatacenter(
 		    dcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, req.configuration, id_used);
-		auto first_grv_proxy = getWorkerForRoleInDatacenter(
-		    dcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, req.configuration, id_used);
-		auto first_resolver = getWorkerForRoleInDatacenter(
-		    dcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, req.configuration, id_used);
+		auto first_grv_proxy = getWorkerForRoleInDatacenter(dcId,
+		                                                    ProcessClass::GrvProxy,
+		                                                    ProcessClass::ExcludeFit,
+		                                                    req.configuration,
+		                                                    id_used,
+		                                                    first_commit_proxy.worker.interf.locality.processId());
+		auto first_resolver = getWorkerForRoleInDatacenter(dcId,
+		                                                   ProcessClass::Resolver,
+		                                                   ProcessClass::ExcludeFit,
+		                                                   req.configuration,
+		                                                   id_used,
+		                                                   first_commit_proxy.worker.interf.locality.processId());
 
 		// If one of the first process recruitments is forced to share a process, allow all of next recruitments
 		// to also share a process.
@@ -1224,10 +1230,20 @@ public:
 					auto used = id_used;
 					auto first_commit_proxy = getWorkerForRoleInDatacenter(
 					    dcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, req.configuration, used);
-					auto first_grv_proxy = getWorkerForRoleInDatacenter(
-					    dcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, req.configuration, used);
-					auto first_resolver = getWorkerForRoleInDatacenter(
-					    dcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, req.configuration, used);
+					auto first_grv_proxy =
+					    getWorkerForRoleInDatacenter(dcId,
+					                                 ProcessClass::GrvProxy,
+					                                 ProcessClass::ExcludeFit,
+					                                 req.configuration,
+					                                 used,
+					                                 first_commit_proxy.worker.interf.locality.processId());
+					auto first_resolver =
+					    getWorkerForRoleInDatacenter(dcId,
+					                                 ProcessClass::Resolver,
+					                                 ProcessClass::ExcludeFit,
+					                                 req.configuration,
+					                                 used,
+					                                 first_commit_proxy.worker.interf.locality.processId());
 
 					// If one of the first process recruitments is forced to share a process, allow all of next
 					// recruitments to also share a process.
@@ -1356,10 +1372,20 @@ public:
 
 		try {
 			std::map<Optional<Standalone<StringRef>>, int> id_used;
-			getWorkerForRoleInDatacenter(
-			    regions[0].dcId, ProcessClass::ClusterController, ProcessClass::ExcludeFit, db.config, id_used, true);
-			getWorkerForRoleInDatacenter(
-			    regions[0].dcId, ProcessClass::Master, ProcessClass::ExcludeFit, db.config, id_used, true);
+			getWorkerForRoleInDatacenter(regions[0].dcId,
+			                             ProcessClass::ClusterController,
+			                             ProcessClass::ExcludeFit,
+			                             db.config,
+			                             id_used,
+			                             Optional<Standalone<StringRef>>(),
+			                             true);
+			getWorkerForRoleInDatacenter(regions[0].dcId,
+			                             ProcessClass::Master,
+			                             ProcessClass::ExcludeFit,
+			                             db.config,
+			                             id_used,
+			                             Optional<Standalone<StringRef>>(),
+			                             true);
 
 			std::set<Optional<Key>> primaryDC;
 			primaryDC.insert(regions[0].dcId);
@@ -1375,12 +1401,27 @@ public:
 				getWorkersForSatelliteLogs(db.config, regions[0], regions[1], id_used, satelliteFallback, true);
 			}
 
-			getWorkerForRoleInDatacenter(
-			    regions[0].dcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, db.config, id_used, true);
-			getWorkerForRoleInDatacenter(
-			    regions[0].dcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, db.config, id_used, true);
-			getWorkerForRoleInDatacenter(
-			    regions[0].dcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, db.config, id_used, true);
+			getWorkerForRoleInDatacenter(regions[0].dcId,
+			                             ProcessClass::Resolver,
+			                             ProcessClass::ExcludeFit,
+			                             db.config,
+			                             id_used,
+			                             Optional<Standalone<StringRef>>(),
+			                             true);
+			getWorkerForRoleInDatacenter(regions[0].dcId,
+			                             ProcessClass::CommitProxy,
+			                             ProcessClass::ExcludeFit,
+			                             db.config,
+			                             id_used,
+			                             Optional<Standalone<StringRef>>(),
+			                             true);
+			getWorkerForRoleInDatacenter(regions[0].dcId,
+			                             ProcessClass::GrvProxy,
+			                             ProcessClass::ExcludeFit,
+			                             db.config,
+			                             id_used,
+			                             Optional<Standalone<StringRef>>(),
+			                             true);
 
 			vector<Optional<Key>> dcPriority;
 			dcPriority.push_back(regions[0].dcId);
@@ -1592,8 +1633,13 @@ public:
 		std::map<Optional<Standalone<StringRef>>, int> old_id_used;
 		id_used[clusterControllerProcessId]++;
 		old_id_used[clusterControllerProcessId]++;
-		WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(
-		    clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db.config, id_used, true);
+		WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(clusterControllerDcId,
+		                                                         ProcessClass::Master,
+		                                                         ProcessClass::NeverAssign,
+		                                                         db.config,
+		                                                         id_used,
+		                                                         Optional<Standalone<StringRef>>(),
+		                                                         true);
 		auto newMasterFit = mworker.worker.processClass.machineClassFitness(ProcessClass::Master);
 		if (db.config.isExcludedServer(mworker.worker.interf.addresses())) {
 			newMasterFit = std::max(newMasterFit, ProcessClass::ExcludeFit);
@@ -1781,12 +1827,27 @@ public:
 		RoleFitness oldGrvProxyFit(grvProxyClasses, ProcessClass::GrvProxy, old_id_used, clusterControllerDcId);
 		RoleFitness oldResolverFit(resolverClasses, ProcessClass::Resolver, old_id_used, clusterControllerDcId);
 
-		auto first_commit_proxy = getWorkerForRoleInDatacenter(
-		    clusterControllerDcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, db.config, id_used, true);
-		auto first_grv_proxy = getWorkerForRoleInDatacenter(
-		    clusterControllerDcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, db.config, id_used, true);
-		auto first_resolver = getWorkerForRoleInDatacenter(
-		    clusterControllerDcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, db.config, id_used, true);
+		auto first_commit_proxy = getWorkerForRoleInDatacenter(clusterControllerDcId,
+		                                                       ProcessClass::CommitProxy,
+		                                                       ProcessClass::ExcludeFit,
+		                                                       db.config,
+		                                                       id_used,
+		                                                       Optional<Standalone<StringRef>>(),
+		                                                       true);
+		auto first_grv_proxy = getWorkerForRoleInDatacenter(clusterControllerDcId,
+		                                                    ProcessClass::GrvProxy,
+		                                                    ProcessClass::ExcludeFit,
+		                                                    db.config,
+		                                                    id_used,
+		                                                    first_commit_proxy.worker.interf.locality.processId(),
+		                                                    true);
+		auto first_resolver = getWorkerForRoleInDatacenter(clusterControllerDcId,
+		                                                   ProcessClass::Resolver,
+		                                                   ProcessClass::ExcludeFit,
+		                                                   db.config,
+		                                                   id_used,
+		                                                   first_commit_proxy.worker.interf.locality.processId(),
+		                                                   true);
 		auto maxUsed = std::max({ first_commit_proxy.used, first_grv_proxy.used, first_resolver.used });
 		first_commit_proxy.used = maxUsed;
 		first_grv_proxy.used = maxUsed;
@@ -2266,6 +2327,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 	                                                               ProcessClass::NeverAssign,
 	                                                               self->db.config,
 	                                                               id_used,
+	                                                               Optional<Standalone<StringRef>>(),
 	                                                               true)
 	                                .worker;
 	if (self->onMasterIsBetter(newRKWorker, ProcessClass::Ratekeeper)) {
@@ -2281,6 +2343,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 	                                                               ProcessClass::NeverAssign,
 	                                                               self->db.config,
 	                                                               id_used,
+	                                                               Optional<Standalone<StringRef>>(),
 	                                                               true)
 	                                .worker;
 	if (self->onMasterIsBetter(newDDWorker, ProcessClass::DataDistributor)) {

From 1d701e8bcfcd01b31949f92e095fd405b4826cfd Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 8 Apr 2021 14:38:37 -0700
Subject: [PATCH 087/461] Log a warning when remote dc's priority doesn't match
 the original primary.

---
 fdbserver/ClusterController.actor.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 55770d6f3b..f73720bf41 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1058,8 +1058,14 @@ public:
 	RecruitFromConfigurationReply findWorkersForConfiguration(RecruitFromConfigurationRequest const& req) {
 		if (req.configuration.regions.size() > 1) {
 			std::vector<RegionInfo> regions = req.configuration.regions;
-			if (regions[0].priority == regions[1].priority && regions[1].dcId == clusterControllerDcId.get()) {
-				std::swap(regions[0], regions[1]);
+			if (regions[1].dcId == clusterControllerDcId.get()) {
+				if (regions[1].priority == regions[0].priority) {
+					std::swap(regions[0], regions[1]);
+				}
+			} else {
+				TraceEvent(SevWarn, "DcPriorityUnmatch")
+				    .detail("DcId", regions[1].dcId)
+				    .detail("Priority", regions[1].priority);
 			}
 
 			if (regions[1].dcId == clusterControllerDcId.get() && regions[1].priority >= 0 &&

From f3d5fa47502a2ae40b7540097131adde84bab229 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 8 Apr 2021 15:19:43 -0700
Subject: [PATCH 088/461] Revert "Log a warning when remote dc's priority
 doesn't match the original primary."

This reverts commit 1d701e8bcfcd01b31949f92e095fd405b4826cfd.
---
 fdbserver/ClusterController.actor.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index f73720bf41..55770d6f3b 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1058,14 +1058,8 @@ public:
 	RecruitFromConfigurationReply findWorkersForConfiguration(RecruitFromConfigurationRequest const& req) {
 		if (req.configuration.regions.size() > 1) {
 			std::vector<RegionInfo> regions = req.configuration.regions;
-			if (regions[1].dcId == clusterControllerDcId.get()) {
-				if (regions[1].priority == regions[0].priority) {
-					std::swap(regions[0], regions[1]);
-				}
-			} else {
-				TraceEvent(SevWarn, "DcPriorityUnmatch")
-				    .detail("DcId", regions[1].dcId)
-				    .detail("Priority", regions[1].priority);
+			if (regions[0].priority == regions[1].priority && regions[1].dcId == clusterControllerDcId.get()) {
+				std::swap(regions[0], regions[1]);
 			}
 
 			if (regions[1].dcId == clusterControllerDcId.get() && regions[1].priority >= 0 &&

From 738e7402f7ad5bdfe047ced9864086837187ec79 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 8 Apr 2021 15:36:52 -0700
Subject: [PATCH 089/461] Log a warning when remote dc is disabled (priority <
 0)

---
 fdbserver/ClusterController.actor.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 55770d6f3b..0d00bd2086 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1062,9 +1062,15 @@ public:
 				std::swap(regions[0], regions[1]);
 			}
 
-			if (regions[1].dcId == clusterControllerDcId.get() && regions[1].priority >= 0 &&
+			if (regions[1].dcId == clusterControllerDcId.get() &&
 			    (!versionDifferenceUpdated || datacenterVersionDifference >= SERVER_KNOBS->MAX_VERSION_DIFFERENCE)) {
-				std::swap(regions[0], regions[1]);
+				if (regions[1].priority >= 0) {
+					std::swap(regions[0], regions[1]);
+				} else {
+					TraceEvent(SevWarn, "DcPriorityNegative")
+					    .detail("DcId", regions[1].dcId)
+					    .detail("Priority", regions[1].priority);
+				}
 			}
 
 			bool setPrimaryDesired = false;

From 7be8dab045a725028a83e97371d559f58bbee61f Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 8 Apr 2021 16:00:37 -0700
Subject: [PATCH 090/461] Change DcPriorityNegative to CCDcPriorityNegative

---
 fdbserver/ClusterController.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 0d00bd2086..7fc5d51d37 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1067,7 +1067,7 @@ public:
 				if (regions[1].priority >= 0) {
 					std::swap(regions[0], regions[1]);
 				} else {
-					TraceEvent(SevWarn, "DcPriorityNegative")
+					TraceEvent(SevWarnAlways, "CCDcPriorityNegative")
 					    .detail("DcId", regions[1].dcId)
 					    .detail("Priority", regions[1].priority);
 				}

From 7652bd1c085c87f45b963d9cdba4c4914a832704 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Fri, 2 Apr 2021 09:27:49 -0700
Subject: [PATCH 091/461] Update help with all command line options

---
 fdbbackup/FileDecoder.actor.cpp | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/fdbbackup/FileDecoder.actor.cpp b/fdbbackup/FileDecoder.actor.cpp
index 89f1855365..6a1ba9346c 100644
--- a/fdbbackup/FileDecoder.actor.cpp
+++ b/fdbbackup/FileDecoder.actor.cpp
@@ -40,7 +40,17 @@ namespace file_converter {
 void printDecodeUsage() {
 	std::cout << "\n"
 	             "  -r, --container   Container URL.\n"
-	             "  -i, --input FILE  Log file to be decoded.\n"
+	             "  -i, --input FILE  Log file filter, only matched files are decoded.\n"
+	             "  --log             Enables trace file logging for the CLI session.\n"
+	             "  --logdir PATH     Specifes the output directory for trace files. If\n"
+	             "                    unspecified, defaults to the current directory. Has\n"
+	             "                    no effect unless --log is specified.\n"
+	             "  --loggroup        LOG_GROUP\n"
+	             "                    Sets the LogGroup field with the specified value for all\n"
+	             "                    events in the trace output (defaults to `default').\n"
+	             "  --trace_format    FORMAT\n"
+	             "                    Select the format of the trace files. xml (the default) and json are supported.\n"
+	             "                    Has no effect unless --log is specified.\n"
 	             "  --crash           Crash on serious error.\n"
 	             "  --build_flags     Print build information and exit.\n"
 	             "\n";
@@ -48,12 +58,12 @@ void printDecodeUsage() {
 }
 
 void printBuildInformation() {
-	printf("%s", jsonBuildInformation().c_str());
+	printf("%s\n", jsonBuildInformation().c_str());
 }
 
 struct DecodeParams {
 	std::string container_url;
-	std::string file;
+	std::string fileFilter; // only files match the filter will be decoded
 	bool log_enabled = false;
 	std::string log_dir, trace_format, trace_log_group;
 
@@ -61,8 +71,8 @@ struct DecodeParams {
 		std::string s;
 		s.append("ContainerURL: ");
 		s.append(container_url);
-		s.append(", File: ");
-		s.append(file);
+		s.append(", FileFilter: ");
+		s.append(fileFilter);
 		if (log_enabled) {
 			if (!log_dir.empty()) {
 				s.append(" LogDir:").append(log_dir);
@@ -105,7 +115,7 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
 			break;
 
 		case OPT_INPUT_FILE:
-			param->file = args->OptionArg();
+			param->fileFilter = args->OptionArg();
 			break;
 
 		case OPT_TRACE:
@@ -127,6 +137,7 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
 		case OPT_TRACE_LOG_GROUP:
 			param->trace_log_group = args->OptionArg();
 			break;
+
 		case OPT_BUILD_FLAGS:
 			printBuildInformation();
 			return FDB_EXIT_ERROR;
@@ -147,7 +158,7 @@ void printLogFiles(std::string msg, const std::vector<LogFile>& files) {
 std::vector<LogFile> getRelevantLogFiles(const std::vector<LogFile>& files, const DecodeParams& params) {
 	std::vector<LogFile> filtered;
 	for (const auto& file : files) {
-		if (file.fileName.find(params.file) != std::string::npos) {
+		if (file.fileName.find(params.fileFilter) != std::string::npos) {
 			filtered.push_back(file);
 		}
 	}

From 219c8d8526f8a8dbd5833eb2dcb9733d8ade2964 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Fri, 2 Apr 2021 09:37:33 -0700
Subject: [PATCH 092/461] Remove verbose help output

---
 fdbbackup/FileDecoder.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbbackup/FileDecoder.actor.cpp b/fdbbackup/FileDecoder.actor.cpp
index 6a1ba9346c..b709008b28 100644
--- a/fdbbackup/FileDecoder.actor.cpp
+++ b/fdbbackup/FileDecoder.actor.cpp
@@ -38,7 +38,8 @@ extern bool g_crashOnError;
 namespace file_converter {
 
 void printDecodeUsage() {
-	std::cout << "\n"
+	std::cout << "Decoder for FoundationDB backup mutation logs.\n"
+	             "Usage: fdbdecode    [OPTIONS]\n"
 	             "  -r, --container   Container URL.\n"
 	             "  -i, --input FILE  Log file filter, only matched files are decoded.\n"
 	             "  --log             Enables trace file logging for the CLI session.\n"
@@ -103,7 +104,6 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
 		int optId = args->OptionId();
 		switch (optId) {
 		case OPT_HELP:
-			printDecodeUsage();
 			return FDB_EXIT_ERROR;
 
 		case OPT_CONTAINER:

From a5841dad7b8014f1bc6cf739f2b680df91a02bfa Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Fri, 2 Apr 2021 10:18:26 -0700
Subject: [PATCH 093/461] Add TLS support to fdbdecode

---
 fdbbackup/FileConverter.h       |   4 ++
 fdbbackup/FileDecoder.actor.cpp | 107 +++++++++++++++++++++++++++-----
 2 files changed, 95 insertions(+), 16 deletions(-)

diff --git a/fdbbackup/FileConverter.h b/fdbbackup/FileConverter.h
index 0f7bfd6b16..4a1033c09a 100644
--- a/fdbbackup/FileConverter.h
+++ b/fdbbackup/FileConverter.h
@@ -24,6 +24,7 @@
 
 #include <cinttypes>
 #include "flow/SimpleOpt.h"
+#include "flow/TLSConfig.actor.h"
 
 namespace file_converter {
 
@@ -55,6 +56,9 @@ CSimpleOpt::SOption gConverterOptions[] = { { OPT_CONTAINER, "-r", SO_REQ_SEP },
 	                                        { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP },
 	                                        { OPT_INPUT_FILE, "-i", SO_REQ_SEP },
 	                                        { OPT_INPUT_FILE, "--input", SO_REQ_SEP },
+#ifndef TLS_DISABLED
+	                                        TLS_OPTION_FLAGS
+#endif
 	                                        { OPT_BUILD_FLAGS, "--build_flags", SO_NONE },
 	                                        { OPT_HELP, "-?", SO_NONE },
 	                                        { OPT_HELP, "-h", SO_NONE },
diff --git a/fdbbackup/FileDecoder.actor.cpp b/fdbbackup/FileDecoder.actor.cpp
index b709008b28..8ec041482a 100644
--- a/fdbbackup/FileDecoder.actor.cpp
+++ b/fdbbackup/FileDecoder.actor.cpp
@@ -39,21 +39,26 @@ namespace file_converter {
 
 void printDecodeUsage() {
 	std::cout << "Decoder for FoundationDB backup mutation logs.\n"
-	             "Usage: fdbdecode    [OPTIONS]\n"
-	             "  -r, --container   Container URL.\n"
-	             "  -i, --input FILE  Log file filter, only matched files are decoded.\n"
-	             "  --log             Enables trace file logging for the CLI session.\n"
-	             "  --logdir PATH     Specifes the output directory for trace files. If\n"
-	             "                    unspecified, defaults to the current directory. Has\n"
-	             "                    no effect unless --log is specified.\n"
-	             "  --loggroup        LOG_GROUP\n"
-	             "                    Sets the LogGroup field with the specified value for all\n"
-	             "                    events in the trace output (defaults to `default').\n"
-	             "  --trace_format    FORMAT\n"
-	             "                    Select the format of the trace files. xml (the default) and json are supported.\n"
-	             "                    Has no effect unless --log is specified.\n"
-	             "  --crash           Crash on serious error.\n"
-	             "  --build_flags     Print build information and exit.\n"
+	             "Usage: fdbdecode  [OPTIONS]\n"
+	             "  -r, --container URL\n"
+				 "                 Backup container URL, e.g., file:///some/path/.\n"
+	             "  -i, --input    FILE\n"
+				 "                 Log file filter, only matched files are decoded.\n"
+	             "  --log          Enables trace file logging for the CLI session.\n"
+	             "  --logdir PATH  Specifes the output directory for trace files. If\n"
+	             "                 unspecified, defaults to the current directory. Has\n"
+	             "                 no effect unless --log is specified.\n"
+	             "  --loggroup     LOG_GROUP\n"
+	             "                 Sets the LogGroup field with the specified value for all\n"
+	             "                 events in the trace output (defaults to `default').\n"
+	             "  --trace_format FORMAT\n"
+	             "                 Select the format of the trace files. xml (the default) and json are supported.\n"
+	             "                 Has no effect unless --log is specified.\n"
+	             "  --crash        Crash on serious error.\n"
+#ifndef TLS_DISABLED
+	             TLS_HELP
+#endif
+	             "  --build_flags  Print build information and exit.\n"
 	             "\n";
 	return;
 }
@@ -67,6 +72,7 @@ struct DecodeParams {
 	std::string fileFilter; // only files match the filter will be decoded
 	bool log_enabled = false;
 	std::string log_dir, trace_format, trace_log_group;
+	std::string tlsCertPath, tlsKeyPath, tlsCAPath, tlsPassword, tlsVerifyPeers;
 
 	std::string toString() {
 		std::string s;
@@ -87,6 +93,48 @@ struct DecodeParams {
 		}
 		return s;
 	}
+
+	// Returns if TLS setup is successful
+	bool setupTLS() {
+		if (tlsCertPath.size()) {
+			try {
+				setNetworkOption(FDBNetworkOptions::TLS_CERT_PATH, tlsCertPath);
+			} catch (Error& e) {
+				std::cerr << "ERROR: cannot set TLS certificate path to " << tlsCertPath << " (" << e.what() << ")\n";
+				return false;
+			}
+		}
+
+		if (tlsCAPath.size()) {
+			try {
+				setNetworkOption(FDBNetworkOptions::TLS_CA_PATH, tlsCAPath);
+			} catch (Error& e) {
+				std::cerr << "ERROR: cannot set TLS CA path to " << tlsCAPath << " (" << e.what() << ")\n";
+				return false;
+			}
+		}
+		if (tlsKeyPath.size()) {
+			try {
+				if (tlsPassword.size())
+					setNetworkOption(FDBNetworkOptions::TLS_PASSWORD, tlsPassword);
+
+				setNetworkOption(FDBNetworkOptions::TLS_KEY_PATH, tlsKeyPath);
+			} catch (Error& e) {
+				std::cerr << "ERROR: cannot set TLS key path to " << tlsKeyPath << " (" << e.what() << ")\n";
+				return false;
+			}
+		}
+		if (tlsVerifyPeers.size()) {
+			try {
+				setNetworkOption(FDBNetworkOptions::TLS_VERIFY_PEERS, tlsVerifyPeers);
+			} catch (Error& e) {
+				std::cerr << "ERROR: cannot set TLS peer verification to " << tlsVerifyPeers << " (" << e.what()
+				          << ")\n";
+				return false;
+			}
+		}
+		return true;
+	}
 };
 
 int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
@@ -138,6 +186,32 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
 			param->trace_log_group = args->OptionArg();
 			break;
 
+#ifndef TLS_DISABLED
+		case TLSConfig::OPT_TLS_PLUGIN:
+			args->OptionArg();
+			break;
+
+		case TLSConfig::OPT_TLS_CERTIFICATES:
+			param->tlsCertPath = args->OptionArg();
+			break;
+
+		case TLSConfig::OPT_TLS_PASSWORD:
+			param->tlsPassword = args->OptionArg();
+			break;
+
+		case TLSConfig::OPT_TLS_CA_FILE:
+			param->tlsCAPath = args->OptionArg();
+			break;
+
+		case TLSConfig::OPT_TLS_KEY:
+			param->tlsKeyPath = args->OptionArg();
+			break;
+
+		case TLSConfig::OPT_TLS_VERIFY_PEERS:
+			param->tlsVerifyPeers = args->OptionArg();
+			break;
+#endif
+
 		case OPT_BUILD_FLAGS:
 			printBuildInformation();
 			return FDB_EXIT_ERROR;
@@ -525,6 +599,7 @@ int main(int argc, char** argv) {
 				setNetworkOption(FDBNetworkOptions::TRACE_LOG_GROUP, StringRef(param.trace_log_group));
 			}
 		}
+		param.setupTLS();
 
 		platformInit();
 		Error::init();
@@ -540,7 +615,7 @@ int main(int argc, char** argv) {
 		runNetwork();
 		return status;
 	} catch (Error& e) {
-		fprintf(stderr, "ERROR: %s\n", e.what());
+		std::cerr << "ERROR: " << e.what() << "\n";
 		return FDB_EXIT_ERROR;
 	} catch (std::exception& e) {
 		TraceEvent(SevError, "MainError").error(unknown_error()).detail("RootException", e.what());

From 86482606bb916f64506387bd670de09148594a10 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Fri, 2 Apr 2021 18:22:06 -0700
Subject: [PATCH 094/461] Add blob credentials option for fdbdecode

---
 fdbbackup/FileConverter.h       |  2 +
 fdbbackup/FileDecoder.actor.cpp | 82 ++++++++++++++++++++++++---------
 2 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/fdbbackup/FileConverter.h b/fdbbackup/FileConverter.h
index 4a1033c09a..e3890cb476 100644
--- a/fdbbackup/FileConverter.h
+++ b/fdbbackup/FileConverter.h
@@ -32,6 +32,7 @@ namespace file_converter {
 enum {
 	OPT_CONTAINER,
 	OPT_BEGIN_VERSION,
+	OPT_BLOB_CREDENTIALS,
 	OPT_CRASHONERROR,
 	OPT_END_VERSION,
 	OPT_TRACE,
@@ -56,6 +57,7 @@ CSimpleOpt::SOption gConverterOptions[] = { { OPT_CONTAINER, "-r", SO_REQ_SEP },
 	                                        { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP },
 	                                        { OPT_INPUT_FILE, "-i", SO_REQ_SEP },
 	                                        { OPT_INPUT_FILE, "--input", SO_REQ_SEP },
+	                                        { OPT_BLOB_CREDENTIALS, "--blob_credentials", SO_REQ_SEP },
 #ifndef TLS_DISABLED
 	                                        TLS_OPTION_FLAGS
 #endif
diff --git a/fdbbackup/FileDecoder.actor.cpp b/fdbbackup/FileDecoder.actor.cpp
index 8ec041482a..d20e2bc12f 100644
--- a/fdbbackup/FileDecoder.actor.cpp
+++ b/fdbbackup/FileDecoder.actor.cpp
@@ -26,6 +26,7 @@
 #include "fdbclient/BackupContainer.h"
 #include "fdbbackup/FileConverter.h"
 #include "fdbclient/MutationList.h"
+#include "flow/Trace.h"
 #include "flow/flow.h"
 #include "flow/serialize.h"
 #include "fdbclient/BuildFlags.h"
@@ -38,33 +39,37 @@ extern bool g_crashOnError;
 namespace file_converter {
 
 void printDecodeUsage() {
-	std::cout << "Decoder for FoundationDB backup mutation logs.\n"
-	             "Usage: fdbdecode  [OPTIONS]\n"
-	             "  -r, --container URL\n"
-				 "                 Backup container URL, e.g., file:///some/path/.\n"
-	             "  -i, --input    FILE\n"
-				 "                 Log file filter, only matched files are decoded.\n"
-	             "  --log          Enables trace file logging for the CLI session.\n"
-	             "  --logdir PATH  Specifes the output directory for trace files. If\n"
-	             "                 unspecified, defaults to the current directory. Has\n"
-	             "                 no effect unless --log is specified.\n"
-	             "  --loggroup     LOG_GROUP\n"
-	             "                 Sets the LogGroup field with the specified value for all\n"
-	             "                 events in the trace output (defaults to `default').\n"
-	             "  --trace_format FORMAT\n"
-	             "                 Select the format of the trace files. xml (the default) and json are supported.\n"
-	             "                 Has no effect unless --log is specified.\n"
-	             "  --crash        Crash on serious error.\n"
+	std::cout
+	    << "Decoder for FoundationDB backup mutation logs.\n"
+	       "Usage: fdbdecode  [OPTIONS]\n"
+	       "  -r, --container URL\n"
+	       "                 Backup container URL, e.g., file:///some/path/.\n"
+	       "  -i, --input    FILE\n"
+	       "                 Log file filter, only matched files are decoded.\n"
+	       "  --log          Enables trace file logging for the CLI session.\n"
+	       "  --logdir PATH  Specifes the output directory for trace files. If\n"
+	       "                 unspecified, defaults to the current directory. Has\n"
+	       "                 no effect unless --log is specified.\n"
+	       "  --loggroup     LOG_GROUP\n"
+	       "                 Sets the LogGroup field with the specified value for all\n"
+	       "                 events in the trace output (defaults to `default').\n"
+	       "  --trace_format FORMAT\n"
+	       "                 Select the format of the trace files, xml (the default) or json.\n"
+	       "                 Has no effect unless --log is specified.\n"
+	       "  --crash        Crash on serious error.\n"
+	       "  --blob_credentials FILE\n"
+	       "                 File containing blob credentials in JSON format.\n"
+	       "                 The same credential format/file fdbbackup uses.\n"
 #ifndef TLS_DISABLED
-	             TLS_HELP
+	    TLS_HELP
 #endif
-	             "  --build_flags  Print build information and exit.\n"
-	             "\n";
+	       "  --build_flags  Print build information and exit.\n"
+	       "\n";
 	return;
 }
 
 void printBuildInformation() {
-	printf("%s\n", jsonBuildInformation().c_str());
+	std::cout << jsonBuildInformation() << "\n";
 }
 
 struct DecodeParams {
@@ -73,6 +78,7 @@ struct DecodeParams {
 	bool log_enabled = false;
 	std::string log_dir, trace_format, trace_log_group;
 	std::string tlsCertPath, tlsKeyPath, tlsCAPath, tlsPassword, tlsVerifyPeers;
+	std::vector<std::string> blobCredentials;
 
 	std::string toString() {
 		std::string s;
@@ -94,6 +100,29 @@ struct DecodeParams {
 		return s;
 	}
 
+	// Sets up blob crentials. Add the file specified by FDB_BLOB_CREDENTIALS as well.
+	void setupBlobCredentials() {
+		// Add blob credentials files from the environment to the list collected from the command line.
+		const char* blobCredsFromENV = getenv("FDB_BLOB_CREDENTIALS");
+		if (blobCredsFromENV != nullptr) {
+			StringRef t((uint8_t*)blobCredsFromENV, strlen(blobCredsFromENV));
+			do {
+				StringRef file = t.eat(":");
+				if (file.size() != 0)
+					blobCredentials.push_back(file.toString());
+			} while (t.size() != 0);
+		}
+
+		// Update the global blob credential files list
+		std::vector<std::string>* pFiles =
+		    (std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
+		if (pFiles != nullptr) {
+			for (auto& f : blobCredentials) {
+				pFiles->push_back(f);
+			}
+		}
+	}
+
 	// Returns if TLS setup is successful
 	bool setupTLS() {
 		if (tlsCertPath.size()) {
@@ -186,6 +215,10 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
 			param->trace_log_group = args->OptionArg();
 			break;
 
+		case OPT_BLOB_CREDENTIALS:
+			param->blobCredentials.push_back(args->OptionArg());
+			break;
+
 #ifndef TLS_DISABLED
 		case TLSConfig::OPT_TLS_PLUGIN:
 			args->OptionArg();
@@ -599,7 +632,11 @@ int main(int argc, char** argv) {
 				setNetworkOption(FDBNetworkOptions::TRACE_LOG_GROUP, StringRef(param.trace_log_group));
 			}
 		}
-		param.setupTLS();
+
+		if (!param.setupTLS()) {
+			TraceEvent(SevError, "TLSError");
+			throw tls_error();
+		}
 
 		platformInit();
 		Error::init();
@@ -609,6 +646,7 @@ int main(int argc, char** argv) {
 
 		TraceEvent::setNetworkThread();
 		openTraceFile(NetworkAddress(), 10 << 20, 10 << 20, param.log_dir, "decode", param.trace_log_group);
+		param.setupBlobCredentials();
 
 		auto f = stopAfter(decode_logs(param));
 

From 0ce3ed93ec8c6ced964a76eb682b0b07991f5b91 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Mon, 5 Apr 2021 11:47:23 -0700
Subject: [PATCH 095/461] Refactor TLS config for backup

---
 fdbbackup/BackupTLSConfig.cpp   | 90 +++++++++++++++++++++++++++++++++
 fdbbackup/BackupTLSConfig.h     | 41 +++++++++++++++
 fdbbackup/CMakeLists.txt        |  2 +
 fdbbackup/FileDecoder.actor.cpp | 83 ++++--------------------------
 4 files changed, 143 insertions(+), 73 deletions(-)
 create mode 100644 fdbbackup/BackupTLSConfig.cpp
 create mode 100644 fdbbackup/BackupTLSConfig.h

diff --git a/fdbbackup/BackupTLSConfig.cpp b/fdbbackup/BackupTLSConfig.cpp
new file mode 100644
index 0000000000..4df47e0b3b
--- /dev/null
+++ b/fdbbackup/BackupTLSConfig.cpp
@@ -0,0 +1,90 @@
+/*
+ * BackupTLSConfig.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+
+#include "fdbclient/NativeAPI.actor.h"
+#include "flow/Arena.h"
+#include "flow/Error.h"
+#include "flow/network.h"
+
+#include "fdbbackup/BackupTLSConfig.h"
+
+void BackupTLSConfig::setupBlobCredentials() {
+	// Add blob credentials files from the environment to the list collected from the command line.
+	const char* blobCredsFromENV = getenv("FDB_BLOB_CREDENTIALS");
+	if (blobCredsFromENV != nullptr) {
+		StringRef t((uint8_t*)blobCredsFromENV, strlen(blobCredsFromENV));
+		do {
+			StringRef file = t.eat(":");
+			if (file.size() != 0)
+				blobCredentials.push_back(file.toString());
+		} while (t.size() != 0);
+	}
+
+	// Update the global blob credential files list
+	std::vector<std::string>* pFiles = (std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
+	if (pFiles != nullptr) {
+		for (auto& f : blobCredentials) {
+			pFiles->push_back(f);
+		}
+	}
+}
+
+bool BackupTLSConfig::setupTLS() {
+	if (tlsCertPath.size()) {
+		try {
+			setNetworkOption(FDBNetworkOptions::TLS_CERT_PATH, tlsCertPath);
+		} catch (Error& e) {
+			std::cerr << "ERROR: cannot set TLS certificate path to " << tlsCertPath << " (" << e.what() << ")\n";
+			return false;
+		}
+	}
+
+	if (tlsCAPath.size()) {
+		try {
+			setNetworkOption(FDBNetworkOptions::TLS_CA_PATH, tlsCAPath);
+		} catch (Error& e) {
+			std::cerr << "ERROR: cannot set TLS CA path to " << tlsCAPath << " (" << e.what() << ")\n";
+			return false;
+		}
+	}
+	if (tlsKeyPath.size()) {
+		try {
+			if (tlsPassword.size())
+				setNetworkOption(FDBNetworkOptions::TLS_PASSWORD, tlsPassword);
+
+			setNetworkOption(FDBNetworkOptions::TLS_KEY_PATH, tlsKeyPath);
+		} catch (Error& e) {
+			std::cerr << "ERROR: cannot set TLS key path to " << tlsKeyPath << " (" << e.what() << ")\n";
+			return false;
+		}
+	}
+	if (tlsVerifyPeers.size()) {
+		try {
+			setNetworkOption(FDBNetworkOptions::TLS_VERIFY_PEERS, tlsVerifyPeers);
+		} catch (Error& e) {
+			std::cerr << "ERROR: cannot set TLS peer verification to " << tlsVerifyPeers << " (" << e.what()
+						<< ")\n";
+			return false;
+		}
+	}
+	return true;
+}
\ No newline at end of file
diff --git a/fdbbackup/BackupTLSConfig.h b/fdbbackup/BackupTLSConfig.h
new file mode 100644
index 0000000000..4222c0c25f
--- /dev/null
+++ b/fdbbackup/BackupTLSConfig.h
@@ -0,0 +1,41 @@
+/*
+ * BackupTLSConfig.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FDBBACKUP_BACKUPTLSCONFIG_H
+#define FDBBACKUP_BACKUPTLSCONFIG_H
+#pragma once
+
+#include <string>
+#include <vector>
+
+// TLS and blob credentials for backups and setup for these credentials.
+struct BackupTLSConfig {
+	std::string tlsCertPath, tlsKeyPath, tlsCAPath, tlsPassword, tlsVerifyPeers;
+	std::vector<std::string> blobCredentials;
+
+	// Returns if TLS setup is successful
+	bool setupTLS();
+
+	// Sets up blob crentials. Add the file specified by FDB_BLOB_CREDENTIALS as well.
+	// Note this must be called after g_network is set up.
+	void setupBlobCredentials();
+};
+
+#endif // FDBBACKUP_BACKUPTLSCONFIG_H
diff --git a/fdbbackup/CMakeLists.txt b/fdbbackup/CMakeLists.txt
index 1737b9042b..ffb151530b 100644
--- a/fdbbackup/CMakeLists.txt
+++ b/fdbbackup/CMakeLists.txt
@@ -11,6 +11,8 @@ add_flow_target(EXECUTABLE NAME fdbconvert SRCS ${FDBCONVERT_SRCS})
 target_link_libraries(fdbconvert PRIVATE fdbclient)
 
 set(FDBDECODE_SRCS
+	BackupTLSConfig.h
+	BackupTLSConfig.cpp
 	FileDecoder.actor.cpp
 	FileConverter.h)
 add_flow_target(EXECUTABLE NAME fdbdecode SRCS ${FDBDECODE_SRCS})
diff --git a/fdbbackup/FileDecoder.actor.cpp b/fdbbackup/FileDecoder.actor.cpp
index d20e2bc12f..193564d905 100644
--- a/fdbbackup/FileDecoder.actor.cpp
+++ b/fdbbackup/FileDecoder.actor.cpp
@@ -22,6 +22,7 @@
 #include <iostream>
 #include <vector>
 
+#include "fdbbackup/BackupTLSConfig.h"
 #include "fdbclient/BackupAgent.actor.h"
 #include "fdbclient/BackupContainer.h"
 #include "fdbbackup/FileConverter.h"
@@ -77,8 +78,7 @@ struct DecodeParams {
 	std::string fileFilter; // only files match the filter will be decoded
 	bool log_enabled = false;
 	std::string log_dir, trace_format, trace_log_group;
-	std::string tlsCertPath, tlsKeyPath, tlsCAPath, tlsPassword, tlsVerifyPeers;
-	std::vector<std::string> blobCredentials;
+	BackupTLSConfig tlsConfig;
 
 	std::string toString() {
 		std::string s;
@@ -100,70 +100,7 @@ struct DecodeParams {
 		return s;
 	}
 
-	// Sets up blob crentials. Add the file specified by FDB_BLOB_CREDENTIALS as well.
-	void setupBlobCredentials() {
-		// Add blob credentials files from the environment to the list collected from the command line.
-		const char* blobCredsFromENV = getenv("FDB_BLOB_CREDENTIALS");
-		if (blobCredsFromENV != nullptr) {
-			StringRef t((uint8_t*)blobCredsFromENV, strlen(blobCredsFromENV));
-			do {
-				StringRef file = t.eat(":");
-				if (file.size() != 0)
-					blobCredentials.push_back(file.toString());
-			} while (t.size() != 0);
-		}
 
-		// Update the global blob credential files list
-		std::vector<std::string>* pFiles =
-		    (std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
-		if (pFiles != nullptr) {
-			for (auto& f : blobCredentials) {
-				pFiles->push_back(f);
-			}
-		}
-	}
-
-	// Returns if TLS setup is successful
-	bool setupTLS() {
-		if (tlsCertPath.size()) {
-			try {
-				setNetworkOption(FDBNetworkOptions::TLS_CERT_PATH, tlsCertPath);
-			} catch (Error& e) {
-				std::cerr << "ERROR: cannot set TLS certificate path to " << tlsCertPath << " (" << e.what() << ")\n";
-				return false;
-			}
-		}
-
-		if (tlsCAPath.size()) {
-			try {
-				setNetworkOption(FDBNetworkOptions::TLS_CA_PATH, tlsCAPath);
-			} catch (Error& e) {
-				std::cerr << "ERROR: cannot set TLS CA path to " << tlsCAPath << " (" << e.what() << ")\n";
-				return false;
-			}
-		}
-		if (tlsKeyPath.size()) {
-			try {
-				if (tlsPassword.size())
-					setNetworkOption(FDBNetworkOptions::TLS_PASSWORD, tlsPassword);
-
-				setNetworkOption(FDBNetworkOptions::TLS_KEY_PATH, tlsKeyPath);
-			} catch (Error& e) {
-				std::cerr << "ERROR: cannot set TLS key path to " << tlsKeyPath << " (" << e.what() << ")\n";
-				return false;
-			}
-		}
-		if (tlsVerifyPeers.size()) {
-			try {
-				setNetworkOption(FDBNetworkOptions::TLS_VERIFY_PEERS, tlsVerifyPeers);
-			} catch (Error& e) {
-				std::cerr << "ERROR: cannot set TLS peer verification to " << tlsVerifyPeers << " (" << e.what()
-				          << ")\n";
-				return false;
-			}
-		}
-		return true;
-	}
 };
 
 int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
@@ -216,7 +153,7 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
 			break;
 
 		case OPT_BLOB_CREDENTIALS:
-			param->blobCredentials.push_back(args->OptionArg());
+			param->tlsConfig.blobCredentials.push_back(args->OptionArg());
 			break;
 
 #ifndef TLS_DISABLED
@@ -225,23 +162,23 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
 			break;
 
 		case TLSConfig::OPT_TLS_CERTIFICATES:
-			param->tlsCertPath = args->OptionArg();
+			param->tlsConfig.tlsCertPath = args->OptionArg();
 			break;
 
 		case TLSConfig::OPT_TLS_PASSWORD:
-			param->tlsPassword = args->OptionArg();
+			param->tlsConfig.tlsPassword = args->OptionArg();
 			break;
 
 		case TLSConfig::OPT_TLS_CA_FILE:
-			param->tlsCAPath = args->OptionArg();
+			param->tlsConfig.tlsCAPath = args->OptionArg();
 			break;
 
 		case TLSConfig::OPT_TLS_KEY:
-			param->tlsKeyPath = args->OptionArg();
+			param->tlsConfig.tlsKeyPath = args->OptionArg();
 			break;
 
 		case TLSConfig::OPT_TLS_VERIFY_PEERS:
-			param->tlsVerifyPeers = args->OptionArg();
+			param->tlsConfig.tlsVerifyPeers = args->OptionArg();
 			break;
 #endif
 
@@ -633,7 +570,7 @@ int main(int argc, char** argv) {
 			}
 		}
 
-		if (!param.setupTLS()) {
+		if (!param.tlsConfig.setupTLS()) {
 			TraceEvent(SevError, "TLSError");
 			throw tls_error();
 		}
@@ -646,7 +583,7 @@ int main(int argc, char** argv) {
 
 		TraceEvent::setNetworkThread();
 		openTraceFile(NetworkAddress(), 10 << 20, 10 << 20, param.log_dir, "decode", param.trace_log_group);
-		param.setupBlobCredentials();
+		param.tlsConfig.setupBlobCredentials();
 
 		auto f = stopAfter(decode_logs(param));
 

From cf70575d743c70b0a874c56002fa671bcd6b4cfb Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Mon, 5 Apr 2021 12:21:32 -0700
Subject: [PATCH 096/461] Refactor fdbbackup with BackupTLSConfig

---
 fdbbackup/CMakeLists.txt   |  4 +-
 fdbbackup/backup.actor.cpp | 75 ++++++--------------------------------
 2 files changed, 15 insertions(+), 64 deletions(-)

diff --git a/fdbbackup/CMakeLists.txt b/fdbbackup/CMakeLists.txt
index ffb151530b..48b1ad1aef 100644
--- a/fdbbackup/CMakeLists.txt
+++ b/fdbbackup/CMakeLists.txt
@@ -1,5 +1,7 @@
 set(FDBBACKUP_SRCS
-  backup.actor.cpp)
+	BackupTLSConfig.h
+	BackupTLSConfig.cpp
+	backup.actor.cpp)
 
 add_flow_target(EXECUTABLE NAME fdbbackup SRCS ${FDBBACKUP_SRCS})
 target_link_libraries(fdbbackup PRIVATE fdbclient)
diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index 7614324afc..7cf72ab517 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -18,6 +18,7 @@
  * limitations under the License.
  */
 
+#include "fdbbackup/BackupTLSConfig.h"
 #include "fdbclient/JsonBuilder.h"
 #include "flow/Arena.h"
 #include "flow/Error.h"
@@ -3251,8 +3252,7 @@ int main(int argc, char* argv[]) {
 		LocalityData localities;
 		uint64_t memLimit = 8LL << 30;
 		Optional<uint64_t> ti;
-		std::vector<std::string> blobCredentials;
-		std::string tlsCertPath, tlsKeyPath, tlsCAPath, tlsPassword, tlsVerifyPeers;
+		BackupTLSConfig tlsConfig;
 		Version dumpBegin = 0;
 		Version dumpEnd = std::numeric_limits<Version>::max();
 		std::string restoreClusterFileDest;
@@ -3578,26 +3578,26 @@ int main(int argc, char* argv[]) {
 				memLimit = ti.get();
 				break;
 			case OPT_BLOB_CREDENTIALS:
-				blobCredentials.push_back(args->OptionArg());
+				tlsConfig.blobCredentials.push_back(args->OptionArg());
 				break;
 #ifndef TLS_DISABLED
 			case TLSConfig::OPT_TLS_PLUGIN:
 				args->OptionArg();
 				break;
 			case TLSConfig::OPT_TLS_CERTIFICATES:
-				tlsCertPath = args->OptionArg();
+				tlsConfig.tlsCertPath = args->OptionArg();
 				break;
 			case TLSConfig::OPT_TLS_PASSWORD:
-				tlsPassword = args->OptionArg();
+				tlsConfig.tlsPassword = args->OptionArg();
 				break;
 			case TLSConfig::OPT_TLS_CA_FILE:
-				tlsCAPath = args->OptionArg();
+				tlsConfig.tlsCAPath = args->OptionArg();
 				break;
 			case TLSConfig::OPT_TLS_KEY:
-				tlsKeyPath = args->OptionArg();
+				tlsConfig.tlsKeyPath = args->OptionArg();
 				break;
 			case TLSConfig::OPT_TLS_VERIFY_PEERS:
-				tlsVerifyPeers = args->OptionArg();
+				tlsConfig.tlsVerifyPeers = args->OptionArg();
 				break;
 #endif
 			case OPT_DUMP_BEGIN:
@@ -3731,42 +3731,8 @@ int main(int argc, char* argv[]) {
 		setNetworkOption(FDBNetworkOptions::DISABLE_CLIENT_STATISTICS_LOGGING);
 
 		// deferred TLS options
-		if (tlsCertPath.size()) {
-			try {
-				setNetworkOption(FDBNetworkOptions::TLS_CERT_PATH, tlsCertPath);
-			} catch (Error& e) {
-				fprintf(stderr, "ERROR: cannot set TLS certificate path to `%s' (%s)\n", tlsCertPath.c_str(), e.what());
-				return 1;
-			}
-		}
-
-		if (tlsCAPath.size()) {
-			try {
-				setNetworkOption(FDBNetworkOptions::TLS_CA_PATH, tlsCAPath);
-			} catch (Error& e) {
-				fprintf(stderr, "ERROR: cannot set TLS CA path to `%s' (%s)\n", tlsCAPath.c_str(), e.what());
-				return 1;
-			}
-		}
-		if (tlsKeyPath.size()) {
-			try {
-				if (tlsPassword.size())
-					setNetworkOption(FDBNetworkOptions::TLS_PASSWORD, tlsPassword);
-
-				setNetworkOption(FDBNetworkOptions::TLS_KEY_PATH, tlsKeyPath);
-			} catch (Error& e) {
-				fprintf(stderr, "ERROR: cannot set TLS key path to `%s' (%s)\n", tlsKeyPath.c_str(), e.what());
-				return 1;
-			}
-		}
-		if (tlsVerifyPeers.size()) {
-			try {
-				setNetworkOption(FDBNetworkOptions::TLS_VERIFY_PEERS, tlsVerifyPeers);
-			} catch (Error& e) {
-				fprintf(
-				    stderr, "ERROR: cannot set TLS peer verification to `%s' (%s)\n", tlsVerifyPeers.c_str(), e.what());
-				return 1;
-			}
+		if (!tlsConfig.setupTLS()) {
+			return 1;
 		}
 
 		Error::init();
@@ -3806,25 +3772,8 @@ int main(int argc, char* argv[]) {
 		// are logged. This thread will eventually run the network, so call it now.
 		TraceEvent::setNetworkThread();
 
-		// Add blob credentials files from the environment to the list collected from the command line.
-		const char* blobCredsFromENV = getenv("FDB_BLOB_CREDENTIALS");
-		if (blobCredsFromENV != nullptr) {
-			StringRef t((uint8_t*)blobCredsFromENV, strlen(blobCredsFromENV));
-			do {
-				StringRef file = t.eat(":");
-				if (file.size() != 0)
-					blobCredentials.push_back(file.toString());
-			} while (t.size() != 0);
-		}
-
-		// Update the global blob credential files list
-		std::vector<std::string>* pFiles =
-		    (std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
-		if (pFiles != nullptr) {
-			for (auto& f : blobCredentials) {
-				pFiles->push_back(f);
-			}
-		}
+		// Sets up blob credentials, including one from the environment FDB_BLOB_CREDENTIALS.
+		tlsConfig.setupBlobCredentials();
 
 		// Opens a trace file if trace is set (and if a trace file isn't already open)
 		// For most modes, initCluster() will open a trace file, but some fdbbackup operations do not require

From 20649037057ee8deba2fca24cd2204d10a5e4b61 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 9 Apr 2021 14:25:11 -0600
Subject: [PATCH 097/461] collect and serialize

---
 CMakeLists.txt                     |   1 +
 cmake/GetMsgpack.cmake             |  16 ++
 fdbclient/ActorLineageProfiler.cpp | 183 ++++++++++++++++++++++
 fdbclient/ActorLineageProfiler.h   |  80 ++++++++++
 fdbclient/CMakeLists.txt           |   7 +-
 fdbserver/RoleLineage.actor.cpp    |   4 +-
 fdbserver/RoleLineage.actor.h      |  43 ++++--
 fdbserver/WorkerInterface.actor.h  |  34 +++++
 flow/Net2.actor.cpp                |   4 +-
 flow/Platform.actor.cpp            |   8 +-
 flow/flow.cpp                      |   6 +-
 flow/flow.h                        |  13 +-
 flow/singleton.h                   | 237 +++++++++++++++++++++++++++++
 13 files changed, 606 insertions(+), 30 deletions(-)
 create mode 100644 cmake/GetMsgpack.cmake
 create mode 100644 fdbclient/ActorLineageProfiler.cpp
 create mode 100644 fdbclient/ActorLineageProfiler.h
 create mode 100644 flow/singleton.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f6e85984f1..2e48d95447 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,6 +152,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")
 endif()
 
 include(CompileBoost)
+include(GetMsgpack)
 add_subdirectory(flow)
 add_subdirectory(fdbrpc)
 add_subdirectory(fdbclient)
diff --git a/cmake/GetMsgpack.cmake b/cmake/GetMsgpack.cmake
new file mode 100644
index 0000000000..0b951d5a1b
--- /dev/null
+++ b/cmake/GetMsgpack.cmake
@@ -0,0 +1,16 @@
+find_package(msgpack 3.3.0 EXACT QUIET CONFIG)
+
+add_library(msgpack INTERFACE)
+
+if(msgpack_FOUND)
+  target_link_libraries(msgpack INTERFACE msgpackc-cxx)
+else()
+  include(ExternalProject)
+  ExternalProject_add(msgpackProject
+    URL "https://github.com/msgpack/msgpack-c/releases/download/cpp-3.3.0/msgpack-3.3.0.tar.gz"
+    URL_HASH SHA256=6e114d12a5ddb8cb11f669f83f32246e484a8addd0ce93f274996f1941c1f07b
+    CONFIGURE_COMMAND BUILD_COMMAND INSTALL_COMMAND)
+
+  ExternalProject_Get_property(msgpackProject SOURCE_DIR)
+  target_include_directories(msgpack SYSTEM INTERFACE "${SOURCE_DIR}/include")
+endif()
\ No newline at end of file
diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
new file mode 100644
index 0000000000..8d5ad1d6ae
--- /dev/null
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -0,0 +1,183 @@
+/*
+ * ActorLineageProfiler.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-20201 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flow/singleton.h"
+#include "fdbclient/ActorLineageProfiler.h"
+#include <msgpack.hpp>
+#include <memory>
+#include <boost/endian/conversion.hpp>
+
+using namespace std::literals;
+
+class Packer : public msgpack::packer<msgpack::sbuffer> {
+	struct visitor_t {
+		using VisitorMap = std::unordered_map<std::type_info, std::function<void(std::any const&, Packer& packer)>>;
+		VisitorMap visitorMap;
+
+		template <class T>
+		static void any_visitor(std::any const& val, Packer& packer) {
+			const T& v = std::any_cast<const T&>(val);
+			packer.pack(v);
+		}
+
+		template <class... Args>
+		struct populate_visitor_map;
+		template <class Head, class... Tail>
+		struct populate_visitor_map<Head, Tail...> {
+			static void populate(VisitorMap& map) {
+				map.emplace(any_visitor<Head>);
+				populate_visitor_map<Tail...>::populate(map);
+			}
+		};
+		template <>
+		struct populate_visitor_map<> {
+			static void populate(VisitorMap&) {}
+		};
+
+		visitor_t() { populate_visitor_map<int64_t, uint64_t, bool, float, double>::populate(visitorMap); }
+
+		void visit(const std::any& val, Packer& packer) {
+			auto iter = visitorMap.find(val.type());
+			if (iter == visitorMap.end()) {
+				// TODO: trace error
+			} else {
+				iter->second(val, packer);
+			}
+		}
+	};
+	msgpack::sbuffer sbuffer;
+	// Initializing visitor_t involves building a type-map. As this is a relatively expensive operation, we don't want
+	// to do this each time we create a Packer object. So visitor_t is a stateless class and we only use it as a
+	// visitor.
+	crossbow::singleton<visitor_t> visitor;
+
+public:
+	Packer() : msgpack::packer<msgpack::sbuffer>(sbuffer) {}
+
+	void pack(std::any const& val) { visitor->visit(val, *this); }
+
+	void pack(bool val) {
+		if (val) {
+			pack_true();
+		} else {
+			pack_false();
+		}
+	}
+
+	void pack(uint64_t val) {
+		if (val <= std::numeric_limits<uint8_t>::max()) {
+			pack_uint8(uint8_t(val));
+		} else if (val <= std::numeric_limits<uint16_t>::max()) {
+			pack_uint16(uint16_t(val));
+		} else if (val <= std::numeric_limits<uint32_t>::max()) {
+			pack_uint32(uint32_t(val));
+		} else {
+			pack_uint64(val);
+		}
+	}
+
+	void pack(int64_t val) {
+		if (val >= 0) {
+			this->pack(uint64_t(val));
+		} else if (val >= std::numeric_limits<uint8_t>::min()) {
+			pack_int8(int8_t(val));
+		} else if (val >= std::numeric_limits<uint16_t>::min()) {
+			pack_int8(int16_t(val));
+		} else if (val >= std::numeric_limits<uint32_t>::min()) {
+			pack_int8(int32_t(val));
+		} else if (val >= std::numeric_limits<uint64_t>::min()) {
+			pack_int8(int64_t(val));
+		}
+	}
+
+	void pack(float val) { pack_float(val); }
+	void pack(double val) { pack_double(val); }
+	void pack(std::string const& str) {
+		pack_str(str.size());
+		pack_str_body(str.data(), str.size());
+	}
+
+	void pack(std::string_view val) {
+		pack_str(val.size());
+		pack_str_body(val.data(), val.size());
+	}
+
+	template <class K, class V>
+	void pack(std::map<K, V> const& map) {
+		pack_map(map.size());
+		for (const auto& p : map) {
+			pack(p.first);
+			pack(p.second);
+		}
+	}
+
+	template <class T>
+	void pack(std::vector<T> const& val) {
+		pack_array(val.size());
+		for (const auto& v : val) {
+			pack(v);
+		}
+	}
+
+	std::shared_ptr<Sample> done(double time) {
+		auto res = std::make_shared<Sample>();
+		res->time = time;
+		res->size = sbuffer.size();
+		res->data = sbuffer.release();
+		return res;
+	}
+};
+
+IALPCollectorBase::IALPCollectorBase() {
+	SampleCollector::instance().addCollector(this);
+}
+
+std::map<std::string_view, std::any> SampleCollectorT::collect(ActorLineage* lineage) {
+	std::map<std::string_view, std::any> out;
+	for (auto& collector : collectors) {
+		auto val = collector->collect(lineage);
+		if (val.has_value()) {
+			out[collector->name()] = val.value();
+		}
+	}
+	return out;
+}
+
+std::shared_ptr<Sample> SampleCollectorT::collect() {
+	Packer packer;
+	std::map<std::string_view, std::any> res;
+	double time = g_network->now();
+	res["time"sv] = time;
+	for (auto& p : getSamples) {
+		std::vector<std::map<std::string_view, std::any>> samples;
+		auto sampleVec = p.second();
+		for (auto& val : sampleVec) {
+			auto m = collect(val.getPtr());
+			if (!m.empty()) {
+				samples.emplace_back(std::move(m));
+			}
+		}
+		if (!samples.empty()) {
+			res[to_string(p.first)] = samples;
+		}
+	}
+	packer.pack(res);
+	return packer.done(time);
+}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
new file mode 100644
index 0000000000..cbd2e7d1f3
--- /dev/null
+++ b/fdbclient/ActorLineageProfiler.h
@@ -0,0 +1,80 @@
+/*
+ * ActorLineageProfiler.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-20201 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <optional>
+#include <string>
+#include <any>
+#include <vector>
+#include "flow/singleton.h"
+#include "flow/flow.h"
+
+struct IALPCollectorBase {
+	virtual std::optional<std::any> collect(ActorLineage*) = 0;
+	virtual const std::string_view& name() = 0;
+	IALPCollectorBase();
+};
+
+template <class T>
+struct IALPCollector : IALPCollectorBase {
+	const std::string_view& name() override {
+		static std::string_view res;
+		if (res == "") {
+			res = T::name;
+		}
+		return res;
+	}
+};
+
+enum class WaitState { Running, DiskIO };
+
+std::string_view to_string(WaitState w) {
+	switch (w) {
+	case WaitState::Running:
+		return "Running";
+	case WaitState::DiskIO:
+		return "DiskIO";
+	}
+}
+
+struct Sample : std::enable_shared_from_this<Sample> {
+	double time = 0.0;
+	unsigned size = 0u;
+	char* data = nullptr;
+	~Sample() { ::free(data); }
+};
+
+class SampleCollectorT {
+public: // Types
+	friend class crossbow::singleton<SampleCollectorT>;
+	using Getter = std::function<std::vector<Reference<ActorLineage>>()>;
+
+private:
+	std::vector<IALPCollectorBase*> collectors;
+	std::map<WaitState, Getter> getSamples;
+	SampleCollectorT() {}
+
+public:
+	void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); }
+	std::map<std::string_view, std::any> collect(ActorLineage* lineage);
+	std::shared_ptr<Sample> collect();
+};
+
+using SampleCollector = crossbow::singleton<SampleCollectorT>;
diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index 129f9e7d3e..f81fd92eac 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -1,4 +1,6 @@
 set(FDBCLIENT_SRCS
+  ActorLineageProfiler.h
+  ActorLineageProfiler.cpp
   AsyncFileS3BlobStore.actor.cpp
   AsyncFileS3BlobStore.actor.h
   AsyncTaskThread.actor.cpp
@@ -137,8 +139,7 @@ endif()
 
 add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs})
 add_dependencies(fdbclient fdboptions)
+target_link_libraries(fdbclient PUBLIC fdbrpc msgpack)
 if(BUILD_AZURE_BACKUP)
-  target_link_libraries(fdbclient PUBLIC fdbrpc PRIVATE curl uuid azure-storage-lite)
-else()
-  target_link_libraries(fdbclient PUBLIC fdbrpc)
+  target_link_libraries(fdbclient PRIVATE curl uuid azure-storage-lite)
 endif()
diff --git a/fdbserver/RoleLineage.actor.cpp b/fdbserver/RoleLineage.actor.cpp
index 6d1b49527a..b54282f5f0 100644
--- a/fdbserver/RoleLineage.actor.cpp
+++ b/fdbserver/RoleLineage.actor.cpp
@@ -20,4 +20,6 @@
 
 #include "fdbserver/RoleLineage.actor.h"
 
-StringRef RoleLineage::name = "RoleLineage"_sr;
+using namespace std::literals;
+
+std::string_view RoleLineage::name = "RoleLineage"sv;
diff --git a/fdbserver/RoleLineage.actor.h b/fdbserver/RoleLineage.actor.h
index d35c749771..5cbf65ed53 100644
--- a/fdbserver/RoleLineage.actor.h
+++ b/fdbserver/RoleLineage.actor.h
@@ -21,30 +21,47 @@
 #pragma once
 #include "flow/flow.h"
 #if defined(NO_INTELLISENSE) && !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_G_H)
-#  define FDBSERVER_ROLE_LINEAGE_ACTOR_G_H
-#  include "fdbserver/RoleLineage.actor.g.h"
+#define FDBSERVER_ROLE_LINEAGE_ACTOR_G_H
+#include "fdbserver/RoleLineage.actor.g.h"
 #elif !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_H)
-#  define FDBSERVER_ROLE_LINEAGE_ACTOR_H
+#define FDBSERVER_ROLE_LINEAGE_ACTOR_H
 
+#include "flow/singleton.h"
 #include "fdbrpc/Locality.h"
+#include "fdbclient/ActorLineageProfiler.h"
+#include "fdbserver/WorkerInterface.actor.h"
+
+#include <string_view>
+#include <msgpack.hpp>
+#include <any>
 #include "flow/actorcompiler.h" // This must be the last include
 
 struct RoleLineage : LineageProperties<RoleLineage> {
-    static StringRef name;
-    ProcessClass::ClusterRole role = ProcessClass::NoRole;
+	static std::string_view name;
+	ProcessClass::ClusterRole role = ProcessClass::NoRole;
 
-    bool isSet(ProcessClass::ClusterRole RoleLineage::*member) const {
-        return this->*member != ProcessClass::NoRole;
-    }
+	bool isSet(ProcessClass::ClusterRole RoleLineage::*member) const { return this->*member != ProcessClass::NoRole; }
+};
+
+struct RoleLineageCollector : IALPCollector<RoleLineage> {
+	RoleLineageCollector() : IALPCollector() {}
+	std::optional<std::any> collect(ActorLineage* lineage) override {
+		auto res = lineage->get(&RoleLineage::role);
+		if (res.has_value()) {
+			return Role::get(res.value()).abbreviation;
+		} else {
+			return std::optional<std::any>();
+		}
+	}
 };
 
 // creates a new root and sets the role lineage
-ACTOR template<class Fun>
+ACTOR template <class Fun>
 Future<decltype(std::declval<Fun>()())> runInRole(Fun fun, ProcessClass::ClusterRole role) {
-    currentLineage->makeRoot();
-    currentLineage->modify(&RoleLineage::role) = role;
-    decltype(std::declval<Fun>()()) res = wait(fun());
-    return res;
+	currentLineage->makeRoot();
+	currentLineage->modify(&RoleLineage::role) = role;
+	decltype(std::declval<Fun>()()) res = wait(fun());
+	return res;
 }
 
 #endif
diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h
index f1d83ec819..57c2833f3c 100644
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@@ -787,6 +787,40 @@ struct Role {
 	std::string abbreviation;
 	bool includeInTraceRoles;
 
+	static const Role& get(ProcessClass::ClusterRole role) {
+		switch (role) {
+		case ProcessClass::Storage:
+			return STORAGE_SERVER;
+		case ProcessClass::TLog:
+			return TRANSACTION_LOG;
+		case ProcessClass::CommitProxy:
+			return COMMIT_PROXY;
+		case ProcessClass::GrvProxy:
+			return GRV_PROXY;
+		case ProcessClass::Master:
+			return MASTER;
+		case ProcessClass::Resolver:
+			return RESOLVER;
+		case ProcessClass::LogRouter:
+			return LOG_ROUTER;
+		case ProcessClass::ClusterController:
+			return CLUSTER_CONTROLLER;
+		case ProcessClass::DataDistributor:
+			return DATA_DISTRIBUTOR;
+		case ProcessClass::Ratekeeper:
+			return RATEKEEPER;
+		case ProcessClass::StorageCache:
+			return STORAGE_CACHE;
+		case ProcessClass::Backup:
+			return BACKUP;
+		case ProcessClass::Worker:
+			return WORKER;
+		case ProcessClass::NoRole:
+			ASSERT(false);
+			throw internal_error();
+		}
+	}
+
 	bool operator==(const Role& r) const { return roleName == r.roleName; }
 	bool operator!=(const Role& r) const { return !(*this == r); }
 
diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp
index bb0b0325c6..a95af0cd21 100644
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@@ -226,7 +226,9 @@ public:
 	TaskPriority currentTaskID;
 	uint64_t tasksIssued;
 	TDMetricCollection tdmetrics;
-	double currentTime;
+	// we read now() from a different thread. On Intel, reading a double is atomic anyways, but on other platforms it's
+	// not. For portability this should be atomic
+	std::atomic<double> currentTime;
 	// May be accessed off the network thread, e.g. by onMainThread
 	std::atomic<bool> stopped;
 	mutable std::map<IPAddress, bool> addressOnHostCache;
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 50f252021b..b28c6c35d5 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -3685,8 +3685,8 @@ void* sampleThread(void* arg) {
 		printf("Currently running actor lineage (%p):\n", actorLineage.getPtr());
 		auto stack = actorLineage->stack(&StackLineage::actorName);
 		while (!stack.empty()) {
-			printf("%s ", stack.top());
-			stack.pop();
+			printf("%s ", stack.back());
+			stack.pop_back();
 		}
 		printf("\n");
 
@@ -3697,8 +3697,8 @@ void* sampleThread(void* arg) {
 		for (auto actorLineage : diskAlps) {
 			auto stack = actorLineage->stack(&StackLineage::actorName);
 			while (!stack.empty()) {
-				printf("%s ", stack.top());
-				stack.pop();
+				printf("%s ", stack.back());
+				stack.pop_back();
 			}
 			printf("\n");
 		}
diff --git a/flow/flow.cpp b/flow/flow.cpp
index 9a7dda781a..351c8d0aa2 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -39,9 +39,11 @@ ActorLineage::~ActorLineage() {
 	}
 }
 
-StringRef StackLineage::name = "StackLineage"_sr;
+using namespace std::literals;
 
-std::stack<StringRef> getActorStackTrace() {
+std::string_view StackLineage::name = "StackLineage"sv;
+
+std::vector<StringRef> getActorStackTrace() {
 	return currentLineage->stack(&StackLineage::actorName);
 }
 
diff --git a/flow/flow.h b/flow/flow.h
index b61453c8f2..09211959a7 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -38,6 +38,7 @@
 #include <functional>
 #include <iostream>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <algorithm>
 #include <memory>
@@ -450,7 +451,7 @@ struct ActorLineage : ReferenceCounted<ActorLineage> {
 	friend class LocalLineage;
 
 private:
-	std::unordered_map<StringRef, LineagePropertiesBase*> properties;
+	std::unordered_map<std::string_view, LineagePropertiesBase*> properties;
 	Reference<ActorLineage> parent;
 
 public:
@@ -483,15 +484,15 @@ public:
 		return std::optional<V>{};
 	}
 	template <class T, class V>
-	std::stack<V> stack(V T::*member) const {
+	std::vector<V> stack(V T::*member) const {
 		auto current = this;
-		std::stack<V> res;
+		std::vector<V> res;
 		while (current != nullptr) {
 			auto iter = current->properties.find(T::name);
 			if (iter != current->properties.end()) {
 				T const& map = static_cast<T const&>(*iter->second);
 				if (map.isSet(member)) {
-					res.push(map.*member);
+					res.push_back(map.*member);
 				}
 			}
 			current = current->parent.getPtr();
@@ -529,11 +530,11 @@ struct restore_lineage {
 };
 
 struct StackLineage : LineageProperties<StackLineage> {
-	static StringRef name;
+	static const std::string_view name;
 	StringRef actorName;
 };
 
-extern std::stack<StringRef> getActorStackTrace();
+extern std::vector<StringRef> getActorStackTrace();
 
 // SAV is short for Single Assignment Variable: It can be assigned for only once!
 template <class T>
diff --git a/flow/singleton.h b/flow/singleton.h
new file mode 100644
index 0000000000..c6a256ac42
--- /dev/null
+++ b/flow/singleton.h
@@ -0,0 +1,237 @@
+/*
+ * (C) Copyright 2015 ETH Zurich Systems Group (http://www.systems.ethz.ch/) and others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Contributors:
+ *     Markus Pilman <mpilman@inf.ethz.ch>
+ *     Simon Loesing <sloesing@inf.ethz.ch>
+ *     Thomas Etter <etterth@gmail.com>
+ *     Kevin Bocksrocker <kevin.bocksrocker@gmail.com>
+ *     Lucas Braun <braunl@inf.ethz.ch>
+ */
+#pragma once
+
+#include <mutex>
+#include <memory>
+#include <cstdlib>
+#include <cassert>
+
+namespace crossbow {
+
+/**
+ * @brief A mock mutex for disabling locking in the singleton
+ *
+ * This class implements the mutex concept with empty methods.
+ * This can be used to disable synchronization in the singleton
+ * holder.
+ */
+struct no_locking {
+	void lock() {}
+	void unlock() {}
+	bool try_lock() { return true; }
+};
+
+template <typename T>
+struct create_static {
+	static constexpr bool supports_recreation = false;
+	union max_align {
+		char t_[sizeof(T)];
+		short int short_int_;
+		long int long_int_;
+		float float_;
+		double double_;
+		long double longDouble_;
+		struct Test;
+		int Test::*pMember_;
+		int (Test::*pMemberFn_)(int);
+	};
+
+	static T* create() {
+		static max_align static_memory_;
+		return new (&static_memory_) T;
+	}
+
+	static void destroy(T* ptr) { ptr->~T(); }
+};
+
+template <typename T>
+struct create_using_new {
+	static constexpr bool supports_recreation = true;
+	static T* create() { return new T; };
+
+	static void destroy(T* ptr) { delete ptr; }
+};
+
+template <typename T>
+struct create_using_malloc {
+	static constexpr bool supports_recreation = true;
+	static T* create() {
+		void* p = std::malloc(sizeof(T));
+		if (!p)
+			return nullptr;
+		return new (p) T;
+	}
+
+	static void destroy(T* ptr) {
+		ptr->~T();
+		free(ptr);
+	}
+};
+
+template <class T, class allocator>
+struct create_using {
+	static constexpr bool supports_recreation = true;
+	static allocator alloc_;
+
+	static T* create() {
+		T* p = alloc_.allocate(1);
+		if (!p)
+			return nullptr;
+		alloc_.construct(p);
+		return p;
+	};
+
+	static void destroy(T* ptr) {
+		alloc_.destroy(ptr);
+		alloc_.deallocate(ptr, 1);
+	}
+};
+
+template <typename T>
+struct default_lifetime {
+	static void schedule_destruction(T*, void (*func)()) { std::atexit(func); }
+
+	static void on_dead_ref() { throw std::logic_error("Dead reference detected"); }
+};
+
+template <typename T>
+struct phoenix_lifetime {
+	static void schedule_destruction(T*, void (*func)()) { std::atexit(func); }
+
+	static void on_dead_ref() {}
+};
+
+template <typename T>
+struct infinite_lifetime {
+	static void schedule_destruction(T*, void (*)()) {}
+	static void on_dead_ref() {}
+};
+
+template <typename T>
+struct lifetime_traits {
+	static constexpr bool supports_recreation = true;
+};
+
+template <typename T>
+struct lifetime_traits<infinite_lifetime<T>> {
+	static constexpr bool supports_recreation = false;
+};
+
+template <typename T>
+struct lifetime_traits<default_lifetime<T>> {
+	static constexpr bool supports_recreation = false;
+};
+
+template <typename Type,
+          typename Create = create_static<Type>,
+          typename LifetimePolicy = default_lifetime<Type>,
+          typename Mutex = std::mutex>
+class singleton {
+public:
+	typedef Type value_type;
+	typedef Type* pointer;
+	typedef const Type* const_pointer;
+	typedef const Type& const_reference;
+	typedef Type& reference;
+
+private:
+	static bool destroyed_;
+	static pointer instance_;
+	static Mutex mutex_;
+
+	static void destroy() {
+		if (destroyed_)
+			return;
+		Create::destroy(instance_);
+		instance_ = nullptr;
+		destroyed_ = true;
+	}
+
+public:
+	static reference instance() {
+		static_assert(Create::supports_recreation || !lifetime_traits<LifetimePolicy>::supports_recreation,
+		              "The creation policy does not support instance recreation, while the lifetime does support it.");
+		if (!instance_) {
+			std::lock_guard<Mutex> l(mutex_);
+			if (!instance_) {
+				if (destroyed_) {
+					destroyed_ = false;
+					LifetimePolicy::on_dead_ref();
+				}
+				instance_ = Create::create();
+				LifetimePolicy::schedule_destruction(instance_, &destroy);
+			}
+		}
+		return *instance_;
+	}
+	/**
+	 * WARNING: DO NOT EXECUTE THIS MULTITHREADED!!!
+	 */
+	static void destroy_instance() {
+		if (instance_) {
+			std::lock_guard<Mutex> l(mutex_);
+			destroy();
+		}
+	}
+
+public:
+	pointer operator->() {
+		if (!instance_) {
+			instance();
+		}
+		return instance_;
+	}
+
+	reference operator*() {
+		if (!instance_) {
+			instance();
+		}
+		return *instance_;
+	}
+
+	const_pointer operator->() const {
+		if (!instance_) {
+			instance();
+		}
+		return instance_;
+	}
+
+	const_reference operator*() const {
+		if (!instance_) {
+			instance();
+		}
+		return *instance_;
+	}
+};
+
+template <typename T, typename C, typename L, typename M>
+bool singleton<T, C, L, M>::destroyed_ = false;
+
+template <typename T, typename C, typename L, typename M>
+typename singleton<T, C, L, M>::pointer singleton<T, C, L, M>::instance_ = nullptr;
+
+template <typename T, typename C, typename L, typename M>
+M singleton<T, C, L, M>::mutex_;
+
+} // namespace crossbow
\ No newline at end of file

From 20d98421af0e8d11ca41f3e748d52f54d9a143e7 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 9 Apr 2021 15:16:07 -0600
Subject: [PATCH 098/461] fix compiler errors

---
 fdbclient/ActorLineageProfiler.cpp | 13 +++++++++++--
 fdbclient/ActorLineageProfiler.h   | 11 ++---------
 fdbserver/SigStack.cpp             | 20 ++++++++++----------
 flow/flow.cpp                      |  2 +-
 4 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 8d5ad1d6ae..a28f011d5a 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -26,9 +26,18 @@
 
 using namespace std::literals;
 
+std::string_view to_string(WaitState w) {
+	switch (w) {
+	case WaitState::Running:
+		return "Running";
+	case WaitState::DiskIO:
+		return "DiskIO";
+	}
+}
+
 class Packer : public msgpack::packer<msgpack::sbuffer> {
 	struct visitor_t {
-		using VisitorMap = std::unordered_map<std::type_info, std::function<void(std::any const&, Packer& packer)>>;
+		using VisitorMap = std::unordered_map<std::type_index, std::function<void(std::any const&, Packer& packer)>>;
 		VisitorMap visitorMap;
 
 		template <class T>
@@ -42,7 +51,7 @@ class Packer : public msgpack::packer<msgpack::sbuffer> {
 		template <class Head, class... Tail>
 		struct populate_visitor_map<Head, Tail...> {
 			static void populate(VisitorMap& map) {
-				map.emplace(any_visitor<Head>);
+				map.emplace(std::type_index(typeid(Head)), any_visitor<Head>);
 				populate_visitor_map<Tail...>::populate(map);
 			}
 		};
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index cbd2e7d1f3..af32d6de13 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -45,14 +45,7 @@ struct IALPCollector : IALPCollectorBase {
 
 enum class WaitState { Running, DiskIO };
 
-std::string_view to_string(WaitState w) {
-	switch (w) {
-	case WaitState::Running:
-		return "Running";
-	case WaitState::DiskIO:
-		return "DiskIO";
-	}
-}
+std::string_view to_string(WaitState w);
 
 struct Sample : std::enable_shared_from_this<Sample> {
 	double time = 0.0;
@@ -63,7 +56,7 @@ struct Sample : std::enable_shared_from_this<Sample> {
 
 class SampleCollectorT {
 public: // Types
-	friend class crossbow::singleton<SampleCollectorT>;
+	friend struct crossbow::create_static<SampleCollectorT>;
 	using Getter = std::function<std::vector<Reference<ActorLineage>>()>;
 
 private:
diff --git a/fdbserver/SigStack.cpp b/fdbserver/SigStack.cpp
index efec5aff7d..0c35326766 100644
--- a/fdbserver/SigStack.cpp
+++ b/fdbserver/SigStack.cpp
@@ -7,17 +7,17 @@
 // However, this should be good enough for an initial
 // proof of concept.
 extern "C" void stackSignalHandler(int sig) {
-    auto stack = getActorStackTrace();
-    int i = 0;
-    while (!stack.empty()) {
-        auto s = stack.top();
-        stack.pop();
-        std::string_view n(reinterpret_cast<const char*>(s.begin()), s.size());
-        std::cout << i << ": " << n << std::endl;
-        ++i;
-    }
+	auto stack = getActorStackTrace();
+	int i = 0;
+	while (!stack.empty()) {
+		auto s = stack.back();
+		stack.pop_back();
+		std::string_view n(reinterpret_cast<const char*>(s.begin()), s.size());
+		std::cout << i << ": " << n << std::endl;
+		++i;
+	}
 }
 
 void setupStackSignal() {
-    std::signal(SIGUSR1, &stackSignalHandler);
+	std::signal(SIGUSR1, &stackSignalHandler);
 }
diff --git a/flow/flow.cpp b/flow/flow.cpp
index 351c8d0aa2..1332207e38 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -41,7 +41,7 @@ ActorLineage::~ActorLineage() {
 
 using namespace std::literals;
 
-std::string_view StackLineage::name = "StackLineage"sv;
+const std::string_view StackLineage::name = "StackLineage"sv;
 
 std::vector<StringRef> getActorStackTrace() {
 	return currentLineage->stack(&StackLineage::actorName);

From 8a6473c08a83bfe1bb888dbfd2dd7d3813c70295 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 9 Apr 2021 15:23:42 -0600
Subject: [PATCH 099/461] Apply suggestions from code review

Co-authored-by: Lukas Joswiak <lukas.joswiak@snowflake.com>
---
 fdbclient/ActorLineageProfiler.cpp | 16 ++++++++--------
 fdbclient/ActorLineageProfiler.h   |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index a28f011d5a..13bc224001 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -1,9 +1,9 @@
 /*
- * ActorLineageProfiler.h
+ * ActorLineageProfiler.cpp
  *
  * This source file is part of the FoundationDB open source project
  *
- * Copyright 2013-20201 Apple Inc. and the FoundationDB project authors
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -107,12 +107,12 @@ public:
 			this->pack(uint64_t(val));
 		} else if (val >= std::numeric_limits<uint8_t>::min()) {
 			pack_int8(int8_t(val));
-		} else if (val >= std::numeric_limits<uint16_t>::min()) {
-			pack_int8(int16_t(val));
-		} else if (val >= std::numeric_limits<uint32_t>::min()) {
-			pack_int8(int32_t(val));
-		} else if (val >= std::numeric_limits<uint64_t>::min()) {
-			pack_int8(int64_t(val));
+		} else if (val >= std::numeric_limits<int16_t>::min()) {
+			pack_int16(int16_t(val));
+		} else if (val >= std::numeric_limits<int32_t>::min()) {
+			pack_int32(int32_t(val));
+		} else if (val >= std::numeric_limits<int64_t>::min()) {
+			pack_int64(int64_t(val));
 		}
 	}
 
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index af32d6de13..2b4e780f39 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -3,7 +3,7 @@
  *
  * This source file is part of the FoundationDB open source project
  *
- * Copyright 2013-20201 Apple Inc. and the FoundationDB project authors
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 6656557b6a276d191e743dfa9414ffbad0afadae Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 9 Apr 2021 15:25:11 -0600
Subject: [PATCH 100/461] made internal collect method private

---
 fdbclient/ActorLineageProfiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 2b4e780f39..1f2bdad659 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -63,10 +63,10 @@ private:
 	std::vector<IALPCollectorBase*> collectors;
 	std::map<WaitState, Getter> getSamples;
 	SampleCollectorT() {}
+	std::map<std::string_view, std::any> collect(ActorLineage* lineage);
 
 public:
 	void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); }
-	std::map<std::string_view, std::any> collect(ActorLineage* lineage);
 	std::shared_ptr<Sample> collect();
 };
 

From 34f903447a7bf4dea455eeba4a7aff8c5595d17e Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sat, 10 Apr 2021 22:43:37 -0700
Subject: [PATCH 101/461] Seek test output improvement.

---
 fdbserver/VersionedBTree.actor.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 8d659ff368..28c73480b6 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -7650,12 +7650,14 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") {
 			pos = newPos;
 		}
 		double elapsed = timer() - start;
-		printf("Seek/skip test, jumpMax=%d, items=%d, oldSeek=%d useHint=%d:  Elapsed %f s\n",
+		printf("Seek/skip test, count=%d jumpMax=%d, items=%d, oldSeek=%d useHint=%d:  Elapsed %f seconds  %.2f M/s\n",
+		       count,
 		       jumpMax,
 		       items.size(),
 		       old,
 		       useHint,
-		       elapsed);
+		       elapsed,
+		       double(count) / elapsed / 1e6);
 	};
 
 	// Compare seeking to nearby elements with and without hints, using the old and new SeekLessThanOrEqual methods.

From 8e7b35d708e796cc2cc50267e2559a4e4d4812d2 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sat, 10 Apr 2021 22:44:28 -0700
Subject: [PATCH 102/461] Removed otherAncestor from DeltaTree::DecodedNode,
 replaced uses with path retracing.

---
 fdbserver/DeltaTree.h | 44 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h
index ceff1f2ec3..2865af596c 100644
--- a/fdbserver/DeltaTree.h
+++ b/fdbserver/DeltaTree.h
@@ -235,19 +235,16 @@ public:
 
 		// construct root node
 		DecodedNode(Node* raw, const T* prev, const T* next, Arena& arena, bool large)
-		  : raw(raw), parent(nullptr), otherAncestor(nullptr), leftChild(nullptr), rightChild(nullptr), prev(prev),
-		    next(next), item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)),
-		    large(large) {
+		  : raw(raw), parent(nullptr), leftChild(nullptr), rightChild(nullptr), prev(prev), next(next),
+		    item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)), large(large) {
 			// printf("DecodedNode1 raw=%p delta=%s\n", raw, raw->delta(large).toString().c_str());
 		}
 
 		// Construct non-root node
 		// wentLeft indicates that we've gone left to get to the raw node.
 		DecodedNode(Node* raw, DecodedNode* parent, bool wentLeft, Arena& arena)
-		  : parent(parent), large(parent->large),
-		    otherAncestor(wentLeft ? parent->getPrevAncestor() : parent->getNextAncestor()),
-		    prev(wentLeft ? parent->prev : &parent->item), next(wentLeft ? &parent->item : parent->next),
-		    leftChild(nullptr), rightChild(nullptr), raw(raw),
+		  : parent(parent), large(parent->large), prev(wentLeft ? parent->prev : &parent->item),
+		    next(wentLeft ? &parent->item : parent->next), leftChild(nullptr), rightChild(nullptr), raw(raw),
 		    item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)) {
 			// printf("DecodedNode2 raw=%p delta=%s\n", raw, raw->delta(large).toString().c_str());
 		}
@@ -258,12 +255,34 @@ public:
 		// Returns true if otherAncestor is the next ("least greator") ancestor
 		bool otherAncestorNext() const { return parent && parent->rightChild == this; }
 
-		DecodedNode* getPrevAncestor() const { return otherAncestorPrev() ? otherAncestor : parent; }
+		// Gets the first ancestor to the left
+		DecodedNode* getPrevAncestor() const {
+			DecodedNode* p = parent;
+			const DecodedNode* child = this;
+			// While p is not null and p is not to the left of child (meaning child is p's right child)
+			while (p != nullptr && p->rightChild != child) {
+				// Otherwise, move up
+				child = p;
+				p = p->parent;
+			}
+			return p;
+		}
 
-		DecodedNode* getNextAncestor() const { return otherAncestorNext() ? otherAncestor : parent; }
+		DecodedNode* getNextAncestor() const {
+			DecodedNode* p = parent;
+			const DecodedNode* child = this;
+			// While p is not null and p is not to the right of child (meaning child is p's left child)
+			while (p != nullptr && p->leftChild != child) {
+				// Otherwise, move up
+				child = p;
+				p = p->parent;
+			}
+			return p;
+		}
 
 		DecodedNode* jumpUpNext(DecodedNode* root, bool& othersChild) const {
 			if (parent != nullptr) {
+				DecodedNode* otherAncestor = otherAncestorPrev() ? getPrevAncestor() : getNextAncestor();
 				if (parent->rightChild == this) {
 					return otherAncestor;
 				}
@@ -277,6 +296,7 @@ public:
 
 		DecodedNode* jumpUpPrev(DecodedNode* root, bool& othersChild) const {
 			if (parent != nullptr) {
+				DecodedNode* otherAncestor = otherAncestorPrev() ? getPrevAncestor() : getNextAncestor();
 				if (parent->leftChild == this) {
 					return otherAncestor;
 				}
@@ -290,22 +310,26 @@ public:
 
 		DecodedNode* jumpNext(DecodedNode* root) const {
 			if (otherAncestorNext()) {
+				DecodedNode* otherAncestor = getNextAncestor();
 				return (otherAncestor != nullptr) ? otherAncestor : rightChild;
 			} else {
 				if (this == root) {
 					return rightChild;
 				}
+				DecodedNode* otherAncestor = getPrevAncestor();
 				return (otherAncestor != nullptr) ? otherAncestor->rightChild : root;
 			}
 		}
 
 		DecodedNode* jumpPrev(DecodedNode* root) const {
 			if (otherAncestorPrev()) {
+				DecodedNode* otherAncestor = getPrevAncestor();
 				return (otherAncestor != nullptr) ? otherAncestor : leftChild;
 			} else {
 				if (this == root) {
 					return leftChild;
 				}
+				DecodedNode* otherAncestor = getNextAncestor();
 				return (otherAncestor != nullptr) ? otherAncestor->leftChild : root;
 			}
 		}
@@ -317,7 +341,6 @@ public:
 		bool large; // Node size
 		Node* raw;
 		DecodedNode* parent;
-		DecodedNode* otherAncestor;
 		DecodedNode* leftChild;
 		DecodedNode* rightChild;
 		const T* prev; // greatest ancestor to the left, or tree lower bound
@@ -474,7 +497,6 @@ public:
 			newNode->leftChild = nullptr;
 			newNode->rightChild = nullptr;
 			newNode->raw = raw;
-			newNode->otherAncestor = addLeftChild ? n->getPrevAncestor() : n->getNextAncestor();
 			newNode->prev = prev;
 			newNode->next = next;
 

From 4e24e3e8c8a3f132b6a8c4ad2fce4b5f76812410 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sat, 10 Apr 2021 23:00:03 -0700
Subject: [PATCH 103/461] Reverted removal of otherAncestor from
 DeltaTree::DecodedNode due to too high of a performance hit.

---
 fdbserver/DeltaTree.h | 44 +++++++++++--------------------------------
 1 file changed, 11 insertions(+), 33 deletions(-)

diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h
index 2865af596c..ceff1f2ec3 100644
--- a/fdbserver/DeltaTree.h
+++ b/fdbserver/DeltaTree.h
@@ -235,16 +235,19 @@ public:
 
 		// construct root node
 		DecodedNode(Node* raw, const T* prev, const T* next, Arena& arena, bool large)
-		  : raw(raw), parent(nullptr), leftChild(nullptr), rightChild(nullptr), prev(prev), next(next),
-		    item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)), large(large) {
+		  : raw(raw), parent(nullptr), otherAncestor(nullptr), leftChild(nullptr), rightChild(nullptr), prev(prev),
+		    next(next), item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)),
+		    large(large) {
 			// printf("DecodedNode1 raw=%p delta=%s\n", raw, raw->delta(large).toString().c_str());
 		}
 
 		// Construct non-root node
 		// wentLeft indicates that we've gone left to get to the raw node.
 		DecodedNode(Node* raw, DecodedNode* parent, bool wentLeft, Arena& arena)
-		  : parent(parent), large(parent->large), prev(wentLeft ? parent->prev : &parent->item),
-		    next(wentLeft ? &parent->item : parent->next), leftChild(nullptr), rightChild(nullptr), raw(raw),
+		  : parent(parent), large(parent->large),
+		    otherAncestor(wentLeft ? parent->getPrevAncestor() : parent->getNextAncestor()),
+		    prev(wentLeft ? parent->prev : &parent->item), next(wentLeft ? &parent->item : parent->next),
+		    leftChild(nullptr), rightChild(nullptr), raw(raw),
 		    item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)) {
 			// printf("DecodedNode2 raw=%p delta=%s\n", raw, raw->delta(large).toString().c_str());
 		}
@@ -255,34 +258,12 @@ public:
 		// Returns true if otherAncestor is the next ("least greator") ancestor
 		bool otherAncestorNext() const { return parent && parent->rightChild == this; }
 
-		// Gets the first ancestor to the left
-		DecodedNode* getPrevAncestor() const {
-			DecodedNode* p = parent;
-			const DecodedNode* child = this;
-			// While p is not null and p is not to the left of child (meaning child is p's right child)
-			while (p != nullptr && p->rightChild != child) {
-				// Otherwise, move up
-				child = p;
-				p = p->parent;
-			}
-			return p;
-		}
+		DecodedNode* getPrevAncestor() const { return otherAncestorPrev() ? otherAncestor : parent; }
 
-		DecodedNode* getNextAncestor() const {
-			DecodedNode* p = parent;
-			const DecodedNode* child = this;
-			// While p is not null and p is not to the right of child (meaning child is p's left child)
-			while (p != nullptr && p->leftChild != child) {
-				// Otherwise, move up
-				child = p;
-				p = p->parent;
-			}
-			return p;
-		}
+		DecodedNode* getNextAncestor() const { return otherAncestorNext() ? otherAncestor : parent; }
 
 		DecodedNode* jumpUpNext(DecodedNode* root, bool& othersChild) const {
 			if (parent != nullptr) {
-				DecodedNode* otherAncestor = otherAncestorPrev() ? getPrevAncestor() : getNextAncestor();
 				if (parent->rightChild == this) {
 					return otherAncestor;
 				}
@@ -296,7 +277,6 @@ public:
 
 		DecodedNode* jumpUpPrev(DecodedNode* root, bool& othersChild) const {
 			if (parent != nullptr) {
-				DecodedNode* otherAncestor = otherAncestorPrev() ? getPrevAncestor() : getNextAncestor();
 				if (parent->leftChild == this) {
 					return otherAncestor;
 				}
@@ -310,26 +290,22 @@ public:
 
 		DecodedNode* jumpNext(DecodedNode* root) const {
 			if (otherAncestorNext()) {
-				DecodedNode* otherAncestor = getNextAncestor();
 				return (otherAncestor != nullptr) ? otherAncestor : rightChild;
 			} else {
 				if (this == root) {
 					return rightChild;
 				}
-				DecodedNode* otherAncestor = getPrevAncestor();
 				return (otherAncestor != nullptr) ? otherAncestor->rightChild : root;
 			}
 		}
 
 		DecodedNode* jumpPrev(DecodedNode* root) const {
 			if (otherAncestorPrev()) {
-				DecodedNode* otherAncestor = getPrevAncestor();
 				return (otherAncestor != nullptr) ? otherAncestor : leftChild;
 			} else {
 				if (this == root) {
 					return leftChild;
 				}
-				DecodedNode* otherAncestor = getNextAncestor();
 				return (otherAncestor != nullptr) ? otherAncestor->leftChild : root;
 			}
 		}
@@ -341,6 +317,7 @@ public:
 		bool large; // Node size
 		Node* raw;
 		DecodedNode* parent;
+		DecodedNode* otherAncestor;
 		DecodedNode* leftChild;
 		DecodedNode* rightChild;
 		const T* prev; // greatest ancestor to the left, or tree lower bound
@@ -497,6 +474,7 @@ public:
 			newNode->leftChild = nullptr;
 			newNode->rightChild = nullptr;
 			newNode->raw = raw;
+			newNode->otherAncestor = addLeftChild ? n->getPrevAncestor() : n->getNextAncestor();
 			newNode->prev = prev;
 			newNode->next = next;
 

From 13e00e8408bc0914751574dc688c4ebeea4b2b11 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 12 Apr 2021 09:43:45 -0600
Subject: [PATCH 104/461] made ActorLineage thread safe

---
 flow/flow.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/flow/flow.h b/flow/flow.h
index 09211959a7..2fab7b11a4 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -42,6 +42,7 @@
 #include <utility>
 #include <algorithm>
 #include <memory>
+#include <mutex>
 
 #include "flow/Platform.h"
 #include "flow/FastAlloc.h"
@@ -453,14 +454,23 @@ struct ActorLineage : ReferenceCounted<ActorLineage> {
 private:
 	std::unordered_map<std::string_view, LineagePropertiesBase*> properties;
 	Reference<ActorLineage> parent;
+	mutable std::mutex mutex;
+	using Lock = std::unique_lock<std::mutex>;
 
 public:
 	ActorLineage();
 	~ActorLineage();
-	bool isRoot() const { return parent.getPtr() == nullptr; }
-	void makeRoot() { parent.clear(); }
+	bool isRoot() const {
+		Lock _{ mutex };
+		return parent.getPtr() == nullptr;
+	}
+	void makeRoot() {
+		Lock _{ mutex };
+		parent.clear();
+	}
 	template <class T, class V>
 	V& modify(V T::*member) {
+		Lock _{ mutex };
 		auto& res = properties[T::name];
 		if (!res) {
 			res = new T{};
@@ -470,6 +480,7 @@ public:
 	}
 	template <class T, class V>
 	std::optional<V> get(V T::*member) const {
+		Lock _{ mutex };
 		auto current = this;
 		while (current != nullptr) {
 			auto iter = current->properties.find(T::name);
@@ -485,6 +496,7 @@ public:
 	}
 	template <class T, class V>
 	std::vector<V> stack(V T::*member) const {
+		Lock _{ mutex };
 		auto current = this;
 		std::vector<V> res;
 		while (current != nullptr) {

From eb2fe0dbcf19e656fe1f7500fd85164778d1c868 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 12 Apr 2021 09:48:53 -0600
Subject: [PATCH 105/461] added serializable containers

---
 fdbclient/ActorLineageProfiler.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 13bc224001..a084beb4b3 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -60,7 +60,19 @@ class Packer : public msgpack::packer<msgpack::sbuffer> {
 			static void populate(VisitorMap&) {}
 		};
 
-		visitor_t() { populate_visitor_map<int64_t, uint64_t, bool, float, double>::populate(visitorMap); }
+		visitor_t() {
+			populate_visitor_map<int64_t,
+			                     uint64_t,
+			                     bool,
+			                     float,
+			                     double,
+			                     std::string,
+			                     std::string_view,
+			                     std::vector<std::any>,
+			                     std::map<std::any, std::any>,
+			                     std::map<std::string_view, std::any>,
+			                     std::unordered_map<std::any, std::any>>::populate(visitorMap);
+		}
 
 		void visit(const std::any& val, Packer& packer) {
 			auto iter = visitorMap.find(val.type());

From ec95b649b04179254ba5c8b6ef8de676972090dc Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 12 Apr 2021 09:51:59 -0600
Subject: [PATCH 106/461] Any can't be used as an index type

---
 fdbclient/ActorLineageProfiler.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index a084beb4b3..5c0aaf86d1 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -69,9 +69,8 @@ class Packer : public msgpack::packer<msgpack::sbuffer> {
 			                     std::string,
 			                     std::string_view,
 			                     std::vector<std::any>,
-			                     std::map<std::any, std::any>,
-			                     std::map<std::string_view, std::any>,
-			                     std::unordered_map<std::any, std::any>>::populate(visitorMap);
+			                     std::map<std::string, std::any>,
+			                     std::map<std::string_view, std::any>>::populate(visitorMap);
 		}
 
 		void visit(const std::any& val, Packer& packer) {

From eb4c80db39deb94cb4eb1e2d72364eb097db030a Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Mon, 12 Apr 2021 23:15:17 +0000
Subject: [PATCH 107/461] Respect the version constraints for restart tests in
 ctest

---
 tests/TestRunner/TestRunner.py | 49 ++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/tests/TestRunner/TestRunner.py b/tests/TestRunner/TestRunner.py
index 4e9bb1d5c0..207bec08c0 100755
--- a/tests/TestRunner/TestRunner.py
+++ b/tests/TestRunner/TestRunner.py
@@ -264,6 +264,40 @@ def process_traces(basedir, testname, path, out, aggregationPolicy, symbolicateB
     parser.writeObject({'CMakeSEED': str(cmake_seed)})
     return res
 
+class RestartTestPolicy:
+    def __init__(self, name, old_binary, new_binary):
+        # Default is to use the same binary for the restart test, unless constraints are satisfied.
+        self._first_binary = new_binary
+        self._second_binary = new_binary
+        if old_binary is None:
+            _logger.info("No old binary provided")
+        old_binary_version_raw = subprocess.check_output([old_binary, '--version']).decode('utf-8')
+        match = re.match('FoundationDB.*\(v([0-9]+\.[0-9]+\.[0-9]+)\)', old_binary_version_raw)
+        assert match, old_binary_version_raw
+        old_binary_version = tuple(map(int, match.group(1).split('.')))
+        match = re.match('.*/restarting/from_([0-9]+\.[0-9]+\.[0-9]+)/', name)
+        if match: # upgrading _from_
+            lower_bound = tuple(map(int, match.group(1).split('.')))
+            if old_binary_version >= lower_bound:
+                self._first_binary = old_binary
+                _logger.info("Using old binary as first binary: {} >= {}".format(old_binary_version, lower_bound))
+            else:
+                _logger.info("Using new binary as first binary: {} < {}".format(old_binary_version, lower_bound))
+        match = re.match('.*/restarting/to_([0-9]+\.[0-9]+\.[0-9]+)/', name)
+        if match: # downgrading _to_
+            lower_bound = tuple(map(int, match.group(1).split('.')))
+            if old_binary_version >= lower_bound:
+                self._second_binary = old_binary
+                _logger.info("Using old binary as second binary: {} >= {}".format(old_binary_version, lower_bound))
+            else:
+                _logger.info("Using new binary as second binary: {} < {}".format(old_binary_version, lower_bound))
+
+    def first_binary(self):
+        return self._first_binary
+
+    def second_binary(self):
+        return self._second_binary
+
 def run_simulation_test(basedir, options):
     fdbserver = os.path.join(basedir, 'bin', 'fdbserver')
     pargs = [fdbserver,
@@ -298,14 +332,19 @@ def run_simulation_test(basedir, options):
     os.mkdir(wd)
     return_codes = {} # {command: return_code}
     first = True
+    restart_test_policy = None
+    if len(options.testfile) > 1:
+        restart_test_policy = RestartTestPolicy(options.testfile[0], options.old_binary, fdbserver)
     for testfile in options.testfile:
         tmp = list(pargs)
-        # old_binary is not under test, so don't run under valgrind
         valgrind_args = []
-        if first and options.old_binary is not None and len(options.testfile) > 1:
-            _logger.info("Run old binary at {}".format(options.old_binary))
-            tmp[0] = options.old_binary
-        elif options.use_valgrind:
+        if restart_test_policy is not None:
+            if first:
+                tmp[0] = restart_test_policy.first_binary()
+            else:
+                tmp[0] = restart_test_policy.second_binary()
+        # old_binary is not under test, so don't run under valgrind
+        if options.use_valgrind and tmp[0] == fdbserver:
             valgrind_args = ['valgrind', '--error-exitcode=99', '--']
         if not first:
             tmp.append('-R')

From 4fdfca0dc042fe8ae8ed5cb275655e1573312441 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 13 Apr 2021 13:21:04 -0700
Subject: [PATCH 108/461] Update the thread-safe Future-to-ThreadFuture method,
 added two TSAN unit tests

---
 flow/CMakeLists.txt         |  2 +-
 flow/ThreadHelper.actor.cpp | 68 +++++++++++++++++++++++++++++++++++++
 flow/ThreadHelper.actor.h   | 32 +++++++++++++----
 flow/ThreadHelper.cpp       | 25 --------------
 4 files changed, 95 insertions(+), 32 deletions(-)
 create mode 100644 flow/ThreadHelper.actor.cpp
 delete mode 100644 flow/ThreadHelper.cpp

diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt
index c838e8eff8..c928aac757 100644
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@@ -58,7 +58,7 @@ set(FLOW_SRCS
   TLSConfig.actor.cpp
   TLSConfig.actor.h
   ThreadHelper.actor.h
-  ThreadHelper.cpp
+  ThreadHelper.actor.cpp
   ThreadPrimitives.cpp
   ThreadPrimitives.h
   ThreadSafeQueue.h
diff --git a/flow/ThreadHelper.actor.cpp b/flow/ThreadHelper.actor.cpp
new file mode 100644
index 0000000000..664dd130dd
--- /dev/null
+++ b/flow/ThreadHelper.actor.cpp
@@ -0,0 +1,68 @@
+/*
+ * ThreadHelper.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flow/ThreadHelper.actor.h"
+#include "flow/Error.h"
+#include "flow/UnitTest.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
+#include <string>
+
+ThreadCallback* ThreadCallback::addCallback(ThreadCallback* cb) {
+	return (new ThreadMultiCallback())->addCallback(this)->addCallback(cb);
+}
+
+// A simple thread object that sends the result
+struct ThreadFutureSendObj {
+	void operator()() { tsav->send(Void()); }
+	ThreadSingleAssignmentVar<Void>* tsav;
+};
+
+// A simple thread object that cancels the threadFuture
+struct ThreadFutureCancelObj {
+	ThreadFutureCancelObj(ThreadSingleAssignmentVar<Void>* tsav) : f(tsav) {}
+	void operator()() { f.cancel(); }
+	ThreadFuture<Void> f;
+};
+
+// This unit test should be running with TSAN enabled binary
+TEST_CASE("/safeThreadFutureSend") {
+	auto* tsav = new ThreadSingleAssignmentVar<Void>;
+	state std::thread thread = std::thread{ ThreadFutureSendObj{ tsav } };
+	ThreadFuture<Void> f(tsav);
+	// change this to unsafeThreadFutureToFuture will get a data-race failure
+	wait(safeThreadFutureToFuture(f));
+	thread.join();
+	return Void();
+}
+
+// This unit test should be running with TSAN enabled binary
+TEST_CASE("/safeThreadFutureCancel") {
+	auto* tsav = new ThreadSingleAssignmentVar<Void>;
+	state std::thread thread = std::thread{ ThreadFutureCancelObj(tsav) };
+	try {
+		ThreadFuture<Void> f(tsav);
+		wait(safeThreadFutureToFuture(f)); // this actor should be thrown actor_cancelled
+		ASSERT(false);
+	} catch (Error& e) {
+		ASSERT(e.code() == error_code_actor_cancelled);
+	}
+	thread.join();
+	return Void();
+}
\ No newline at end of file
diff --git a/flow/ThreadHelper.actor.h b/flow/ThreadHelper.actor.h
index b89c99b571..dd1065481f 100644
--- a/flow/ThreadHelper.actor.h
+++ b/flow/ThreadHelper.actor.h
@@ -534,7 +534,7 @@ Future<T> unsafeThreadFutureToFuture(ThreadFuture<T> threadFuture) {
 
 // A callback waiting on a thread future and will delete itself once fired
 template <class T>
-struct UtilCallback : public ThreadCallback, ReferenceCounted<UtilCallback<T>> {
+struct UtilCallback : public ThreadCallback {
 public:
 	UtilCallback(ThreadFuture<T> f, void* userdata) : f(f), userdata(userdata) {}
 
@@ -547,29 +547,49 @@ public:
 		g_network->onMainThread(Promise<Void>((SAV<Void>*)userdata), TaskPriority::DefaultOnMainThread);
 		delete this;
 	}
+	void destroy() override {}
 
 private:
 	ThreadFuture<T> f;
 	void* userdata;
 };
 
+// The underlying actor that converts ThreadFuture from Future
+// Note: should be used from main thread
 ACTOR template <class T>
-static Future<T> safeThreadFutureToFutureActor(ThreadFuture<T> threadFuture) {
+static Future<Void> safeThreadFutureToFutureActor(Promise<T> result, ThreadFuture<T> threadFuture) {
 	Promise<Void> ready;
 	Future<Void> onReady = ready.getFuture();
-	UtilCallback<T>* callback = new UtilCallback<T>(threadFuture, ready.extractRawPointer());
+	auto savPtr = ready.extractRawPointer();
+	UtilCallback<T>* callback = new UtilCallback<T>(threadFuture, savPtr);
 	int unused = 0;
 	threadFuture.callOrSetAsCallback(callback, unused, 0);
 	wait(onReady);
 	// threadFuture should be ready
 	if (threadFuture.isError())
-		throw threadFuture.getError();
-	return threadFuture.get();
+		result.sendError(threadFuture.getError());
+	result.send(threadFuture.get());
+	return Void();
 }
 
+// A wrapper actor used for cancellation
+ACTOR template <class T>
+static Future<T> safeThreadFutureToFutureCancellableActor(ThreadFuture<T> threadFuture) {
+	state Promise<T> result;
+	Future<Void> cancellable = safeThreadFutureToFutureActor(result, threadFuture);
+	threadFuture.getPtr()->setCancel(Future<Void>(cancellable));
+	wait(cancellable);
+	Future<T> ready = result.getFuture();
+	if (ready.isError())
+		throw ready.getError();
+	return ready.get();
+}
+
+// Converts a ThreadFuture into a Future
+// Note: This is a thread-safe method when used from the main thread and supports cancellation
 template <class T>
 Future<T> safeThreadFutureToFuture(ThreadFuture<T> threadFuture) {
-	return safeThreadFutureToFutureActor(threadFuture);
+	return safeThreadFutureToFutureCancellableActor(threadFuture);
 }
 
 ACTOR template <class R, class F>
diff --git a/flow/ThreadHelper.cpp b/flow/ThreadHelper.cpp
deleted file mode 100644
index fe61752ea5..0000000000
--- a/flow/ThreadHelper.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * ThreadHelper.cpp
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flow/ThreadHelper.actor.h"
-
-ThreadCallback* ThreadCallback::addCallback(ThreadCallback* cb) {
-	return (new ThreadMultiCallback())->addCallback(this)->addCallback(cb);
-}

From c934e4b3fe34f30cd74646c49a973c9856760982 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 13 Apr 2021 13:27:40 -0700
Subject: [PATCH 109/461] Add comments, fix typos

---
 fdbcli/ConsistencycheckCommand.actor.cpp | 43 +++++++++++++-----------
 fdbcli/fdbcli.actor.cpp                  |  2 +-
 fdbcli/fdbcli.h                          | 19 +++++++----
 3 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/fdbcli/ConsistencycheckCommand.actor.cpp b/fdbcli/ConsistencycheckCommand.actor.cpp
index 615c078203..4279727b1d 100644
--- a/fdbcli/ConsistencycheckCommand.actor.cpp
+++ b/fdbcli/ConsistencycheckCommand.actor.cpp
@@ -6,40 +6,43 @@
 #include "flow/Arena.h"
 #include "flow/FastRef.h"
 #include "flow/ThreadHelper.actor.h"
-#include "flow/actorcompiler.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
 
 using namespace fdb_cli;
 
 ACTOR static Future<bool> consistencycheckCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
-    state Reference<ITransaction> tr = db->createTransaction();
-    tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
-    KeyRef k = LiteralStringRef("\xff\xff/management/consistency_check_suspended");
-    if (tokens.size() == 1) {
-		Optional<Value> suspended = wait(safeThreadFutureToFuture(tr->get(k)));
+	state Reference<ITransaction> tr = db->createTransaction();
+	tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+	if (tokens.size() == 1) {
+		Optional<Value> suspended = wait(safeThreadFutureToFuture(tr->get(consistencyCheckSpeicalKey)));
 		printf("ConsistencyCheck is %s\n", suspended.present() ? "off" : "on");
-    } else if (tokens.size() == 2 && tokencmp(tokens[1], "off")) {
-        tr->set(k, Value());
+	} else if (tokens.size() == 2 && tokencmp(tokens[1], "off")) {
+		tr->set(consistencyCheckSpeicalKey, Value());
 		wait(safeThreadFutureToFuture(tr->commit()));
 	} else if (tokens.size() == 2 && tokencmp(tokens[1], "on")) {
-        tr->clear(k);
+		tr->clear(consistencyCheckSpeicalKey);
 		wait(safeThreadFutureToFuture(tr->commit()));
 	} else {
-        printUsage(tokens[0]);
-        return false;
-    }
-    return true;
+		printUsage(tokens[0]);
+		return false;
+	}
+	return true;
 }
 
 namespace fdb_cli {
 
+const KeyRef consistencyCheckSpeicalKey = LiteralStringRef("\xff\xff/management/consistency_check_suspended");
+
 Future<bool> consistencycheckCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
-    return consistencycheckCommandActor(db, tokens);
+	return consistencycheckCommandActor(db, tokens);
 }
 
-CommandFactory consistencycheckFactory("consistencycheck", CommandHelp(
-	    "consistencycheck [on|off]",
-	    "permits or prevents consistency checking",
-	    "Calling this command with `on' permits consistency check processes to run and `off' will halt their checking. "
-	    "Calling this command with no arguments will display if consistency checking is currently allowed.\n"));
+CommandFactory consistencycheckFactory(
+    "consistencycheck",
+    CommandHelp(
+        "consistencycheck [on|off]",
+        "permits or prevents consistency checking",
+        "Calling this command with `on' permits consistency check processes to run and `off' will halt their checking. "
+        "Calling this command with no arguments will display if consistency checking is currently allowed.\n"));
 
-} // namespace fdb_cli
\ No newline at end of file
+} // namespace fdb_cli
diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index 22387d33c4..62c2c52285 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -3789,7 +3789,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 
 				if (tokencmp(tokens[0], "consistencycheck")) {
 					bool _result = wait(consistencycheckCommand(db2, tokens));
-					is_error = _result;
+					is_error = !_result;
 					continue;
 				}
 
diff --git a/fdbcli/fdbcli.h b/fdbcli/fdbcli.h
index d0b0581aff..a5328191d5 100644
--- a/fdbcli/fdbcli.h
+++ b/fdbcli/fdbcli.h
@@ -37,17 +37,22 @@ struct CommandHelp {
 
 struct CommandFactory {
 	CommandFactory(const char* name, CommandHelp help) { commands()[name] = help; }
-    CommandFactory(const char* name) { hiddenCommands().insert(name); }
-    static std::map<std::string, CommandHelp>& commands() {
+	CommandFactory(const char* name) { hiddenCommands().insert(name); }
+	static std::map<std::string, CommandHelp>& commands() {
 		static std::map<std::string, CommandHelp> helpMap;
 		return helpMap;
 	}
-    static std::set<std::string>& hiddenCommands() {
-        static std::set<std::string> commands;
-        return commands;
-    }
+	static std::set<std::string>& hiddenCommands() {
+		static std::set<std::string> commands;
+		return commands;
+	}
 };
 
+// Special keys used by fdbcli commands
+
+// consistencycheck
+extern const KeyRef consistencyCheckSpeicalKey;
+
 // help functions (Copied from fdbcli.actor.cpp)
 
 // compare StringRef with the given c string
@@ -61,4 +66,4 @@ Future<bool> consistencycheckCommand(Reference<IDatabase> db, std::vector<String
 
 } // namespace fdb_cli
 
-#endif
\ No newline at end of file
+#endif

From 3e6215702fe72cf212158264c4eb1732531ebace Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 13 Apr 2021 13:42:19 -0700
Subject: [PATCH 110/461] Add comments for refactoring

---
 fdbcli/fdbcli.actor.cpp     | 12 ++++++------
 flow/ThreadHelper.actor.cpp |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index 62c2c52285..7aa2833f4c 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -36,7 +36,6 @@
 #include "fdbclient/TagThrottle.h"
 
 #include "flow/DeterministicRandom.h"
-#include "flow/FastRef.h"
 #include "flow/Platform.h"
 
 #include "flow/TLSConfig.actor.h"
@@ -59,6 +58,7 @@
 
 #include "flow/actorcompiler.h" // This must be the last #include.
 
+#define FDB_API_VERSION 700
 /*
  * While we could just use the MultiVersionApi instance directly, this #define allows us to swap in any other IClientApi
  * instance (e.g. from ThreadSafeApi)
@@ -3123,7 +3123,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 
 	state Database db;
 	state Reference<ReadYourWritesTransaction> tr;
-	// refactoring
+	// Note: refactoring work, will replace db when we have all commands through the general fdb interface
 	state Reference<IDatabase> db2;
 
 	state bool writeMode = false;
@@ -3162,11 +3162,12 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 		return 1;
 	}
 
+	// Note: refactoring work, will remove the above code finally
 	try {
 		db2 = API->createDatabase(opt.clusterFile.c_str());
 	} catch (Error& e) {
 		fprintf(stderr, "(CAPI)ERROR: %s (%d)\n", e.what(), e.code());
-		printf("(CAPI): Unable to connect to cluster from `%s'\n", ccf->getFilename().c_str());
+		printf("(Refactoring): Unable to connect to cluster from `%s'\n", ccf->getFilename().c_str());
 		return 1;
 	}
 
@@ -4881,9 +4882,8 @@ int main(int argc, char** argv) {
 	}
 
 	try {
-		// setupNetwork();
-		// refactoring fdbcli
-		API->selectApiVersion(700);
+		// Note: refactoring fdbcli, in progress
+		API->selectApiVersion(FDB_API_VERSION);
 		API->setupNetwork();
 		Future<int> cliFuture = runCli(opt);
 		Future<Void> timeoutFuture = opt.exit_timeout ? timeExit(opt.exit_timeout) : Never();
diff --git a/flow/ThreadHelper.actor.cpp b/flow/ThreadHelper.actor.cpp
index 664dd130dd..740fe967b2 100644
--- a/flow/ThreadHelper.actor.cpp
+++ b/flow/ThreadHelper.actor.cpp
@@ -65,4 +65,4 @@ TEST_CASE("/safeThreadFutureCancel") {
 	}
 	thread.join();
 	return Void();
-}
\ No newline at end of file
+}

From bd6db9ca7cc4a789c5738d16f128e895c8ad15b6 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Tue, 13 Apr 2021 15:13:45 -0700
Subject: [PATCH 111/461] Update fdbserver/ClusterController.actor.cpp

Co-authored-by: Markus Pilman <markus.pilman@snowflake.com>
---
 fdbserver/ClusterController.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 97249dccf8..543abc8dad 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -855,8 +855,8 @@ public:
 				auto thisFit = it.processClass.machineClassFitness(role);
 				worstFit = std::max(worstFit, thisFit);
 				bestFit = std::min(bestFit, thisFit);
-				degraded |= it.degraded;
-				inClusterControllerDC |= (it.interf.locality.dcId() == ccDcId);
+				degraded = it.degraded || degraded;
+				inClusterControllerDC = (it.interf.locality.dcId() == ccDcId) || inClusterControllerDC;
 
 				auto thisUsed = id_used.find(it.interf.locality.processId());
 				if (thisUsed == id_used.end()) {

From ebf37594f788b5bc28dcf2214f02806f642f4f88 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Tue, 13 Apr 2021 19:22:13 -0700
Subject: [PATCH 112/461] Change initialSnapshotIntervalSeconds from knob to a
 backup argument.

---
 documentation/sphinx/source/backups.rst            |  3 +++
 fdbbackup/backup.actor.cpp                         |  9 +++++++++
 fdbclient/BackupAgent.actor.h                      | 14 ++++++++++++--
 fdbclient/FileBackupAgent.actor.cpp                |  6 +++++-
 fdbclient/Knobs.cpp                                |  2 +-
 fdbserver/workloads/AtomicRestore.actor.cpp        |  1 +
 .../BackupAndParallelRestoreCorrectness.actor.cpp  |  2 ++
 fdbserver/workloads/BackupCorrectness.actor.cpp    |  2 ++
 fdbserver/workloads/BackupToBlob.actor.cpp         |  3 ++-
 fdbserver/workloads/IncrementalBackup.actor.cpp    |  2 +-
 fdbserver/workloads/SubmitBackup.actor.cpp         |  3 +++
 11 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/documentation/sphinx/source/backups.rst b/documentation/sphinx/source/backups.rst
index 24ae05a124..404fe70f50 100644
--- a/documentation/sphinx/source/backups.rst
+++ b/documentation/sphinx/source/backups.rst
@@ -244,6 +244,9 @@ The ``start`` subcommand is used to start a backup.  If there is already a backu
 ``-s <DURATION>`` or ``--snapshot_interval <DURATION>``  
   Specifies the duration, in seconds, of the inconsistent snapshots written to the backup in continuous mode.  The default is 864000 which is 10 days.
 
+``--init_snapshot_interval <DURATION>``  
+  Specifies the duration, in seconds, of the first inconsistent snapshot written to the backup.  The default is 0, which means as fast as possible.
+
 ``--partitioned_log_experimental``
   Specifies the backup uses the partitioned mutation logs generated by backup workers. Since FDB version 6.3, this option is experimental and requires using fast restore for restoring the database from the generated files. The default is to use non-partitioned mutation logs generated by backup agents.
 
diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index 7614324afc..8170aa5eca 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -105,6 +105,7 @@ enum {
 	// Backup constants
 	OPT_DESTCONTAINER,
 	OPT_SNAPSHOTINTERVAL,
+	OPT_INIT_SNAPSHOT_INTERVAL,
 	OPT_ERRORLIMIT,
 	OPT_NOSTOPWHENDONE,
 	OPT_EXPIRE_BEFORE_VERSION,
@@ -232,6 +233,7 @@ CSimpleOpt::SOption g_rgBackupStartOptions[] = {
 	{ OPT_USE_PARTITIONED_LOG, "--partitioned_log_experimental", SO_NONE },
 	{ OPT_SNAPSHOTINTERVAL, "-s", SO_REQ_SEP },
 	{ OPT_SNAPSHOTINTERVAL, "--snapshot_interval", SO_REQ_SEP },
+	{ OPT_INIT_SNAPSHOT_INTERVAL, "--init_snapshot_interval", SO_REQ_SEP },
 	{ OPT_TAGNAME, "-t", SO_REQ_SEP },
 	{ OPT_TAGNAME, "--tagname", SO_REQ_SEP },
 	{ OPT_BACKUPKEYS, "-k", SO_REQ_SEP },
@@ -1879,6 +1881,7 @@ ACTOR Future<Void> submitDBBackup(Database src,
 
 ACTOR Future<Void> submitBackup(Database db,
                                 std::string url,
+                                int initialSnapshotIntervalSeconds,
                                 int snapshotIntervalSeconds,
                                 Standalone<VectorRef<KeyRangeRef>> backupRanges,
                                 std::string tagName,
@@ -1935,6 +1938,7 @@ ACTOR Future<Void> submitBackup(Database db,
 		else {
 			wait(backupAgent.submitBackup(db,
 			                              KeyRef(url),
+			                              initialSnapshotIntervalSeconds,
 			                              snapshotIntervalSeconds,
 			                              tagName,
 			                              backupRanges,
@@ -3212,6 +3216,7 @@ int main(int argc, char* argv[]) {
 		std::string destinationContainer;
 		bool describeDeep = false;
 		bool describeTimestamps = false;
+		int initialSnapshotIntervalSeconds = CLIENT_KNOBS->BACKUP_INIT_SNAPSHOT_INTERVAL_SEC;
 		int snapshotIntervalSeconds = CLIENT_KNOBS->BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC;
 		std::string clusterFile;
 		std::string sourceClusterFile;
@@ -3467,6 +3472,7 @@ int main(int argc, char* argv[]) {
 				modifyOptions.destURL = destinationContainer;
 				break;
 			case OPT_SNAPSHOTINTERVAL:
+			case OPT_INIT_SNAPSHOT_INTERVAL:
 			case OPT_MOD_ACTIVE_INTERVAL: {
 				const char* a = args->OptionArg();
 				int seconds;
@@ -3478,6 +3484,8 @@ int main(int argc, char* argv[]) {
 				if (optId == OPT_SNAPSHOTINTERVAL) {
 					snapshotIntervalSeconds = seconds;
 					modifyOptions.snapshotIntervalSeconds = seconds;
+				} else if (optId == OPT_INIT_SNAPSHOT_INTERVAL) {
+					initialSnapshotIntervalSeconds = seconds;
 				} else if (optId == OPT_MOD_ACTIVE_INTERVAL) {
 					modifyOptions.activeSnapshotIntervalSeconds = seconds;
 				}
@@ -3888,6 +3896,7 @@ int main(int argc, char* argv[]) {
 				openBackupContainer(argv[0], destinationContainer);
 				f = stopAfter(submitBackup(db,
 				                           destinationContainer,
+				                           initialSnapshotIntervalSeconds,
 				                           snapshotIntervalSeconds,
 				                           backupKeys,
 				                           tagName,
diff --git a/fdbclient/BackupAgent.actor.h b/fdbclient/BackupAgent.actor.h
index fb8f6b1564..b29cee2a12 100644
--- a/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/BackupAgent.actor.h
@@ -357,6 +357,7 @@ public:
 
 	Future<Void> submitBackup(Reference<ReadYourWritesTransaction> tr,
 	                          Key outContainer,
+	                          int initialSnapshotIntervalSeconds,
 	                          int snapshotIntervalSeconds,
 	                          std::string tagName,
 	                          Standalone<VectorRef<KeyRangeRef>> backupRanges,
@@ -365,6 +366,7 @@ public:
 	                          bool incrementalBackupOnly = false);
 	Future<Void> submitBackup(Database cx,
 	                          Key outContainer,
+	                          int initialSnapshotIntervalSeconds,
 	                          int snapshotIntervalSeconds,
 	                          std::string tagName,
 	                          Standalone<VectorRef<KeyRangeRef>> backupRanges,
@@ -374,6 +376,7 @@ public:
 		return runRYWTransactionFailIfLocked(cx, [=](Reference<ReadYourWritesTransaction> tr) {
 			return submitBackup(tr,
 			                    outContainer,
+			                    initialSnapshotIntervalSeconds,
 			                    snapshotIntervalSeconds,
 			                    tagName,
 			                    backupRanges,
@@ -404,7 +407,8 @@ public:
 	Future<std::string> getStatus(Database cx, bool showErrors, std::string tagName);
 	Future<std::string> getStatusJSON(Database cx, std::string tagName);
 
-	Future<Optional<Version>> getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName,
+	Future<Optional<Version>> getLastRestorable(Reference<ReadYourWritesTransaction> tr,
+	                                            Key tagName,
 	                                            bool snapshot = false);
 	void setLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName, Version version);
 
@@ -835,6 +839,11 @@ public:
 	typedef KeyBackedMap<Key, bool> RangeDispatchMapT;
 	RangeDispatchMapT snapshotRangeDispatchMap() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 
+	// Interval to use for the first (initial) snapshot.
+	KeyBackedProperty<int64_t> initialSnapshotIntervalSeconds() {
+		return configSpace.pack(LiteralStringRef(__FUNCTION__));
+	}
+
 	// Interval to use for determining the target end version for new snapshots
 	KeyBackedProperty<int64_t> snapshotIntervalSeconds() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 
@@ -864,8 +873,9 @@ public:
 
 		Future<Version> beginVersion = tr->getReadVersion();
 		Future<int64_t> defaultInterval = 0;
-		if (intervalSeconds < 0)
+		if (intervalSeconds < 0) {
 			defaultInterval = copy.snapshotIntervalSeconds().getOrThrow(tr);
+		}
 
 		// Make sure read version and possibly the snapshot interval value are ready, then clear/init the snapshot
 		// config members
diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index fd7f817711..680628bfc9 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2778,7 +2778,7 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 		state Reference<TaskFuture> backupFinished = futureBucket->future(tr);
 
 		// Initialize the initial snapshot and create tasks to continually write logs and snapshots.
-		wait(config.initNewSnapshot(tr, CLIENT_KNOBS->BACKUP_INIT_SNAPSHOT_INTERVAL_SEC));
+		wait(config.initNewSnapshot(tr, config.initialSnapshotIntervalSeconds().get(tr).get().orDefault(0)));
 
 		// Using priority 1 for both of these to at least start both tasks soon
 		// Do not add snapshot task if we only want the incremental backup
@@ -4439,6 +4439,7 @@ public:
 	ACTOR static Future<Void> submitBackup(FileBackupAgent* backupAgent,
 	                                       Reference<ReadYourWritesTransaction> tr,
 	                                       Key outContainer,
+	                                       int initialSnapshotIntervalSeconds,
 	                                       int snapshotIntervalSeconds,
 	                                       std::string tagName,
 	                                       Standalone<VectorRef<KeyRangeRef>> backupRanges,
@@ -4554,6 +4555,7 @@ public:
 		config.backupContainer().set(tr, bc);
 		config.stopWhenDone().set(tr, stopWhenDone);
 		config.backupRanges().set(tr, normalizedRanges);
+		config.initialSnapshotIntervalSeconds().set(tr, initialSnapshotIntervalSeconds);
 		config.snapshotIntervalSeconds().set(tr, snapshotIntervalSeconds);
 		config.partitionedLogEnabled().set(tr, partitionedLog);
 		config.incrementalBackupOnly().set(tr, incrementalBackupOnly);
@@ -5541,6 +5543,7 @@ Future<ERestoreState> FileBackupAgent::waitRestore(Database cx, Key tagName, boo
 
 Future<Void> FileBackupAgent::submitBackup(Reference<ReadYourWritesTransaction> tr,
                                            Key outContainer,
+                                           int initialSnapshotIntervalSeconds,
                                            int snapshotIntervalSeconds,
                                            std::string tagName,
                                            Standalone<VectorRef<KeyRangeRef>> backupRanges,
@@ -5550,6 +5553,7 @@ Future<Void> FileBackupAgent::submitBackup(Reference<ReadYourWritesTransaction>
 	return FileBackupAgentImpl::submitBackup(this,
 	                                         tr,
 	                                         outContainer,
+	                                         initialSnapshotIntervalSeconds,
 	                                         snapshotIntervalSeconds,
 	                                         tagName,
 	                                         backupRanges,
diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index 761a652a1a..bf9b87d2c3 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -133,7 +133,7 @@ void ClientKnobs::initialize(bool randomize) {
 	init( BACKUP_RANGE_TIMEOUT,   TASKBUCKET_TIMEOUT_VERSIONS/CORE_VERSIONSPERSECOND/2.0 );
 	init( BACKUP_RANGE_MINWAIT,   std::max(1.0, BACKUP_RANGE_TIMEOUT/2.0));
 	init( BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC,  10 * 60 );  // 10 minutes
-	init( BACKUP_INIT_SNAPSHOT_INTERVAL_SEC,          0); if( randomize && BUGGIFY ) BACKUP_INIT_SNAPSHOT_INTERVAL_SEC = deterministicRandom()->randomInt(0, 60); // The initial snapshot has a desired duration of 0, meaning go as fast as possible. In simulation, choose a random value between 0 - 60 seconds.
+	init( BACKUP_INIT_SNAPSHOT_INTERVAL_SEC,          0); // The initial snapshot has a desired duration of 0, meaning go as fast as possible.
 	init( BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC,   3600 * 24 * 10); // 10 days
 	init( BACKUP_SHARD_TASK_LIMIT,                1000 ); if( randomize && BUGGIFY ) BACKUP_SHARD_TASK_LIMIT = 4;
 	init( BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL, 60);
diff --git a/fdbserver/workloads/AtomicRestore.actor.cpp b/fdbserver/workloads/AtomicRestore.actor.cpp
index fb121bcc28..33412123f2 100644
--- a/fdbserver/workloads/AtomicRestore.actor.cpp
+++ b/fdbserver/workloads/AtomicRestore.actor.cpp
@@ -93,6 +93,7 @@ struct AtomicRestoreWorkload : TestWorkload {
 		try {
 			wait(backupAgent.submitBackup(cx,
 			                              StringRef(backupContainer),
+			                              deterministicRandom()->randomInt(0, 60),
 			                              deterministicRandom()->randomInt(0, 100),
 			                              BackupAgentBase::getDefaultTagName(),
 			                              self->backupRanges,
diff --git a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
index 9e8efe2937..fc7f014df0 100644
--- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
@@ -222,6 +222,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 		try {
 			wait(backupAgent->submitBackup(cx,
 			                               StringRef(backupContainer),
+										   deterministicRandom()->randomInt(0, 60),
 			                               deterministicRandom()->randomInt(0, 100),
 			                               tag.toString(),
 			                               backupRanges,
@@ -477,6 +478,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 					// the configuration to disable backup workers before restore.
 					extraBackup = backupAgent.submitBackup(cx,
 					                                       LiteralStringRef("file://simfdb/backups/"),
+														   deterministicRandom()->randomInt(0, 60),
 					                                       deterministicRandom()->randomInt(0, 100),
 					                                       self->backupTag.toString(),
 					                                       self->backupRanges,
diff --git a/fdbserver/workloads/BackupCorrectness.actor.cpp b/fdbserver/workloads/BackupCorrectness.actor.cpp
index 4a57d399fe..a6ad2c783b 100644
--- a/fdbserver/workloads/BackupCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupCorrectness.actor.cpp
@@ -248,6 +248,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 		try {
 			wait(backupAgent->submitBackup(cx,
 			                               StringRef(backupContainer),
+			                               deterministicRandom()->randomInt(0, 60),
 			                               deterministicRandom()->randomInt(0, 100),
 			                               tag.toString(),
 			                               backupRanges,
@@ -497,6 +498,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 				try {
 					extraBackup = backupAgent.submitBackup(cx,
 					                                       LiteralStringRef("file://simfdb/backups/"),
+					                                       deterministicRandom()->randomInt(0, 60),
 					                                       deterministicRandom()->randomInt(0, 100),
 					                                       self->backupTag.toString(),
 					                                       self->backupRanges,
diff --git a/fdbserver/workloads/BackupToBlob.actor.cpp b/fdbserver/workloads/BackupToBlob.actor.cpp
index 374b0d7f5d..b106176f16 100644
--- a/fdbserver/workloads/BackupToBlob.actor.cpp
+++ b/fdbserver/workloads/BackupToBlob.actor.cpp
@@ -29,6 +29,7 @@ struct BackupToBlobWorkload : TestWorkload {
 	double backupAfter;
 	Key backupTag;
 	Standalone<StringRef> backupURL;
+	int initSnapshotInterval = 0;
 	int snapshotInterval = 100000;
 
 	static constexpr const char* DESCRIPTION = "BackupToBlob";
@@ -60,7 +61,7 @@ struct BackupToBlobWorkload : TestWorkload {
 
 		wait(delay(self->backupAfter));
 		wait(backupAgent.submitBackup(
-		    cx, self->backupURL, self->snapshotInterval, self->backupTag.toString(), backupRanges));
+		    cx, self->backupURL, self->initSnapshotInterval, self->snapshotInterval, self->backupTag.toString(), backupRanges));
 		EBackupState backupStatus = wait(backupAgent.waitBackup(cx, self->backupTag.toString(), true));
 		TraceEvent("BackupToBlob_BackupStatus").detail("Status", BackupAgentBase::getStateText(backupStatus));
 		return Void();
diff --git a/fdbserver/workloads/IncrementalBackup.actor.cpp b/fdbserver/workloads/IncrementalBackup.actor.cpp
index 52aeae2859..8c4b20a07a 100644
--- a/fdbserver/workloads/IncrementalBackup.actor.cpp
+++ b/fdbserver/workloads/IncrementalBackup.actor.cpp
@@ -151,7 +151,7 @@ struct IncrementalBackupWorkload : TestWorkload {
 			TraceEvent("IBackupSubmitAttempt");
 			try {
 				wait(self->backupAgent.submitBackup(
-				    cx, self->backupDir, 1e8, self->tag.toString(), backupRanges, false, false, true));
+				    cx, self->backupDir, 0, 1e8, self->tag.toString(), backupRanges, false, false, true));
 			} catch (Error& e) {
 				TraceEvent("IBackupSubmitError").error(e);
 				if (e.code() != error_code_backup_duplicate) {
diff --git a/fdbserver/workloads/SubmitBackup.actor.cpp b/fdbserver/workloads/SubmitBackup.actor.cpp
index e468664514..6dbc58abf8 100644
--- a/fdbserver/workloads/SubmitBackup.actor.cpp
+++ b/fdbserver/workloads/SubmitBackup.actor.cpp
@@ -33,6 +33,7 @@ struct SubmitBackupWorkload final : TestWorkload {
 	Standalone<StringRef> backupDir;
 	Standalone<StringRef> tag;
 	double delayFor;
+	int initSnapshotInterval;
 	int snapshotInterval;
 	bool stopWhenDone;
 	bool incremental;
@@ -41,6 +42,7 @@ struct SubmitBackupWorkload final : TestWorkload {
 		backupDir = getOption(options, LiteralStringRef("backupDir"), LiteralStringRef("file://simfdb/backups/"));
 		tag = getOption(options, LiteralStringRef("tag"), LiteralStringRef("default"));
 		delayFor = getOption(options, LiteralStringRef("delayFor"), 10.0);
+		initSnapshotInterval = getOption(options, LiteralStringRef("initSnapshotInterval"), 0);
 		snapshotInterval = getOption(options, LiteralStringRef("snapshotInterval"), 1e8);
 		stopWhenDone = getOption(options, LiteralStringRef("stopWhenDone"), true);
 		incremental = getOption(options, LiteralStringRef("incremental"), false);
@@ -55,6 +57,7 @@ struct SubmitBackupWorkload final : TestWorkload {
 		try {
 			wait(self->backupAgent.submitBackup(cx,
 			                                    self->backupDir,
+			                                    self->initSnapshotInterval,
 			                                    self->snapshotInterval,
 			                                    self->tag.toString(),
 			                                    backupRanges,

From 28e7fc7ccafb6ebc72853bb66938cf776ecb2132 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Tue, 13 Apr 2021 19:25:43 -0700
Subject: [PATCH 113/461] Clang format fix.

---
 .../workloads/BackupAndParallelRestoreCorrectness.actor.cpp   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
index fc7f014df0..cbbe52f64a 100644
--- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
@@ -222,7 +222,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 		try {
 			wait(backupAgent->submitBackup(cx,
 			                               StringRef(backupContainer),
-										   deterministicRandom()->randomInt(0, 60),
+			                               deterministicRandom()->randomInt(0, 60),
 			                               deterministicRandom()->randomInt(0, 100),
 			                               tag.toString(),
 			                               backupRanges,
@@ -478,7 +478,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 					// the configuration to disable backup workers before restore.
 					extraBackup = backupAgent.submitBackup(cx,
 					                                       LiteralStringRef("file://simfdb/backups/"),
-														   deterministicRandom()->randomInt(0, 60),
+					                                       deterministicRandom()->randomInt(0, 60),
 					                                       deterministicRandom()->randomInt(0, 100),
 					                                       self->backupTag.toString(),
 					                                       self->backupRanges,

From 9475b6a5dd6ca87a2b42f5a950ffe7039e8e046e Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 13 Apr 2021 20:15:19 -0700
Subject: [PATCH 114/461] Correctness fix, prevent AsyncFileNonDurable from
 always making file writes take up to 5 seconds.

---
 fdbrpc/AsyncFileNonDurable.actor.h | 6 +++---
 flow/Knobs.cpp                     | 1 +
 flow/Knobs.h                       | 1 +
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index b682a7741b..c55cd8494e 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -197,7 +197,7 @@ private:
 		this->file = file;
 		this->filename = filename;
 		this->diskParameters = diskParameters;
-		maxWriteDelay = 5.0;
+		maxWriteDelay = deterministicRandom()->random01() * FLOW_KNOBS->NON_DURABLE_MAX_WRITE_DELAY;
 		hasBeenSynced = false;
 
 		killMode = (KillMode)deterministicRandom()->randomInt(1, 3);
@@ -434,7 +434,7 @@ private:
 		state TaskPriority currentTaskID = g_network->getCurrentTask();
 		wait(g_simulator.onMachine(currentProcess));
 
-		state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay;
+		state double delayDuration = g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
 		state Standalone<StringRef> dataCopy(StringRef((uint8_t*)data, length));
 
 		state Future<bool> startSyncFuture = self->startSyncPromise.getFuture();
@@ -606,7 +606,7 @@ private:
 		state TaskPriority currentTaskID = g_network->getCurrentTask();
 		wait(g_simulator.onMachine(currentProcess));
 
-		state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay;
+		state double delayDuration = g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
 		state Future<bool> startSyncFuture = self->startSyncPromise.getFuture();
 
 		try {
diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp
index a173ba43bf..6dc77e2fb2 100644
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@@ -135,6 +135,7 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) {
 	init( DISABLE_POSIX_KERNEL_AIO,                              0 );
 
 	//AsyncFileNonDurable
+	init( NON_DURABLE_MAX_WRITE_DELAY,                      0.0001 ); if( randomize && BUGGIFY ) NON_DURABLE_MAX_WRITE_DELAY = 5.0;
 	init( MAX_PRIOR_MODIFICATION_DELAY,                        1.0 ); if( randomize && BUGGIFY ) MAX_PRIOR_MODIFICATION_DELAY = 10.0;
 
 	//GenericActors
diff --git a/flow/Knobs.h b/flow/Knobs.h
index ab088382f8..67ec3b82b7 100644
--- a/flow/Knobs.h
+++ b/flow/Knobs.h
@@ -149,6 +149,7 @@ public:
 	int DISABLE_POSIX_KERNEL_AIO;
 
 	// AsyncFileNonDurable
+	double NON_DURABLE_MAX_WRITE_DELAY;
 	double MAX_PRIOR_MODIFICATION_DELAY;
 
 	// GenericActors

From 1c5013f6ecc04eb89541abcd14dfe7ddc7b27445 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 7 Apr 2021 18:39:06 -0700
Subject: [PATCH 115/461] Removed btree cleanup parameter override.

---
 tests/rare/RedwoodCorrectnessBTree.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/rare/RedwoodCorrectnessBTree.toml b/tests/rare/RedwoodCorrectnessBTree.toml
index db21848a4b..c39098e4cc 100644
--- a/tests/rare/RedwoodCorrectnessBTree.toml
+++ b/tests/rare/RedwoodCorrectnessBTree.toml
@@ -7,4 +7,3 @@ startDelay = 0
     testName = 'UnitTests'
     maxTestCases = 0
     testsMatching = '/redwood/correctness/btree'
-    remapCleanupWindow = 1000000000

From f74748ebac9fda2626cc71f8370c6b1d7ed2bd9a Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 13 Apr 2021 20:43:12 -0700
Subject: [PATCH 116/461] Applied clang-format.

---
 fdbrpc/AsyncFileNonDurable.actor.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index c55cd8494e..9997a8f4b2 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -434,7 +434,8 @@ private:
 		state TaskPriority currentTaskID = g_network->getCurrentTask();
 		wait(g_simulator.onMachine(currentProcess));
 
-		state double delayDuration = g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
+		state double delayDuration =
+		    g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
 		state Standalone<StringRef> dataCopy(StringRef((uint8_t*)data, length));
 
 		state Future<bool> startSyncFuture = self->startSyncPromise.getFuture();
@@ -606,7 +607,8 @@ private:
 		state TaskPriority currentTaskID = g_network->getCurrentTask();
 		wait(g_simulator.onMachine(currentProcess));
 
-		state double delayDuration = g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
+		state double delayDuration =
+		    g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
 		state Future<bool> startSyncFuture = self->startSyncPromise.getFuture();
 
 		try {

From 1958fde5c6751f13e633f9197610cbc22d1cf0a8 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 13 Apr 2021 20:49:04 -0700
Subject: [PATCH 117/461] Added parentheses for clarity.

---
 fdbrpc/AsyncFileNonDurable.actor.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 9997a8f4b2..fe3d3a4137 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -435,7 +435,7 @@ private:
 		wait(g_simulator.onMachine(currentProcess));
 
 		state double delayDuration =
-		    g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
+		    g_simulator.speedUpSimulation ? 0.0001 : (deterministicRandom()->random01() * self->maxWriteDelay);
 		state Standalone<StringRef> dataCopy(StringRef((uint8_t*)data, length));
 
 		state Future<bool> startSyncFuture = self->startSyncPromise.getFuture();
@@ -608,7 +608,7 @@ private:
 		wait(g_simulator.onMachine(currentProcess));
 
 		state double delayDuration =
-		    g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
+		    g_simulator.speedUpSimulation ? 0.0001 : (deterministicRandom()->random01() * self->maxWriteDelay);
 		state Future<bool> startSyncFuture = self->startSyncPromise.getFuture();
 
 		try {

From 7567fca3cf30d353ec0d1ae5bbcc36d4e14e9d2d Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 13 Apr 2021 21:25:27 -0700
Subject: [PATCH 118/461] Changed knob to int64_t from int as its default value
 overflows.

---
 fdbserver/DiskQueue.actor.cpp | 1 +
 fdbserver/Knobs.cpp           | 2 +-
 fdbserver/Knobs.h             | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fdbserver/DiskQueue.actor.cpp b/fdbserver/DiskQueue.actor.cpp
index a1be3733e6..1efc6ecee6 100644
--- a/fdbserver/DiskQueue.actor.cpp
+++ b/fdbserver/DiskQueue.actor.cpp
@@ -385,6 +385,7 @@ public:
 						waitfor.push_back(self->files[1].f->truncate(self->fileExtensionBytes));
 						self->files[1].size = self->fileExtensionBytes;
 					} else {
+						TEST(true); // Truncating DiskQueue file
 						const int64_t startingSize = self->files[1].size;
 						self->files[1].size -= std::min(maxShrink, self->files[1].size);
 						self->files[1].size = std::max(self->files[1].size, self->fileExtensionBytes);
diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index ad4c797b8d..993ad64e4f 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -83,7 +83,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( TLOG_SPILL_REFERENCE_MAX_BYTES_PER_BATCH,           16<<10 ); if ( randomize && BUGGIFY ) TLOG_SPILL_REFERENCE_MAX_BYTES_PER_BATCH = 500;
 	init( DISK_QUEUE_FILE_EXTENSION_BYTES,                    10<<20 ); // BUGGIFYd per file within the DiskQueue
 	init( DISK_QUEUE_FILE_SHRINK_BYTES,                      100<<20 ); // BUGGIFYd per file within the DiskQueue
-	init( DISK_QUEUE_MAX_TRUNCATE_BYTES,                       2<<30 ); if ( randomize && BUGGIFY ) DISK_QUEUE_MAX_TRUNCATE_BYTES = 0;
+	init( DISK_QUEUE_MAX_TRUNCATE_BYTES,                     2LL<<30 ); if ( randomize && BUGGIFY ) DISK_QUEUE_MAX_TRUNCATE_BYTES = 0;
 	init( TLOG_DEGRADED_DURATION,                                5.0 );
 	init( MAX_CACHE_VERSIONS,                                   10e6 );
 	init( TLOG_IGNORE_POP_AUTO_ENABLE_DELAY,                   300.0 );
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 9a5f2a528c..5f91f0eff8 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -84,7 +84,7 @@ public:
 	int64_t TLOG_SPILL_REFERENCE_MAX_BYTES_PER_BATCH;
 	int64_t DISK_QUEUE_FILE_EXTENSION_BYTES; // When we grow the disk queue, by how many bytes should it grow?
 	int64_t DISK_QUEUE_FILE_SHRINK_BYTES; // When we shrink the disk queue, by how many bytes should it shrink?
-	int DISK_QUEUE_MAX_TRUNCATE_BYTES; // A truncate larger than this will cause the file to be replaced instead.
+	int64_t DISK_QUEUE_MAX_TRUNCATE_BYTES; // A truncate larger than this will cause the file to be replaced instead.
 	double TLOG_DEGRADED_DURATION;
 	int64_t MAX_CACHE_VERSIONS;
 	double TXS_POPPED_MAX_DELAY;

From eab468fecca3c166296db892f5d297add4da8df8 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 14 Apr 2021 09:32:48 -0700
Subject: [PATCH 119/461] Remove extra line caused by commit issue

---
 fdbclient/NativeAPI.actor.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index df9e08169f..9f6784e279 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -2477,7 +2477,6 @@ ACTOR Future<Version> watchValue(Future<Version> version,
 				cx->invalidateCache(key);
 				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID));
 			} else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) {
-				TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead
 				TEST(e.code() == error_code_watch_cancelled); // Too many watches on storage server, poll for changes
 				TEST(e.code() == error_code_process_behind); // The storage servers are all behind
 				wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, info.taskID));

From a0430536f14a49beca9be444d671cfcbb377cf13 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 10:41:41 -0700
Subject: [PATCH 120/461] Remove knob BACKUP_INIT_SNAPSHOT_INTERVAL_SEC.

---
 fdbbackup/backup.actor.cpp                 |  3 ++-
 fdbclient/FileBackupAgent.actor.cpp        | 11 ++++++++---
 fdbclient/Knobs.cpp                        |  1 -
 fdbclient/Knobs.h                          |  1 -
 fdbserver/workloads/BackupToBlob.actor.cpp |  8 ++++++--
 5 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index 8170aa5eca..c171c2fcb5 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -3216,7 +3216,8 @@ int main(int argc, char* argv[]) {
 		std::string destinationContainer;
 		bool describeDeep = false;
 		bool describeTimestamps = false;
-		int initialSnapshotIntervalSeconds = CLIENT_KNOBS->BACKUP_INIT_SNAPSHOT_INTERVAL_SEC;
+		int initialSnapshotIntervalSeconds =
+		    0; // The initial snapshot has a desired duration of 0, meaning go as fast as possible.
 		int snapshotIntervalSeconds = CLIENT_KNOBS->BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC;
 		std::string clusterFile;
 		std::string sourceClusterFile;
diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 680628bfc9..5101d4d90e 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2778,7 +2778,10 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 		state Reference<TaskFuture> backupFinished = futureBucket->future(tr);
 
 		// Initialize the initial snapshot and create tasks to continually write logs and snapshots.
-		wait(config.initNewSnapshot(tr, config.initialSnapshotIntervalSeconds().get(tr).get().orDefault(0)));
+		state Future<Optional<int64_t>> initialSnapshotIntervalSeconds =
+		    config.initialSnapshotIntervalSeconds().get(tr);
+		wait(success(initialSnapshotIntervalSeconds) &&
+		     config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.get().orDefault(-1)));
 
 		// Using priority 1 for both of these to at least start both tasks soon
 		// Do not add snapshot task if we only want the incremental backup
@@ -5182,7 +5185,8 @@ public:
 	}
 
 	ACTOR static Future<Optional<Version>> getLastRestorable(FileBackupAgent* backupAgent,
-	                                                         Reference<ReadYourWritesTransaction> tr, Key tagName,
+	                                                         Reference<ReadYourWritesTransaction> tr,
+	                                                         Key tagName,
 	                                                         bool snapshot) {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
@@ -5578,7 +5582,8 @@ Future<std::string> FileBackupAgent::getStatusJSON(Database cx, std::string tagN
 	return FileBackupAgentImpl::getStatusJSON(this, cx, tagName);
 }
 
-Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName,
+Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr,
+                                                             Key tagName,
                                                              bool snapshot) {
 	return FileBackupAgentImpl::getLastRestorable(this, tr, tagName, snapshot);
 }
diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index bf9b87d2c3..bcca5ed166 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -133,7 +133,6 @@ void ClientKnobs::initialize(bool randomize) {
 	init( BACKUP_RANGE_TIMEOUT,   TASKBUCKET_TIMEOUT_VERSIONS/CORE_VERSIONSPERSECOND/2.0 );
 	init( BACKUP_RANGE_MINWAIT,   std::max(1.0, BACKUP_RANGE_TIMEOUT/2.0));
 	init( BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC,  10 * 60 );  // 10 minutes
-	init( BACKUP_INIT_SNAPSHOT_INTERVAL_SEC,          0); // The initial snapshot has a desired duration of 0, meaning go as fast as possible.
 	init( BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC,   3600 * 24 * 10); // 10 days
 	init( BACKUP_SHARD_TASK_LIMIT,                1000 ); if( randomize && BUGGIFY ) BACKUP_SHARD_TASK_LIMIT = 4;
 	init( BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL, 60);
diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h
index 4fc925766c..3d22b5a24b 100644
--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@@ -129,7 +129,6 @@ public:
 	double BACKUP_RANGE_TIMEOUT;
 	double BACKUP_RANGE_MINWAIT;
 	int BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC;
-	int BACKUP_INIT_SNAPSHOT_INTERVAL_SEC;
 	int BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC;
 	int BACKUP_SHARD_TASK_LIMIT;
 	double BACKUP_AGGREGATE_POLL_RATE;
diff --git a/fdbserver/workloads/BackupToBlob.actor.cpp b/fdbserver/workloads/BackupToBlob.actor.cpp
index b106176f16..5b94e4d771 100644
--- a/fdbserver/workloads/BackupToBlob.actor.cpp
+++ b/fdbserver/workloads/BackupToBlob.actor.cpp
@@ -60,8 +60,12 @@ struct BackupToBlobWorkload : TestWorkload {
 		backupRanges.push_back_deep(backupRanges.arena(), normalKeys);
 
 		wait(delay(self->backupAfter));
-		wait(backupAgent.submitBackup(
-		    cx, self->backupURL, self->initSnapshotInterval, self->snapshotInterval, self->backupTag.toString(), backupRanges));
+		wait(backupAgent.submitBackup(cx,
+		                              self->backupURL,
+		                              self->initSnapshotInterval,
+		                              self->snapshotInterval,
+		                              self->backupTag.toString(),
+		                              backupRanges));
 		EBackupState backupStatus = wait(backupAgent.waitBackup(cx, self->backupTag.toString(), true));
 		TraceEvent("BackupToBlob_BackupStatus").detail("Status", BackupAgentBase::getStateText(backupStatus));
 		return Void();

From f1415412f1bae3da4931eaa88f1413c179f02f41 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 12 Feb 2021 18:55:01 -0800
Subject: [PATCH 121/461] Add global configuration framework implementation

---
 fdbclient/CMakeLists.txt              |   2 +
 fdbclient/CommitProxyInterface.h      |  12 +--
 fdbclient/GlobalConfig.actor.cpp      |  60 ++++++++++++
 fdbclient/GlobalConfig.actor.h        | 132 ++++++++++++++++++++++++++
 fdbclient/NativeAPI.actor.cpp         |  13 ++-
 fdbclient/SpecialKeySpace.actor.cpp   | 127 ++++++++++++++++++++++++-
 fdbclient/SpecialKeySpace.actor.h     |  11 +++
 fdbclient/SystemData.cpp              |  13 ++-
 fdbclient/SystemData.h                |  24 +++++
 fdbserver/ClusterController.actor.cpp |  55 +++++++++--
 flow/network.h                        |   3 +-
 11 files changed, 426 insertions(+), 26 deletions(-)
 create mode 100644 fdbclient/GlobalConfig.actor.cpp
 create mode 100644 fdbclient/GlobalConfig.actor.h

diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index 129f9e7d3e..e733259611 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -28,6 +28,8 @@ set(FDBCLIENT_SRCS
   FDBOptions.h
   FDBTypes.h
   FileBackupAgent.actor.cpp
+  GlobalConfig.actor.h
+  GlobalConfig.actor.cpp
   GrvProxyInterface.h
   HTTP.actor.cpp
   IClientApi.h
diff --git a/fdbclient/CommitProxyInterface.h b/fdbclient/CommitProxyInterface.h
index a166a87dfa..f29d7369b3 100644
--- a/fdbclient/CommitProxyInterface.h
+++ b/fdbclient/CommitProxyInterface.h
@@ -113,6 +113,7 @@ struct ClientDBInfo {
 	vector<CommitProxyInterface> commitProxies;
 	Optional<CommitProxyInterface>
 	    firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk
+	vector<Standalone<std::pair<Version, VectorRef<MutationRef>>>> history;
 	double clientTxnInfoSampleRate;
 	int64_t clientTxnInfoSizeLimit;
 	Optional<Value> forward;
@@ -132,15 +133,8 @@ struct ClientDBInfo {
 		if constexpr (!is_fb_function<Archive>) {
 			ASSERT(ar.protocolVersion().isValid());
 		}
-		serializer(ar,
-		           grvProxies,
-		           commitProxies,
-		           id,
-		           clientTxnInfoSampleRate,
-		           clientTxnInfoSizeLimit,
-		           forward,
-		           transactionTagSampleRate,
-		           transactionTagSampleCost);
+		serializer(ar, grvProxies, commitProxies, id, history, clientTxnInfoSampleRate, clientTxnInfoSizeLimit,
+	               forward, transactionTagSampleRate, transactionTagSampleCost);
 	}
 };
 
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
new file mode 100644
index 0000000000..f4c2c81e5d
--- /dev/null
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -0,0 +1,60 @@
+/*
+ * GlobalConfig.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/GlobalConfig.actor.h"
+
+#include "flow/actorcompiler.h"  // This must be the last #include.
+
+GlobalConfig::GlobalConfig() : lastUpdate(0) {}
+
+void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+	auto config = new GlobalConfig{}; // TODO: memory leak?
+	config->cx = Database(cx);
+	g_network->setGlobal(INetwork::enGlobalConfig, config);
+	config->_updater = updater(config, dbInfo);
+}
+
+GlobalConfig& GlobalConfig::globalConfig() {
+	void* res = g_network->global(INetwork::enGlobalConfig);
+	ASSERT(res);
+	return *reinterpret_cast<GlobalConfig*>(res);
+}
+
+const std::any GlobalConfig::get(StringRef name) {
+	auto it = data.find(name);
+	if (it == data.end()) {
+		return nullptr;
+	}
+	return it->second;
+}
+
+Future<Void> GlobalConfig::onInitialized() {
+	return initialized.getFuture();
+}
+
+void GlobalConfig::insert(KeyRef key, ValueRef value) {
+	Tuple t = Tuple::unpack(value);
+	// TODO: Add more Tuple types
+	if (t.getType(0) == Tuple::ElementType::UTF8) {
+		data[key] = t.getString(0);
+	} else if (t.getType(0) == Tuple::ElementType::INT) {
+		data[key] = t.getInt(0);
+	}
+}
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
new file mode 100644
index 0000000000..323a5e953c
--- /dev/null
+++ b/fdbclient/GlobalConfig.actor.h
@@ -0,0 +1,132 @@
+/*
+ * GlobalConfig.actor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_GLOBALCONFIG_ACTOR_G_H)
+#define FDBCLIENT_GLOBALCONFIG_ACTOR_G_H
+#include "fdbclient/GlobalConfig.actor.g.h"
+#elif !defined(FDBCLIENT_GLOBALCONFIG_ACTOR_H)
+#define FDBCLIENT_GLOBALCONFIG_ACTOR_H
+
+#include <any>
+#include <unordered_map>
+
+#include "fdbclient/CommitProxyInterface.h"
+#include "fdbclient/ReadYourWrites.h"
+#include "fdbclient/SystemData.h"
+#include "fdbclient/Tuple.h"
+#include "flow/flow.h"
+#include "flow/genericactors.actor.h"
+#include "flow/Knobs.h"
+
+#include "flow/actorcompiler.h" // has to be last include
+
+class GlobalConfig {
+public:
+	GlobalConfig();
+	GlobalConfig(const GlobalConfig&) = delete;
+	GlobalConfig& operator=(const GlobalConfig&) = delete;
+
+	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
+	static GlobalConfig& globalConfig();
+	const std::any get(StringRef name);
+	Future<Void> onInitialized();
+
+private:
+	void insert(KeyRef key, ValueRef value);
+
+	ACTOR static Future<Void> refresh(GlobalConfig* self) {
+		Transaction tr(self->cx);
+		Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
+		for (const auto& kv : result) {
+			KeyRef systemKey = kv.key.removePrefix(globalConfigDataPrefix);
+			self->insert(systemKey, kv.value);
+		}
+		return Void();
+	}
+
+	ACTOR static Future<Void> updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+		wait(refresh(self));
+		self->initialized.send(Void());
+
+		loop {
+			try {
+				wait(dbInfo->onChange());
+
+				auto& history = dbInfo->get().history;
+				if (history.size() == 0 || (self->lastUpdate < history[0].first && self->lastUpdate != 0)) {
+					// This process missed too many global configuration
+					// history updates or the protocol version changed, so it
+					// must re-read the entire configuration range.
+					wait(refresh(self));
+					self->lastUpdate = dbInfo->get().history.back().contents().first;
+				} else {
+					// Apply history in order, from lowest version to highest
+					// version. Mutation history should already be stored in
+					// ascending version order.
+					for (int i = 0; i < history.size(); ++i) {
+						std::pair<Version, VectorRef<MutationRef>> pair = history[i].contents();
+
+						Version version = pair.first;
+						if (version <= self->lastUpdate) {
+							continue;  // already applied this mutation
+						}
+
+						VectorRef<MutationRef>& mutations = pair.second;
+						for (const auto& mutation : mutations) {
+							if (mutation.type == MutationRef::SetValue) {
+								self->insert(mutation.param1, mutation.param2);
+							} else if (mutation.type == MutationRef::ClearRange) {
+								// TODO: Could be optimized if using std::map..
+								KeyRangeRef range(mutation.param1, mutation.param2);
+								auto it = self->data.begin();
+								while (it != self->data.end()) {
+									if (range.contains(it->first)) {
+										it = self->data.erase(it);
+									} else {
+										++it;
+									}
+								}
+							} else {
+								ASSERT(false);
+							}
+						}
+
+						ASSERT(version > self->lastUpdate);
+						self->lastUpdate = version;
+					}
+				}
+			} catch (Error& e) {
+				throw;
+			}
+		}
+	}
+
+	Database cx;
+	Future<Void> _updater;
+	Promise<Void> initialized;
+	// TODO: Arena to store all data in
+	// TODO: Change to std::map for faster range access
+	std::unordered_map<StringRef, std::any> data;
+	Version lastUpdate;
+};
+
+#endif
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 9f6784e279..d350a39974 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -36,6 +36,7 @@
 #include "fdbclient/ClusterInterface.h"
 #include "fdbclient/CoordinationInterface.h"
 #include "fdbclient/DatabaseContext.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/JsonBuilder.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/Knobs.h"
@@ -962,6 +963,8 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 
 	smoothMidShardSize.reset(CLIENT_KNOBS->INIT_MID_SHARD_BYTES);
 
+	GlobalConfig::create(this, clientInfo);
+
 	if (apiVersionAtLeast(700)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ERRORMSG,
 		                              SpecialKeySpace::IMPLTYPE::READONLY,
@@ -1018,9 +1021,13 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 		        singleKeyRange(LiteralStringRef("consistency_check_suspended"))
 		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
 		registerSpecialKeySpaceModule(
-		    SpecialKeySpace::MODULE::TRACING,
-		    SpecialKeySpace::IMPLTYPE::READWRITE,
-		    std::make_unique<TracingOptionsImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::TRACING)));
+		    SpecialKeySpace::MODULE::GLOBALCONFIG, SpecialKeySpace::IMPLTYPE::READWRITE,
+		    std::make_unique<GlobalConfigImpl>(
+		        SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG)));
+		registerSpecialKeySpaceModule(
+		    SpecialKeySpace::MODULE::TRACING, SpecialKeySpace::IMPLTYPE::READWRITE,
+		    std::make_unique<TracingOptionsImpl>(
+		        SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::TRACING)));
 		registerSpecialKeySpaceModule(
 		    SpecialKeySpace::MODULE::CONFIGURATION,
 		    SpecialKeySpace::IMPLTYPE::READWRITE,
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 5fb7360b0d..8e681b9aba 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -21,7 +21,7 @@
 #include "boost/lexical_cast.hpp"
 #include "boost/algorithm/string.hpp"
 
-#include "fdbclient/Knobs.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/SpecialKeySpace.actor.h"
 #include "flow/Arena.h"
 #include "flow/UnitTest.h"
@@ -64,6 +64,8 @@ std::unordered_map<SpecialKeySpace::MODULE, KeyRange> SpecialKeySpace::moduleToB
 	{ SpecialKeySpace::MODULE::ERRORMSG, singleKeyRange(LiteralStringRef("\xff\xff/error_message")) },
 	{ SpecialKeySpace::MODULE::CONFIGURATION,
 	  KeyRangeRef(LiteralStringRef("\xff\xff/configuration/"), LiteralStringRef("\xff\xff/configuration0")) },
+	{ SpecialKeySpace::MODULE::GLOBALCONFIG,
+	  KeyRangeRef(LiteralStringRef("\xff\xff/global_config/"), LiteralStringRef("\xff\xff/global_config0")) },
 	{ SpecialKeySpace::MODULE::TRACING,
 	  KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) }
 };
@@ -1369,11 +1371,128 @@ Future<Optional<std::string>> ConsistencyCheckImpl::commit(ReadYourWritesTransac
 	return Optional<std::string>();
 }
 
-TracingOptionsImpl::TracingOptionsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {
-	TraceEvent("TracingOptionsImpl::TracingOptionsImpl").detail("Range", kr);
+GlobalConfigImpl::GlobalConfigImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
+
+Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw,
+                                                              KeyRangeRef kr) const {
+	Standalone<RangeResultRef> result;
+
+	// if (kr.begin != kr.end) {
+	// 	ryw->setSpecialKeySpaceErrorMsg("get range disabled, please fetch a single key");
+	// 	throw special_keys_api_failure();
+	// }
+
+	auto& globalConfig = GlobalConfig::globalConfig();
+	KeyRef key = kr.begin.removePrefix(getKeyRange().begin);
+	const std::any& any = globalConfig.get(key);
+	if (any.has_value()) {
+		if (any.type() == typeid(Standalone<StringRef>)) {
+			result.push_back_deep(result.arena(), KeyValueRef(kr.begin, std::any_cast<Standalone<StringRef>>(globalConfig.get(key)).contents()));
+		} else if (any.type() == typeid(int64_t)) {
+			result.push_back_deep(result.arena(), KeyValueRef(kr.begin, std::to_string(std::any_cast<int64_t>(globalConfig.get(key)))));
+		} else {
+			ASSERT(false);
+		}
+	}
+	return result;
 }
 
-Future<Standalone<RangeResultRef>> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+void GlobalConfigImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
+	ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>(value)));
+}
+
+ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* globalConfig, ReadYourWritesTransaction* ryw) {
+	state Transaction& tr = ryw->getTransaction();
+
+	// History should only contain three most recent updates. If it currently
+	// has three items, remove the oldest to make room for a new item.
+	Standalone<RangeResultRef> history = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY, false, true));
+	constexpr int kGlobalConfigMaxHistorySize = 3;
+	if (history.size() > kGlobalConfigMaxHistorySize - 1) {
+		std::vector<KeyRef> keys;
+		for (const auto& kv : history) {
+			keys.push_back(kv.key);
+		}
+		// Fix ordering of returned keys. This will ensure versions are ordered
+		// numerically; for example \xff/globalConfig/h/1000 should come after
+		// \xff/globalConfig/h/999.
+		std::sort(keys.begin(), keys.end(), [](const KeyRef& lhs, const KeyRef& rhs) {
+			if (lhs.size() != rhs.size()) {
+				return lhs.size() < rhs.size();
+			}
+			return lhs.compare(rhs) < 0;
+		});
+
+		// Cannot use a range clear because of how keys are ordered in FDB.
+		//   \xff/globalConfig/h/999 -> ...
+		//   \xff/globalConfig/h/1000 -> ...
+		//   \xff/globalConfig/h/1001 -> ...
+		//
+		//   clear_range(\xff/globalConfig/h, \xff/globalConfig/h/1000) results
+		//   in zero key-value pairs being deleted (999 is lexicographically
+		//   larger than 1000, and the range is exclusive).
+		for (int i = 0; i < keys.size() - (kGlobalConfigMaxHistorySize - 1); ++i) {
+			tr.clear(keys[i]);
+		}
+	}
+
+	// TODO: Should probably be using the commit version...
+	Version readVersion = wait(ryw->getReadVersion());
+	BinaryWriter wr = BinaryWriter(AssumeVersion(g_network->protocolVersion()));
+
+	Arena arena;
+	VectorRef<MutationRef> mutations;
+
+	// Transform writes from special-key-space (\xff\xff/global_config/) to
+	// system key space (\xff/globalConfig/).
+	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
+	    ryw->getSpecialKeySpaceWriteMap().containedRanges(specialKeys);
+	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
+	while (iter != ranges.end()) {
+		Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin);
+		Key systemKey = bareKey.withPrefix(globalConfigDataPrefix);
+		std::pair<bool, Optional<Value>> entry = iter->value();
+		if (entry.first) {
+			if (entry.second.present()) {
+				mutations.emplace_back_deep(arena, MutationRef(MutationRef::SetValue, bareKey, entry.second.get()));
+				tr.set(systemKey, entry.second.get());
+			} else {
+				mutations.emplace_back_deep(arena, MutationRef(MutationRef::ClearRange, bareKey, keyAfter(bareKey)));
+				tr.clear(systemKey);
+			}
+		}
+		++iter;
+	}
+
+	wr << std::make_pair(readVersion, mutations);
+
+	// Record the mutations in this commit into the global configuration history.
+	Key historyVersionKey = globalConfigHistoryPrefix.withSuffix(std::to_string(readVersion));
+	tr.set(historyVersionKey, wr.toValue());
+
+	ProtocolVersion protocolVersion = g_network->protocolVersion();
+	BinaryWriter versionWriter = BinaryWriter(AssumeVersion(protocolVersion));
+	versionWriter << readVersion << protocolVersion;
+	tr.set(globalConfigVersionKey, versionWriter.toValue());
+
+	return Optional<std::string>();
+
+}
+
+Future<Optional<std::string>> GlobalConfigImpl::commit(ReadYourWritesTransaction* ryw) {
+	return globalConfigCommitActor(this, ryw);
+}
+
+void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
+	// TODO
+}
+
+void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
+	ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>()));
+}
+
+Future<Standalone<RangeResultRef>> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw,
+                                                                KeyRangeRef kr) const {
 	Standalone<RangeResultRef> result;
 	for (const auto& option : SpecialKeySpace::getTracingOptions()) {
 		auto key = getKeyRange().begin.withSuffix(option);
diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h
index c760a10724..2f605385c1 100644
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@@ -146,6 +146,7 @@ public:
 		CONFIGURATION, // Configuration of the cluster
 		CONNECTIONSTRING,
 		ERRORMSG, // A single key space contains a json string which describes the last error in special-key-space
+		GLOBALCONFIG, // Global configuration options synchronized to all nodes
 		MANAGEMENT, // Management-API
 		METRICS, // data-distribution metrics
 		TESTONLY, // only used by correctness tests
@@ -336,6 +337,16 @@ public:
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };
 
+class GlobalConfigImpl : public SpecialKeyRangeRWImpl {
+public:
+	explicit GlobalConfigImpl(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) override;
+	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
+	void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
+	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
+};
+
 class TracingOptionsImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit TracingOptionsImpl(KeyRangeRef kr);
diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index 0b15f8f91d..aaf115e3b8 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -632,7 +632,18 @@ std::string encodeFailedServersKey(AddressExclusion const& addr) {
 	return failedServersPrefix.toString() + addr.toString();
 }
 
-const KeyRangeRef workerListKeys(LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0"));
+const KeyRangeRef globalConfigKeys( LiteralStringRef("\xff/globalConfig/"), LiteralStringRef("\xff/globalConfig0") );
+const KeyRef globalConfigPrefix = globalConfigKeys.begin;
+
+const KeyRangeRef globalConfigDataKeys( LiteralStringRef("\xff/globalConfig/k/"), LiteralStringRef("\xff/globalConfig/k0") );
+const KeyRef globalConfigDataPrefix = globalConfigDataKeys.begin;
+
+const KeyRangeRef globalConfigHistoryKeys( LiteralStringRef("\xff/globalConfig/h/"), LiteralStringRef("\xff/globalConfig/h0") );
+const KeyRef globalConfigHistoryPrefix = globalConfigHistoryKeys.begin;
+
+const KeyRef globalConfigVersionKey = LiteralStringRef("\xff/globalConfig/v");
+
+const KeyRangeRef workerListKeys( LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0") );
 const KeyRef workerListPrefix = workerListKeys.begin;
 
 const Key workerListKeyFor(StringRef processID) {
diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h
index bbeb7489f9..15117a867e 100644
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@@ -230,6 +230,30 @@ extern const KeyRef failedServersVersionKey; // The value of this key shall be c
 const AddressExclusion decodeFailedServersKey(KeyRef const& key); // where key.startsWith(failedServersPrefix)
 std::string encodeFailedServersKey(AddressExclusion const&);
 
+//   "\xff/globalConfig/[[option]]" := "value"
+//	 An umbrella prefix for global configuration data synchronized to all nodes.
+extern const KeyRangeRef globalConfigData;
+extern const KeyRef globalConfigDataPrefix;
+
+//   "\xff/globalConfig/k/[[key]]" := "value"
+//	 Key-value pairs that have been set. The range this keyspace represents
+//	 contains all globally configured options.
+extern const KeyRangeRef globalConfigDataKeys;
+extern const KeyRef globalConfigDataPrefix;
+
+//   "\xff/globalConfig/h/[[version]]" := "value"
+//   Maps a commit version to a list of mutations made to the global
+//   configuration at that commit. Shipped to nodes periodically. In general,
+//   clients should not write to keys in this keyspace; it will be written
+//   automatically when updating global configuration keys.
+extern const KeyRangeRef globalConfigHistoryKeys;
+extern const KeyRef globalConfigHistoryPrefix;
+
+//   "\xff/globalConfig/v" := "version,protocol"
+//   Read-only key which returns the version and protocol of the most recent
+//   data written to the global configuration keyspace.
+extern const KeyRef globalConfigVersionKey;
+
 //	"\xff/workers/[[processID]]" := ""
 //	Asynchronously updated by the cluster controller, this is a list of fdbserver processes that have joined the cluster
 //	and are currently (recently) available
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 543abc8dad..e599e3feae 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3198,26 +3198,65 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 			try {
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				state Optional<Value> globalConfigVersion = wait(tr.get(globalConfigVersionKey));
 				state Optional<Value> rateVal = wait(tr.get(fdbClientInfoTxnSampleRate));
 				state Optional<Value> limitVal = wait(tr.get(fdbClientInfoTxnSizeLimit));
-				ClientDBInfo clientInfo = db->clientInfo->get();
-				double sampleRate = rateVal.present()
-				                        ? BinaryReader::fromStringRef<double>(rateVal.get(), Unversioned())
-				                        : std::numeric_limits<double>::infinity();
-				int64_t sizeLimit =
-				    limitVal.present() ? BinaryReader::fromStringRef<int64_t>(limitVal.get(), Unversioned()) : -1;
-				if (sampleRate != clientInfo.clientTxnInfoSampleRate ||
-				    sizeLimit != clientInfo.clientTxnInfoSampleRate) {
+				state ClientDBInfo clientInfo = db->clientInfo->get();
+
+				if (globalConfigVersion.present()) {
+					BinaryReader versionReader = BinaryReader(globalConfigVersion.get(), AssumeVersion(g_network->protocolVersion()));
+					Version version;
+					ProtocolVersion protocolVersion;
+					versionReader >> version >> protocolVersion;
+
+					state Arena arena;
+					if (protocolVersion == g_network->protocolVersion()) {
+						Standalone<RangeResultRef> globalConfigHistory = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
+						// If the global configuration version key has been
+						// set, the history should contain at least one item.
+						ASSERT(globalConfigHistory.size() > 0);
+						clientInfo.history.clear();
+
+						for (const auto& kv : globalConfigHistory) {
+							BinaryReader rd = BinaryReader(kv.value, AssumeVersion(g_network->protocolVersion()));
+							Standalone<std::pair<Version, VectorRef<MutationRef>>> data;
+							rd >> data >> arena;
+							clientInfo.history.push_back(data);
+						}
+
+						// History should be ordered by version, ascending.
+						std::sort(clientInfo.history.begin(), clientInfo.history.end(), [](const auto& lhs, const auto& rhs) {
+							return lhs.first < rhs.first;
+						});
+					} else {
+						// If the protocol version has changed, the
+						// GlobalConfig actor should refresh its view by
+						// reading the entire global configuration key range.
+						// An empty mutation list will signal the actor to
+						// refresh.
+						clientInfo.history.clear();
+					}
+
+					clientInfo.id = deterministicRandom()->randomUniqueID();
+					db->clientInfo->set(clientInfo);
+				}
+
+				// TODO: Remove this and move to global config space
+				double sampleRate = rateVal.present() ? BinaryReader::fromStringRef<double>(rateVal.get(), Unversioned()) : std::numeric_limits<double>::infinity();
+				int64_t sizeLimit = limitVal.present() ? BinaryReader::fromStringRef<int64_t>(limitVal.get(), Unversioned()) : -1;
+				if (sampleRate != clientInfo.clientTxnInfoSampleRate || sizeLimit != clientInfo.clientTxnInfoSampleRate) {
 					clientInfo.id = deterministicRandom()->randomUniqueID();
 					clientInfo.clientTxnInfoSampleRate = sampleRate;
 					clientInfo.clientTxnInfoSizeLimit = sizeLimit;
 					db->clientInfo->set(clientInfo);
 				}
 
+				state Future<Void> globalConfigFuture = tr.watch(globalConfigVersionKey);
 				state Future<Void> watchRateFuture = tr.watch(fdbClientInfoTxnSampleRate);
 				state Future<Void> watchLimitFuture = tr.watch(fdbClientInfoTxnSizeLimit);
 				wait(tr.commit());
 				choose {
+					when (wait(globalConfigFuture)) { break; }
 					when(wait(watchRateFuture)) { break; }
 					when(wait(watchLimitFuture)) { break; }
 				}
diff --git a/flow/network.h b/flow/network.h
index d0f117dede..1eeb5bdc2d 100644
--- a/flow/network.h
+++ b/flow/network.h
@@ -481,7 +481,8 @@ public:
 		enBlobCredentialFiles = 10,
 		enNetworkAddressesFunc = 11,
 		enClientFailureMonitor = 12,
-		enSQLiteInjectedError = 13
+		enSQLiteInjectedError = 13,
+		enGlobalConfig = 14
 	};
 
 	virtual void longTaskCheck(const char* name) {}

From 9e20b08976b49bbbe7216c55d757b7b4fb43577d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 17 Feb 2021 16:04:23 -0800
Subject: [PATCH 122/461] Add float and double parsing

---
 fdbclient/GlobalConfig.actor.cpp |  9 ++++-
 fdbclient/Tuple.cpp              | 64 ++++++++++++++++++++++++++++++++
 fdbclient/Tuple.h                |  4 +-
 3 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index f4c2c81e5d..8997d11973 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -40,7 +40,7 @@ GlobalConfig& GlobalConfig::globalConfig() {
 const std::any GlobalConfig::get(StringRef name) {
 	auto it = data.find(name);
 	if (it == data.end()) {
-		return nullptr;
+		return std::any{};
 	}
 	return it->second;
 }
@@ -51,10 +51,15 @@ Future<Void> GlobalConfig::onInitialized() {
 
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
 	Tuple t = Tuple::unpack(value);
-	// TODO: Add more Tuple types
 	if (t.getType(0) == Tuple::ElementType::UTF8) {
 		data[key] = t.getString(0);
 	} else if (t.getType(0) == Tuple::ElementType::INT) {
 		data[key] = t.getInt(0);
+	} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
+		data[key] = t.getFloat(0);
+	} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
+		data[key] = t.getDouble(0);
+	} else {
+		ASSERT(false);
 	}
 }
diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp
index 3d4427079f..535be3d7fc 100644
--- a/fdbclient/Tuple.cpp
+++ b/fdbclient/Tuple.cpp
@@ -20,6 +20,18 @@
 
 #include "fdbclient/Tuple.h"
 
+static float bigEndianFloat(float orig) {
+	int32_t big = *(int32_t*)&orig;
+	big = bigEndian32(big);
+	return *(float*)&big;
+}
+
+static double bigEndianDouble(double orig) {
+	int64_t big = *(int64_t*)&orig;
+	big = bigEndian64(big);
+	return *(double*)&big;
+}
+
 static size_t find_string_terminator(const StringRef data, size_t offset) {
 	size_t i = offset;
 	while (i < data.size() - 1 && !(data[i] == '\x00' && data[i + 1] != (uint8_t)'\xff')) {
@@ -29,6 +41,19 @@ static size_t find_string_terminator(const StringRef data, size_t offset) {
 	return i;
 }
 
+// If encoding and the sign bit is 1 (the number is negative), flip all the bits.
+// If decoding and the sign bit is 0 (the number is negative), flip all the bits.
+// Otherwise, the number is positive, so flip the sign bit.
+static void adjust_floating_point(uint8_t *bytes, size_t size, bool encode) {
+	if((encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x00)) || (!encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x80))) {
+		for(size_t i = 0; i < size; i++) {
+			bytes[i] ^= (uint8_t)0xff;
+		}
+	} else {
+		bytes[0] ^= (uint8_t)0x80;
+	}
+}
+
 Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
 	data.append(data.arena(), str.begin(), str.size());
 
@@ -228,6 +253,45 @@ int64_t Tuple::getInt(size_t index, bool allow_incomplete) const {
 	return swap;
 }
 
+// TODO: Combine with bindings/flow/Tuple.*. This code is copied from there.
+float Tuple::getFloat(size_t index) const {
+	if(index >= offsets.size()) {
+		throw invalid_tuple_index();
+	}
+	ASSERT_LT(offsets[index], data.size());
+	uint8_t code = data[offsets[index]];
+	if(code != 0x20) {
+		throw invalid_tuple_data_type();
+	}
+
+	float swap;
+	uint8_t* bytes = (uint8_t*)&swap;
+	ASSERT_LE(offsets[index] + 1 + sizeof(float), data.size());
+	swap = *(float*)(data.begin() + offsets[index] + 1);
+	adjust_floating_point( bytes, sizeof(float), false );
+
+	return bigEndianFloat(swap);
+}
+
+double Tuple::getDouble(size_t index) const {
+	if(index >= offsets.size()) {
+		throw invalid_tuple_index();
+	}
+	ASSERT_LT(offsets[index], data.size());
+	uint8_t code = data[offsets[index]];
+	if(code != 0x21) {
+		throw invalid_tuple_data_type();
+	}
+
+	double swap;
+	uint8_t* bytes = (uint8_t*)&swap;
+	ASSERT_LE(offsets[index] + 1 + sizeof(double), data.size());
+	swap = *(double*)(data.begin() + offsets[index] + 1);
+	adjust_floating_point( bytes, sizeof(double), false );
+
+	return bigEndianDouble(swap);
+}
+
 KeyRange Tuple::range(Tuple const& tuple) const {
 	VectorRef<uint8_t> begin;
 	VectorRef<uint8_t> end;
diff --git a/fdbclient/Tuple.h b/fdbclient/Tuple.h
index b44edd73cc..4497f19441 100644
--- a/fdbclient/Tuple.h
+++ b/fdbclient/Tuple.h
@@ -47,7 +47,7 @@ struct Tuple {
 		return append(t);
 	}
 
-	enum ElementType { NULL_TYPE, INT, BYTES, UTF8 };
+	enum ElementType { NULL_TYPE, INT, BYTES, UTF8, FLOAT, DOUBLE };
 
 	// this is number of elements, not length of data
 	size_t size() const { return offsets.size(); }
@@ -55,6 +55,8 @@ struct Tuple {
 	ElementType getType(size_t index) const;
 	Standalone<StringRef> getString(size_t index) const;
 	int64_t getInt(size_t index, bool allow_incomplete = false) const;
+	float getFloat(size_t index) const;
+	double getDouble(size_t index) const;
 
 	KeyRange range(Tuple const& tuple = Tuple()) const;
 

From 7bb0b3d8995a6d40232f8bbd107f010f2e94193d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 18 Feb 2021 17:44:42 -0800
Subject: [PATCH 123/461] Use commit version for global configuration updates

FIXME: There is a memory issue where the underlying data for values set
in the `data` field of GlobalConfig will be freed shortly after being
set.
---
 fdbclient/SpecialKeySpace.actor.cpp   | 26 +++++++++++++++-----------
 fdbserver/ClusterController.actor.cpp | 23 ++++++++++++++++++-----
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 8e681b9aba..d2d553f0ca 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1436,10 +1436,6 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 		}
 	}
 
-	// TODO: Should probably be using the commit version...
-	Version readVersion = wait(ryw->getReadVersion());
-	BinaryWriter wr = BinaryWriter(AssumeVersion(g_network->protocolVersion()));
-
 	Arena arena;
 	VectorRef<MutationRef> mutations;
 
@@ -1464,16 +1460,24 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 		++iter;
 	}
 
-	wr << std::make_pair(readVersion, mutations);
+	ProtocolVersion protocolVersion = g_network->protocolVersion();
 
 	// Record the mutations in this commit into the global configuration history.
-	Key historyVersionKey = globalConfigHistoryPrefix.withSuffix(std::to_string(readVersion));
-	tr.set(historyVersionKey, wr.toValue());
+	BinaryWriter historyKeyWriter(AssumeVersion(protocolVersion));
+	historyKeyWriter.serializeBytes(globalConfigHistoryPrefix);
+	Key historyKey = addVersionStampAtEnd(historyKeyWriter.toValue());
 
-	ProtocolVersion protocolVersion = g_network->protocolVersion();
-	BinaryWriter versionWriter = BinaryWriter(AssumeVersion(protocolVersion));
-	versionWriter << readVersion << protocolVersion;
-	tr.set(globalConfigVersionKey, versionWriter.toValue());
+	BinaryWriter historyMutationsWriter(AssumeVersion(protocolVersion));
+	historyMutationsWriter << mutations;
+
+	tr.atomicOp(historyKey, historyMutationsWriter.toValue(), MutationRef::SetVersionstampedKey);
+
+	// Write version key to trigger update in cluster controller.
+	tr.atomicOp(globalConfigVersionKey,
+	            BinaryWriter::toValue(protocolVersion, AssumeVersion(protocolVersion))
+	                .withPrefix(LiteralStringRef("0123456789")) // placeholder for versionstamp
+	                .withSuffix(LiteralStringRef("\x00\x00\x00\x00")),
+	            MutationRef::SetVersionstampedValue);
 
 	return Optional<std::string>();
 
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index e599e3feae..43b53c908c 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3205,9 +3205,10 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 
 				if (globalConfigVersion.present()) {
 					BinaryReader versionReader = BinaryReader(globalConfigVersion.get(), AssumeVersion(g_network->protocolVersion()));
-					Version version;
-					ProtocolVersion protocolVersion;
-					versionReader >> version >> protocolVersion;
+					int64_t commitVersion;
+					int16_t serializationOrder;
+					state ProtocolVersion protocolVersion;
+					versionReader >> commitVersion >> serializationOrder >> protocolVersion;
 
 					state Arena arena;
 					if (protocolVersion == g_network->protocolVersion()) {
@@ -3218,9 +3219,21 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 						clientInfo.history.clear();
 
 						for (const auto& kv : globalConfigHistory) {
-							BinaryReader rd = BinaryReader(kv.value, AssumeVersion(g_network->protocolVersion()));
 							Standalone<std::pair<Version, VectorRef<MutationRef>>> data;
-							rd >> data >> arena;
+
+							// Read commit version out of versionstamp at end of key.
+							BinaryReader versionReader = BinaryReader(kv.key.removePrefix(globalConfigHistoryPrefix), AssumeVersion(protocolVersion));
+							Version historyCommitVersion;
+							versionReader >> historyCommitVersion;
+							historyCommitVersion = bigEndian64(historyCommitVersion);
+							data.first = historyCommitVersion;
+
+							// Read the list of mutations that occurred at this version.
+							BinaryReader mutationReader = BinaryReader(kv.value, AssumeVersion(protocolVersion));
+							VectorRef<MutationRef> mutations;
+							mutationReader >> mutations;
+							data.second = VectorRef(arena, mutations);
+
 							clientInfo.history.push_back(data);
 						}
 

From c9b0d3dd4e203853c8a78904671ae38d98c5b394 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 19 Feb 2021 14:00:07 -0800
Subject: [PATCH 124/461] Fix memory leak

The map containing global configuration data had keys of type StringRef,
referencing data allocated in history arenas. When the old history
was deleted, this memory was no longer valid and some keys would point
to garbage memory.
---
 fdbclient/GlobalConfig.actor.cpp    | 37 +++++++++++++++++++++++++----
 fdbclient/GlobalConfig.actor.h      | 24 +++++++------------
 fdbclient/SpecialKeySpace.actor.cpp | 31 +++++++++++++-----------
 3 files changed, 58 insertions(+), 34 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 8997d11973..04a72bb443 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -37,7 +37,7 @@ GlobalConfig& GlobalConfig::globalConfig() {
 	return *reinterpret_cast<GlobalConfig*>(res);
 }
 
-const std::any GlobalConfig::get(StringRef name) {
+const std::any GlobalConfig::get(KeyRef name) {
 	auto it = data.find(name);
 	if (it == data.end()) {
 		return std::any{};
@@ -45,21 +45,48 @@ const std::any GlobalConfig::get(StringRef name) {
 	return it->second;
 }
 
+const std::map<KeyRef, std::any> GlobalConfig::get(KeyRangeRef range) {
+	std::map<KeyRef, std::any> results;
+	for (const auto& [key, value] : data) {
+		if (range.contains(key)) {
+			results[key] = value;
+		}
+	}
+	return results;
+}
+
 Future<Void> GlobalConfig::onInitialized() {
 	return initialized.getFuture();
 }
 
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
+	KeyRef stableKey = KeyRef(arena, key);
 	Tuple t = Tuple::unpack(value);
 	if (t.getType(0) == Tuple::ElementType::UTF8) {
-		data[key] = t.getString(0);
+		data[stableKey] = t.getString(0);
 	} else if (t.getType(0) == Tuple::ElementType::INT) {
-		data[key] = t.getInt(0);
+		data[stableKey] = t.getInt(0);
 	} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
-		data[key] = t.getFloat(0);
+		data[stableKey] = t.getFloat(0);
 	} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
-		data[key] = t.getDouble(0);
+		data[stableKey] = t.getDouble(0);
 	} else {
 		ASSERT(false);
 	}
 }
+
+void GlobalConfig::erase(KeyRef key) {
+	erase(KeyRangeRef(key, keyAfter(key)));
+}
+
+void GlobalConfig::erase(KeyRangeRef range) {
+	// TODO: Memory leak -- memory for key remains allocated in arena
+	auto it = data.begin();
+	while (it != data.end()) {
+		if (range.contains(it->first)) {
+			it = data.erase(it);
+		} else {
+			++it;
+		}
+	}
+}
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 323a5e953c..8d0bd21a17 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -27,6 +27,7 @@
 #define FDBCLIENT_GLOBALCONFIG_ACTOR_H
 
 #include <any>
+#include <map>
 #include <unordered_map>
 
 #include "fdbclient/CommitProxyInterface.h"
@@ -47,11 +48,14 @@ public:
 
 	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 	static GlobalConfig& globalConfig();
-	const std::any get(StringRef name);
+	const std::any get(KeyRef name);
+	const std::map<KeyRef, std::any> get(KeyRangeRef range);
 	Future<Void> onInitialized();
 
 private:
 	void insert(KeyRef key, ValueRef value);
+	void erase(KeyRef key);
+	void erase(KeyRangeRef range);
 
 	ACTOR static Future<Void> refresh(GlobalConfig* self) {
 		Transaction tr(self->cx);
@@ -83,28 +87,19 @@ private:
 					// version. Mutation history should already be stored in
 					// ascending version order.
 					for (int i = 0; i < history.size(); ++i) {
-						std::pair<Version, VectorRef<MutationRef>> pair = history[i].contents();
+						const std::pair<Version, VectorRef<MutationRef>>& pair = history[i].contents();
 
 						Version version = pair.first;
 						if (version <= self->lastUpdate) {
 							continue;  // already applied this mutation
 						}
 
-						VectorRef<MutationRef>& mutations = pair.second;
+						const VectorRef<MutationRef>& mutations = pair.second;
 						for (const auto& mutation : mutations) {
 							if (mutation.type == MutationRef::SetValue) {
 								self->insert(mutation.param1, mutation.param2);
 							} else if (mutation.type == MutationRef::ClearRange) {
-								// TODO: Could be optimized if using std::map..
-								KeyRangeRef range(mutation.param1, mutation.param2);
-								auto it = self->data.begin();
-								while (it != self->data.end()) {
-									if (range.contains(it->first)) {
-										it = self->data.erase(it);
-									} else {
-										++it;
-									}
-								}
+								self->erase(KeyRangeRef(mutation.param1, mutation.param2));
 							} else {
 								ASSERT(false);
 							}
@@ -123,8 +118,7 @@ private:
 	Database cx;
 	Future<Void> _updater;
 	Promise<Void> initialized;
-	// TODO: Arena to store all data in
-	// TODO: Change to std::map for faster range access
+	Arena arena;
 	std::unordered_map<StringRef, std::any> data;
 	Version lastUpdate;
 };
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index d2d553f0ca..ea577777e5 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1377,23 +1377,26 @@ Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTran
                                                               KeyRangeRef kr) const {
 	Standalone<RangeResultRef> result;
 
-	// if (kr.begin != kr.end) {
-	// 	ryw->setSpecialKeySpaceErrorMsg("get range disabled, please fetch a single key");
-	// 	throw special_keys_api_failure();
-	// }
-
 	auto& globalConfig = GlobalConfig::globalConfig();
-	KeyRef key = kr.begin.removePrefix(getKeyRange().begin);
-	const std::any& any = globalConfig.get(key);
-	if (any.has_value()) {
-		if (any.type() == typeid(Standalone<StringRef>)) {
-			result.push_back_deep(result.arena(), KeyValueRef(kr.begin, std::any_cast<Standalone<StringRef>>(globalConfig.get(key)).contents()));
-		} else if (any.type() == typeid(int64_t)) {
-			result.push_back_deep(result.arena(), KeyValueRef(kr.begin, std::to_string(std::any_cast<int64_t>(globalConfig.get(key)))));
-		} else {
-			ASSERT(false);
+	KeyRangeRef modified = KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin));
+	std::map<KeyRef, std::any> values = globalConfig.get(modified);
+	for (const auto& [key, any] : values) {
+		Key prefixedKey = key.withPrefix(getKeyRange().begin);
+		if (any.has_value()) {
+			if (any.type() == typeid(Standalone<StringRef>)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::any_cast<Standalone<StringRef>>(any).contents()));
+			} else if (any.type() == typeid(int64_t)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(any))));
+			} else if (any.type() == typeid(float)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(any))));
+			} else if (any.type() == typeid(double)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(any))));
+			} else {
+				ASSERT(false);
+			}
 		}
 	}
+
 	return result;
 }
 

From 96732810ffd4ba4dfdbf1c2830b76c02fbd86938 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 19 Feb 2021 14:22:58 -0800
Subject: [PATCH 125/461] Move actor implementation

---
 fdbclient/GlobalConfig.actor.cpp | 60 ++++++++++++++++++++++++++++++-
 fdbclient/GlobalConfig.actor.h   | 61 +++-----------------------------
 2 files changed, 63 insertions(+), 58 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 04a72bb443..476c291167 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -25,7 +25,7 @@
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
 void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
-	auto config = new GlobalConfig{}; // TODO: memory leak?
+	auto config = new GlobalConfig{};
 	config->cx = Database(cx);
 	g_network->setGlobal(INetwork::enGlobalConfig, config);
 	config->_updater = updater(config, dbInfo);
@@ -90,3 +90,61 @@ void GlobalConfig::erase(KeyRangeRef range) {
 		}
 	}
 }
+
+ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
+	Transaction tr(self->cx);
+	Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
+	for (const auto& kv : result) {
+		KeyRef systemKey = kv.key.removePrefix(globalConfigDataPrefix);
+		self->insert(systemKey, kv.value);
+	}
+	return Void();
+}
+
+ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+	wait(self->refresh(self));
+	self->initialized.send(Void());
+
+	loop {
+		try {
+			wait(dbInfo->onChange());
+
+			auto& history = dbInfo->get().history;
+			if (history.size() == 0 || (self->lastUpdate < history[0].first && self->lastUpdate != 0)) {
+				// This process missed too many global configuration
+				// history updates or the protocol version changed, so it
+				// must re-read the entire configuration range.
+				wait(self->refresh(self));
+				self->lastUpdate = dbInfo->get().history.back().contents().first;
+			} else {
+				// Apply history in order, from lowest version to highest
+				// version. Mutation history should already be stored in
+				// ascending version order.
+				for (int i = 0; i < history.size(); ++i) {
+					const std::pair<Version, VectorRef<MutationRef>>& pair = history[i].contents();
+
+					Version version = pair.first;
+					if (version <= self->lastUpdate) {
+						continue;  // already applied this mutation
+					}
+
+					const VectorRef<MutationRef>& mutations = pair.second;
+					for (const auto& mutation : mutations) {
+						if (mutation.type == MutationRef::SetValue) {
+							self->insert(mutation.param1, mutation.param2);
+						} else if (mutation.type == MutationRef::ClearRange) {
+							self->erase(KeyRangeRef(mutation.param1, mutation.param2));
+						} else {
+							ASSERT(false);
+						}
+					}
+
+					ASSERT(version > self->lastUpdate);
+					self->lastUpdate = version;
+				}
+			}
+		} catch (Error& e) {
+			throw;
+		}
+	}
+}
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 8d0bd21a17..ec43ff5a97 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -48,8 +48,10 @@ public:
 
 	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 	static GlobalConfig& globalConfig();
+
 	const std::any get(KeyRef name);
 	const std::map<KeyRef, std::any> get(KeyRangeRef range);
+
 	Future<Void> onInitialized();
 
 private:
@@ -57,63 +59,8 @@ private:
 	void erase(KeyRef key);
 	void erase(KeyRangeRef range);
 
-	ACTOR static Future<Void> refresh(GlobalConfig* self) {
-		Transaction tr(self->cx);
-		Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
-		for (const auto& kv : result) {
-			KeyRef systemKey = kv.key.removePrefix(globalConfigDataPrefix);
-			self->insert(systemKey, kv.value);
-		}
-		return Void();
-	}
-
-	ACTOR static Future<Void> updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
-		wait(refresh(self));
-		self->initialized.send(Void());
-
-		loop {
-			try {
-				wait(dbInfo->onChange());
-
-				auto& history = dbInfo->get().history;
-				if (history.size() == 0 || (self->lastUpdate < history[0].first && self->lastUpdate != 0)) {
-					// This process missed too many global configuration
-					// history updates or the protocol version changed, so it
-					// must re-read the entire configuration range.
-					wait(refresh(self));
-					self->lastUpdate = dbInfo->get().history.back().contents().first;
-				} else {
-					// Apply history in order, from lowest version to highest
-					// version. Mutation history should already be stored in
-					// ascending version order.
-					for (int i = 0; i < history.size(); ++i) {
-						const std::pair<Version, VectorRef<MutationRef>>& pair = history[i].contents();
-
-						Version version = pair.first;
-						if (version <= self->lastUpdate) {
-							continue;  // already applied this mutation
-						}
-
-						const VectorRef<MutationRef>& mutations = pair.second;
-						for (const auto& mutation : mutations) {
-							if (mutation.type == MutationRef::SetValue) {
-								self->insert(mutation.param1, mutation.param2);
-							} else if (mutation.type == MutationRef::ClearRange) {
-								self->erase(KeyRangeRef(mutation.param1, mutation.param2));
-							} else {
-								ASSERT(false);
-							}
-						}
-
-						ASSERT(version > self->lastUpdate);
-						self->lastUpdate = version;
-					}
-				}
-			} catch (Error& e) {
-				throw;
-			}
-		}
-	}
+	ACTOR static Future<Void> refresh(GlobalConfig* self);
+	ACTOR static Future<Void> updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 
 	Database cx;
 	Future<Void> _updater;

From 2acefa2c821071bf0337be99ab195902c24da9fa Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 19 Feb 2021 16:46:08 -0800
Subject: [PATCH 126/461] Add double and float support to tuples

Note that this functionality is copied from bindings/flow/Tuple.cpp.
These classes should eventually be combined (see #4351).
---
 fdbclient/GlobalConfig.actor.cpp | 30 ++++++++++++++---------
 fdbclient/GlobalConfig.actor.h   | 13 ++++++----
 fdbclient/Tuple.cpp              | 41 ++++++++++++++++++++++++++++++--
 fdbclient/Tuple.h                |  4 ++++
 4 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 476c291167..319c143b5a 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -19,6 +19,10 @@
  */
 
 #include "fdbclient/GlobalConfig.actor.h"
+#include "fdbclient/SystemData.h"
+#include "fdbclient/Tuple.h"
+#include "flow/flow.h"
+#include "flow/genericactors.actor.h"
 
 #include "flow/actorcompiler.h"  // This must be the last #include.
 
@@ -61,17 +65,21 @@ Future<Void> GlobalConfig::onInitialized() {
 
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
 	KeyRef stableKey = KeyRef(arena, key);
-	Tuple t = Tuple::unpack(value);
-	if (t.getType(0) == Tuple::ElementType::UTF8) {
-		data[stableKey] = t.getString(0);
-	} else if (t.getType(0) == Tuple::ElementType::INT) {
-		data[stableKey] = t.getInt(0);
-	} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
-		data[stableKey] = t.getFloat(0);
-	} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
-		data[stableKey] = t.getDouble(0);
-	} else {
-		ASSERT(false);
+	try {
+		Tuple t = Tuple::unpack(value);
+		if (t.getType(0) == Tuple::ElementType::UTF8) {
+			data[stableKey] = t.getString(0);
+		} else if (t.getType(0) == Tuple::ElementType::INT) {
+			data[stableKey] = t.getInt(0);
+		} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
+			data[stableKey] = t.getFloat(0);
+		} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
+			data[stableKey] = t.getDouble(0);
+		} else {
+			ASSERT(false);
+		}
+	} catch (Error& e) {
+		TraceEvent("GlobalConfigTupleError").detail("What", e.what());
 	}
 }
 
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index ec43ff5a97..b472ea4718 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -32,14 +32,13 @@
 
 #include "fdbclient/CommitProxyInterface.h"
 #include "fdbclient/ReadYourWrites.h"
-#include "fdbclient/SystemData.h"
-#include "fdbclient/Tuple.h"
-#include "flow/flow.h"
-#include "flow/genericactors.actor.h"
-#include "flow/Knobs.h"
 
 #include "flow/actorcompiler.h" // has to be last include
 
+// The global configuration is a series of typed key-value pairs synced to all
+// nodes (server and client) in an FDB cluster in an eventually consistent
+// manner.
+
 class GlobalConfig {
 public:
 	GlobalConfig();
@@ -52,6 +51,10 @@ public:
 	const std::any get(KeyRef name);
 	const std::map<KeyRef, std::any> get(KeyRangeRef range);
 
+	// To write into the global configuration, submit a transaction to
+	// \xff\xff/global_config/<your-key> with <your-value> encoded using the
+	// FDB tuple typecodes.
+
 	Future<Void> onInitialized();
 
 private:
diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp
index 535be3d7fc..96f806c791 100644
--- a/fdbclient/Tuple.cpp
+++ b/fdbclient/Tuple.cpp
@@ -65,7 +65,14 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
 			i = find_string_terminator(str, i + 1) + 1;
 		} else if (data[i] >= '\x0c' && data[i] <= '\x1c') {
 			i += abs(data[i] - '\x14') + 1;
-		} else if (data[i] == '\x00') {
+		}
+		else if(data[i] == 0x20) {
+			i += sizeof(float) + 1;
+		}
+		else if(data[i] == 0x21) {
+			i += sizeof(double) + 1;
+		}
+		else if(data[i] == '\x00') {
 			i += 1;
 		} else {
 			throw invalid_tuple_data_type();
@@ -138,6 +145,29 @@ Tuple& Tuple::append(int64_t value) {
 	return *this;
 }
 
+Tuple& Tuple::appendFloat( float value ) {
+	offsets.push_back( data.size() );
+	float swap = bigEndianFloat(value);
+	uint8_t *bytes = (uint8_t*)&swap;
+	adjust_floating_point(bytes, sizeof(float), true);
+
+	data.push_back( data.arena(), 0x20 );
+	data.append( data.arena(), bytes, sizeof(float) );
+	return *this;
+}
+
+Tuple& Tuple::appendDouble( double value ) {
+	offsets.push_back( data.size() );
+	double swap = value;
+	swap = bigEndianDouble(swap);
+	uint8_t *bytes = (uint8_t*)&swap;
+	adjust_floating_point(bytes, sizeof(double), true);
+
+	data.push_back( data.arena(), 0x21 );
+	data.append( data.arena(), bytes, sizeof(double) );
+	return *this;
+}
+
 Tuple& Tuple::appendNull() {
 	offsets.push_back(data.size());
 	data.push_back(data.arena(), (uint8_t)'\x00');
@@ -159,7 +189,14 @@ Tuple::ElementType Tuple::getType(size_t index) const {
 		return ElementType::UTF8;
 	} else if (code >= '\x0c' && code <= '\x1c') {
 		return ElementType::INT;
-	} else {
+	}
+	else if(code == 0x20) {
+		return ElementType::FLOAT;
+	}
+	else if(code == 0x21) {
+		return ElementType::DOUBLE;
+	}
+	else {
 		throw invalid_tuple_data_type();
 	}
 }
diff --git a/fdbclient/Tuple.h b/fdbclient/Tuple.h
index 4497f19441..3dc597f262 100644
--- a/fdbclient/Tuple.h
+++ b/fdbclient/Tuple.h
@@ -38,6 +38,10 @@ struct Tuple {
 	Tuple& append(Tuple const& tuple);
 	Tuple& append(StringRef const& str, bool utf8 = false);
 	Tuple& append(int64_t);
+	// There are some ambiguous append calls in fdbclient, so to make it easier
+	// to add append for floats and doubles, name them differently for now.
+	Tuple& appendFloat(float);
+	Tuple& appendDouble(double);
 	Tuple& appendNull();
 
 	StringRef pack() const { return StringRef(data.begin(), data.size()); }

From 9587318696c54a932a53d31a7ccdb91cf35b2401 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 19 Feb 2021 18:00:04 -0800
Subject: [PATCH 127/461] Fix crash when history size is 0

This shouldn't happen in normal operation (if ClientDBInfo has been
updated, that means at least one item should have been added to the
history). But there is old functionality that uses other ClientDBInfo
fields to send updates to all nodes, and until this functionality is
removed this check needs to be here.
---
 fdbclient/GlobalConfig.actor.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 319c143b5a..b411e06f56 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -123,7 +123,13 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
 				wait(self->refresh(self));
-				self->lastUpdate = dbInfo->get().history.back().contents().first;
+				// TODO: This check is a temporary fix while old functionality
+				// for setting ClientDBInfo fields exist, but eventually it
+				// should be replaced with an assert that the size of `history`
+				// is greater than 0.
+				if (dbInfo->get().history.size() > 0) {
+					self->lastUpdate = dbInfo->get().history.back().contents().first;
+				}
 			} else {
 				// Apply history in order, from lowest version to highest
 				// version. Mutation history should already be stored in

From 80c6048a01abb641fbfa54b9b33bf83486ffcbf6 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Sat, 20 Feb 2021 00:43:54 -0800
Subject: [PATCH 128/461] Naming fixes

---
 fdbclient/GlobalConfig.actor.cpp    | 2 +-
 fdbclient/SpecialKeySpace.actor.cpp | 4 +++-
 fdbclient/SystemData.cpp            | 6 +++---
 fdbclient/SystemData.h              | 6 +++---
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index b411e06f56..c51429a694 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -103,7 +103,7 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 	Transaction tr(self->cx);
 	Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
 	for (const auto& kv : result) {
-		KeyRef systemKey = kv.key.removePrefix(globalConfigDataPrefix);
+		KeyRef systemKey = kv.key.removePrefix(globalConfigKeysPrefix);
 		self->insert(systemKey, kv.value);
 	}
 	return Void();
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index ea577777e5..c84b4a5d42 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1449,7 +1449,7 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
 	while (iter != ranges.end()) {
 		Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin);
-		Key systemKey = bareKey.withPrefix(globalConfigDataPrefix);
+		Key systemKey = bareKey.withPrefix(globalConfigKeysPrefix);
 		std::pair<bool, Optional<Value>> entry = iter->value();
 		if (entry.first) {
 			if (entry.second.present()) {
@@ -1498,6 +1498,8 @@ void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key)
 	ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>()));
 }
 
+TracingOptionsImpl::TracingOptionsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
+
 Future<Standalone<RangeResultRef>> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw,
                                                                 KeyRangeRef kr) const {
 	Standalone<RangeResultRef> result;
diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index aaf115e3b8..785a42828b 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -632,11 +632,11 @@ std::string encodeFailedServersKey(AddressExclusion const& addr) {
 	return failedServersPrefix.toString() + addr.toString();
 }
 
-const KeyRangeRef globalConfigKeys( LiteralStringRef("\xff/globalConfig/"), LiteralStringRef("\xff/globalConfig0") );
-const KeyRef globalConfigPrefix = globalConfigKeys.begin;
+// const KeyRangeRef globalConfigKeys( LiteralStringRef("\xff/globalConfig/"), LiteralStringRef("\xff/globalConfig0") );
+// const KeyRef globalConfigPrefix = globalConfigKeys.begin;
 
 const KeyRangeRef globalConfigDataKeys( LiteralStringRef("\xff/globalConfig/k/"), LiteralStringRef("\xff/globalConfig/k0") );
-const KeyRef globalConfigDataPrefix = globalConfigDataKeys.begin;
+const KeyRef globalConfigKeysPrefix = globalConfigDataKeys.begin;
 
 const KeyRangeRef globalConfigHistoryKeys( LiteralStringRef("\xff/globalConfig/h/"), LiteralStringRef("\xff/globalConfig/h0") );
 const KeyRef globalConfigHistoryPrefix = globalConfigHistoryKeys.begin;
diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h
index 15117a867e..489da42f83 100644
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@@ -232,14 +232,14 @@ std::string encodeFailedServersKey(AddressExclusion const&);
 
 //   "\xff/globalConfig/[[option]]" := "value"
 //	 An umbrella prefix for global configuration data synchronized to all nodes.
-extern const KeyRangeRef globalConfigData;
-extern const KeyRef globalConfigDataPrefix;
+// extern const KeyRangeRef globalConfigData;
+// extern const KeyRef globalConfigDataPrefix;
 
 //   "\xff/globalConfig/k/[[key]]" := "value"
 //	 Key-value pairs that have been set. The range this keyspace represents
 //	 contains all globally configured options.
 extern const KeyRangeRef globalConfigDataKeys;
-extern const KeyRef globalConfigDataPrefix;
+extern const KeyRef globalConfigKeysPrefix;
 
 //   "\xff/globalConfig/h/[[version]]" := "value"
 //   Maps a commit version to a list of mutations made to the global

From c3f68831af6ee2e59c0cee6ced085396775c120b Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 23 Feb 2021 16:17:05 -0800
Subject: [PATCH 129/461] Move existing ClientDBInfo variables to global
 configuration

---
 fdbcli/fdbcli.actor.cpp                       | 32 +++++++---------
 fdbclient/CommitProxyInterface.h              | 14 ++-----
 fdbclient/GlobalConfig.actor.cpp              |  6 +++
 fdbclient/GlobalConfig.actor.h                | 27 +++++++++++++
 fdbclient/NativeAPI.actor.cpp                 | 38 +++++++++----------
 fdbclient/SystemData.cpp                      |  6 +--
 fdbclient/SystemData.h                        |  2 -
 fdbserver/ClusterController.actor.cpp         | 22 +----------
 ...entTransactionProfileCorrectness.actor.cpp | 19 ++++++----
 9 files changed, 82 insertions(+), 84 deletions(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index e608e96086..35f7fdd884 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -24,6 +24,7 @@
 #include "fdbclient/Status.h"
 #include "fdbclient/StatusClient.h"
 #include "fdbclient/DatabaseContext.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbclient/ClusterInterface.h"
@@ -3841,25 +3842,14 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 								is_error = true;
 								continue;
 							}
-							state Future<Optional<Standalone<StringRef>>> sampleRateFuture =
-							    tr->get(fdbClientInfoTxnSampleRate);
-							state Future<Optional<Standalone<StringRef>>> sizeLimitFuture =
-							    tr->get(fdbClientInfoTxnSizeLimit);
-							wait(makeInterruptable(success(sampleRateFuture) && success(sizeLimitFuture)));
+							const double sampleRateDbl = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
+							const int64_t sizeLimit = GlobalConfig::globalConfig().get<int64_t>(fdbClientInfoTxnSizeLimit, -1);
 							std::string sampleRateStr = "default", sizeLimitStr = "default";
-							if (sampleRateFuture.get().present()) {
-								const double sampleRateDbl =
-								    BinaryReader::fromStringRef<double>(sampleRateFuture.get().get(), Unversioned());
-								if (!std::isinf(sampleRateDbl)) {
-									sampleRateStr = boost::lexical_cast<std::string>(sampleRateDbl);
-								}
+							if (!std::isinf(sampleRateDbl)) {
+								sampleRateStr = boost::lexical_cast<std::string>(sampleRateDbl);
 							}
-							if (sizeLimitFuture.get().present()) {
-								const int64_t sizeLimit =
-								    BinaryReader::fromStringRef<int64_t>(sizeLimitFuture.get().get(), Unversioned());
-								if (sizeLimit != -1) {
-									sizeLimitStr = boost::lexical_cast<std::string>(sizeLimit);
-								}
+							if (sizeLimit != -1) {
+								sizeLimitStr = boost::lexical_cast<std::string>(sizeLimit);
 							}
 							printf("Client profiling rate is set to %s and size limit is set to %s.\n",
 							       sampleRateStr.c_str(),
@@ -3897,8 +3887,12 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 									continue;
 								}
 							}
-							tr->set(fdbClientInfoTxnSampleRate, BinaryWriter::toValue(sampleRate, Unversioned()));
-							tr->set(fdbClientInfoTxnSizeLimit, BinaryWriter::toValue(sizeLimit, Unversioned()));
+
+							Tuple rate = Tuple().appendDouble(sampleRate);
+							Tuple size = Tuple().append(sizeLimit);
+							tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+							tr->set(fdbClientInfoTxnSampleRate.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), rate.pack());
+							tr->set(fdbClientInfoTxnSizeLimit.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), size.pack());
 							if (!intrans) {
 								wait(commitTransaction(tr));
 							}
diff --git a/fdbclient/CommitProxyInterface.h b/fdbclient/CommitProxyInterface.h
index f29d7369b3..4fffa116ac 100644
--- a/fdbclient/CommitProxyInterface.h
+++ b/fdbclient/CommitProxyInterface.h
@@ -113,17 +113,10 @@ struct ClientDBInfo {
 	vector<CommitProxyInterface> commitProxies;
 	Optional<CommitProxyInterface>
 	    firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk
-	vector<Standalone<std::pair<Version, VectorRef<MutationRef>>>> history;
-	double clientTxnInfoSampleRate;
-	int64_t clientTxnInfoSizeLimit;
 	Optional<Value> forward;
-	double transactionTagSampleRate;
-	double transactionTagSampleCost;
+	vector<Standalone<std::pair<Version, VectorRef<MutationRef>>>> history;
 
-	ClientDBInfo()
-	  : clientTxnInfoSampleRate(std::numeric_limits<double>::infinity()), clientTxnInfoSizeLimit(-1),
-	    transactionTagSampleRate(CLIENT_KNOBS->READ_TAG_SAMPLE_RATE),
-	    transactionTagSampleCost(CLIENT_KNOBS->COMMIT_SAMPLE_COST) {}
+	ClientDBInfo() {}
 
 	bool operator==(ClientDBInfo const& r) const { return id == r.id; }
 	bool operator!=(ClientDBInfo const& r) const { return id != r.id; }
@@ -133,8 +126,7 @@ struct ClientDBInfo {
 		if constexpr (!is_fb_function<Archive>) {
 			ASSERT(ar.protocolVersion().isValid());
 		}
-		serializer(ar, grvProxies, commitProxies, id, history, clientTxnInfoSampleRate, clientTxnInfoSizeLimit,
-	               forward, transactionTagSampleRate, transactionTagSampleCost);
+		serializer(ar, grvProxies, commitProxies, id, forward, history);
 	}
 };
 
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index c51429a694..261680f4d4 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -26,6 +26,12 @@
 
 #include "flow/actorcompiler.h"  // This must be the last #include.
 
+const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("fdbClientInfo/client_txn_sample_rate");
+const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("fdbClientInfo/client_txn_size_limit");
+
+const KeyRef transactionTagSampleRate = LiteralStringRef("transactionTagSampleRate");
+const KeyRef transactionTagSampleCost = LiteralStringRef("transactionTagSampleCost");
+
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
 void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index b472ea4718..601802e57c 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -39,6 +39,13 @@
 // nodes (server and client) in an FDB cluster in an eventually consistent
 // manner.
 
+// Keys
+extern const KeyRef fdbClientInfoTxnSampleRate;
+extern const KeyRef fdbClientInfoTxnSizeLimit;
+
+extern const KeyRef transactionTagSampleRate;
+extern const KeyRef transactionTagSampleCost;
+
 class GlobalConfig {
 public:
 	GlobalConfig();
@@ -51,6 +58,26 @@ public:
 	const std::any get(KeyRef name);
 	const std::map<KeyRef, std::any> get(KeyRangeRef range);
 
+	template <typename T>
+	const T get(KeyRef name) {
+		try {
+			auto any = get(name);
+			return std::any_cast<T>(any);
+		} catch (Error& e) {
+			throw;
+		}
+	}
+
+	template <typename T>
+	const T get(KeyRef name, T defaultVal) {
+		auto any = get(name);
+		if (any.has_value()) {
+			return std::any_cast<T>(any);
+		}
+
+		return defaultVal;
+	}
+
 	// To write into the global configuration, submit a transaction to
 	// \xff\xff/global_config/<your-key> with <your-value> encoded using the
 	// FDB tuple typecodes.
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index d350a39974..f45040982f 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -506,12 +506,11 @@ ACTOR static Future<Void> clientStatusUpdateActor(DatabaseContext* cx) {
 				}
 			}
 			cx->clientStatusUpdater.outStatusQ.clear();
-			double clientSamplingProbability = std::isinf(cx->clientInfo->get().clientTxnInfoSampleRate)
-			                                       ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY
-			                                       : cx->clientInfo->get().clientTxnInfoSampleRate;
-			int64_t clientTxnInfoSizeLimit = cx->clientInfo->get().clientTxnInfoSizeLimit == -1
-			                                     ? CLIENT_KNOBS->CSI_SIZE_LIMIT
-			                                     : cx->clientInfo->get().clientTxnInfoSizeLimit;
+			wait(GlobalConfig::globalConfig().onInitialized());
+			double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
+			double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
+			double sizeLimit = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSizeLimit, -1);
+			int64_t clientTxnInfoSizeLimit = sizeLimit == -1 ? CLIENT_KNOBS->CSI_SIZE_LIMIT : sizeLimit;
 			if (!trChunksQ.empty() && deterministicRandom()->random01() < clientSamplingProbability)
 				wait(delExcessClntTxnEntriesActor(&tr, clientTxnInfoSizeLimit));
 
@@ -957,14 +956,14 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 	getValueSubmitted.init(LiteralStringRef("NativeAPI.GetValueSubmitted"));
 	getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted"));
 
+	GlobalConfig::create(this, clientInfo);
+
 	monitorProxiesInfoChange = monitorProxiesChange(clientInfo, &proxiesChangeTrigger);
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);
 	cacheListMonitor = monitorCacheList(this);
 
 	smoothMidShardSize.reset(CLIENT_KNOBS->INIT_MID_SHARD_BYTES);
 
-	GlobalConfig::create(this, clientInfo);
-
 	if (apiVersionAtLeast(700)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ERRORMSG,
 		                              SpecialKeySpace::IMPLTYPE::READONLY,
@@ -1273,14 +1272,14 @@ Future<Void> DatabaseContext::onProxiesChanged() {
 }
 
 bool DatabaseContext::sampleReadTags() const {
-	return clientInfo->get().transactionTagSampleRate > 0 &&
-	       deterministicRandom()->random01() <= clientInfo->get().transactionTagSampleRate;
+	double sampleRate = GlobalConfig::globalConfig().get(transactionTagSampleRate, CLIENT_KNOBS->READ_TAG_SAMPLE_RATE);
+	return sampleRate > 0 && deterministicRandom()->random01() <= sampleRate;
 }
 
 bool DatabaseContext::sampleOnCost(uint64_t cost) const {
-	if (clientInfo->get().transactionTagSampleCost <= 0)
-		return false;
-	return deterministicRandom()->random01() <= (double)cost / clientInfo->get().transactionTagSampleCost;
+	double sampleCost = GlobalConfig::globalConfig().get<double>(transactionTagSampleCost, CLIENT_KNOBS->COMMIT_SAMPLE_COST);
+	if (sampleCost <= 0) return false;
+	return deterministicRandom()->random01() <= (double)cost / sampleCost;
 }
 
 int64_t extractIntOption(Optional<StringRef> value, int64_t minValue, int64_t maxValue) {
@@ -5375,14 +5374,11 @@ void Transaction::checkDeferredError() {
 	cx->checkDeferredError();
 }
 
-Reference<TransactionLogInfo> Transaction::createTrLogInfoProbabilistically(const Database& cx) {
-	if (!cx->isError()) {
-		double clientSamplingProbability = std::isinf(cx->clientInfo->get().clientTxnInfoSampleRate)
-		                                       ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY
-		                                       : cx->clientInfo->get().clientTxnInfoSampleRate;
-		if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) &&
-		    deterministicRandom()->random01() < clientSamplingProbability &&
-		    (!g_network->isSimulated() || !g_simulator.speedUpSimulation)) {
+Reference<TransactionLogInfo> Transaction::createTrLogInfoProbabilistically(const Database &cx) {
+	if(!cx->isError()) {
+		double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
+		double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
+		if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) && deterministicRandom()->random01() < clientSamplingProbability && (!g_network->isSimulated() || !g_simulator.speedUpSimulation)) {
 			return makeReference<TransactionLogInfo>(TransactionLogInfo::DATABASE);
 		}
 	}
diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index 785a42828b..7c12a69059 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -757,10 +757,8 @@ const KeyRef tagThrottleLimitKey = LiteralStringRef("\xff\x02/throttledTags/manu
 const KeyRef tagThrottleCountKey = LiteralStringRef("\xff\x02/throttledTags/manualThrottleCount");
 
 // Client status info prefix
-const KeyRangeRef fdbClientInfoPrefixRange(LiteralStringRef("\xff\x02/fdbClientInfo/"),
-                                           LiteralStringRef("\xff\x02/fdbClientInfo0"));
-const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("\xff\x02/fdbClientInfo/client_txn_sample_rate/");
-const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("\xff\x02/fdbClientInfo/client_txn_size_limit/");
+const KeyRangeRef fdbClientInfoPrefixRange(LiteralStringRef("\xff\x02/fdbClientInfo/"), LiteralStringRef("\xff\x02/fdbClientInfo0"));
+// See remaining fields in GlobalConfig.actor.h
 
 // ConsistencyCheck settings
 const KeyRef fdbShouldConsistencyCheckBeSuspended = LiteralStringRef("\xff\x02/ConsistencyCheck/Suspend");
diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h
index 489da42f83..5cf56ef7ec 100644
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@@ -379,8 +379,6 @@ extern const KeyRangeRef applyMutationsKeyVersionCountRange;
 
 // FdbClient Info prefix
 extern const KeyRangeRef fdbClientInfoPrefixRange;
-extern const KeyRef fdbClientInfoTxnSampleRate;
-extern const KeyRef fdbClientInfoTxnSizeLimit;
 
 // Consistency Check settings
 extern const KeyRef fdbShouldConsistencyCheckBeSuspended;
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 43b53c908c..4aca282d62 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -2715,9 +2715,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
 		clientInfo.id = deterministicRandom()->randomUniqueID();
 		clientInfo.commitProxies = req.commitProxies;
 		clientInfo.grvProxies = req.grvProxies;
-		clientInfo.clientTxnInfoSampleRate = db->clientInfo->get().clientTxnInfoSampleRate;
-		clientInfo.clientTxnInfoSizeLimit = db->clientInfo->get().clientTxnInfoSizeLimit;
-		db->clientInfo->set(clientInfo);
+		db->clientInfo->set( clientInfo );
 		dbInfo.client = db->clientInfo->get();
 	}
 
@@ -3199,13 +3197,11 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 				state Optional<Value> globalConfigVersion = wait(tr.get(globalConfigVersionKey));
-				state Optional<Value> rateVal = wait(tr.get(fdbClientInfoTxnSampleRate));
-				state Optional<Value> limitVal = wait(tr.get(fdbClientInfoTxnSizeLimit));
 				state ClientDBInfo clientInfo = db->clientInfo->get();
 
 				if (globalConfigVersion.present()) {
 					BinaryReader versionReader = BinaryReader(globalConfigVersion.get(), AssumeVersion(g_network->protocolVersion()));
-					int64_t commitVersion;
+					int64_t commitVersion;  // Currently unused. Convert to little endian if you want to use it
 					int16_t serializationOrder;
 					state ProtocolVersion protocolVersion;
 					versionReader >> commitVersion >> serializationOrder >> protocolVersion;
@@ -3254,24 +3250,10 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 					db->clientInfo->set(clientInfo);
 				}
 
-				// TODO: Remove this and move to global config space
-				double sampleRate = rateVal.present() ? BinaryReader::fromStringRef<double>(rateVal.get(), Unversioned()) : std::numeric_limits<double>::infinity();
-				int64_t sizeLimit = limitVal.present() ? BinaryReader::fromStringRef<int64_t>(limitVal.get(), Unversioned()) : -1;
-				if (sampleRate != clientInfo.clientTxnInfoSampleRate || sizeLimit != clientInfo.clientTxnInfoSampleRate) {
-					clientInfo.id = deterministicRandom()->randomUniqueID();
-					clientInfo.clientTxnInfoSampleRate = sampleRate;
-					clientInfo.clientTxnInfoSizeLimit = sizeLimit;
-					db->clientInfo->set(clientInfo);
-				}
-
 				state Future<Void> globalConfigFuture = tr.watch(globalConfigVersionKey);
-				state Future<Void> watchRateFuture = tr.watch(fdbClientInfoTxnSampleRate);
-				state Future<Void> watchLimitFuture = tr.watch(fdbClientInfoTxnSizeLimit);
 				wait(tr.commit());
 				choose {
 					when (wait(globalConfigFuture)) { break; }
-					when(wait(watchRateFuture)) { break; }
-					when(wait(watchLimitFuture)) { break; }
 				}
 			} catch (Error& e) {
 				wait(tr.onError(e));
diff --git a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
index a5d6ca18be..9343d24694 100644
--- a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
+++ b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
@@ -1,5 +1,6 @@
 #include "fdbserver/workloads/workloads.actor.h"
 #include "fdbserver/ServerDBInfo.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/ManagementAPI.actor.h"
 #include "fdbclient/RunTransaction.actor.h"
 #include "flow/actorcompiler.h" // has to be last include
@@ -268,13 +269,17 @@ struct ClientTransactionProfileCorrectnessWorkload : TestWorkload {
 
 	ACTOR Future<Void> changeProfilingParameters(Database cx, int64_t sizeLimit, double sampleProbability) {
 
-		wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
-			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-			tr->set(fdbClientInfoTxnSampleRate, BinaryWriter::toValue(sampleProbability, Unversioned()));
-			tr->set(fdbClientInfoTxnSizeLimit, BinaryWriter::toValue(sizeLimit, Unversioned()));
-			return Void();
-		}));
+		wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void>
+						{
+							tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+							tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+							Tuple rate = Tuple().appendDouble(sampleProbability);
+							Tuple size = Tuple().append(sizeLimit);
+							tr->set(fdbClientInfoTxnSampleRate.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), rate.pack());
+							tr->set(fdbClientInfoTxnSizeLimit.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), size.pack());
+							return Void();
+						}
+					 ));
 		return Void();
 	}
 

From 4a799baa1d9c720abfd18d1058fff54aa02ae14f Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 24 Feb 2021 11:23:29 -0800
Subject: [PATCH 130/461] Add clear range for global configuration

---
 fdbclient/GlobalConfig.actor.h      | 3 ++-
 fdbclient/NativeAPI.actor.cpp       | 2 +-
 fdbclient/SpecialKeySpace.actor.cpp | 6 ++++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 601802e57c..3902654112 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -48,7 +48,6 @@ extern const KeyRef transactionTagSampleCost;
 
 class GlobalConfig {
 public:
-	GlobalConfig();
 	GlobalConfig(const GlobalConfig&) = delete;
 	GlobalConfig& operator=(const GlobalConfig&) = delete;
 
@@ -85,6 +84,8 @@ public:
 	Future<Void> onInitialized();
 
 private:
+	GlobalConfig();
+
 	void insert(KeyRef key, ValueRef value);
 	void erase(KeyRef key);
 	void erase(KeyRangeRef range);
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index f45040982f..92d6d6711e 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -509,7 +509,7 @@ ACTOR static Future<Void> clientStatusUpdateActor(DatabaseContext* cx) {
 			wait(GlobalConfig::globalConfig().onInitialized());
 			double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
 			double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
-			double sizeLimit = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSizeLimit, -1);
+			int64_t sizeLimit = GlobalConfig::globalConfig().get<int64_t>(fdbClientInfoTxnSizeLimit, -1);
 			int64_t clientTxnInfoSizeLimit = sizeLimit == -1 ? CLIENT_KNOBS->CSI_SIZE_LIMIT : sizeLimit;
 			if (!trChunksQ.empty() && deterministicRandom()->random01() < clientSamplingProbability)
 				wait(delExcessClntTxnEntriesActor(&tr, clientTxnInfoSizeLimit));
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index c84b4a5d42..7fbce93b6e 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1456,7 +1456,9 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 				mutations.emplace_back_deep(arena, MutationRef(MutationRef::SetValue, bareKey, entry.second.get()));
 				tr.set(systemKey, entry.second.get());
 			} else {
-				mutations.emplace_back_deep(arena, MutationRef(MutationRef::ClearRange, bareKey, keyAfter(bareKey)));
+				KeyRef clearRangeBegin = iter->range().begin.removePrefix(globalConfig->getKeyRange().begin);
+				KeyRef clearRangeEnd = iter->range().end.removePrefix(globalConfig->getKeyRange().begin);
+				mutations.emplace_back_deep(arena, MutationRef(MutationRef::ClearRange, clearRangeBegin, clearRangeEnd));
 				tr.clear(systemKey);
 			}
 		}
@@ -1491,7 +1493,7 @@ Future<Optional<std::string>> GlobalConfigImpl::commit(ReadYourWritesTransaction
 }
 
 void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
-	// TODO
+	ryw->getSpecialKeySpaceWriteMap().insert(range, std::make_pair(true, Optional<Value>()));
 }
 
 void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {

From 70c4bbe119f40bcd7d44f45ee5e05819374c3657 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 24 Feb 2021 11:49:25 -0800
Subject: [PATCH 131/461] Fix clear range persistence issue

---
 fdbclient/SpecialKeySpace.actor.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 7fbce93b6e..395e791839 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1448,18 +1448,22 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 	    ryw->getSpecialKeySpaceWriteMap().containedRanges(specialKeys);
 	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
 	while (iter != ranges.end()) {
-		Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin);
-		Key systemKey = bareKey.withPrefix(globalConfigKeysPrefix);
 		std::pair<bool, Optional<Value>> entry = iter->value();
 		if (entry.first) {
 			if (entry.second.present()) {
+				Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin);
 				mutations.emplace_back_deep(arena, MutationRef(MutationRef::SetValue, bareKey, entry.second.get()));
+
+				Key systemKey = bareKey.withPrefix(globalConfigKeysPrefix);
 				tr.set(systemKey, entry.second.get());
 			} else {
-				KeyRef clearRangeBegin = iter->range().begin.removePrefix(globalConfig->getKeyRange().begin);
-				KeyRef clearRangeEnd = iter->range().end.removePrefix(globalConfig->getKeyRange().begin);
-				mutations.emplace_back_deep(arena, MutationRef(MutationRef::ClearRange, clearRangeBegin, clearRangeEnd));
-				tr.clear(systemKey);
+				KeyRef bareRangeBegin = iter->range().begin.removePrefix(globalConfig->getKeyRange().begin);
+				KeyRef bareRangeEnd = iter->range().end.removePrefix(globalConfig->getKeyRange().begin);
+				mutations.emplace_back_deep(arena, MutationRef(MutationRef::ClearRange, bareRangeBegin, bareRangeEnd));
+
+				Key systemRangeBegin = bareRangeBegin.withPrefix(globalConfigKeysPrefix);
+				Key systemRangeEnd = bareRangeEnd.withPrefix(globalConfigKeysPrefix);
+				tr.clear(KeyRangeRef(systemRangeBegin, systemRangeEnd));
 			}
 		}
 		++iter;

From e9e2ca54d68ef7f06ab3c2d97065b2dba8a29b4c Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 24 Feb 2021 11:50:13 -0800
Subject: [PATCH 132/461] Assert history contains data

---
 fdbclient/GlobalConfig.actor.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 261680f4d4..4ae6c20f83 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -129,13 +129,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
 				wait(self->refresh(self));
-				// TODO: This check is a temporary fix while old functionality
-				// for setting ClientDBInfo fields exist, but eventually it
-				// should be replaced with an assert that the size of `history`
-				// is greater than 0.
-				if (dbInfo->get().history.size() > 0) {
-					self->lastUpdate = dbInfo->get().history.back().contents().first;
-				}
+				ASSERT(dbInfo->get().history.size() > 0);
+				self->lastUpdate = dbInfo->get().history.back().contents().first;
 			} else {
 				// Apply history in order, from lowest version to highest
 				// version. Mutation history should already be stored in

From e5e48da5ceadae455fbd52af383cbd6387d8ee2f Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 24 Feb 2021 12:59:40 -0800
Subject: [PATCH 133/461] Revert removal of history size check

---
 fdbclient/GlobalConfig.actor.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 4ae6c20f83..52df7c30e5 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -129,8 +129,9 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
 				wait(self->refresh(self));
-				ASSERT(dbInfo->get().history.size() > 0);
-				self->lastUpdate = dbInfo->get().history.back().contents().first;
+				if (dbInfo->get().history.size() > 0) {
+					self->lastUpdate = dbInfo->get().history.back().contents().first;
+				}
 			} else {
 				// Apply history in order, from lowest version to highest
 				// version. Mutation history should already be stored in

From b7cd8175be269e4c12b8e08ebbc7ab549d8d4d02 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 24 Feb 2021 18:29:53 -0800
Subject: [PATCH 134/461] Add arena per object in global config

---
 fdbclient/GlobalConfig.actor.cpp    | 36 +++++++++++++++--------------
 fdbclient/GlobalConfig.actor.h      | 33 ++++++++++++++++----------
 fdbclient/SpecialKeySpace.actor.cpp | 22 +++++++++---------
 3 files changed, 51 insertions(+), 40 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 52df7c30e5..6569943ff3 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -26,19 +26,21 @@
 
 #include "flow/actorcompiler.h"  // This must be the last #include.
 
-const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("fdbClientInfo/client_txn_sample_rate");
-const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("fdbClientInfo/client_txn_size_limit");
+const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("config/fdbClientInfo/client_txn_sample_rate");
+const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdbClientInfo/client_txn_size_limit");
 
-const KeyRef transactionTagSampleRate = LiteralStringRef("transactionTagSampleRate");
-const KeyRef transactionTagSampleCost = LiteralStringRef("transactionTagSampleCost");
+const KeyRef transactionTagSampleRate = LiteralStringRef("config/transactionTagSampleRate");
+const KeyRef transactionTagSampleCost = LiteralStringRef("config/transactionTagSampleCost");
 
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
 void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
-	auto config = new GlobalConfig{};
-	config->cx = Database(cx);
-	g_network->setGlobal(INetwork::enGlobalConfig, config);
-	config->_updater = updater(config, dbInfo);
+	if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
+		auto config = new GlobalConfig{};
+		config->cx = Database(cx);
+		g_network->setGlobal(INetwork::enGlobalConfig, config);
+		config->_updater = updater(config, dbInfo);
+	}
 }
 
 GlobalConfig& GlobalConfig::globalConfig() {
@@ -47,16 +49,16 @@ GlobalConfig& GlobalConfig::globalConfig() {
 	return *reinterpret_cast<GlobalConfig*>(res);
 }
 
-const std::any GlobalConfig::get(KeyRef name) {
+const ConfigValue GlobalConfig::get(KeyRef name) {
 	auto it = data.find(name);
 	if (it == data.end()) {
-		return std::any{};
+		return ConfigValue{ Arena(), std::any{} };
 	}
 	return it->second;
 }
 
-const std::map<KeyRef, std::any> GlobalConfig::get(KeyRangeRef range) {
-	std::map<KeyRef, std::any> results;
+const std::map<KeyRef, ConfigValue> GlobalConfig::get(KeyRangeRef range) {
+	std::map<KeyRef, ConfigValue> results;
 	for (const auto& [key, value] : data) {
 		if (range.contains(key)) {
 			results[key] = value;
@@ -70,17 +72,18 @@ Future<Void> GlobalConfig::onInitialized() {
 }
 
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
+	Arena arena(1);
 	KeyRef stableKey = KeyRef(arena, key);
 	try {
 		Tuple t = Tuple::unpack(value);
 		if (t.getType(0) == Tuple::ElementType::UTF8) {
-			data[stableKey] = t.getString(0);
+			data[stableKey] = ConfigValue{ arena, StringRef(arena, t.getString(0).contents()) };
 		} else if (t.getType(0) == Tuple::ElementType::INT) {
-			data[stableKey] = t.getInt(0);
+			data[stableKey] = ConfigValue{ arena, t.getInt(0) };
 		} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
-			data[stableKey] = t.getFloat(0);
+			data[stableKey] = ConfigValue{ arena, t.getFloat(0) };
 		} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
-			data[stableKey] = t.getDouble(0);
+			data[stableKey] = ConfigValue{ arena, t.getDouble(0) };
 		} else {
 			ASSERT(false);
 		}
@@ -94,7 +97,6 @@ void GlobalConfig::erase(KeyRef key) {
 }
 
 void GlobalConfig::erase(KeyRangeRef range) {
-	// TODO: Memory leak -- memory for key remains allocated in arena
 	auto it = data.begin();
 	while (it != data.end()) {
 		if (range.contains(it->first)) {
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 3902654112..a82d86cc8a 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -28,6 +28,7 @@
 
 #include <any>
 #include <map>
+#include <type_traits>
 #include <unordered_map>
 
 #include "fdbclient/CommitProxyInterface.h"
@@ -46,6 +47,11 @@ extern const KeyRef fdbClientInfoTxnSizeLimit;
 extern const KeyRef transactionTagSampleRate;
 extern const KeyRef transactionTagSampleCost;
 
+struct ConfigValue {
+	Arena arena;
+	std::any value;
+};
+
 class GlobalConfig {
 public:
 	GlobalConfig(const GlobalConfig&) = delete;
@@ -54,27 +60,31 @@ public:
 	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 	static GlobalConfig& globalConfig();
 
-	const std::any get(KeyRef name);
-	const std::map<KeyRef, std::any> get(KeyRangeRef range);
+	const ConfigValue get(KeyRef name);
+	const std::map<KeyRef, ConfigValue> get(KeyRangeRef range);
 
-	template <typename T>
+	template <typename T, typename std::enable_if<std::is_arithmetic<T>{}, bool>::type = true>
 	const T get(KeyRef name) {
 		try {
-			auto any = get(name);
+			auto any = get(name).value;
 			return std::any_cast<T>(any);
 		} catch (Error& e) {
 			throw;
 		}
 	}
 
-	template <typename T>
+	template <typename T, typename std::enable_if<std::is_arithmetic<T>{}, bool>::type = true>
 	const T get(KeyRef name, T defaultVal) {
-		auto any = get(name);
-		if (any.has_value()) {
-			return std::any_cast<T>(any);
-		}
+		try {
+			auto any = get(name).value;
+			if (any.has_value()) {
+				return std::any_cast<T>(any);
+			}
 
-		return defaultVal;
+			return defaultVal;
+		} catch (Error& e) {
+			throw;
+		}
 	}
 
 	// To write into the global configuration, submit a transaction to
@@ -96,8 +106,7 @@ private:
 	Database cx;
 	Future<Void> _updater;
 	Promise<Void> initialized;
-	Arena arena;
-	std::unordered_map<StringRef, std::any> data;
+	std::unordered_map<StringRef, ConfigValue> data;
 	Version lastUpdate;
 };
 
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 395e791839..4c4a09456a 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1379,18 +1379,18 @@ Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTran
 
 	auto& globalConfig = GlobalConfig::globalConfig();
 	KeyRangeRef modified = KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin));
-	std::map<KeyRef, std::any> values = globalConfig.get(modified);
-	for (const auto& [key, any] : values) {
+	std::map<KeyRef, ConfigValue> values = globalConfig.get(modified);
+	for (const auto& [key, config] : values) {
 		Key prefixedKey = key.withPrefix(getKeyRange().begin);
-		if (any.has_value()) {
-			if (any.type() == typeid(Standalone<StringRef>)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::any_cast<Standalone<StringRef>>(any).contents()));
-			} else if (any.type() == typeid(int64_t)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(any))));
-			} else if (any.type() == typeid(float)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(any))));
-			} else if (any.type() == typeid(double)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(any))));
+		if (config.value.has_value()) {
+			if (config.value.type() == typeid(StringRef)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::any_cast<StringRef>(config.value).toString()));
+			} else if (config.value.type() == typeid(int64_t)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config.value))));
+			} else if (config.value.type() == typeid(float)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config.value))));
+			} else if (config.value.type() == typeid(double)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(config.value))));
 			} else {
 				ASSERT(false);
 			}

From 388344c31e9c1f7bbb56470e78ad5b9f369f9a54 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 26 Feb 2021 09:27:55 -0800
Subject: [PATCH 135/461] Better estimation for arena size

---
 fdbclient/GlobalConfig.actor.cpp |  2 +-
 fdbclient/GlobalConfig.actor.h   | 18 ++++++++----------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 6569943ff3..cbb92d053e 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -72,7 +72,7 @@ Future<Void> GlobalConfig::onInitialized() {
 }
 
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
-	Arena arena(1);
+	Arena arena(key.expectedSize() + value.expectedSize());
 	KeyRef stableKey = KeyRef(arena, key);
 	try {
 		Tuple t = Tuple::unpack(value);
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index a82d86cc8a..479e65d427 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -60,19 +60,17 @@ public:
 	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 	static GlobalConfig& globalConfig();
 
+	// Get a value from the framework. Values are returned in a ConfigValue
+	// struct which also contains a reference to the arena containing the
+	// memory for the object. As long as the caller keeps a reference to the
+	// returned ConfigValue, the value is guaranteed to be readable (if it
+	// exists).
 	const ConfigValue get(KeyRef name);
 	const std::map<KeyRef, ConfigValue> get(KeyRangeRef range);
 
-	template <typename T, typename std::enable_if<std::is_arithmetic<T>{}, bool>::type = true>
-	const T get(KeyRef name) {
-		try {
-			auto any = get(name).value;
-			return std::any_cast<T>(any);
-		} catch (Error& e) {
-			throw;
-		}
-	}
-
+	// For arithmetic value types, returns a copy of the value for the given
+	// key, or the supplied default value if the framework does not know about
+	// the key.
 	template <typename T, typename std::enable_if<std::is_arithmetic<T>{}, bool>::type = true>
 	const T get(KeyRef name, T defaultVal) {
 		try {

From fb9a929780a97d57ef8768c8afe3a03855894d35 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 2 Mar 2021 15:03:05 -0800
Subject: [PATCH 136/461] Fix issue with freed memory being accessed

---
 fdbserver/ClusterController.actor.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 4aca282d62..28bb8c4d4d 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3206,7 +3206,6 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 					state ProtocolVersion protocolVersion;
 					versionReader >> commitVersion >> serializationOrder >> protocolVersion;
 
-					state Arena arena;
 					if (protocolVersion == g_network->protocolVersion()) {
 						Standalone<RangeResultRef> globalConfigHistory = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
 						// If the global configuration version key has been
@@ -3228,7 +3227,7 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 							BinaryReader mutationReader = BinaryReader(kv.value, AssumeVersion(protocolVersion));
 							VectorRef<MutationRef> mutations;
 							mutationReader >> mutations;
-							data.second = VectorRef(arena, mutations);
+							data.second = VectorRef(data.arena(), mutations);
 
 							clientInfo.history.push_back(data);
 						}

From 1c84c04ffc25106b71288f9b15421c628b8af837 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 11 Mar 2021 10:57:46 -0800
Subject: [PATCH 137/461] Add global configuration prefix function

---
 fdbcli/fdbcli.actor.cpp                                      | 4 ++--
 fdbclient/GlobalConfig.actor.cpp                             | 5 +++++
 fdbclient/GlobalConfig.actor.h                               | 4 ++++
 fdbclient/SpecialKeySpace.actor.cpp                          | 1 +
 .../workloads/ClientTransactionProfileCorrectness.actor.cpp  | 4 ++--
 5 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index 35f7fdd884..e8167c4855 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -3891,8 +3891,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 							Tuple rate = Tuple().appendDouble(sampleRate);
 							Tuple size = Tuple().append(sizeLimit);
 							tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
-							tr->set(fdbClientInfoTxnSampleRate.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), rate.pack());
-							tr->set(fdbClientInfoTxnSizeLimit.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), size.pack());
+							tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
+							tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
 							if (!intrans) {
 								wait(commitTransaction(tr));
 							}
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index cbb92d053e..dd9b2d8aff 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -19,6 +19,7 @@
  */
 
 #include "fdbclient/GlobalConfig.actor.h"
+#include "fdbclient/SpecialKeySpace.actor.h"
 #include "fdbclient/SystemData.h"
 #include "fdbclient/Tuple.h"
 #include "flow/flow.h"
@@ -49,6 +50,10 @@ GlobalConfig& GlobalConfig::globalConfig() {
 	return *reinterpret_cast<GlobalConfig*>(res);
 }
 
+Key GlobalConfig::prefixedKey(KeyRef key) {
+	return key.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin);
+}
+
 const ConfigValue GlobalConfig::get(KeyRef name) {
 	auto it = data.find(name);
 	if (it == data.end()) {
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 479e65d427..d699c804f0 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -60,6 +60,10 @@ public:
 	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 	static GlobalConfig& globalConfig();
 
+	// Use this function to turn a global configuration key defined above into
+	// the full path needed to set the value in the database.
+	static Key prefixedKey(KeyRef key);
+
 	// Get a value from the framework. Values are returned in a ConfigValue
 	// struct which also contains a reference to the arena containing the
 	// memory for the object. As long as the caller keeps a reference to the
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 4c4a09456a..356cdfc2c1 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1434,6 +1434,7 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 		//   clear_range(\xff/globalConfig/h, \xff/globalConfig/h/1000) results
 		//   in zero key-value pairs being deleted (999 is lexicographically
 		//   larger than 1000, and the range is exclusive).
+		// Delete the oldest key(s) in the history to make room for new data.
 		for (int i = 0; i < keys.size() - (kGlobalConfigMaxHistorySize - 1); ++i) {
 			tr.clear(keys[i]);
 		}
diff --git a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
index 9343d24694..d56061ca3a 100644
--- a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
+++ b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
@@ -275,8 +275,8 @@ struct ClientTransactionProfileCorrectnessWorkload : TestWorkload {
 							tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 							Tuple rate = Tuple().appendDouble(sampleProbability);
 							Tuple size = Tuple().append(sizeLimit);
-							tr->set(fdbClientInfoTxnSampleRate.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), rate.pack());
-							tr->set(fdbClientInfoTxnSizeLimit.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), size.pack());
+							tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
+							tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
 							return Void();
 						}
 					 ));

From 12603859659108bfd092e50ae0edaaa65e8934a5 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 15 Mar 2021 18:03:54 -0700
Subject: [PATCH 138/461] Use object to wrap global configuration history

---
 fdbclient/CommitProxyInterface.h      |   3 +-
 fdbclient/GlobalConfig.actor.cpp      |  22 ++---
 fdbclient/GlobalConfig.actor.h        |   8 +-
 fdbclient/GlobalConfig.h              |  45 ++++++++++
 fdbclient/SpecialKeySpace.actor.cpp   |  62 ++++----------
 fdbserver/ClusterController.actor.cpp | 113 ++++++++++++--------------
 6 files changed, 131 insertions(+), 122 deletions(-)
 create mode 100644 fdbclient/GlobalConfig.h

diff --git a/fdbclient/CommitProxyInterface.h b/fdbclient/CommitProxyInterface.h
index 4fffa116ac..794b88ceaa 100644
--- a/fdbclient/CommitProxyInterface.h
+++ b/fdbclient/CommitProxyInterface.h
@@ -31,6 +31,7 @@
 #include "fdbclient/CommitTransaction.h"
 #include "fdbserver/RatekeeperInterface.h"
 #include "fdbclient/TagThrottle.h"
+#include "fdbclient/GlobalConfig.h"
 
 #include "fdbrpc/Stats.h"
 #include "fdbrpc/TimedRequest.h"
@@ -114,7 +115,7 @@ struct ClientDBInfo {
 	Optional<CommitProxyInterface>
 	    firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk
 	Optional<Value> forward;
-	vector<Standalone<std::pair<Version, VectorRef<MutationRef>>>> history;
+	vector<VersionHistory> history;
 
 	ClientDBInfo() {}
 
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index dd9b2d8aff..5315a8f68a 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -112,6 +112,8 @@ void GlobalConfig::erase(KeyRangeRef range) {
 	}
 }
 
+// Updates local copy of global configuration by reading the entire key-range
+// from storage.
 ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 	Transaction tr(self->cx);
 	Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
@@ -122,6 +124,8 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 	return Void();
 }
 
+// Applies updates to the local copy of the global configuration when this
+// process receives an updated history.
 ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
 	wait(self->refresh(self));
 	self->initialized.send(Void());
@@ -131,28 +135,24 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 			wait(dbInfo->onChange());
 
 			auto& history = dbInfo->get().history;
-			if (history.size() == 0 || (self->lastUpdate < history[0].first && self->lastUpdate != 0)) {
+			if (history.size() == 0 || (self->lastUpdate < history[0].version && self->lastUpdate != 0)) {
 				// This process missed too many global configuration
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
 				wait(self->refresh(self));
 				if (dbInfo->get().history.size() > 0) {
-					self->lastUpdate = dbInfo->get().history.back().contents().first;
+					self->lastUpdate = dbInfo->get().history.back().version;
 				}
 			} else {
 				// Apply history in order, from lowest version to highest
 				// version. Mutation history should already be stored in
 				// ascending version order.
-				for (int i = 0; i < history.size(); ++i) {
-					const std::pair<Version, VectorRef<MutationRef>>& pair = history[i].contents();
-
-					Version version = pair.first;
-					if (version <= self->lastUpdate) {
+				for (const auto& vh : history) {
+					if (vh.version <= self->lastUpdate) {
 						continue;  // already applied this mutation
 					}
 
-					const VectorRef<MutationRef>& mutations = pair.second;
-					for (const auto& mutation : mutations) {
+					for (const auto& mutation : vh.mutations.contents()) {
 						if (mutation.type == MutationRef::SetValue) {
 							self->insert(mutation.param1, mutation.param2);
 						} else if (mutation.type == MutationRef::ClearRange) {
@@ -162,8 +162,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 						}
 					}
 
-					ASSERT(version > self->lastUpdate);
-					self->lastUpdate = version;
+					ASSERT(vh.version > self->lastUpdate);
+					self->lastUpdate = vh.version;
 				}
 			}
 		} catch (Error& e) {
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index d699c804f0..11699633da 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -32,6 +32,7 @@
 #include <unordered_map>
 
 #include "fdbclient/CommitProxyInterface.h"
+#include "fdbclient/GlobalConfig.h"
 #include "fdbclient/ReadYourWrites.h"
 
 #include "flow/actorcompiler.h" // has to be last include
@@ -62,6 +63,8 @@ public:
 
 	// Use this function to turn a global configuration key defined above into
 	// the full path needed to set the value in the database.
+	//
+	// For example, given "config/a", returns "\xff\xff/global_config/config/a".
 	static Key prefixedKey(KeyRef key);
 
 	// Get a value from the framework. Values are returned in a ConfigValue
@@ -91,8 +94,11 @@ public:
 
 	// To write into the global configuration, submit a transaction to
 	// \xff\xff/global_config/<your-key> with <your-value> encoded using the
-	// FDB tuple typecodes.
+	// FDB tuple typecodes. Use the helper function `prefixedKey` to correctly
+	// prefix your global configuration key.
 
+	// Triggers the returned future when the global configuration singleton has
+	// been created and is ready.
 	Future<Void> onInitialized();
 
 private:
diff --git a/fdbclient/GlobalConfig.h b/fdbclient/GlobalConfig.h
new file mode 100644
index 0000000000..14e10f8635
--- /dev/null
+++ b/fdbclient/GlobalConfig.h
@@ -0,0 +1,45 @@
+/*
+ * GlobalConfig.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "fdbclient/CommitTransaction.h"
+#include "fdbclient/FDBTypes.h"
+
+// Used to store a list of mutations made to the global configuration at a
+// specific version.
+struct VersionHistory {
+	constexpr static FileIdentifier file_identifier = 5863456;
+
+	Version version;
+	Standalone<VectorRef<MutationRef>> mutations;
+
+	bool operator<(const VersionHistory& other) const { return version < other.version; }
+
+	int expectedSize() const { return sizeof(version) + mutations.expectedSize(); }
+
+	template <typename Ar>
+	void serialize(Ar& ar) {
+		// The version is not serialized because this object is only sent over
+		// the network during a write. In this case, the version is included in
+		// the key, while this object will be written to the value.
+		serializer(ar, mutations);
+	}
+};
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 356cdfc2c1..6bec7665d1 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1409,42 +1409,19 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 
 	// History should only contain three most recent updates. If it currently
 	// has three items, remove the oldest to make room for a new item.
-	Standalone<RangeResultRef> history = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY, false, true));
+	Standalone<RangeResultRef> history = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
 	constexpr int kGlobalConfigMaxHistorySize = 3;
 	if (history.size() > kGlobalConfigMaxHistorySize - 1) {
-		std::vector<KeyRef> keys;
-		for (const auto& kv : history) {
-			keys.push_back(kv.key);
-		}
-		// Fix ordering of returned keys. This will ensure versions are ordered
-		// numerically; for example \xff/globalConfig/h/1000 should come after
-		// \xff/globalConfig/h/999.
-		std::sort(keys.begin(), keys.end(), [](const KeyRef& lhs, const KeyRef& rhs) {
-			if (lhs.size() != rhs.size()) {
-				return lhs.size() < rhs.size();
-			}
-			return lhs.compare(rhs) < 0;
-		});
-
-		// Cannot use a range clear because of how keys are ordered in FDB.
-		//   \xff/globalConfig/h/999 -> ...
-		//   \xff/globalConfig/h/1000 -> ...
-		//   \xff/globalConfig/h/1001 -> ...
-		//
-		//   clear_range(\xff/globalConfig/h, \xff/globalConfig/h/1000) results
-		//   in zero key-value pairs being deleted (999 is lexicographically
-		//   larger than 1000, and the range is exclusive).
-		// Delete the oldest key(s) in the history to make room for new data.
-		for (int i = 0; i < keys.size() - (kGlobalConfigMaxHistorySize - 1); ++i) {
-			tr.clear(keys[i]);
+		for (int i = 0; i < history.size() - (kGlobalConfigMaxHistorySize - 1); ++i) {
+			tr.clear(history[i].key);
 		}
 	}
 
-	Arena arena;
-	VectorRef<MutationRef> mutations;
+	VersionHistory vh;
 
-	// Transform writes from special-key-space (\xff\xff/global_config/) to
-	// system key space (\xff/globalConfig/).
+	// Transform writes from the special-key-space (\xff\xff/global_config/) to
+	// the system key space (\xff/globalConfig/), and writes mutations to
+	// latest version history.
 	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
 	    ryw->getSpecialKeySpaceWriteMap().containedRanges(specialKeys);
 	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
@@ -1453,14 +1430,16 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 		if (entry.first) {
 			if (entry.second.present()) {
 				Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin);
-				mutations.emplace_back_deep(arena, MutationRef(MutationRef::SetValue, bareKey, entry.second.get()));
+				vh.mutations.emplace_back_deep(vh.mutations.arena(),
+				                               MutationRef(MutationRef::SetValue, bareKey, entry.second.get()));
 
 				Key systemKey = bareKey.withPrefix(globalConfigKeysPrefix);
 				tr.set(systemKey, entry.second.get());
 			} else {
 				KeyRef bareRangeBegin = iter->range().begin.removePrefix(globalConfig->getKeyRange().begin);
 				KeyRef bareRangeEnd = iter->range().end.removePrefix(globalConfig->getKeyRange().begin);
-				mutations.emplace_back_deep(arena, MutationRef(MutationRef::ClearRange, bareRangeBegin, bareRangeEnd));
+				vh.mutations.emplace_back_deep(vh.mutations.arena(),
+				                               MutationRef(MutationRef::ClearRange, bareRangeBegin, bareRangeEnd));
 
 				Key systemRangeBegin = bareRangeBegin.withPrefix(globalConfigKeysPrefix);
 				Key systemRangeEnd = bareRangeEnd.withPrefix(globalConfigKeysPrefix);
@@ -1470,27 +1449,18 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 		++iter;
 	}
 
-	ProtocolVersion protocolVersion = g_network->protocolVersion();
-
 	// Record the mutations in this commit into the global configuration history.
-	BinaryWriter historyKeyWriter(AssumeVersion(protocolVersion));
-	historyKeyWriter.serializeBytes(globalConfigHistoryPrefix);
-	Key historyKey = addVersionStampAtEnd(historyKeyWriter.toValue());
-
-	BinaryWriter historyMutationsWriter(AssumeVersion(protocolVersion));
-	historyMutationsWriter << mutations;
-
-	tr.atomicOp(historyKey, historyMutationsWriter.toValue(), MutationRef::SetVersionstampedKey);
+	Key historyKey = addVersionStampAtEnd(globalConfigHistoryPrefix);
+	ObjectWriter historyWriter(IncludeVersion());
+	historyWriter.serialize(vh);
+	tr.atomicOp(historyKey, historyWriter.toStringRef(), MutationRef::SetVersionstampedKey);
 
 	// Write version key to trigger update in cluster controller.
 	tr.atomicOp(globalConfigVersionKey,
-	            BinaryWriter::toValue(protocolVersion, AssumeVersion(protocolVersion))
-	                .withPrefix(LiteralStringRef("0123456789")) // placeholder for versionstamp
-	                .withSuffix(LiteralStringRef("\x00\x00\x00\x00")),
+	            LiteralStringRef("0123456789\x00\x00\x00\x00"), // versionstamp
 	            MutationRef::SetVersionstampedValue);
 
 	return Optional<std::string>();
-
 }
 
 Future<Optional<std::string>> GlobalConfigImpl::commit(ReadYourWritesTransaction* ryw) {
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 28bb8c4d4d..93632650e1 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -41,6 +41,7 @@
 #include "fdbserver/Status.h"
 #include "fdbserver/LatencyBandConfig.h"
 #include "fdbclient/DatabaseContext.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbserver/RecoveryState.h"
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbrpc/Replication.h"
@@ -3189,7 +3190,7 @@ ACTOR Future<Void> monitorServerInfoConfig(ClusterControllerData::DBInfo* db) {
 	}
 }
 
-ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db) {
+ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 	loop {
 		state ReadYourWritesTransaction tr(db->db);
 		loop {
@@ -3200,49 +3201,39 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 				state ClientDBInfo clientInfo = db->clientInfo->get();
 
 				if (globalConfigVersion.present()) {
-					BinaryReader versionReader = BinaryReader(globalConfigVersion.get(), AssumeVersion(g_network->protocolVersion()));
-					int64_t commitVersion;  // Currently unused. Convert to little endian if you want to use it
-					int16_t serializationOrder;
-					state ProtocolVersion protocolVersion;
-					versionReader >> commitVersion >> serializationOrder >> protocolVersion;
+					// Since the history keys end with versionstamps, they
+					// should be sorted correctly (versionstamps are stored in
+					// big-endian order).
+					Standalone<RangeResultRef> globalConfigHistory =
+					    wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
+					// If the global configuration version key has been set,
+					// the history should contain at least one item.
+					ASSERT(globalConfigHistory.size() > 0);
+					clientInfo.history.clear();
 
-					if (protocolVersion == g_network->protocolVersion()) {
-						Standalone<RangeResultRef> globalConfigHistory = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
-						// If the global configuration version key has been
-						// set, the history should contain at least one item.
-						ASSERT(globalConfigHistory.size() > 0);
-						clientInfo.history.clear();
-
-						for (const auto& kv : globalConfigHistory) {
-							Standalone<std::pair<Version, VectorRef<MutationRef>>> data;
-
-							// Read commit version out of versionstamp at end of key.
-							BinaryReader versionReader = BinaryReader(kv.key.removePrefix(globalConfigHistoryPrefix), AssumeVersion(protocolVersion));
-							Version historyCommitVersion;
-							versionReader >> historyCommitVersion;
-							historyCommitVersion = bigEndian64(historyCommitVersion);
-							data.first = historyCommitVersion;
-
-							// Read the list of mutations that occurred at this version.
-							BinaryReader mutationReader = BinaryReader(kv.value, AssumeVersion(protocolVersion));
-							VectorRef<MutationRef> mutations;
-							mutationReader >> mutations;
-							data.second = VectorRef(data.arena(), mutations);
-
-							clientInfo.history.push_back(data);
+					for (const auto& kv : globalConfigHistory) {
+						VersionHistory vh;
+						ObjectReader reader(kv.value.begin(), IncludeVersion());
+						if (reader.protocolVersion() != g_network->protocolVersion()) {
+							// If the protocol version has changed, the
+							// GlobalConfig actor should refresh its view by
+							// reading the entire global configuration key
+							// range.  An empty mutation list will signal the
+							// actor to refresh.
+							clientInfo.history.clear();
+							break;
 						}
+						reader.deserialize(vh);
 
-						// History should be ordered by version, ascending.
-						std::sort(clientInfo.history.begin(), clientInfo.history.end(), [](const auto& lhs, const auto& rhs) {
-							return lhs.first < rhs.first;
-						});
-					} else {
-						// If the protocol version has changed, the
-						// GlobalConfig actor should refresh its view by
-						// reading the entire global configuration key range.
-						// An empty mutation list will signal the actor to
-						// refresh.
-						clientInfo.history.clear();
+						// Read commit version out of versionstamp at end of key.
+						BinaryReader versionReader =
+						    BinaryReader(kv.key.removePrefix(globalConfigHistoryPrefix), Unversioned());
+						Version historyCommitVersion;
+						versionReader >> historyCommitVersion;
+						historyCommitVersion = bigEndian64(historyCommitVersion);
+						vh.version = historyCommitVersion;
+
+						clientInfo.history.push_back(vh);
 					}
 
 					clientInfo.id = deterministicRandom()->randomUniqueID();
@@ -3711,29 +3702,25 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
 	state ClusterControllerData self(interf, locality);
 	state Future<Void> coordinationPingDelay = delay(SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY);
 	state uint64_t step = 0;
-	state Future<ErrorOr<Void>> error = errorOr(actorCollection(self.addActor.getFuture()));
+	state Future<ErrorOr<Void>> error = errorOr( actorCollection( self.addActor.getFuture() ) );
 
-	self.addActor.send(clusterWatchDatabase(&self, &self.db)); // Start the master database
-	self.addActor.send(self.updateWorkerList.init(self.db.db));
-	self.addActor.send(statusServer(interf.clientInterface.databaseStatus.getFuture(), &self, coordinators));
-	self.addActor.send(timeKeeper(&self));
-	self.addActor.send(monitorProcessClasses(&self));
-	self.addActor.send(monitorServerInfoConfig(&self.db));
-	self.addActor.send(monitorClientTxnInfoConfigs(&self.db));
-	self.addActor.send(updatedChangingDatacenters(&self));
-	self.addActor.send(updatedChangedDatacenters(&self));
-	self.addActor.send(updateDatacenterVersionDifference(&self));
-	self.addActor.send(handleForcedRecoveries(&self, interf));
-	self.addActor.send(monitorDataDistributor(&self));
-	self.addActor.send(monitorRatekeeper(&self));
-	self.addActor.send(dbInfoUpdater(&self));
-	self.addActor.send(traceCounters("ClusterControllerMetrics",
-	                                 self.id,
-	                                 SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                                 &self.clusterControllerMetrics,
-	                                 self.id.toString() + "/ClusterControllerMetrics"));
-	self.addActor.send(traceRole(Role::CLUSTER_CONTROLLER, interf.id()));
-	// printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());
+	self.addActor.send( clusterWatchDatabase( &self, &self.db ) );  // Start the master database
+	self.addActor.send( self.updateWorkerList.init( self.db.db ) );
+	self.addActor.send( statusServer( interf.clientInterface.databaseStatus.getFuture(), &self, coordinators));
+	self.addActor.send( timeKeeper(&self) );
+	self.addActor.send( monitorProcessClasses(&self) );
+	self.addActor.send( monitorServerInfoConfig(&self.db) );
+	self.addActor.send(monitorGlobalConfig(&self.db));
+	self.addActor.send( updatedChangingDatacenters(&self) );
+	self.addActor.send( updatedChangedDatacenters(&self) );
+	self.addActor.send( updateDatacenterVersionDifference(&self) );
+	self.addActor.send( handleForcedRecoveries(&self, interf) );
+	self.addActor.send( monitorDataDistributor(&self) );
+	self.addActor.send( monitorRatekeeper(&self) );
+	self.addActor.send( dbInfoUpdater(&self) );
+	self.addActor.send( traceCounters("ClusterControllerMetrics", self.id, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self.clusterControllerMetrics, self.id.toString() + "/ClusterControllerMetrics") );
+	self.addActor.send( traceRole(Role::CLUSTER_CONTROLLER, interf.id()) );
+	//printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());
 
 	loop choose {
 		when(ErrorOr<Void> err = wait(error)) {

From 6de28dd916df282d025044ad590ee94f69742806 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 16 Mar 2021 17:20:25 -0700
Subject: [PATCH 139/461] clang-format

---
 fdbcli/fdbcli.actor.cpp                       |  6 +-
 fdbclient/GlobalConfig.actor.cpp              |  2 +-
 fdbclient/NativeAPI.actor.cpp                 | 23 +++++---
 fdbclient/SpecialKeySpace.actor.cpp           | 15 +++--
 fdbclient/SystemData.cpp                      |  3 +-
 fdbclient/Tuple.cpp                           | 57 +++++++++----------
 fdbserver/ClusterController.actor.cpp         | 40 +++++++------
 ...entTransactionProfileCorrectness.actor.cpp | 20 +++----
 8 files changed, 89 insertions(+), 77 deletions(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index e8167c4855..d655601e22 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -3842,8 +3842,10 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 								is_error = true;
 								continue;
 							}
-							const double sampleRateDbl = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
-							const int64_t sizeLimit = GlobalConfig::globalConfig().get<int64_t>(fdbClientInfoTxnSizeLimit, -1);
+							const double sampleRateDbl = GlobalConfig::globalConfig().get<double>(
+							    fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
+							const int64_t sizeLimit =
+							    GlobalConfig::globalConfig().get<int64_t>(fdbClientInfoTxnSizeLimit, -1);
 							std::string sampleRateStr = "default", sizeLimitStr = "default";
 							if (!std::isinf(sampleRateDbl)) {
 								sampleRateStr = boost::lexical_cast<std::string>(sampleRateDbl);
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 5315a8f68a..a482a8e3ea 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -149,7 +149,7 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 				// ascending version order.
 				for (const auto& vh : history) {
 					if (vh.version <= self->lastUpdate) {
-						continue;  // already applied this mutation
+						continue; // already applied this mutation
 					}
 
 					for (const auto& mutation : vh.mutations.contents()) {
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 92d6d6711e..2e460d55d2 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -507,8 +507,10 @@ ACTOR static Future<Void> clientStatusUpdateActor(DatabaseContext* cx) {
 			}
 			cx->clientStatusUpdater.outStatusQ.clear();
 			wait(GlobalConfig::globalConfig().onInitialized());
-			double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
-			double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
+			double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate,
+			                                                             std::numeric_limits<double>::infinity());
+			double clientSamplingProbability =
+			    std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
 			int64_t sizeLimit = GlobalConfig::globalConfig().get<int64_t>(fdbClientInfoTxnSizeLimit, -1);
 			int64_t clientTxnInfoSizeLimit = sizeLimit == -1 ? CLIENT_KNOBS->CSI_SIZE_LIMIT : sizeLimit;
 			if (!trChunksQ.empty() && deterministicRandom()->random01() < clientSamplingProbability)
@@ -1277,8 +1279,10 @@ bool DatabaseContext::sampleReadTags() const {
 }
 
 bool DatabaseContext::sampleOnCost(uint64_t cost) const {
-	double sampleCost = GlobalConfig::globalConfig().get<double>(transactionTagSampleCost, CLIENT_KNOBS->COMMIT_SAMPLE_COST);
-	if (sampleCost <= 0) return false;
+	double sampleCost =
+	    GlobalConfig::globalConfig().get<double>(transactionTagSampleCost, CLIENT_KNOBS->COMMIT_SAMPLE_COST);
+	if (sampleCost <= 0)
+		return false;
 	return deterministicRandom()->random01() <= (double)cost / sampleCost;
 }
 
@@ -5374,11 +5378,14 @@ void Transaction::checkDeferredError() {
 	cx->checkDeferredError();
 }
 
-Reference<TransactionLogInfo> Transaction::createTrLogInfoProbabilistically(const Database &cx) {
-	if(!cx->isError()) {
-		double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
+Reference<TransactionLogInfo> Transaction::createTrLogInfoProbabilistically(const Database& cx) {
+	if (!cx->isError()) {
+		double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate,
+		                                                             std::numeric_limits<double>::infinity());
 		double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
-		if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) && deterministicRandom()->random01() < clientSamplingProbability && (!g_network->isSimulated() || !g_simulator.speedUpSimulation)) {
+		if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) &&
+		    deterministicRandom()->random01() < clientSamplingProbability &&
+		    (!g_network->isSimulated() || !g_simulator.speedUpSimulation)) {
 			return makeReference<TransactionLogInfo>(TransactionLogInfo::DATABASE);
 		}
 	}
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 6bec7665d1..bff44b03ba 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1378,19 +1378,24 @@ Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTran
 	Standalone<RangeResultRef> result;
 
 	auto& globalConfig = GlobalConfig::globalConfig();
-	KeyRangeRef modified = KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin));
+	KeyRangeRef modified =
+	    KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin));
 	std::map<KeyRef, ConfigValue> values = globalConfig.get(modified);
 	for (const auto& [key, config] : values) {
 		Key prefixedKey = key.withPrefix(getKeyRange().begin);
 		if (config.value.has_value()) {
 			if (config.value.type() == typeid(StringRef)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::any_cast<StringRef>(config.value).toString()));
+				result.push_back_deep(result.arena(),
+				                      KeyValueRef(prefixedKey, std::any_cast<StringRef>(config.value).toString()));
 			} else if (config.value.type() == typeid(int64_t)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config.value))));
+				result.push_back_deep(result.arena(),
+				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config.value))));
 			} else if (config.value.type() == typeid(float)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config.value))));
+				result.push_back_deep(result.arena(),
+				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config.value))));
 			} else if (config.value.type() == typeid(double)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(config.value))));
+				result.push_back_deep(result.arena(),
+				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(config.value))));
 			} else {
 				ASSERT(false);
 			}
diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index 7c12a69059..42fec5f9f2 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -757,7 +757,8 @@ const KeyRef tagThrottleLimitKey = LiteralStringRef("\xff\x02/throttledTags/manu
 const KeyRef tagThrottleCountKey = LiteralStringRef("\xff\x02/throttledTags/manualThrottleCount");
 
 // Client status info prefix
-const KeyRangeRef fdbClientInfoPrefixRange(LiteralStringRef("\xff\x02/fdbClientInfo/"), LiteralStringRef("\xff\x02/fdbClientInfo0"));
+const KeyRangeRef fdbClientInfoPrefixRange(LiteralStringRef("\xff\x02/fdbClientInfo/"),
+                                           LiteralStringRef("\xff\x02/fdbClientInfo0"));
 // See remaining fields in GlobalConfig.actor.h
 
 // ConsistencyCheck settings
diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp
index 96f806c791..9d81281b14 100644
--- a/fdbclient/Tuple.cpp
+++ b/fdbclient/Tuple.cpp
@@ -44,9 +44,10 @@ static size_t find_string_terminator(const StringRef data, size_t offset) {
 // If encoding and the sign bit is 1 (the number is negative), flip all the bits.
 // If decoding and the sign bit is 0 (the number is negative), flip all the bits.
 // Otherwise, the number is positive, so flip the sign bit.
-static void adjust_floating_point(uint8_t *bytes, size_t size, bool encode) {
-	if((encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x00)) || (!encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x80))) {
-		for(size_t i = 0; i < size; i++) {
+static void adjust_floating_point(uint8_t* bytes, size_t size, bool encode) {
+	if ((encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x00)) ||
+	    (!encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x80))) {
+		for (size_t i = 0; i < size; i++) {
 			bytes[i] ^= (uint8_t)0xff;
 		}
 	} else {
@@ -65,14 +66,11 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
 			i = find_string_terminator(str, i + 1) + 1;
 		} else if (data[i] >= '\x0c' && data[i] <= '\x1c') {
 			i += abs(data[i] - '\x14') + 1;
-		}
-		else if(data[i] == 0x20) {
+		} else if (data[i] == 0x20) {
 			i += sizeof(float) + 1;
-		}
-		else if(data[i] == 0x21) {
+		} else if (data[i] == 0x21) {
 			i += sizeof(double) + 1;
-		}
-		else if(data[i] == '\x00') {
+		} else if (data[i] == '\x00') {
 			i += 1;
 		} else {
 			throw invalid_tuple_data_type();
@@ -145,26 +143,26 @@ Tuple& Tuple::append(int64_t value) {
 	return *this;
 }
 
-Tuple& Tuple::appendFloat( float value ) {
-	offsets.push_back( data.size() );
+Tuple& Tuple::appendFloat(float value) {
+	offsets.push_back(data.size());
 	float swap = bigEndianFloat(value);
-	uint8_t *bytes = (uint8_t*)&swap;
+	uint8_t* bytes = (uint8_t*)&swap;
 	adjust_floating_point(bytes, sizeof(float), true);
 
-	data.push_back( data.arena(), 0x20 );
-	data.append( data.arena(), bytes, sizeof(float) );
+	data.push_back(data.arena(), 0x20);
+	data.append(data.arena(), bytes, sizeof(float));
 	return *this;
 }
 
-Tuple& Tuple::appendDouble( double value ) {
-	offsets.push_back( data.size() );
+Tuple& Tuple::appendDouble(double value) {
+	offsets.push_back(data.size());
 	double swap = value;
 	swap = bigEndianDouble(swap);
-	uint8_t *bytes = (uint8_t*)&swap;
+	uint8_t* bytes = (uint8_t*)&swap;
 	adjust_floating_point(bytes, sizeof(double), true);
 
-	data.push_back( data.arena(), 0x21 );
-	data.append( data.arena(), bytes, sizeof(double) );
+	data.push_back(data.arena(), 0x21);
+	data.append(data.arena(), bytes, sizeof(double));
 	return *this;
 }
 
@@ -189,14 +187,11 @@ Tuple::ElementType Tuple::getType(size_t index) const {
 		return ElementType::UTF8;
 	} else if (code >= '\x0c' && code <= '\x1c') {
 		return ElementType::INT;
-	}
-	else if(code == 0x20) {
+	} else if (code == 0x20) {
 		return ElementType::FLOAT;
-	}
-	else if(code == 0x21) {
+	} else if (code == 0x21) {
 		return ElementType::DOUBLE;
-	}
-	else {
+	} else {
 		throw invalid_tuple_data_type();
 	}
 }
@@ -292,12 +287,12 @@ int64_t Tuple::getInt(size_t index, bool allow_incomplete) const {
 
 // TODO: Combine with bindings/flow/Tuple.*. This code is copied from there.
 float Tuple::getFloat(size_t index) const {
-	if(index >= offsets.size()) {
+	if (index >= offsets.size()) {
 		throw invalid_tuple_index();
 	}
 	ASSERT_LT(offsets[index], data.size());
 	uint8_t code = data[offsets[index]];
-	if(code != 0x20) {
+	if (code != 0x20) {
 		throw invalid_tuple_data_type();
 	}
 
@@ -305,18 +300,18 @@ float Tuple::getFloat(size_t index) const {
 	uint8_t* bytes = (uint8_t*)&swap;
 	ASSERT_LE(offsets[index] + 1 + sizeof(float), data.size());
 	swap = *(float*)(data.begin() + offsets[index] + 1);
-	adjust_floating_point( bytes, sizeof(float), false );
+	adjust_floating_point(bytes, sizeof(float), false);
 
 	return bigEndianFloat(swap);
 }
 
 double Tuple::getDouble(size_t index) const {
-	if(index >= offsets.size()) {
+	if (index >= offsets.size()) {
 		throw invalid_tuple_index();
 	}
 	ASSERT_LT(offsets[index], data.size());
 	uint8_t code = data[offsets[index]];
-	if(code != 0x21) {
+	if (code != 0x21) {
 		throw invalid_tuple_data_type();
 	}
 
@@ -324,7 +319,7 @@ double Tuple::getDouble(size_t index) const {
 	uint8_t* bytes = (uint8_t*)&swap;
 	ASSERT_LE(offsets[index] + 1 + sizeof(double), data.size());
 	swap = *(double*)(data.begin() + offsets[index] + 1);
-	adjust_floating_point( bytes, sizeof(double), false );
+	adjust_floating_point(bytes, sizeof(double), false);
 
 	return bigEndianDouble(swap);
 }
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 93632650e1..0f5b7ebb94 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -2716,7 +2716,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
 		clientInfo.id = deterministicRandom()->randomUniqueID();
 		clientInfo.commitProxies = req.commitProxies;
 		clientInfo.grvProxies = req.grvProxies;
-		db->clientInfo->set( clientInfo );
+		db->clientInfo->set(clientInfo);
 		dbInfo.client = db->clientInfo->get();
 	}
 
@@ -3702,25 +3702,29 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
 	state ClusterControllerData self(interf, locality);
 	state Future<Void> coordinationPingDelay = delay(SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY);
 	state uint64_t step = 0;
-	state Future<ErrorOr<Void>> error = errorOr( actorCollection( self.addActor.getFuture() ) );
+	state Future<ErrorOr<Void>> error = errorOr(actorCollection(self.addActor.getFuture()));
 
-	self.addActor.send( clusterWatchDatabase( &self, &self.db ) );  // Start the master database
-	self.addActor.send( self.updateWorkerList.init( self.db.db ) );
-	self.addActor.send( statusServer( interf.clientInterface.databaseStatus.getFuture(), &self, coordinators));
-	self.addActor.send( timeKeeper(&self) );
-	self.addActor.send( monitorProcessClasses(&self) );
-	self.addActor.send( monitorServerInfoConfig(&self.db) );
+	self.addActor.send(clusterWatchDatabase(&self, &self.db)); // Start the master database
+	self.addActor.send(self.updateWorkerList.init(self.db.db));
+	self.addActor.send(statusServer(interf.clientInterface.databaseStatus.getFuture(), &self, coordinators));
+	self.addActor.send(timeKeeper(&self));
+	self.addActor.send(monitorProcessClasses(&self));
+	self.addActor.send(monitorServerInfoConfig(&self.db));
 	self.addActor.send(monitorGlobalConfig(&self.db));
-	self.addActor.send( updatedChangingDatacenters(&self) );
-	self.addActor.send( updatedChangedDatacenters(&self) );
-	self.addActor.send( updateDatacenterVersionDifference(&self) );
-	self.addActor.send( handleForcedRecoveries(&self, interf) );
-	self.addActor.send( monitorDataDistributor(&self) );
-	self.addActor.send( monitorRatekeeper(&self) );
-	self.addActor.send( dbInfoUpdater(&self) );
-	self.addActor.send( traceCounters("ClusterControllerMetrics", self.id, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self.clusterControllerMetrics, self.id.toString() + "/ClusterControllerMetrics") );
-	self.addActor.send( traceRole(Role::CLUSTER_CONTROLLER, interf.id()) );
-	//printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());
+	self.addActor.send(updatedChangingDatacenters(&self));
+	self.addActor.send(updatedChangedDatacenters(&self));
+	self.addActor.send(updateDatacenterVersionDifference(&self));
+	self.addActor.send(handleForcedRecoveries(&self, interf));
+	self.addActor.send(monitorDataDistributor(&self));
+	self.addActor.send(monitorRatekeeper(&self));
+	self.addActor.send(dbInfoUpdater(&self));
+	self.addActor.send(traceCounters("ClusterControllerMetrics",
+	                                 self.id,
+	                                 SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                                 &self.clusterControllerMetrics,
+	                                 self.id.toString() + "/ClusterControllerMetrics"));
+	self.addActor.send(traceRole(Role::CLUSTER_CONTROLLER, interf.id()));
+	// printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());
 
 	loop choose {
 		when(ErrorOr<Void> err = wait(error)) {
diff --git a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
index d56061ca3a..5c99263f58 100644
--- a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
+++ b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
@@ -269,17 +269,15 @@ struct ClientTransactionProfileCorrectnessWorkload : TestWorkload {
 
 	ACTOR Future<Void> changeProfilingParameters(Database cx, int64_t sizeLimit, double sampleProbability) {
 
-		wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void>
-						{
-							tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
-							tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-							Tuple rate = Tuple().appendDouble(sampleProbability);
-							Tuple size = Tuple().append(sizeLimit);
-							tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
-							tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
-							return Void();
-						}
-					 ));
+		wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
+			tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+			Tuple rate = Tuple().appendDouble(sampleProbability);
+			Tuple size = Tuple().append(sizeLimit);
+			tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
+			tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
+			return Void();
+		}));
 		return Void();
 	}
 

From aa0014ab6e672d59df2ba2372b86d72c841229f1 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 17 Mar 2021 20:22:38 -0700
Subject: [PATCH 140/461] Fix version serialization

---
 fdbclient/GlobalConfig.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fdbclient/GlobalConfig.h b/fdbclient/GlobalConfig.h
index 14e10f8635..4973a86499 100644
--- a/fdbclient/GlobalConfig.h
+++ b/fdbclient/GlobalConfig.h
@@ -37,9 +37,6 @@ struct VersionHistory {
 
 	template <typename Ar>
 	void serialize(Ar& ar) {
-		// The version is not serialized because this object is only sent over
-		// the network during a write. In this case, the version is included in
-		// the key, while this object will be written to the value.
-		serializer(ar, mutations);
+		serializer(ar, mutations, version);
 	}
 };

From 1c60653c2acfc3b938c07dbbe17e60c2983445a2 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 17 Mar 2021 20:41:46 -0700
Subject: [PATCH 141/461] Add fix to conditionally set global config history

---
 fdbclient/CMakeLists.txt                                 | 1 +
 fdbclient/GlobalConfig.actor.cpp                         | 5 +++++
 fdbserver/ClusterController.actor.cpp                    | 4 ++--
 fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index e733259611..bd14ef7b52 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -28,6 +28,7 @@ set(FDBCLIENT_SRCS
   FDBOptions.h
   FDBTypes.h
   FileBackupAgent.actor.cpp
+  GlobalConfig.h
   GlobalConfig.actor.h
   GlobalConfig.actor.cpp
   GrvProxyInterface.h
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index a482a8e3ea..f504ec2db2 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -18,6 +18,7 @@
  * limitations under the License.
  */
 
+#include "fdbclient/DatabaseContext.h"
 #include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/SpecialKeySpace.actor.h"
 #include "fdbclient/SystemData.h"
@@ -134,6 +135,10 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 		try {
 			wait(dbInfo->onChange());
 
+			if (dbInfo->get().id.second() != 123456789) {
+				continue;
+			}
+
 			auto& history = dbInfo->get().history;
 			if (history.size() == 0 || (self->lastUpdate < history[0].version && self->lastUpdate != 0)) {
 				// This process missed too many global configuration
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 0f5b7ebb94..47b19a932a 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3233,10 +3233,10 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 						historyCommitVersion = bigEndian64(historyCommitVersion);
 						vh.version = historyCommitVersion;
 
-						clientInfo.history.push_back(vh);
+						clientInfo.history.push_back(std::move(vh));
 					}
 
-					clientInfo.id = deterministicRandom()->randomUniqueID();
+					clientInfo.id = UID(deterministicRandom()->randomUniqueID().first(), 123456789);
 					db->clientInfo->set(clientInfo);
 				}
 
diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
index 34c1f32cf4..a6c98910d2 100644
--- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
+++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
@@ -21,6 +21,7 @@
 #include "boost/lexical_cast.hpp"
 #include "boost/algorithm/string.hpp"
 
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/ManagementAPI.actor.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/ReadYourWrites.h"

From 7ba7257cd2812e76fb992e06c544a82320140f73 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 19 Mar 2021 13:28:03 -0700
Subject: [PATCH 142/461] Store global config data on heap

---
 fdbclient/GlobalConfig.actor.cpp      | 26 ++++++++++++++-----------
 fdbclient/GlobalConfig.actor.h        | 28 +++++++++++++++------------
 fdbclient/GlobalConfig.h              |  3 +++
 fdbclient/SpecialKeySpace.actor.cpp   | 20 +++++++++----------
 fdbserver/ClusterController.actor.cpp | 12 ++++++++----
 5 files changed, 52 insertions(+), 37 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index f504ec2db2..f8f97916f8 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -55,16 +55,16 @@ Key GlobalConfig::prefixedKey(KeyRef key) {
 	return key.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin);
 }
 
-const ConfigValue GlobalConfig::get(KeyRef name) {
+const Reference<ConfigValue> GlobalConfig::get(KeyRef name) {
 	auto it = data.find(name);
 	if (it == data.end()) {
-		return ConfigValue{ Arena(), std::any{} };
+		return Reference<ConfigValue>();
 	}
 	return it->second;
 }
 
-const std::map<KeyRef, ConfigValue> GlobalConfig::get(KeyRangeRef range) {
-	std::map<KeyRef, ConfigValue> results;
+const std::map<KeyRef, Reference<ConfigValue>> GlobalConfig::get(KeyRangeRef range) {
+	std::map<KeyRef, Reference<ConfigValue>> results;
 	for (const auto& [key, value] : data) {
 		if (range.contains(key)) {
 			results[key] = value;
@@ -78,21 +78,25 @@ Future<Void> GlobalConfig::onInitialized() {
 }
 
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
+	data.erase(key);
+
 	Arena arena(key.expectedSize() + value.expectedSize());
 	KeyRef stableKey = KeyRef(arena, key);
 	try {
+		std::any any;
 		Tuple t = Tuple::unpack(value);
 		if (t.getType(0) == Tuple::ElementType::UTF8) {
-			data[stableKey] = ConfigValue{ arena, StringRef(arena, t.getString(0).contents()) };
+			any = StringRef(arena, t.getString(0).contents());
 		} else if (t.getType(0) == Tuple::ElementType::INT) {
-			data[stableKey] = ConfigValue{ arena, t.getInt(0) };
+			any = t.getInt(0);
 		} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
-			data[stableKey] = ConfigValue{ arena, t.getFloat(0) };
+			any = t.getFloat(0);
 		} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
-			data[stableKey] = ConfigValue{ arena, t.getDouble(0) };
+			any = t.getDouble(0);
 		} else {
 			ASSERT(false);
 		}
+		data[stableKey] = makeReference<ConfigValue>(std::move(arena), std::move(any));
 	} catch (Error& e) {
 		TraceEvent("GlobalConfigTupleError").detail("What", e.what());
 	}
@@ -135,12 +139,12 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 		try {
 			wait(dbInfo->onChange());
 
-			if (dbInfo->get().id.second() != 123456789) {
+			auto& history = dbInfo->get().history;
+			if (history.size() == 0) {
 				continue;
 			}
 
-			auto& history = dbInfo->get().history;
-			if (history.size() == 0 || (self->lastUpdate < history[0].version && self->lastUpdate != 0)) {
+			if (self->lastUpdate < history[0].version) {
 				// This process missed too many global configuration
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 11699633da..45a3e469e9 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -48,9 +48,12 @@ extern const KeyRef fdbClientInfoTxnSizeLimit;
 extern const KeyRef transactionTagSampleRate;
 extern const KeyRef transactionTagSampleCost;
 
-struct ConfigValue {
+struct ConfigValue : ReferenceCounted<ConfigValue> {
 	Arena arena;
 	std::any value;
+
+	ConfigValue() {}
+	ConfigValue(Arena&& a, std::any&& v) : arena(a), value(v) {}
 };
 
 class GlobalConfig {
@@ -67,13 +70,12 @@ public:
 	// For example, given "config/a", returns "\xff\xff/global_config/config/a".
 	static Key prefixedKey(KeyRef key);
 
-	// Get a value from the framework. Values are returned in a ConfigValue
-	// struct which also contains a reference to the arena containing the
-	// memory for the object. As long as the caller keeps a reference to the
-	// returned ConfigValue, the value is guaranteed to be readable (if it
-	// exists).
-	const ConfigValue get(KeyRef name);
-	const std::map<KeyRef, ConfigValue> get(KeyRangeRef range);
+	// Get a value from the framework. Values are returned as a ConfigValue
+	// reference which also contains the arena holding the object. As long as
+	// the caller keeps the ConfigValue reference, the value is guaranteed to
+	// be readable. An empty reference is returned if the value does not exist.
+	const Reference<ConfigValue> get(KeyRef name);
+	const std::map<KeyRef, Reference<ConfigValue>> get(KeyRangeRef range);
 
 	// For arithmetic value types, returns a copy of the value for the given
 	// key, or the supplied default value if the framework does not know about
@@ -81,9 +83,11 @@ public:
 	template <typename T, typename std::enable_if<std::is_arithmetic<T>{}, bool>::type = true>
 	const T get(KeyRef name, T defaultVal) {
 		try {
-			auto any = get(name).value;
-			if (any.has_value()) {
-				return std::any_cast<T>(any);
+			auto configValue = get(name);
+			if (configValue.isValid()) {
+				if (configValue->value.has_value()) {
+					return std::any_cast<T>(configValue->value);
+				}
 			}
 
 			return defaultVal;
@@ -114,7 +118,7 @@ private:
 	Database cx;
 	Future<Void> _updater;
 	Promise<Void> initialized;
-	std::unordered_map<StringRef, ConfigValue> data;
+	std::unordered_map<StringRef, Reference<ConfigValue>> data;
 	Version lastUpdate;
 };
 
diff --git a/fdbclient/GlobalConfig.h b/fdbclient/GlobalConfig.h
index 4973a86499..f68ea2361e 100644
--- a/fdbclient/GlobalConfig.h
+++ b/fdbclient/GlobalConfig.h
@@ -28,6 +28,9 @@
 struct VersionHistory {
 	constexpr static FileIdentifier file_identifier = 5863456;
 
+	VersionHistory() {}
+	VersionHistory(Version v) : version(v) {}
+
 	Version version;
 	Standalone<VectorRef<MutationRef>> mutations;
 
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index bff44b03ba..4b76a14255 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1380,22 +1380,22 @@ Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTran
 	auto& globalConfig = GlobalConfig::globalConfig();
 	KeyRangeRef modified =
 	    KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin));
-	std::map<KeyRef, ConfigValue> values = globalConfig.get(modified);
+	std::map<KeyRef, Reference<ConfigValue>> values = globalConfig.get(modified);
 	for (const auto& [key, config] : values) {
 		Key prefixedKey = key.withPrefix(getKeyRange().begin);
-		if (config.value.has_value()) {
-			if (config.value.type() == typeid(StringRef)) {
+		if (config.isValid() && config->value.has_value()) {
+			if (config->value.type() == typeid(StringRef)) {
 				result.push_back_deep(result.arena(),
-				                      KeyValueRef(prefixedKey, std::any_cast<StringRef>(config.value).toString()));
-			} else if (config.value.type() == typeid(int64_t)) {
+				                      KeyValueRef(prefixedKey, std::any_cast<StringRef>(config->value).toString()));
+			} else if (config->value.type() == typeid(int64_t)) {
 				result.push_back_deep(result.arena(),
-				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config.value))));
-			} else if (config.value.type() == typeid(float)) {
+				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config->value))));
+			} else if (config->value.type() == typeid(float)) {
 				result.push_back_deep(result.arena(),
-				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config.value))));
-			} else if (config.value.type() == typeid(double)) {
+				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config->value))));
+			} else if (config->value.type() == typeid(double)) {
 				result.push_back_deep(result.arena(),
-				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(config.value))));
+				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(config->value))));
 			} else {
 				ASSERT(false);
 			}
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 47b19a932a..4b7b26fb95 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3212,17 +3212,21 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 					clientInfo.history.clear();
 
 					for (const auto& kv : globalConfigHistory) {
-						VersionHistory vh;
 						ObjectReader reader(kv.value.begin(), IncludeVersion());
 						if (reader.protocolVersion() != g_network->protocolVersion()) {
 							// If the protocol version has changed, the
 							// GlobalConfig actor should refresh its view by
 							// reading the entire global configuration key
-							// range.  An empty mutation list will signal the
-							// actor to refresh.
+							// range.  Setting the version to the max int64_t
+							// will always cause the global configuration
+							// updater to refresh its view of the configuration
+							// keyspace.
 							clientInfo.history.clear();
+							clientInfo.history.emplace_back(std::numeric_limits<Version>::max());
 							break;
 						}
+
+						VersionHistory vh;
 						reader.deserialize(vh);
 
 						// Read commit version out of versionstamp at end of key.
@@ -3236,7 +3240,7 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 						clientInfo.history.push_back(std::move(vh));
 					}
 
-					clientInfo.id = UID(deterministicRandom()->randomUniqueID().first(), 123456789);
+					clientInfo.id = deterministicRandom()->randomUniqueID();
 					db->clientInfo->set(clientInfo);
 				}
 

From c38ddf5eb72a82abc9f6e4245ce275e9caead350 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 19 Mar 2021 17:37:01 -0700
Subject: [PATCH 143/461] Add comments

---
 fdbclient/GlobalConfig.actor.h        | 21 +++++++++++++++++++++
 fdbclient/SpecialKeySpace.actor.cpp   | 11 +++++++++++
 fdbserver/ClusterController.actor.cpp |  5 +++++
 3 files changed, 37 insertions(+)

diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 45a3e469e9..dc62203746 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -48,6 +48,9 @@ extern const KeyRef fdbClientInfoTxnSizeLimit;
 extern const KeyRef transactionTagSampleRate;
 extern const KeyRef transactionTagSampleCost;
 
+// Structure used to hold the values stored by global configuration. The arena
+// is used as memory to store both the key and the value (the value is only
+// stored in the arena if it is an object; primitives are just copied).
 struct ConfigValue : ReferenceCounted<ConfigValue> {
 	Arena arena;
 	std::any value;
@@ -61,7 +64,14 @@ public:
 	GlobalConfig(const GlobalConfig&) = delete;
 	GlobalConfig& operator=(const GlobalConfig&) = delete;
 
+	// Creates a GlobalConfig singleton, accessed by calling GlobalConfig().
+	// This function should only be called once by each process (however, it is
+	// idempotent and calling it multiple times will have no effect).
 	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
+
+	// Returns a reference to the global GlobalConfig object. Clients should
+	// call this function whenever they need to read a value out of the global
+	// configuration.
 	static GlobalConfig& globalConfig();
 
 	// Use this function to turn a global configuration key defined above into
@@ -108,8 +118,19 @@ public:
 private:
 	GlobalConfig();
 
+	// The functions below only affect the local copy of the global
+	// configuration keyspace! To insert or remove values across all nodes you
+	// must use a transaction (see the note above).
+
+	// Inserts the given key-value pair into the local copy of the global
+	// configuration keyspace, overwriting the old key-value pair if it exists.
+	// `value` must be encoded using the FDB tuple typecodes.
 	void insert(KeyRef key, ValueRef value);
+	// Removes the given key (and associated value) from the local copy of the
+	// global configuration keyspace.
 	void erase(KeyRef key);
+	// Removes the given key range (and associated values) from the local copy
+	// of the global configuration keyspace.
 	void erase(KeyRangeRef range);
 
 	ACTOR static Future<Void> refresh(GlobalConfig* self);
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 4b76a14255..da53a91f93 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1373,6 +1373,10 @@ Future<Optional<std::string>> ConsistencyCheckImpl::commit(ReadYourWritesTransac
 
 GlobalConfigImpl::GlobalConfigImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
+// Returns key-value pairs for each value stored in the global configuration
+// framework within the range specified. The special-key-space getrange
+// function should only be used for informational purposes. All values are
+// returned as strings regardless of their true type.
 Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw,
                                                               KeyRangeRef kr) const {
 	Standalone<RangeResultRef> result;
@@ -1405,10 +1409,14 @@ Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTran
 	return result;
 }
 
+// Marks the key for insertion into global configuration.
 void GlobalConfigImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
 	ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>(value)));
 }
 
+// Writes global configuration changes to durable memory. Also writes the
+// changes made in the transaction to a recent history set, and updates the
+// latest version which the global configuration was updated at.
 ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* globalConfig, ReadYourWritesTransaction* ryw) {
 	state Transaction& tr = ryw->getTransaction();
 
@@ -1468,14 +1476,17 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 	return Optional<std::string>();
 }
 
+// Called when a transaction includes keys in the global configuration special-key-space range.
 Future<Optional<std::string>> GlobalConfigImpl::commit(ReadYourWritesTransaction* ryw) {
 	return globalConfigCommitActor(this, ryw);
 }
 
+// Marks the range for deletion from global configuration.
 void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
 	ryw->getSpecialKeySpaceWriteMap().insert(range, std::make_pair(true, Optional<Value>()));
 }
 
+// Marks the key for deletion from global configuration.
 void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
 	ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>()));
 }
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 4b7b26fb95..1d1d05a930 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3190,6 +3190,11 @@ ACTOR Future<Void> monitorServerInfoConfig(ClusterControllerData::DBInfo* db) {
 	}
 }
 
+// Monitors the global configuration version key for changes. When changes are
+// made, the global configuration history is read and any updates are sent to
+// all processes in the system by updating the ClientDBInfo object. The
+// GlobalConfig actor class contains the functionality to read the latest
+// history and update the processes local view.
 ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 	loop {
 		state ReadYourWritesTransaction tr(db->db);

From 7de23918c0d14bfcd29e83b893704eb12301854d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 23 Mar 2021 16:22:39 -0700
Subject: [PATCH 144/461] Add comments, fix erase bug, make optimizations

---
 fdbclient/GlobalConfig.actor.cpp      |  6 ++++--
 fdbclient/GlobalConfig.actor.h        | 17 ++++++++---------
 fdbclient/NativeAPI.actor.cpp         |  5 ++---
 fdbclient/SpecialKeySpace.actor.cpp   |  5 +++--
 fdbclient/SystemData.h                |  6 +++---
 fdbclient/Tuple.cpp                   | 15 ++++++++-------
 fdbserver/ClusterController.actor.cpp | 20 +++++++++++++++++---
 7 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index f8f97916f8..1071a08a28 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -98,12 +98,12 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
 		}
 		data[stableKey] = makeReference<ConfigValue>(std::move(arena), std::move(any));
 	} catch (Error& e) {
-		TraceEvent("GlobalConfigTupleError").detail("What", e.what());
+		TraceEvent("GlobalConfigTupleParseError").detail("What", e.what());
 	}
 }
 
 void GlobalConfig::erase(KeyRef key) {
-	erase(KeyRangeRef(key, keyAfter(key)));
+	data.erase(key);
 }
 
 void GlobalConfig::erase(KeyRangeRef range) {
@@ -120,6 +120,8 @@ void GlobalConfig::erase(KeyRangeRef range) {
 // Updates local copy of global configuration by reading the entire key-range
 // from storage.
 ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
+	self->data.clear();
+
 	Transaction tr(self->cx);
 	Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
 	for (const auto& kv : result) {
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index dc62203746..ded1bc32c6 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -39,7 +39,8 @@
 
 // The global configuration is a series of typed key-value pairs synced to all
 // nodes (server and client) in an FDB cluster in an eventually consistent
-// manner.
+// manner. Only small key-value pairs should be stored in global configuration;
+// an excessive amount of data can cause synchronization slowness.
 
 // Keys
 extern const KeyRef fdbClientInfoTxnSampleRate;
@@ -59,11 +60,8 @@ struct ConfigValue : ReferenceCounted<ConfigValue> {
 	ConfigValue(Arena&& a, std::any&& v) : arena(a), value(v) {}
 };
 
-class GlobalConfig {
+class GlobalConfig : NonCopyable {
 public:
-	GlobalConfig(const GlobalConfig&) = delete;
-	GlobalConfig& operator=(const GlobalConfig&) = delete;
-
 	// Creates a GlobalConfig singleton, accessed by calling GlobalConfig().
 	// This function should only be called once by each process (however, it is
 	// idempotent and calling it multiple times will have no effect).
@@ -106,10 +104,11 @@ public:
 		}
 	}
 
-	// To write into the global configuration, submit a transaction to
-	// \xff\xff/global_config/<your-key> with <your-value> encoded using the
-	// FDB tuple typecodes. Use the helper function `prefixedKey` to correctly
-	// prefix your global configuration key.
+	// Trying to write into the global configuration keyspace? To write data,
+	// submit a transaction to \xff\xff/global_config/<your-key> with
+	// <your-value> encoded using the FDB tuple typecodes. Use the helper
+	// function `prefixedKey` to correctly prefix your global configuration
+	// key.
 
 	// Triggers the returned future when the global configuration singleton has
 	// been created and is ready.
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 2e460d55d2..be581a08ea 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -5380,9 +5380,8 @@ void Transaction::checkDeferredError() {
 
 Reference<TransactionLogInfo> Transaction::createTrLogInfoProbabilistically(const Database& cx) {
 	if (!cx->isError()) {
-		double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate,
-		                                                             std::numeric_limits<double>::infinity());
-		double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
+		double clientSamplingProbability = GlobalConfig::globalConfig().get<double>(
+		    fdbClientInfoTxnSampleRate, CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY);
 		if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) &&
 		    deterministicRandom()->random01() < clientSamplingProbability &&
 		    (!g_network->isSimulated() || !g_simulator.speedUpSimulation)) {
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index da53a91f93..cb347f7fcd 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1441,14 +1441,15 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 	while (iter != ranges.end()) {
 		std::pair<bool, Optional<Value>> entry = iter->value();
 		if (entry.first) {
-			if (entry.second.present()) {
+			if (entry.second.present() && iter->begin().startsWith(globalConfig->getKeyRange().begin)) {
 				Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin);
 				vh.mutations.emplace_back_deep(vh.mutations.arena(),
 				                               MutationRef(MutationRef::SetValue, bareKey, entry.second.get()));
 
 				Key systemKey = bareKey.withPrefix(globalConfigKeysPrefix);
 				tr.set(systemKey, entry.second.get());
-			} else {
+			} else if (!entry.second.present() && iter->range().begin.startsWith(globalConfig->getKeyRange().begin) &&
+			           iter->range().end.startsWith(globalConfig->getKeyRange().begin)) {
 				KeyRef bareRangeBegin = iter->range().begin.removePrefix(globalConfig->getKeyRange().begin);
 				KeyRef bareRangeEnd = iter->range().end.removePrefix(globalConfig->getKeyRange().begin);
 				vh.mutations.emplace_back_deep(vh.mutations.arena(),
diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h
index 5cf56ef7ec..952e8fcf00 100644
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@@ -249,9 +249,9 @@ extern const KeyRef globalConfigKeysPrefix;
 extern const KeyRangeRef globalConfigHistoryKeys;
 extern const KeyRef globalConfigHistoryPrefix;
 
-//   "\xff/globalConfig/v" := "version,protocol"
-//   Read-only key which returns the version and protocol of the most recent
-//   data written to the global configuration keyspace.
+//   "\xff/globalConfig/v" := "version"
+//   Read-only key which returns the commit version of the most recent mutation
+//   made to the global configuration keyspace.
 extern const KeyRef globalConfigVersionKey;
 
 //	"\xff/workers/[[processID]]" := ""
diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp
index 9d81281b14..367a7b80fb 100644
--- a/fdbclient/Tuple.cpp
+++ b/fdbclient/Tuple.cpp
@@ -20,6 +20,7 @@
 
 #include "fdbclient/Tuple.h"
 
+// TODO: Many functions copied from bindings/flow/Tuple.cpp. Merge at some point.
 static float bigEndianFloat(float orig) {
 	int32_t big = *(int32_t*)&orig;
 	big = bigEndian32(big);
@@ -32,7 +33,7 @@ static double bigEndianDouble(double orig) {
 	return *(double*)&big;
 }
 
-static size_t find_string_terminator(const StringRef data, size_t offset) {
+static size_t findStringTerminator(const StringRef data, size_t offset) {
 	size_t i = offset;
 	while (i < data.size() - 1 && !(data[i] == '\x00' && data[i + 1] != (uint8_t)'\xff')) {
 		i += (data[i] == '\x00' ? 2 : 1);
@@ -44,7 +45,7 @@ static size_t find_string_terminator(const StringRef data, size_t offset) {
 // If encoding and the sign bit is 1 (the number is negative), flip all the bits.
 // If decoding and the sign bit is 0 (the number is negative), flip all the bits.
 // Otherwise, the number is positive, so flip the sign bit.
-static void adjust_floating_point(uint8_t* bytes, size_t size, bool encode) {
+static void adjustFloatingPoint(uint8_t* bytes, size_t size, bool encode) {
 	if ((encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x00)) ||
 	    (!encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x80))) {
 		for (size_t i = 0; i < size; i++) {
@@ -63,7 +64,7 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
 		offsets.push_back(i);
 
 		if (data[i] == '\x01' || data[i] == '\x02') {
-			i = find_string_terminator(str, i + 1) + 1;
+			i = findStringTerminator(str, i + 1) + 1;
 		} else if (data[i] >= '\x0c' && data[i] <= '\x1c') {
 			i += abs(data[i] - '\x14') + 1;
 		} else if (data[i] == 0x20) {
@@ -147,7 +148,7 @@ Tuple& Tuple::appendFloat(float value) {
 	offsets.push_back(data.size());
 	float swap = bigEndianFloat(value);
 	uint8_t* bytes = (uint8_t*)&swap;
-	adjust_floating_point(bytes, sizeof(float), true);
+	adjustFloatingPoint(bytes, sizeof(float), true);
 
 	data.push_back(data.arena(), 0x20);
 	data.append(data.arena(), bytes, sizeof(float));
@@ -159,7 +160,7 @@ Tuple& Tuple::appendDouble(double value) {
 	double swap = value;
 	swap = bigEndianDouble(swap);
 	uint8_t* bytes = (uint8_t*)&swap;
-	adjust_floating_point(bytes, sizeof(double), true);
+	adjustFloatingPoint(bytes, sizeof(double), true);
 
 	data.push_back(data.arena(), 0x21);
 	data.append(data.arena(), bytes, sizeof(double));
@@ -300,7 +301,7 @@ float Tuple::getFloat(size_t index) const {
 	uint8_t* bytes = (uint8_t*)&swap;
 	ASSERT_LE(offsets[index] + 1 + sizeof(float), data.size());
 	swap = *(float*)(data.begin() + offsets[index] + 1);
-	adjust_floating_point(bytes, sizeof(float), false);
+	adjustFloatingPoint(bytes, sizeof(float), false);
 
 	return bigEndianFloat(swap);
 }
@@ -319,7 +320,7 @@ double Tuple::getDouble(size_t index) const {
 	uint8_t* bytes = (uint8_t*)&swap;
 	ASSERT_LE(offsets[index] + 1 + sizeof(double), data.size());
 	swap = *(double*)(data.begin() + offsets[index] + 1);
-	adjust_floating_point(bytes, sizeof(double), false);
+	adjustFloatingPoint(bytes, sizeof(double), false);
 
 	return bigEndianDouble(swap);
 }
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 1d1d05a930..750d6643ed 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3245,15 +3245,29 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 						clientInfo.history.push_back(std::move(vh));
 					}
 
+					if (clientInfo.history.size() > 0) {
+						// The first item in the historical list of mutations
+						// is only used to:
+						//   a) Recognize that some historical changes may have
+						//      been missed, and the entire global
+						//      configuration keyspace needs to be read, or..
+						//   b) Check which historical updates have already
+						//      been applied. If this is the case, the first
+						//      history item must have a version greater than
+						//      or equal to whatever version the global
+						//      configuration was last updated at, and
+						//      therefore won't need to be applied again.
+						clientInfo.history[0].mutations = Standalone<VectorRef<MutationRef>>();
+					}
+
 					clientInfo.id = deterministicRandom()->randomUniqueID();
 					db->clientInfo->set(clientInfo);
 				}
 
 				state Future<Void> globalConfigFuture = tr.watch(globalConfigVersionKey);
 				wait(tr.commit());
-				choose {
-					when (wait(globalConfigFuture)) { break; }
-				}
+				wait(globalConfigFuture);
+				break;
 			} catch (Error& e) {
 				wait(tr.onError(e));
 			}

From 2594d91f113711b1593a515abbef8a335e57525e Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 12 Apr 2021 10:27:41 -0700
Subject: [PATCH 145/461] Update casing

---
 fdbclient/GlobalConfig.actor.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 1071a08a28..1aa8c84992 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -28,11 +28,11 @@
 
 #include "flow/actorcompiler.h"  // This must be the last #include.
 
-const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("config/fdbClientInfo/client_txn_sample_rate");
-const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdbClientInfo/client_txn_size_limit");
+const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("config/fdb_client_info/client_txn_sample_rate");
+const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_info/client_txn_size_limit");
 
-const KeyRef transactionTagSampleRate = LiteralStringRef("config/transactionTagSampleRate");
-const KeyRef transactionTagSampleCost = LiteralStringRef("config/transactionTagSampleCost");
+const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag_sample_rate");
+const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost");
 
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 

From 51e4c19675d99c6504df03c27ac396649cbfbd1d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 13 Apr 2021 12:50:18 -0700
Subject: [PATCH 146/461] Add migration for client profiling keys

---
 fdbclient/GlobalConfig.actor.cpp    | 47 ++++++++++++++++++++++++++++-
 fdbclient/GlobalConfig.actor.h      |  1 +
 fdbclient/SpecialKeySpace.actor.cpp |  9 +++---
 3 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 1aa8c84992..58e032d363 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -26,7 +26,7 @@
 #include "flow/flow.h"
 #include "flow/genericactors.actor.h"
 
-#include "flow/actorcompiler.h"  // This must be the last #include.
+#include "flow/actorcompiler.h" // This must be the last #include.
 
 const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("config/fdb_client_info/client_txn_sample_rate");
 const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_info/client_txn_size_limit");
@@ -117,6 +117,49 @@ void GlobalConfig::erase(KeyRangeRef range) {
 	}
 }
 
+// Older FDB versions used different keys for client profiling data. This
+// function performs a one-time migration of data in these keys to the new
+// global configuration key space.
+ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
+	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+
+	state Key migratedKey("\xff\x02/fdbClientInfo/migrated/"_sr);
+	state Optional<Value> migrated = wait(tr->get(migratedKey));
+	if (migrated.present()) {
+		// Already performed migration.
+		return Void();
+	}
+
+	state Optional<Value> sampleRate = wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_sample_rate/"_sr)));
+	state Optional<Value> sizeLimit = wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_size_limit/"_sr)));
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+			// The value doesn't matter too much, as long as the key is set.
+			tr->set(migratedKey.contents(), "1"_sr);
+			if (sampleRate.present()) {
+				const double sampleRateDbl =
+				    BinaryReader::fromStringRef<double>(sampleRate.get().contents(), Unversioned());
+				Tuple rate = Tuple().appendDouble(sampleRateDbl);
+				tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
+			}
+			if (sizeLimit.present()) {
+				const int64_t sizeLimitInt =
+				    BinaryReader::fromStringRef<int64_t>(sizeLimit.get().contents(), Unversioned());
+				Tuple size = Tuple().append(sizeLimitInt);
+				tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
+			}
+
+			wait(tr->commit());
+			return Void();
+		} catch (Error& e) {
+			throw;
+		}
+	}
+}
+
 // Updates local copy of global configuration by reading the entire key-range
 // from storage.
 ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
@@ -134,6 +177,8 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 // Applies updates to the local copy of the global configuration when this
 // process receives an updated history.
 ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+	wait(self->migrate(self));
+
 	wait(self->refresh(self));
 	self->initialized.send(Void());
 
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index ded1bc32c6..5c3693f450 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -132,6 +132,7 @@ private:
 	// of the global configuration keyspace.
 	void erase(KeyRangeRef range);
 
+	ACTOR static Future<Void> migrate(GlobalConfig* self);
 	ACTOR static Future<Void> refresh(GlobalConfig* self);
 	ACTOR static Future<Void> updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index cb347f7fcd..2bbafbd451 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1377,8 +1377,7 @@ GlobalConfigImpl::GlobalConfigImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {
 // framework within the range specified. The special-key-space getrange
 // function should only be used for informational purposes. All values are
 // returned as strings regardless of their true type.
-Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                              KeyRangeRef kr) const {
+Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	Standalone<RangeResultRef> result;
 
 	auto& globalConfig = GlobalConfig::globalConfig();
@@ -1417,7 +1416,8 @@ void GlobalConfigImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, co
 // Writes global configuration changes to durable memory. Also writes the
 // changes made in the transaction to a recent history set, and updates the
 // latest version which the global configuration was updated at.
-ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* globalConfig, ReadYourWritesTransaction* ryw) {
+ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* globalConfig,
+                                                            ReadYourWritesTransaction* ryw) {
 	state Transaction& tr = ryw->getTransaction();
 
 	// History should only contain three most recent updates. If it currently
@@ -1494,8 +1494,7 @@ void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key)
 
 TracingOptionsImpl::TracingOptionsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                                KeyRangeRef kr) const {
+Future<Standalone<RangeResultRef>> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	Standalone<RangeResultRef> result;
 	for (const auto& option : SpecialKeySpace::getTracingOptions()) {
 		auto key = getKeyRange().begin.withSuffix(option);

From 9737212e5186be9baf86e167edcb245aeb49cf16 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 10:56:42 -0700
Subject: [PATCH 147/461] The default value of the first snapshot interval
 should be 0 rather than -1.

---
 fdbclient/FileBackupAgent.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 5101d4d90e..b7e2a847b0 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2781,7 +2781,7 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 		state Future<Optional<int64_t>> initialSnapshotIntervalSeconds =
 		    config.initialSnapshotIntervalSeconds().get(tr);
 		wait(success(initialSnapshotIntervalSeconds) &&
-		     config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.get().orDefault(-1)));
+		     config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.get().orDefault(0)));
 
 		// Using priority 1 for both of these to at least start both tasks soon
 		// Do not add snapshot task if we only want the incremental backup

From 08c82050ac60d7f13216d940e204b1320296ee7c Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 11:05:21 -0700
Subject: [PATCH 148/461] Add TraceEvent to see whether RateKeeper is too busy
 to get SS list, or fails to get SS list.

---
 fdbserver/Ratekeeper.actor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index d07caf06b7..c667cdc1b1 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -764,6 +764,7 @@ ACTOR Future<Void> monitorServerListChange(
 			tr = Transaction(self->db);
 			wait(delay(SERVER_KNOBS->SERVER_LIST_DELAY));
 		} catch (Error& e) {
+			TraceEvent("RatekeeperGetSSListError", self->id).error(e);
 			wait(tr.onError(e));
 		}
 	}

From c8b8e8cf7d94e0d421d4a8163ba851fd0560a57e Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 14 Apr 2021 11:27:01 -0700
Subject: [PATCH 149/461] Fix msgpack install

---
 cmake/GetMsgpack.cmake | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cmake/GetMsgpack.cmake b/cmake/GetMsgpack.cmake
index 0b951d5a1b..dc9a578175 100644
--- a/cmake/GetMsgpack.cmake
+++ b/cmake/GetMsgpack.cmake
@@ -9,8 +9,11 @@ else()
   ExternalProject_add(msgpackProject
     URL "https://github.com/msgpack/msgpack-c/releases/download/cpp-3.3.0/msgpack-3.3.0.tar.gz"
     URL_HASH SHA256=6e114d12a5ddb8cb11f669f83f32246e484a8addd0ce93f274996f1941c1f07b
-    CONFIGURE_COMMAND BUILD_COMMAND INSTALL_COMMAND)
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ""
+  )
 
   ExternalProject_Get_property(msgpackProject SOURCE_DIR)
   target_include_directories(msgpack SYSTEM INTERFACE "${SOURCE_DIR}/include")
-endif()
\ No newline at end of file
+endif()

From 28c92aa9452ae78de5ca09b1b655cc3049750846 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 12:01:41 -0700
Subject: [PATCH 150/461] Supress TraceEvent.

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>
---
 fdbserver/Ratekeeper.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index c667cdc1b1..0a66ae15f9 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -764,7 +764,7 @@ ACTOR Future<Void> monitorServerListChange(
 			tr = Transaction(self->db);
 			wait(delay(SERVER_KNOBS->SERVER_LIST_DELAY));
 		} catch (Error& e) {
-			TraceEvent("RatekeeperGetSSListError", self->id).error(e);
+			TraceEvent("RatekeeperGetSSListError", self->id).suppressFor(1.0).error(e);
 			wait(tr.onError(e));
 		}
 	}

From 3ed0d614d29e5734ba03c5492386fddb5ee629cf Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 14 Apr 2021 12:50:30 -0700
Subject: [PATCH 151/461] Move fdb_get_server_protocol to be a function on the
 database object. Add an argument for expected_version that can be used to
 signal that the function shouldn't return unless the protocol version is
 different.

---
 bindings/c/fdb_c.cpp                        | 15 +++++--
 bindings/c/foundationdb/fdb_c.h             |  4 +-
 bindings/c/test/unit/unit_tests.cpp         |  4 +-
 fdbclient/IClientApi.h                      | 12 ++++-
 fdbclient/MultiVersionTransaction.actor.cpp | 49 ++++++++++++++++-----
 fdbclient/MultiVersionTransaction.h         | 33 ++++++++++++--
 fdbclient/NativeAPI.actor.cpp               | 15 +++++--
 fdbclient/NativeAPI.actor.h                 |  5 ++-
 fdbclient/ThreadSafeTransaction.cpp         | 19 ++++----
 fdbclient/ThreadSafeTransaction.h           | 18 ++++++--
 10 files changed, 133 insertions(+), 41 deletions(-)

diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp
index bf6af3aab7..907f8058b6 100644
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@@ -364,6 +364,17 @@ extern "C" DLLEXPORT double fdb_database_get_main_thread_busyness(FDBDatabase* d
 	return DB(d)->getMainThreadBusyness();
 }
 
+// Returns the protocol version reported by a quorum of coordinators
+// If an expected version is non-zero, the future won't return until the protocol version is different than expected
+extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version) {
+	Optional<ProtocolVersion> expected;
+	if (expected_version > 0) {
+		expected = ProtocolVersion(expected_version);
+	}
+
+	return (FDBFuture*)(DB(db)->getServerProtocol(expected).extractPtr());
+}
+
 extern "C" DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr) {
 	try {
 		TXN(tr)->delref();
@@ -583,10 +594,6 @@ extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_approximate_size(FDBTransact
 	return (FDBFuture*)TXN(tr)->getApproximateSize().extractPtr();
 }
 
-extern "C" DLLEXPORT FDBFuture* fdb_get_server_protocol(const char* clusterFilePath) {
-	return (FDBFuture*)(API->getServerProtocol(clusterFilePath ? clusterFilePath : "").extractPtr());
-}
-
 extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_versionstamp(FDBTransaction* tr) {
 	return (FDBFuture*)(TXN(tr)->getVersionstamp().extractPtr());
 }
diff --git a/bindings/c/foundationdb/fdb_c.h b/bindings/c/foundationdb/fdb_c.h
index 2086cbd775..4ea59ac11e 100644
--- a/bindings/c/foundationdb/fdb_c.h
+++ b/bindings/c/foundationdb/fdb_c.h
@@ -189,6 +189,8 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_create_snapshot(FDBDatabase
 
 DLLEXPORT WARN_UNUSED_RESULT double fdb_database_get_main_thread_busyness(FDBDatabase* db);
 
+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version);
+
 DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr);
 
 DLLEXPORT void fdb_transaction_cancel(FDBTransaction* tr);
@@ -281,8 +283,6 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_transaction_get_committed_version(F
  */
 DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_approximate_size(FDBTransaction* tr);
 
-DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_get_server_protocol(const char* clusterFilePath);
-
 DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_versionstamp(FDBTransaction* tr);
 
 DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_on_error(FDBTransaction* tr, fdb_error_t error);
diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp
index f3f97476c2..a87e483ef3 100644
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@@ -1515,7 +1515,7 @@ TEST_CASE("fdb_transaction_get_approximate_size") {
 
 TEST_CASE("fdb_get_server_protocol") {
 	// We don't really have any expectations other than "don't crash" here
-	FDBFuture* protocolFuture = fdb_get_server_protocol(clusterFilePath.c_str());
+	FDBFuture* protocolFuture = fdb_database_get_server_protocol(db, 0);
 	uint64_t out;
 
 	fdb_check(fdb_future_block_until_ready(protocolFuture));
@@ -1523,7 +1523,7 @@ TEST_CASE("fdb_get_server_protocol") {
 	fdb_future_destroy(protocolFuture);
 
 	// "Default" cluster file version
-	protocolFuture = fdb_get_server_protocol(nullptr);
+	protocolFuture = fdb_database_get_server_protocol(nullptr, 0x0FDB00A200090000LL);
 	fdb_check(fdb_future_block_until_ready(protocolFuture));
 	fdb_check(fdb_future_get_uint64(protocolFuture, &out));
 	fdb_future_destroy(protocolFuture);
diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h
index 6f3ad07cd1..4496eff732 100644
--- a/fdbclient/IClientApi.h
+++ b/fdbclient/IClientApi.h
@@ -28,6 +28,7 @@
 
 #include "flow/ThreadHelper.actor.h"
 
+// An interface that represents a transaction created by a client
 class ITransaction {
 public:
 	virtual ~ITransaction() {}
@@ -90,6 +91,7 @@ public:
 	virtual void delref() = 0;
 };
 
+// An interface that represents a connection to a cluster made by a client
 class IDatabase {
 public:
 	virtual ~IDatabase() {}
@@ -98,6 +100,11 @@ public:
 	virtual void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;
 	virtual double getMainThreadBusyness() = 0;
 
+	// Returns the protocol version reported by a quorum of coordinators
+	// If an expected version is given, the future won't return until the protocol version is different than expected
+	virtual ThreadFuture<ProtocolVersion> getServerProtocol(
+	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) = 0;
+
 	virtual void addref() = 0;
 	virtual void delref() = 0;
 
@@ -110,13 +117,16 @@ public:
 	virtual ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) = 0;
 };
 
+// An interface that presents the top-level FDB client API as exposed through the C bindings
+//
+// This interface and its associated objects are intended to live outside the network thread, so its asynchronous
+// operations use ThreadFutures and implementations should be thread safe.
 class IClientApi {
 public:
 	virtual ~IClientApi() {}
 
 	virtual void selectApiVersion(int apiVersion) = 0;
 	virtual const char* getClientVersion() = 0;
-	virtual ThreadFuture<uint64_t> getServerProtocol(const char* clusterFilePath) = 0;
 
 	virtual void setNetworkOption(FDBNetworkOptions::Option option,
 	                              Optional<StringRef> value = Optional<StringRef>()) = 0;
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index ac1855c811..4b6ba0c27c 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -356,7 +356,32 @@ double DLDatabase::getMainThreadBusyness() {
 	return 0;
 }
 
+// Returns the protocol version reported by a quorum of coordinators
+// If an expected version is given, the future won't return until the protocol version is different than expected
+ThreadFuture<ProtocolVersion> DLDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
+	ASSERT(api->databaseGetServerProtocol != nullptr);
+
+	uint64_t expected =
+	    expectedVersion.map<uint64_t>([](const ProtocolVersion& v) { return v.version(); }).orDefault(0);
+	FdbCApi::FDBFuture* f = api->databaseGetServerProtocol(db, expected);
+	return toThreadFuture<ProtocolVersion>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
+		uint64_t pv;
+		FdbCApi::fdb_error_t error = api->futureGetUInt64(f, &pv);
+		ASSERT(!error);
+		return ProtocolVersion(pv);
+	});
+}
+
 // DLApi
+
+// Loads the specified function from a dynamic library
+//
+// fp - The function pointer where the loaded function will be stored
+// lib - The dynamic library where the function is loaded from
+// libPath - The path of the dynamic library (used for logging)
+// functionName - The function to load
+// requireFunction - Determines the behavior if the function is not present. If true, an error is thrown. If false,
+//                   the function pointer will be set to nullptr.
 template <class T>
 void loadClientFunction(T* fp, void* lib, std::string libPath, const char* functionName, bool requireFunction = true) {
 	*(void**)(fp) = loadFunction(lib, functionName);
@@ -403,6 +428,8 @@ void DLApi::init() {
 	                   fdbCPath,
 	                   "fdb_database_get_main_thread_busyness",
 	                   headerVersion >= 700);
+	loadClientFunction(
+	    &api->databaseGetServerProtocol, lib, fdbCPath, "fdb_database_get_server_protocol", headerVersion >= 700);
 	loadClientFunction(&api->databaseDestroy, lib, fdbCPath, "fdb_database_destroy");
 	loadClientFunction(&api->databaseRebootWorker, lib, fdbCPath, "fdb_database_reboot_worker", headerVersion >= 700);
 	loadClientFunction(&api->databaseForceRecoveryWithDataLoss,
@@ -452,7 +479,7 @@ void DLApi::init() {
 
 	loadClientFunction(
 	    &api->futureGetInt64, lib, fdbCPath, headerVersion >= 620 ? "fdb_future_get_int64" : "fdb_future_get_version");
-	loadClientFunction(&api->futureGetUInt64, lib, fdbCPath, "fdb_future_get_uint64");
+	loadClientFunction(&api->futureGetUInt64, lib, fdbCPath, "fdb_future_get_uint64", headerVersion >= 700);
 	loadClientFunction(&api->futureGetError, lib, fdbCPath, "fdb_future_get_error");
 	loadClientFunction(&api->futureGetKey, lib, fdbCPath, "fdb_future_get_key");
 	loadClientFunction(&api->futureGetValue, lib, fdbCPath, "fdb_future_get_value");
@@ -488,11 +515,6 @@ const char* DLApi::getClientVersion() {
 	return api->getClientVersion();
 }
 
-ThreadFuture<uint64_t> DLApi::getServerProtocol(const char* clusterFilePath) {
-	ASSERT(false);
-	return ThreadFuture<uint64_t>();
-}
-
 void DLApi::setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value) {
 	throwIfError(api->setNetworkOption(
 	    option, value.present() ? value.get().begin() : nullptr, value.present() ? value.get().size() : 0));
@@ -856,7 +878,7 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
                                            std::string clusterFilePath,
                                            Reference<IDatabase> db,
                                            bool openConnectors)
-  : dbState(new DatabaseState()) {
+  : dbState(new DatabaseState()), clusterFilePath(clusterFilePath) {
 	dbState->db = db;
 	dbState->dbVar->set(db);
 
@@ -941,6 +963,15 @@ double MultiVersionDatabase::getMainThreadBusyness() {
 	return 0;
 }
 
+// Returns the protocol version reported by a quorum of coordinators
+// If an expected version is given, the future won't return until the protocol version is different than expected
+ThreadFuture<ProtocolVersion> MultiVersionDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
+	// TODO: send this out through the active database
+	return MultiVersionApi::api->getLocalClient()
+	    ->api->createDatabase(clusterFilePath.c_str())
+	    ->getServerProtocol(expectedVersion);
+}
+
 void MultiVersionDatabase::Connector::connect() {
 	addref();
 	onMainThreadVoid(
@@ -1181,10 +1212,6 @@ const char* MultiVersionApi::getClientVersion() {
 	return localClient->api->getClientVersion();
 }
 
-ThreadFuture<uint64_t> MultiVersionApi::getServerProtocol(const char* clusterFilePath) {
-	return api->localClient->api->getServerProtocol(clusterFilePath);
-}
-
 void validateOption(Optional<StringRef> value, bool canBePresent, bool canBeAbsent, bool canBeEmpty = true) {
 	ASSERT(canBePresent || canBeAbsent);
 
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index ea16f4f35e..badb848334 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -28,6 +28,8 @@
 
 #include "flow/ThreadHelper.actor.h"
 
+// FdbCApi is used as a wrapper around the FoundationDB C API that gets loaded from an external client library.
+// All of the required functions loaded from that external library are stored in function pointers in this struct.
 struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	typedef struct future FDBFuture;
 	typedef struct cluster FDBCluster;
@@ -55,7 +57,6 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	// Network
 	fdb_error_t (*selectApiVersion)(int runtimeVersion, int headerVersion);
 	const char* (*getClientVersion)();
-	FDBFuture* (*getServerProtocol)(const char* clusterFilePath);
 	fdb_error_t (*setNetworkOption)(FDBNetworkOptions::Option option, uint8_t const* value, int valueLength);
 	fdb_error_t (*setupNetwork)();
 	fdb_error_t (*runNetwork)();
@@ -81,6 +82,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	                                     uint8_t const* snapshotCommmand,
 	                                     int snapshotCommandLength);
 	double (*databaseGetMainThreadBusyness)(FDBDatabase* database);
+	FDBFuture* (*databaseGetServerProtocol)(FDBDatabase* database, uint64_t expectedVersion);
 
 	// Transaction
 	fdb_error_t (*transactionSetOption)(FDBTransaction* tr,
@@ -185,6 +187,8 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	fdb_error_t (*futureGetCluster)(FDBFuture* f, FDBCluster** outCluster);
 };
 
+// An implementation of ITransaction that wraps a transaction object created on an externally loaded client library.
+// All API calls to that transaction are routed through the external library.
 class DLTransaction : public ITransaction, ThreadSafeReferenceCounted<DLTransaction> {
 public:
 	DLTransaction(Reference<FdbCApi> api, FdbCApi::FDBTransaction* tr) : api(api), tr(tr) {}
@@ -249,6 +253,8 @@ private:
 	FdbCApi::FDBTransaction* const tr;
 };
 
+// An implementation of IDatabase that wraps a database object created on an externally loaded client library.
+// All API calls to that database are routed through the external library.
 class DLDatabase : public IDatabase, ThreadSafeReferenceCounted<DLDatabase> {
 public:
 	DLDatabase(Reference<FdbCApi> api, FdbCApi::FDBDatabase* db) : api(api), db(db), ready(Void()) {}
@@ -265,6 +271,11 @@ public:
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
 
+	// Returns the protocol version reported by a quorum of coordinators
+	// If an expected version is given, the future won't return until the protocol version is different than expected
+	ThreadFuture<ProtocolVersion> getServerProtocol(
+	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
+
 	void addref() override { ThreadSafeReferenceCounted<DLDatabase>::addref(); }
 	void delref() override { ThreadSafeReferenceCounted<DLDatabase>::delref(); }
 
@@ -279,13 +290,14 @@ private:
 	ThreadFuture<Void> ready;
 };
 
+// An implementation of IClientApi that re-issues API calls to the C API of an externally loaded client library.
+// The DL prefix stands for "dynamic library".
 class DLApi : public IClientApi {
 public:
 	DLApi(std::string fdbCPath, bool unlinkOnLoad = false);
 
 	void selectApiVersion(int apiVersion) override;
 	const char* getClientVersion() override;
-	ThreadFuture<uint64_t> getServerProtocol(const char* clusterFilePath) override;
 
 	void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	void setupNetwork() override;
@@ -312,6 +324,9 @@ private:
 
 class MultiVersionDatabase;
 
+// An implementation of ITransaction that wraps a transaction created either locally or through a dynamically loaded
+// external client. When needed (e.g on cluster version change), the MultiVersionTransaction can automatically replace
+// its wrapped transaction with one from another client.
 class MultiVersionTransaction : public ITransaction, ThreadSafeReferenceCounted<MultiVersionTransaction> {
 public:
 	MultiVersionTransaction(Reference<MultiVersionDatabase> db,
@@ -413,6 +428,9 @@ struct ClientInfo : ClientDesc, ThreadSafeReferenceCounted<ClientInfo> {
 
 class MultiVersionApi;
 
+// An implementation of IDatabase that wraps a database created either locally or through a dynamically loaded
+// external client. The MultiVersionDatabase monitors the protocol version of the cluster and automatically
+// replaces the wrapped database when the protocol version changes.
 class MultiVersionDatabase final : public IDatabase, ThreadSafeReferenceCounted<MultiVersionDatabase> {
 public:
 	MultiVersionDatabase(MultiVersionApi* api,
@@ -426,6 +444,11 @@ public:
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
 
+	// Returns the protocol version reported by a quorum of coordinators
+	// If an expected version is given, the future won't return until the protocol version is different than expected
+	ThreadFuture<ProtocolVersion> getServerProtocol(
+	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
+
 	void addref() override { ThreadSafeReferenceCounted<MultiVersionDatabase>::addref(); }
 	void delref() override { ThreadSafeReferenceCounted<MultiVersionDatabase>::delref(); }
 
@@ -487,15 +510,19 @@ private:
 		Mutex optionLock;
 	};
 
+	std::string clusterFilePath;
 	const Reference<DatabaseState> dbState;
 	friend class MultiVersionTransaction;
 };
 
+// An implementation of IClientApi that can choose between multiple different client implementations either provided
+// locally within the primary loaded fdb_c client or through any number of dynamically loaded clients.
+//
+// This functionality is used to provide support for multiple protocol versions simultaneously.
 class MultiVersionApi : public IClientApi {
 public:
 	void selectApiVersion(int apiVersion) override;
 	const char* getClientVersion() override;
-	ThreadFuture<uint64_t> getServerProtocol(const char* clusterFilePath) override;
 
 	void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	void setupNetwork() override;
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 9f6784e279..6615e973dd 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -4900,9 +4900,18 @@ ACTOR Future<ProtocolVersion> coordinatorProtocolsFetcher(Reference<ClusterConne
 	return ProtocolVersion(majorityProtocol);
 }
 
-ACTOR Future<uint64_t> getCoordinatorProtocols(Reference<ClusterConnectionFile> f) {
-	ProtocolVersion protocolVersion = wait(coordinatorProtocolsFetcher(f));
-	return protocolVersion.version();
+// Returns the protocol version reported by a quorum of coordinators
+// If an expected version is given, the future won't return until the protocol version is different than expected
+ACTOR Future<ProtocolVersion> getClusterProtocol(Reference<ClusterConnectionFile> f,
+                                                 Optional<ProtocolVersion> expectedVersion) {
+	loop {
+		ProtocolVersion protocolVersion = wait(coordinatorProtocolsFetcher(f));
+		if (!expectedVersion.present() || protocolVersion != expectedVersion.get()) {
+			return protocolVersion;
+		} else {
+			wait(delay(2.0)); // TODO: this is temporary, so not making into a knob yet
+		}
+	}
 }
 
 uint32_t Transaction::getSize() {
diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h
index ac31967d83..51411ae0a2 100644
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@@ -400,7 +400,10 @@ ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID sn
 // Checks with Data Distributor that it is safe to mark all servers in exclusions as failed
 ACTOR Future<bool> checkSafeExclusions(Database cx, vector<AddressExclusion> exclusions);
 
-ACTOR Future<uint64_t> getCoordinatorProtocols(Reference<ClusterConnectionFile> f);
+// Returns the protocol version reported by a quorum of coordinators
+// If an expected version is given, the future won't return until the protocol version is different than expected
+ACTOR Future<ProtocolVersion> getClusterProtocol(Reference<ClusterConnectionFile> f,
+                                                 Optional<ProtocolVersion> expectedVersion);
 
 inline uint64_t getWriteOperationCost(uint64_t bytes) {
 	return bytes / std::max(1, CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR) + 1;
diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp
index 0e0877f9af..c5bf2dce87 100644
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@@ -97,6 +97,15 @@ double ThreadSafeDatabase::getMainThreadBusyness() {
 	return g_network->networkInfo.metrics.networkBusyness;
 }
 
+// Returns the protocol version reported by a quorum of coordinators
+// If an expected version is given, the future won't return until the protocol version is different than expected
+ThreadFuture<ProtocolVersion> ThreadSafeDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
+	DatabaseContext* db = this->db;
+	return onMainThread([db, expectedVersion]() -> Future<ProtocolVersion> {
+		return getClusterProtocol(db->getConnectionFile(), expectedVersion);
+	});
+}
+
 ThreadSafeDatabase::ThreadSafeDatabase(std::string connFilename, int apiVersion) {
 	ClusterConnectionFile* connFile =
 	    new ClusterConnectionFile(ClusterConnectionFile::lookupClusterFileName(connFilename).first);
@@ -407,16 +416,6 @@ const char* ThreadSafeApi::getClientVersion() {
 	return clientVersion.c_str();
 }
 
-// Wait until a quorum of coordinators with the same protocol version are available, and then return that protocol
-// version.
-ThreadFuture<uint64_t> ThreadSafeApi::getServerProtocol(const char* clusterFilePath) {
-	return onMainThread([clusterFilePath = std::string(clusterFilePath)]() -> Future<uint64_t> {
-		auto [clusterFile, isDefault] = ClusterConnectionFile::lookupClusterFileName(clusterFilePath);
-		Reference<ClusterConnectionFile> f = Reference<ClusterConnectionFile>(new ClusterConnectionFile(clusterFile));
-		return getCoordinatorProtocols(f);
-	});
-}
-
 void ThreadSafeApi::setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value) {
 	if (option == FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID) {
 		if (value.present()) {
diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h
index a62e503c11..e6360c2a6d 100644
--- a/fdbclient/ThreadSafeTransaction.h
+++ b/fdbclient/ThreadSafeTransaction.h
@@ -27,6 +27,8 @@
 #include "fdbclient/ClusterInterface.h"
 #include "fdbclient/IClientApi.h"
 
+// An implementation of IDatabase that serializes operations onto the network thread and interacts with the lower-level
+// client APIs exposed by NativeAPI and ReadYourWrites.
 class ThreadSafeDatabase : public IDatabase, public ThreadSafeReferenceCounted<ThreadSafeDatabase> {
 public:
 	~ThreadSafeDatabase() override;
@@ -37,9 +39,14 @@ public:
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
 
-	ThreadFuture<Void>
-	onConnected(); // Returns after a majority of coordination servers are available and have reported a leader. The
-	               // cluster file therefore is valid, but the database might be unavailable.
+	// Returns the protocol version reported by a quorum of coordinators
+	// If an expected version is given, the future won't return until the protocol version is different than expected
+	ThreadFuture<ProtocolVersion> getServerProtocol(
+	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
+
+	// Returns after a majority of coordination servers are available and have reported a leader. The
+	// cluster file therefore is valid, but the database might be unavailable.
+	ThreadFuture<Void> onConnected();
 
 	void addref() override { ThreadSafeReferenceCounted<ThreadSafeDatabase>::addref(); }
 	void delref() override { ThreadSafeReferenceCounted<ThreadSafeDatabase>::delref(); }
@@ -58,6 +65,8 @@ public: // Internal use only
 	DatabaseContext* unsafeGetPtr() const { return db; }
 };
 
+// An implementation of ITransaction that serializes operations onto the network thread and interacts with the
+// lower-level client APIs exposed by NativeAPI and ReadYourWrites.
 class ThreadSafeTransaction : public ITransaction, ThreadSafeReferenceCounted<ThreadSafeTransaction>, NonCopyable {
 public:
 	explicit ThreadSafeTransaction(DatabaseContext* cx);
@@ -135,11 +144,12 @@ private:
 	ReadYourWritesTransaction* tr;
 };
 
+// An implementation of IClientApi that serializes operations onto the network thread and interacts with the lower-level
+// client APIs exposed by NativeAPI and ReadYourWrites.
 class ThreadSafeApi : public IClientApi, ThreadSafeReferenceCounted<ThreadSafeApi> {
 public:
 	void selectApiVersion(int apiVersion) override;
 	const char* getClientVersion() override;
-	ThreadFuture<uint64_t> getServerProtocol(const char* clusterFilePath) override;
 
 	void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	void setupNetwork() override;

From bc8568d4bbdccf6a90200965c04dfe2ecd6641ae Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 14 Apr 2021 12:58:59 -0700
Subject: [PATCH 152/461] Use the correct pointer in the unit test for
 fdb_database_get_server_protocol

---
 bindings/c/test/unit/unit_tests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp
index a87e483ef3..c5c40b88c1 100644
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@@ -1523,7 +1523,7 @@ TEST_CASE("fdb_get_server_protocol") {
 	fdb_future_destroy(protocolFuture);
 
 	// "Default" cluster file version
-	protocolFuture = fdb_database_get_server_protocol(nullptr, 0x0FDB00A200090000LL);
+	protocolFuture = fdb_database_get_server_protocol(db, 0x0FDB00A200090000LL);
 	fdb_check(fdb_future_block_until_ready(protocolFuture));
 	fdb_check(fdb_future_get_uint64(protocolFuture, &out));
 	fdb_future_destroy(protocolFuture);

From a285d6019e40d0ff8f557d4a415b7144daac8aef Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 13:10:56 -0700
Subject: [PATCH 153/461] We cannot put 2 Future functions in the same wait if
 the second one uses the first's result.

Before this change:
20210414-180825-renxuan-7451fad7aed4f0c7           compressed=True data_size=22960315 duration=732 ended=146 fail=10 fail_fast=10 max_runs=100000 pass=46 priority=100 remaining=0 runtime=0:01:12 sanity=False started=147 stopped=20210414-180937 submitted=20210414-180825 timeout=5400 username=renxuan

After this change:
20210414-192849-renxuan-cbe0f71ad5c48286           compressed=True data_size=22959419 duration=4261266 ended=106778 fail=1 fail_fast=10 max_runs=100000 pass=99999 priority=100 remaining=0 runtime=0:24:49 sanity=False started=106963 stopped=20210414-195338 submitted=20210414-192849 timeout=5400 username=renxuan
---
 fdbclient/FileBackupAgent.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index b7e2a847b0..6f3fd1e13b 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2780,8 +2780,8 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 		// Initialize the initial snapshot and create tasks to continually write logs and snapshots.
 		state Future<Optional<int64_t>> initialSnapshotIntervalSeconds =
 		    config.initialSnapshotIntervalSeconds().get(tr);
-		wait(success(initialSnapshotIntervalSeconds) &&
-		     config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.get().orDefault(0)));
+		wait(success(initialSnapshotIntervalSeconds));
+		wait(config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.get().orDefault(0)));
 
 		// Using priority 1 for both of these to at least start both tasks soon
 		// Do not add snapshot task if we only want the incremental backup

From d3b6a543ab29bbdd9becaf0bf205b1b6c6ac1cb9 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 14 Apr 2021 13:23:06 -0700
Subject: [PATCH 154/461] Update comment in unit test

---
 bindings/c/test/unit/unit_tests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp
index c5c40b88c1..64898f6ede 100644
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@@ -1522,7 +1522,7 @@ TEST_CASE("fdb_get_server_protocol") {
 	fdb_check(fdb_future_get_uint64(protocolFuture, &out));
 	fdb_future_destroy(protocolFuture);
 
-	// "Default" cluster file version
+	// Passing in an expected version that's different than the cluster version
 	protocolFuture = fdb_database_get_server_protocol(db, 0x0FDB00A200090000LL);
 	fdb_check(fdb_future_block_until_ready(protocolFuture));
 	fdb_check(fdb_future_get_uint64(protocolFuture, &out));

From 97b995fb4f4a49c88daafce4284f8659b49d4bee Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 13:49:59 -0700
Subject: [PATCH 155/461] Update fdbclient/FileBackupAgent.actor.cpp

Co-authored-by: Trevor Clinkenbeard <trevor.clinkenbeard@snowflake.com>
---
 fdbclient/FileBackupAgent.actor.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 6f3fd1e13b..7f59cf553e 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2778,10 +2778,8 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 		state Reference<TaskFuture> backupFinished = futureBucket->future(tr);
 
 		// Initialize the initial snapshot and create tasks to continually write logs and snapshots.
-		state Future<Optional<int64_t>> initialSnapshotIntervalSeconds =
-		    config.initialSnapshotIntervalSeconds().get(tr);
-		wait(success(initialSnapshotIntervalSeconds));
-		wait(config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.get().orDefault(0)));
+		state Optional<int64_t> initialSnapshotIntervalSeconds = wait(config.initialSnapshotIntervalSeconds().get(tr));
+		wait(config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.orDefault(0)));
 
 		// Using priority 1 for both of these to at least start both tasks soon
 		// Do not add snapshot task if we only want the incremental backup

From 650b052284dab70a1b4f8fabc63b7fa1bfa14fd9 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 13:55:46 -0700
Subject: [PATCH 156/461] Log each time RateKeeper enters
 monitorServerListChange().

---
 fdbserver/Ratekeeper.actor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 0a66ae15f9..0224954b9e 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -738,6 +738,7 @@ ACTOR Future<Void> monitorServerListChange(
 	loop {
 		try {
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			TraceEvent("RatekeeperMonitorSSList", self->id).detail("CurrentTime", now());
 			vector<std::pair<StorageServerInterface, ProcessClass>> results = wait(getServerListAndProcessClasses(&tr));
 			self->lastSSListFetchedTimestamp = now();
 

From 0378dc0a502b3f8e26d512c738d2f918f4fd9542 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 22:19:39 -0700
Subject: [PATCH 157/461] Report the current version in the restore status.

---
 fdbclient/FileBackupAgent.actor.cpp | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index e7da8fbf58..4044df66cc 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -244,6 +244,20 @@ public:
 
 	Key applyMutationsMapPrefix() { return uidPrefixKey(applyMutationsKeyVersionMapRange.begin, uid); }
 
+	ACTOR static Future<Version> getCurrentVersion_impl(Reference<ReadYourWritesTransaction> tr, UID uid) {
+		state Future<Optional<Value>> beginVal = tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid), true);
+		wait(success(beginVal));
+		if (!beginVal.get().present()) {
+			return -1;
+		}
+		Version currentVersion = BinaryReader::fromStringRef<Version>(beginVal.get().get(), Unversioned());
+		return currentVersion;
+	}
+
+	Future<Version> getCurrentVersion(Reference<ReadYourWritesTransaction> tr) {
+		return getCurrentVersion_impl(tr, uid);
+	}
+
 	ACTOR static Future<int64_t> getApplyVersionLag_impl(Reference<ReadYourWritesTransaction> tr, UID uid) {
 		// Both of these are snapshot reads
 		state Future<Optional<Value>> beginVal = tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid), true);
@@ -334,6 +348,7 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	state Future<int64_t> fileBlocksFinished = restore.fileBlocksFinished().getD(tr);
 	state Future<int64_t> bytesWritten = restore.bytesWritten().getD(tr);
 	state Future<StringRef> status = restore.stateText(tr);
+	state Future<Version> currentVersion = restore.getCurrentVersion(tr);
 	state Future<Version> lag = restore.getApplyVersionLag(tr);
 	state Future<std::string> tag = restore.tag().getD(tr);
 	state Future<std::pair<std::string, Version>> lastError = restore.lastError().getD(tr);
@@ -341,8 +356,8 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	// restore might no longer be valid after the first wait so make sure it is not needed anymore.
 	state UID uid = restore.getUid();
 	wait(success(fileCount) && success(fileBlockCount) && success(fileBlocksDispatched) &&
-	     success(fileBlocksFinished) && success(bytesWritten) && success(status) && success(lag) && success(tag) &&
-	     success(lastError));
+	     success(fileBlocksFinished) && success(bytesWritten) && success(status) && success(currentVersion) &&
+	     success(lag) && success(tag) && success(lastError));
 
 	std::string errstr = "None";
 	if (lastError.get().second != 0)
@@ -359,11 +374,12 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	    .detail("FileBlocksTotal", fileBlockCount.get())
 	    .detail("FileBlocksInProgress", fileBlocksDispatched.get() - fileBlocksFinished.get())
 	    .detail("BytesWritten", bytesWritten.get())
+	    .detail("CurrentVersion", currentVersion.get())
 	    .detail("ApplyLag", lag.get())
 	    .detail("TaskInstance", THIS_ADDR);
 
 	return format("Tag: %s  UID: %s  State: %s  Blocks: %lld/%lld  BlocksInProgress: %lld  Files: %lld  BytesWritten: "
-	              "%lld  ApplyVersionLag: %lld  LastError: %s",
+	              "%lld  CurrentVersion: %lld  ApplyVersionLag: %lld  LastError: %s",
 	              tag.get().c_str(),
 	              uid.toString().c_str(),
 	              status.get().toString().c_str(),
@@ -372,6 +388,7 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	              fileBlocksDispatched.get() - fileBlocksFinished.get(),
 	              fileCount.get(),
 	              bytesWritten.get(),
+	              currentVersion.get(),
 	              lag.get(),
 	              errstr.c_str());
 }

From daa1796c99072eac14bba04f73ea8a6f7ada9d09 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Thu, 15 Apr 2021 00:08:29 -0700
Subject: [PATCH 158/461] Added Pager function for trying to evict a page from
 cache.

---
 fdbserver/IPager.h                 |  1 +
 fdbserver/VersionedBTree.actor.cpp | 56 +++++++++++++++++++++++++-----
 2 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h
index 45c9f02fcc..7f21e30566 100644
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@@ -66,6 +66,7 @@ public:
 class IPagerSnapshot {
 public:
 	virtual Future<Reference<const IPage>> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool nohit) = 0;
+	virtual bool tryEvictPage(LogicalPageID id) = 0;
 	virtual Version getVersion() const = 0;
 
 	virtual Key getMetaKey() const = 0;
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 28c73480b6..52408c2820 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -1030,6 +1030,22 @@ public:
 		return nullptr;
 	}
 
+	// Try to evict the item at index from cache
+	// Returns true if item is evicted or was not present in cache
+	bool tryEvict(const IndexType& index) {
+		auto i = cache.find(index);
+		if (i == cache.end() || !i->second.item.evictable()) {
+			return false;
+		}
+		Entry& toEvict = i->second;
+		if (toEvict.hits == 0) {
+			++g_redwoodMetrics.pagerEvictUnhit;
+		}
+		evictionOrder.erase(evictionOrder.iterator_to(toEvict));
+		cache.erase(toEvict.index);
+		return true;
+	}
+
 	// Get the object for i or create a new one.
 	// After a get(), the object for i is the last in evictionOrder.
 	// If noHit is set, do not consider this access to be cache hit if the object is present
@@ -1690,6 +1706,11 @@ public:
 		return readPhysicalPage(self, pageID, true);
 	}
 
+	bool tryEvictPage(LogicalPageID logicalID, Version v) {
+		PhysicalPageID physicalID = getPhysicalPageID(logicalID, v);
+		return pageCache.tryEvict(physicalID);
+	}
+
 	// Reads the most recent version of pageID, either previously committed or written using updatePage() in the current
 	// commit
 	Future<Reference<IPage>> readPage(LogicalPageID pageID, bool cacheable, bool noHit = false) override {
@@ -1725,14 +1746,14 @@ public:
 		return cacheEntry.readFuture;
 	}
 
-	Future<Reference<IPage>> readPageAtVersion(LogicalPageID pageID, Version v, bool cacheable, bool noHit) {
+	PhysicalPageID getPhysicalPageID(LogicalPageID pageID, Version v) {
 		auto i = remappedPages.find(pageID);
 
 		if (i != remappedPages.end()) {
 			auto j = i->second.upper_bound(v);
 			if (j != i->second.begin()) {
 				--j;
-				debug_printf("DWALPager(%s) op=readAtVersionRemapped %s @%" PRId64 " -> %s\n",
+				debug_printf("DWALPager(%s) op=lookupRemapped %s @%" PRId64 " -> %s\n",
 				             filename.c_str(),
 				             toString(pageID).c_str(),
 				             v,
@@ -1741,13 +1762,18 @@ public:
 				ASSERT(pageID != invalidLogicalPageID);
 			}
 		} else {
-			debug_printf("DWALPager(%s) op=readAtVersionNotRemapped %s @%" PRId64 " (not remapped)\n",
+			debug_printf("DWALPager(%s) op=lookupNotRemapped %s @%" PRId64 " (not remapped)\n",
 			             filename.c_str(),
 			             toString(pageID).c_str(),
 			             v);
 		}
 
-		return readPage(pageID, cacheable, noHit);
+		return (PhysicalPageID)pageID;
+	}
+
+	Future<Reference<IPage>> readPageAtVersion(LogicalPageID logicalID, Version v, bool cacheable, bool noHit) {
+		PhysicalPageID physicalID = getPhysicalPageID(logicalID, v);
+		return readPage(physicalID, cacheable, noHit);
 	}
 
 	// Get snapshot as of the most recent committed version of the pager
@@ -2281,9 +2307,11 @@ public:
 			throw expired.getError();
 		}
 		return map(pager->readPageAtVersion(pageID, version, cacheable, noHit),
-		           [=](Reference<IPage> p) { return Reference<const IPage>(p); });
+		           [=](Reference<IPage> p) { return Reference<const IPage>(std::move(p)); });
 	}
 
+	bool tryEvictPage(LogicalPageID id) override { return pager->tryEvictPage(id, version); }
+
 	Key getMetaKey() const override { return metaKey; }
 
 	Version getVersion() const override { return version; }
@@ -4153,6 +4181,17 @@ private:
 		int m_size;
 	};
 
+	// Try to evict a BTree page from the pager cache.
+	// Returns true if, at the end of the call, the page is no longer in cache,
+	// so the caller can assume its IPage reference is the only one.
+	bool tryEvictPage(IPagerSnapshot* pager, BTreePageIDRef id) {
+		// If it's an oversized page, currently it cannot be in the cache
+		if (id.size() > 0) {
+			return true;
+		}
+		return pager->tryEvictPage(id.front());
+	}
+
 	ACTOR static Future<Reference<const IPage>> readPage(Reference<IPagerSnapshot> snapshot,
 	                                                     BTreePageIDRef id,
 	                                                     const RedwoodRecordRef* lowerBound,
@@ -4175,7 +4214,7 @@ private:
 
 		if (id.size() == 1) {
 			Reference<const IPage> p = wait(snapshot->getPhysicalPage(id.front(), !forLazyClear, false));
-			page = p;
+			page = std::move(p);
 		} else {
 			ASSERT(!id.empty());
 			std::vector<Future<Reference<const IPage>>> reads;
@@ -4208,7 +4247,7 @@ private:
 			             pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str());
 		}
 
-		return page;
+		return std::move(page);
 	}
 
 	static void preLoadPage(IPagerSnapshot* snapshot, BTreePageIDRef id) {
@@ -5077,6 +5116,7 @@ private:
 			state bool detachChildren = (parentInfo->count > 2);
 			state bool forceUpdate = false;
 
+			// If no changes were made, but we should rewrite it to point directly to remapped child pages
 			if (!m.changesMade && detachChildren) {
 				debug_printf(
 				    "%s Internal page forced rewrite because at least %d children have been updated in-place.\n",
@@ -5107,7 +5147,7 @@ private:
 					if (m.updating) {
 						// Page was updated in place (or being forced to be updated in place to update child page ids)
 						debug_printf(
-						    "%s Internal page modified in-place tryUpdate=%d forceUpdate=%d detachChildren=%d\n",
+						    "%s Internal page modified in-place tryToUpdate=%d forceUpdate=%d detachChildren=%d\n",
 						    context.c_str(),
 						    tryToUpdate,
 						    forceUpdate,

From b2d6930103becc1323397bab0fec40fa0ce64e0a Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Thu, 15 Apr 2021 11:45:14 -0700
Subject: [PATCH 159/461] The multi-version client monitors the cluster's
 protocol version and only activates the client library that can connect.

---
 bindings/c/fdb_c.cpp                        |   2 +-
 fdbclient/CoordinationInterface.h           |   5 +
 fdbclient/DatabaseContext.h                 |  11 +
 fdbclient/IClientApi.h                      |   2 +-
 fdbclient/MonitorLeader.actor.cpp           |   7 +-
 fdbclient/MonitorLeader.h                   |   1 +
 fdbclient/MultiVersionTransaction.actor.cpp | 313 ++++++++------------
 fdbclient/MultiVersionTransaction.h         |  71 ++---
 fdbclient/NativeAPI.actor.cpp               | 139 ++++++---
 fdbclient/NativeAPI.actor.h                 |   9 +-
 fdbclient/ThreadSafeTransaction.cpp         |   7 +-
 fdbclient/ThreadSafeTransaction.h           |   2 +-
 fdbrpc/FlowTransport.actor.cpp              |  26 +-
 fdbrpc/FlowTransport.h                      |  49 +--
 flow/ProtocolVersion.h                      |  12 +
 15 files changed, 350 insertions(+), 306 deletions(-)

diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp
index 907f8058b6..2c133dae36 100644
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@@ -364,7 +364,7 @@ extern "C" DLLEXPORT double fdb_database_get_main_thread_busyness(FDBDatabase* d
 	return DB(d)->getMainThreadBusyness();
 }
 
-// Returns the protocol version reported by a quorum of coordinators
+// Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is non-zero, the future won't return until the protocol version is different than expected
 extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version) {
 	Optional<ProtocolVersion> expected;
diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h
index 0d22b035fb..d826da4fd6 100644
--- a/fdbclient/CoordinationInterface.h
+++ b/fdbclient/CoordinationInterface.h
@@ -35,6 +35,7 @@ constexpr UID WLTOKEN_CLIENTLEADERREG_OPENDATABASE(-1, 3);
 
 constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 10);
 
+// The coordinator interface as exposed to clients
 struct ClientLeaderRegInterface {
 	RequestStream<struct GetLeaderRequest> getLeader;
 	RequestStream<struct OpenDatabaseCoordRequest> openDatabase;
@@ -42,6 +43,10 @@ struct ClientLeaderRegInterface {
 	ClientLeaderRegInterface() {}
 	ClientLeaderRegInterface(NetworkAddress remote);
 	ClientLeaderRegInterface(INetwork* local);
+
+	bool operator==(const ClientLeaderRegInterface& rhs) const {
+		return getLeader == rhs.getLeader && openDatabase == rhs.openDatabase;
+	}
 };
 
 class ClusterConnectionString {
diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h
index 2e1100fef7..487ce50bf2 100644
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@@ -152,6 +152,7 @@ public:
 		return (DatabaseContext*)DatabaseContext::operator new(sizeof(DatabaseContext));
 	}
 
+	// Static constructor used by server processes to create a DatabaseContext
 	// For internal (fdbserver) use only
 	static Database create(Reference<AsyncVar<ClientDBInfo>> clientInfo,
 	                       Future<Void> clientInfoMonitor,
@@ -164,9 +165,11 @@ public:
 
 	~DatabaseContext();
 
+	// Constructs a new copy of this DatabaseContext from the parameters of this DatabaseContext
 	Database clone() const {
 		return Database(new DatabaseContext(connectionFile,
 		                                    clientInfo,
+		                                    coordinator,
 		                                    clientInfoMonitor,
 		                                    taskID,
 		                                    clientLocality,
@@ -196,6 +199,10 @@ public:
 	Future<Void> onProxiesChanged();
 	Future<HealthMetrics> getHealthMetrics(bool detailed);
 
+	// Returns the protocol version reported by the coordinator this client is connected to
+	// If an expected version is given, the future won't return until the protocol version is different than expected
+	Future<ProtocolVersion> getClusterProtocol(Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>());
+
 	// Update the watch counter for the database
 	void addWatch();
 	void removeWatch();
@@ -247,6 +254,7 @@ public:
 	// private:
 	explicit DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionFile>>> connectionFile,
 	                         Reference<AsyncVar<ClientDBInfo>> clientDBInfo,
+	                         Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
 	                         Future<Void> clientInfoMonitor,
 	                         TaskPriority taskID,
 	                         LocalityData const& clientLocality,
@@ -380,6 +388,9 @@ public:
 	Future<Void> clientInfoMonitor;
 	Future<Void> connected;
 
+	// An AsyncVar that reports the coordinator this DatabaseContext is interacting with
+	Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator;
+
 	Reference<AsyncVar<Optional<ClusterInterface>>> statusClusterInterface;
 	Future<Void> statusLeaderMon;
 	double lastStatusFetch;
diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h
index 4496eff732..a3de56bf10 100644
--- a/fdbclient/IClientApi.h
+++ b/fdbclient/IClientApi.h
@@ -100,7 +100,7 @@ public:
 	virtual void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;
 	virtual double getMainThreadBusyness() = 0;
 
-	// Returns the protocol version reported by a quorum of coordinators
+	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
 	virtual ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) = 0;
diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp
index af563c68b0..df14e6a40a 100644
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@@ -757,6 +757,7 @@ void shrinkProxyList(ClientDBInfo& ni,
 ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
     Reference<ClusterConnectionFile> connFile,
     Reference<AsyncVar<ClientDBInfo>> clientInfo,
+    Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
     MonitorLeaderInfo info,
     Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions,
     Key traceLogGroup) {
@@ -774,6 +775,9 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
 	loop {
 		state ClientLeaderRegInterface clientLeaderServer(addrs[idx]);
 		state OpenDatabaseCoordRequest req;
+
+		coordinator->set(clientLeaderServer);
+
 		req.clusterKey = cs.clusterKey();
 		req.coordinators = cs.coordinators();
 		req.knownClientInfoID = clientInfo->get().id;
@@ -840,13 +844,14 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
 ACTOR Future<Void> monitorProxies(
     Reference<AsyncVar<Reference<ClusterConnectionFile>>> connFile,
     Reference<AsyncVar<ClientDBInfo>> clientInfo,
+    Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
     Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions,
     Key traceLogGroup) {
 	state MonitorLeaderInfo info(connFile->get());
 	loop {
 		choose {
 			when(MonitorLeaderInfo _info = wait(monitorProxiesOneGeneration(
-			         connFile->get(), clientInfo, info, supportedVersions, traceLogGroup))) {
+			         connFile->get(), clientInfo, coordinator, info, supportedVersions, traceLogGroup))) {
 				info = _info;
 			}
 			when(wait(connFile->onChange())) {
diff --git a/fdbclient/MonitorLeader.h b/fdbclient/MonitorLeader.h
index 204b6994f4..b9b195a9da 100644
--- a/fdbclient/MonitorLeader.h
+++ b/fdbclient/MonitorLeader.h
@@ -76,6 +76,7 @@ Future<Void> monitorLeaderForProxies(Value const& key,
 Future<Void> monitorProxies(
     Reference<AsyncVar<Reference<ClusterConnectionFile>>> const& connFile,
     Reference<AsyncVar<ClientDBInfo>> const& clientInfo,
+    Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> const& coordinator,
     Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> const& supportedVersions,
     Key const& traceLogGroup);
 
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 4b6ba0c27c..57f23e3d88 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -356,7 +356,7 @@ double DLDatabase::getMainThreadBusyness() {
 	return 0;
 }
 
-// Returns the protocol version reported by a quorum of coordinators
+// Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
 ThreadFuture<ProtocolVersion> DLDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
 	ASSERT(api->databaseGetServerProtocol != nullptr);
@@ -877,35 +877,35 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
                                            int threadIdx,
                                            std::string clusterFilePath,
                                            Reference<IDatabase> db,
+                                           Reference<IDatabase> versionMonitorDb,
                                            bool openConnectors)
-  : dbState(new DatabaseState()), clusterFilePath(clusterFilePath) {
+  : dbState(new DatabaseState(clusterFilePath, versionMonitorDb)) {
 	dbState->db = db;
 	dbState->dbVar->set(db);
 
-	if (!openConnectors) {
-		dbState->currentClientIndex = 0;
-	} else {
+	if (openConnectors) {
 		if (!api->localClientDisabled) {
-			dbState->currentClientIndex = 0;
-			dbState->addConnection(api->getLocalClient(), clusterFilePath);
-		} else {
-			dbState->currentClientIndex = -1;
+			dbState->addClient(api->getLocalClient());
 		}
 
-		api->runOnExternalClients(threadIdx, [this, clusterFilePath](Reference<ClientInfo> client) {
-			dbState->addConnection(client, clusterFilePath);
-		});
+		if (!externalClientsInitialized.test_and_set()) {
+			api->runOnExternalClientsAllThreads([&clusterFilePath](Reference<ClientInfo> client) {
+				// This creates a database to initialize some client state on the external library,
+				// but it gets deleted immediately so that we don't keep open connections
+				Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
+			});
+		}
 
-		dbState->startConnections();
+		api->runOnExternalClients(threadIdx, [this](Reference<ClientInfo> client) { dbState->addClient(client); });
+
+		dbState->protocolVersionMonitor = dbState->monitorProtocolVersion();
 	}
 }
 
-MultiVersionDatabase::~MultiVersionDatabase() {
-	dbState->cancelConnections();
-}
-
+// Create a MultiVersionDatabase that wraps an already created IDatabase object
+// For internal use in testing
 Reference<IDatabase> MultiVersionDatabase::debugCreateFromExistingDatabase(Reference<IDatabase> db) {
-	return Reference<IDatabase>(new MultiVersionDatabase(MultiVersionApi::api, 0, "", db, false));
+	return Reference<IDatabase>(new MultiVersionDatabase(MultiVersionApi::api, 0, "", db, db, false));
 }
 
 Reference<ITransaction> MultiVersionDatabase::createTransaction() {
@@ -963,189 +963,122 @@ double MultiVersionDatabase::getMainThreadBusyness() {
 	return 0;
 }
 
-// Returns the protocol version reported by a quorum of coordinators
+// Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
 ThreadFuture<ProtocolVersion> MultiVersionDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
-	// TODO: send this out through the active database
-	return MultiVersionApi::api->getLocalClient()
-	    ->api->createDatabase(clusterFilePath.c_str())
-	    ->getServerProtocol(expectedVersion);
+	return dbState->versionMonitorDb->getServerProtocol(expectedVersion);
 }
 
-void MultiVersionDatabase::Connector::connect() {
-	addref();
-	onMainThreadVoid(
-	    [this]() {
-		    if (!cancelled) {
-			    connected = false;
-			    if (connectionFuture.isValid()) {
-				    connectionFuture.cancel();
-			    }
+MultiVersionDatabase::DatabaseState::DatabaseState(std::string clusterFilePath, Reference<IDatabase> versionMonitorDb)
+  : clusterFilePath(clusterFilePath), versionMonitorDb(versionMonitorDb),
+    dbVar(new ThreadSafeAsyncVar<Reference<IDatabase>>(Reference<IDatabase>(NULL))) {}
 
-			    candidateDatabase = client->api->createDatabase(clusterFilePath.c_str());
-			    if (client->external) {
-				    connectionFuture = candidateDatabase.castTo<DLDatabase>()->onReady();
-			    } else {
-				    connectionFuture = ThreadFuture<Void>(Void());
-			    }
+// Adds a client (local or externally loaded) that can be used to connect to the cluster
+void MultiVersionDatabase::DatabaseState::addClient(Reference<ClientInfo> client) {
+	ProtocolVersion baseVersion = client->protocolVersion.normalizedVersion();
+	auto [itr, inserted] = clients.insert({ baseVersion, client });
+	if (!inserted) {
+		// SOMEDAY: prefer client with higher release version if protocol versions are compatible
+		Reference<ClientInfo> keptClient = itr->second;
+		Reference<ClientInfo> discardedClient = client;
+		if (client->canReplace(itr->second)) {
+			std::swap(keptClient, discardedClient);
+			clients[baseVersion] = client;
+		}
 
-			    connectionFuture = flatMapThreadFuture<Void, Void>(connectionFuture, [this](ErrorOr<Void> ready) {
-				    if (ready.isError()) {
-					    return ErrorOr<ThreadFuture<Void>>(ready.getError());
-				    }
+		discardedClient->failed = true;
+		TraceEvent(SevWarn, "DuplicateClientVersion")
+		    .detail("Keeping", keptClient->libPath)
+		    .detail("KeptProtocolVersion", keptClient->protocolVersion)
+		    .detail("Disabling", discardedClient->libPath)
+		    .detail("DisabledProtocolVersion", discardedClient->protocolVersion);
 
-				    tr = candidateDatabase->createTransaction();
-				    return ErrorOr<ThreadFuture<Void>>(
-				        mapThreadFuture<Version, Void>(tr->getReadVersion(), [](ErrorOr<Version> v) {
-					        // If the version attempt returns an error, we regard that as a connection (except
-					        // operation_cancelled)
-					        if (v.isError() && v.getError().code() == error_code_operation_cancelled) {
-						        return ErrorOr<Void>(v.getError());
-					        } else {
-						        return ErrorOr<Void>(Void());
-					        }
-				        }));
-			    });
-
-			    int userParam;
-			    connectionFuture.callOrSetAsCallback(this, userParam, 0);
-		    } else {
-			    delref();
-		    }
-	    },
-	    nullptr);
-}
-
-// Only called from main thread
-void MultiVersionDatabase::Connector::cancel() {
-	connected = false;
-	cancelled = true;
-	if (connectionFuture.isValid()) {
-		connectionFuture.cancel();
-	}
-}
-
-void MultiVersionDatabase::Connector::fire(const Void& unused, int& userParam) {
-	onMainThreadVoid(
-	    [this]() {
-		    if (!cancelled) {
-			    connected = true;
-			    dbState->stateChanged();
-		    }
-		    delref();
-	    },
-	    nullptr);
-}
-
-void MultiVersionDatabase::Connector::error(const Error& e, int& userParam) {
-	if (e.code() != error_code_operation_cancelled) {
-		// TODO: is it right to abandon this connection attempt?
-		client->failed = true;
 		MultiVersionApi::api->updateSupportedVersions();
-		TraceEvent(SevError, "DatabaseConnectionError").error(e).detail("ClientLibrary", this->client->libPath);
 	}
-
-	delref();
 }
 
-MultiVersionDatabase::DatabaseState::DatabaseState()
-  : dbVar(new ThreadSafeAsyncVar<Reference<IDatabase>>(Reference<IDatabase>(nullptr))), currentClientIndex(-1) {}
+// Watch the cluster protocol version for changes and update the database state when it does
+ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion() {
+	ThreadFuture<ProtocolVersion> f = versionMonitorDb->getServerProtocol(dbProtocolVersion);
+	return mapThreadFuture<ProtocolVersion, Void>(f, [this](ErrorOr<ProtocolVersion> cv) {
+		if (cv.isError()) {
+			TraceEvent("ErrorGettingClusterProtocolVersion")
+			    .detail("ExpectedProtocolVersion", dbProtocolVersion)
+			    .error(cv.getError());
+		}
 
-// Only called from main thread
-void MultiVersionDatabase::DatabaseState::stateChanged() {
-	int newIndex = -1;
-	for (int i = 0; i < clients.size(); ++i) {
-		if (i != currentClientIndex && connectionAttempts[i]->connected) {
-			if (currentClientIndex >= 0 && !clients[i]->canReplace(clients[currentClientIndex])) {
-				TraceEvent(SevWarn, "DuplicateClientVersion")
-				    .detail("Keeping", clients[currentClientIndex]->libPath)
-				    .detail("KeptClientProtocolVersion", clients[currentClientIndex]->protocolVersion.version())
-				    .detail("Disabling", clients[i]->libPath)
-				    .detail("DisabledClientProtocolVersion", clients[i]->protocolVersion.version());
-				connectionAttempts[i]->connected = false; // Permanently disable this client in favor of the current one
-				clients[i]->failed = true;
-				MultiVersionApi::api->updateSupportedVersions();
-				return;
+		ProtocolVersion clusterVersion = !cv.isError() ? cv.get() : dbProtocolVersion.orDefault(currentProtocolVersion);
+		onMainThreadVoid([this, clusterVersion]() { protocolVersionChanged(clusterVersion); }, nullptr);
+		return Void();
+	});
+}
+
+// Called when a change to the protocol version of the cluster has been detected. Must be called from the main
+// thread.
+void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion protocolVersion) {
+	if (dbProtocolVersion.present() &&
+	    protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) {
+		dbProtocolVersion = protocolVersion;
+	} else {
+		TraceEvent("ProtocolVersionChanged")
+		    .detail("NewProtocolVersion", protocolVersion)
+		    .detail("OldProtocolVersion", dbProtocolVersion);
+
+		dbProtocolVersion = protocolVersion;
+		auto itr = clients.find(protocolVersion.normalizedVersion());
+
+		if (itr != clients.end()) {
+			auto& client = itr->second;
+			TraceEvent("CreatingDatabaseOnExternalClient")
+			    .detail("LibraryPath", client->libPath)
+			    .detail("Failed", client->failed);
+
+			Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
+
+			optionLock.enter();
+			for (auto option : options) {
+				try {
+					newDb->setOption(
+					    option.first,
+					    option.second.castTo<StringRef>()); // In practice, this will set a deferred error instead
+					                                        // of throwing. If that happens, the database will be
+					                                        // unusable (attempts to use it will throw errors).
+				} catch (Error& e) {
+					optionLock.leave();
+					TraceEvent(SevError, "ClusterVersionChangeOptionError")
+					    .error(e)
+					    .detail("Option", option.first)
+					    .detail("OptionValue", option.second)
+					    .detail("LibPath", client->libPath);
+					client->failed = true;
+					MultiVersionApi::api->updateSupportedVersions();
+					db = Reference<IDatabase>(); // If we can't set all of the options on a cluster, we abandon the
+					                             // client
+					break;
+				}
 			}
 
-			newIndex = i;
-			break;
-		}
-	}
-
-	if (newIndex == -1) {
-		ASSERT_EQ(currentClientIndex, 0); // This can only happen for the local client, which we set as the current
-		                                  // connection before we know it's connected
-		return;
-	}
-
-	// Restart connection for replaced client
-	auto newDb = connectionAttempts[newIndex]->candidateDatabase;
-
-	optionLock.enter();
-	for (auto option : options) {
-		try {
-			newDb->setOption(option.first,
-			                 option.second.castTo<StringRef>()); // In practice, this will set a deferred error instead
-			                                                     // of throwing. If that happens, the database will be
-			                                                     // unusable (attempts to use it will throw errors).
-		} catch (Error& e) {
+			db = newDb;
+			if (dbProtocolVersion.get().hasStableInterfaces()) {
+				versionMonitorDb = db;
+			} else {
+				versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
+			}
 			optionLock.leave();
-			TraceEvent(SevError, "ClusterVersionChangeOptionError")
-			    .error(e)
-			    .detail("Option", option.first)
-			    .detail("OptionValue", option.second)
-			    .detail("LibPath", clients[newIndex]->libPath);
-			connectionAttempts[newIndex]->connected = false;
-			clients[newIndex]->failed = true;
-			MultiVersionApi::api->updateSupportedVersions();
-			return; // If we can't set all of the options on a cluster, we abandon the client
+		} else {
+			db = Reference<IDatabase>();
+			versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
 		}
+
+		dbVar->set(db);
 	}
 
-	db = newDb;
-	optionLock.leave();
-
-	dbVar->set(db);
-
-	if (currentClientIndex >= 0 && connectionAttempts[currentClientIndex]->connected) {
-		connectionAttempts[currentClientIndex]->connected = false;
-		connectionAttempts[currentClientIndex]->connect();
-	}
-
-	ASSERT(newIndex >= 0 && newIndex < clients.size());
-	currentClientIndex = newIndex;
+	protocolVersionMonitor = monitorProtocolVersion();
 }
 
-void MultiVersionDatabase::DatabaseState::addConnection(Reference<ClientInfo> client, std::string clusterFilePath) {
-	clients.push_back(client);
-	connectionAttempts.push_back(
-	    makeReference<Connector>(Reference<DatabaseState>::addRef(this), client, clusterFilePath));
-}
-
-void MultiVersionDatabase::DatabaseState::startConnections() {
-	for (auto c : connectionAttempts) {
-		c->connect();
-	}
-}
-
-void MultiVersionDatabase::DatabaseState::cancelConnections() {
-	addref();
-	onMainThreadVoid(
-	    [this]() {
-		    for (auto c : connectionAttempts) {
-			    c->cancel();
-		    }
-
-		    connectionAttempts.clear();
-		    clients.clear();
-		    delref();
-	    },
-	    nullptr);
-}
+std::atomic_flag MultiVersionDatabase::externalClientsInitialized = ATOMIC_FLAG_INIT;
 
 // MultiVersionApi
-
 bool MultiVersionApi::apiVersionAtLeast(int minVersion) {
 	ASSERT_NE(MultiVersionApi::api->apiVersion, 0);
 	return MultiVersionApi::api->apiVersion >= minVersion || MultiVersionApi::api->apiVersion < 0;
@@ -1608,6 +1541,7 @@ void MultiVersionApi::addNetworkThreadCompletionHook(void (*hook)(void*), void*
 	}
 }
 
+// Creates an IDatabase object that represents a connections to the cluster
 Reference<IDatabase> MultiVersionApi::createDatabase(const char* clusterFilePath) {
 	lock.enter();
 	if (!networkSetup) {
@@ -1622,28 +1556,21 @@ Reference<IDatabase> MultiVersionApi::createDatabase(const char* clusterFilePath
 		int threadIdx = nextThread;
 		nextThread = (nextThread + 1) % threadCount;
 		lock.leave();
-		for (auto it : externalClients) {
-			TraceEvent("CreatingDatabaseOnExternalClient")
-			    .detail("LibraryPath", it.first)
-			    .detail("Failed", it.second[threadIdx]->failed);
-		}
-		return Reference<IDatabase>(new MultiVersionDatabase(this, threadIdx, clusterFile, Reference<IDatabase>()));
+
+		Reference<IDatabase> localDb = localClient->api->createDatabase(clusterFilePath);
+		return Reference<IDatabase>(
+		    new MultiVersionDatabase(this, threadIdx, clusterFile, Reference<IDatabase>(), localDb));
 	}
 
 	lock.leave();
 
 	ASSERT_LE(threadCount, 1);
 
-	auto db = localClient->api->createDatabase(clusterFilePath);
+	Reference<IDatabase> localDb = localClient->api->createDatabase(clusterFilePath);
 	if (bypassMultiClientApi) {
-		return db;
+		return localDb;
 	} else {
-		for (auto it : externalClients) {
-			TraceEvent("CreatingDatabaseOnExternalClient")
-			    .detail("LibraryPath", it.first)
-			    .detail("Failed", it.second[0]->failed);
-		}
-		return Reference<IDatabase>(new MultiVersionDatabase(this, 0, clusterFile, db));
+		return Reference<IDatabase>(new MultiVersionDatabase(this, 0, clusterFile, Reference<IDatabase>(), localDb));
 	}
 }
 
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index badb848334..c8aaeb840e 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -271,7 +271,7 @@ public:
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
 
-	// Returns the protocol version reported by a quorum of coordinators
+	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
 	ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
@@ -437,14 +437,14 @@ public:
 	                     int threadIdx,
 	                     std::string clusterFilePath,
 	                     Reference<IDatabase> db,
+	                     Reference<IDatabase> versionMonitorDb,
 	                     bool openConnectors = true);
-	~MultiVersionDatabase() override;
 
 	Reference<ITransaction> createTransaction() override;
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
 
-	// Returns the protocol version reported by a quorum of coordinators
+	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
 	ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
@@ -452,67 +452,59 @@ public:
 	void addref() override { ThreadSafeReferenceCounted<MultiVersionDatabase>::addref(); }
 	void delref() override { ThreadSafeReferenceCounted<MultiVersionDatabase>::delref(); }
 
+	// Create a MultiVersionDatabase that wraps an already created IDatabase object
+	// For internal use in testing
 	static Reference<IDatabase> debugCreateFromExistingDatabase(Reference<IDatabase> db);
 
 	ThreadFuture<int64_t> rebootWorker(const StringRef& address, bool check, int duration) override;
 	ThreadFuture<Void> forceRecoveryWithDataLoss(const StringRef& dcid) override;
 	ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override;
 
-private:
-	struct DatabaseState;
-
-	struct Connector : ThreadCallback, ThreadSafeReferenceCounted<Connector> {
-		Connector(Reference<DatabaseState> dbState, Reference<ClientInfo> client, std::string clusterFilePath)
-		  : dbState(dbState), client(client), clusterFilePath(clusterFilePath), connected(false), cancelled(false) {}
-
-		void connect();
-		void cancel();
-
-		bool canFire(int notMadeActive) const override { return true; }
-		void fire(const Void& unused, int& userParam) override;
-		void error(const Error& e, int& userParam) override;
-
-		const Reference<ClientInfo> client;
-		const std::string clusterFilePath;
-
-		const Reference<DatabaseState> dbState;
-
-		ThreadFuture<Void> connectionFuture;
-
-		Reference<IDatabase> candidateDatabase;
-		Reference<ITransaction> tr;
-
-		bool connected;
-		bool cancelled;
-	};
+	// private:
 
+	// A struct that manages the current connection state of the MultiVersionDatabase. This wraps the underlying
+	// IDatabase object that is currently interacting with the cluster.
 	struct DatabaseState : ThreadSafeReferenceCounted<DatabaseState> {
-		DatabaseState();
+		DatabaseState(std::string clusterFilePath, Reference<IDatabase> versionMonitorDb);
 
-		void stateChanged();
-		void addConnection(Reference<ClientInfo> client, std::string clusterFilePath);
-		void startConnections();
-		void cancelConnections();
+		// Called when a change to the protocol version of the cluster has been detected. Must be called from the main
+		// thread.
+		void protocolVersionChanged(ProtocolVersion protocolVersion);
+
+		// Adds a client (local or externally loaded) that can be used to connect to the cluster
+		void addClient(Reference<ClientInfo> client);
+
+		// Watch the cluster protocol version for changes and update the database state when it does
+		ThreadFuture<Void> monitorProtocolVersion();
 
 		Reference<IDatabase> db;
 		const Reference<ThreadSafeAsyncVar<Reference<IDatabase>>> dbVar;
+		std::string clusterFilePath;
+
+		// Used to monitor the cluster protocol version. Will be the same as db unless we have either not connected
+		// yet or if the client version associated with db does not support protocol monitoring. In those cases, this
+		// will be a specially created local db.
+		Reference<IDatabase> versionMonitorDb;
 
 		ThreadFuture<Void> changed;
 
 		bool cancelled;
 
-		int currentClientIndex;
-		std::vector<Reference<ClientInfo>> clients;
-		std::vector<Reference<Connector>> connectionAttempts;
+		ThreadFuture<Void> protocolVersionMonitor;
+		Optional<ProtocolVersion> dbProtocolVersion;
+		std::map<ProtocolVersion, Reference<ClientInfo>> clients;
 
 		std::vector<std::pair<FDBDatabaseOptions::Option, Optional<Standalone<StringRef>>>> options;
 		UniqueOrderedOptionList<FDBTransactionOptions> transactionDefaultOptions;
 		Mutex optionLock;
 	};
 
-	std::string clusterFilePath;
 	const Reference<DatabaseState> dbState;
 	friend class MultiVersionTransaction;
+
+	// Clients must create a database object in order to initialize some of their state.
+	// This needs to be done only once, and this flag tracks whether that has happened.
+	static std::atomic_flag externalClientsInitialized;
 };
 
 // An implementation of IClientApi that can choose between multiple different client implementations either provided
@@ -530,6 +522,7 @@ public:
 	void stopNetwork() override;
 	void addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParameter) override;
 
+	// Creates an IDatabase object that represents a connections to the cluster
 	Reference<IDatabase> createDatabase(const char* clusterFilePath) override;
 	static MultiVersionApi* api;
 
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 6615e973dd..4a6239346e 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -898,6 +898,7 @@ Future<Standalone<RangeResultRef>> HealthMetricsRangeImpl::getRange(ReadYourWrit
 
 DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionFile>>> connectionFile,
                                  Reference<AsyncVar<ClientDBInfo>> clientInfo,
+                                 Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
                                  Future<Void> clientInfoMonitor,
                                  TaskPriority taskID,
                                  LocalityData const& clientLocality,
@@ -906,9 +907,10 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
                                  bool internal,
                                  int apiVersion,
                                  bool switchable)
-  : connectionFile(connectionFile), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), taskID(taskID),
-    clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance), lockAware(lockAware),
-    apiVersion(apiVersion), switchable(switchable), proxyProvisional(false), cc("TransactionMetrics"),
+  : connectionFile(connectionFile), clientInfo(clientInfo), coordinator(coordinator),
+    clientInfoMonitor(clientInfoMonitor), taskID(taskID), clientLocality(clientLocality),
+    enableLocalityLoadBalance(enableLocalityLoadBalance), lockAware(lockAware), apiVersion(apiVersion),
+    switchable(switchable), proxyProvisional(false), cc("TransactionMetrics"),
     transactionReadVersions("ReadVersions", cc), transactionReadVersionsThrottled("ReadVersionsThrottled", cc),
     transactionReadVersionsCompleted("ReadVersionsCompleted", cc),
     transactionReadVersionBatches("ReadVersionBatches", cc),
@@ -1156,6 +1158,8 @@ DatabaseContext::DatabaseContext(const Error& err)
     transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), internal(false),
     transactionTracingEnabled(true) {}
 
+// Static constructor used by server processes to create a DatabaseContext
+// For internal (fdbserver) use only
 Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo,
                                  Future<Void> clientInfoMonitor,
                                  LocalityData clientLocality,
@@ -1166,6 +1170,7 @@ Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo,
                                  bool switchable) {
 	return Database(new DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionFile>>>(),
 	                                    clientInfo,
+	                                    makeReference<AsyncVar<Optional<ClientLeaderRegInterface>>>(),
 	                                    clientInfoMonitor,
 	                                    taskID,
 	                                    clientLocality,
@@ -1446,6 +1451,9 @@ void DatabaseContext::expireThrottles() {
 
 extern IPAddress determinePublicIPAutomatically(ClusterConnectionString const& ccs);
 
+// Creates a database object that represents a connection to a cluster
+// This constructor uses a preallocated DatabaseContext that may have been created
+// on another thread
 Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
                                   int apiVersion,
                                   bool internal,
@@ -1492,15 +1500,20 @@ Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
 	g_network->initTLS();
 
 	auto clientInfo = makeReference<AsyncVar<ClientDBInfo>>();
+	auto coordinator = makeReference<AsyncVar<Optional<ClientLeaderRegInterface>>>();
 	auto connectionFile = makeReference<AsyncVar<Reference<ClusterConnectionFile>>>();
 	connectionFile->set(connFile);
-	Future<Void> clientInfoMonitor = monitorProxies(
-	    connectionFile, clientInfo, networkOptions.supportedVersions, StringRef(networkOptions.traceLogGroup));
+	Future<Void> clientInfoMonitor = monitorProxies(connectionFile,
+	                                                clientInfo,
+	                                                coordinator,
+	                                                networkOptions.supportedVersions,
+	                                                StringRef(networkOptions.traceLogGroup));
 
 	DatabaseContext* db;
 	if (preallocatedDb) {
 		db = new (preallocatedDb) DatabaseContext(connectionFile,
 		                                          clientInfo,
+		                                          coordinator,
 		                                          clientInfoMonitor,
 		                                          TaskPriority::DefaultEndpoint,
 		                                          clientLocality,
@@ -1512,6 +1525,7 @@ Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
 	} else {
 		db = new DatabaseContext(connectionFile,
 		                         clientInfo,
+		                         coordinator,
 		                         clientInfoMonitor,
 		                         TaskPriority::DefaultEndpoint,
 		                         clientLocality,
@@ -4872,48 +4886,95 @@ Future<Standalone<StringRef>> Transaction::getVersionstamp() {
 	return versionstampPromise.getFuture();
 }
 
-ACTOR Future<ProtocolVersion> coordinatorProtocolsFetcher(Reference<ClusterConnectionFile> f) {
-	state ClientCoordinators coord(f);
+// Gets the protocol version reported by a coordinator via the protocol info interface
+ACTOR Future<ProtocolVersion> getCoordinatorProtocol(NetworkAddressList coordinatorAddresses) {
+	RequestStream<ProtocolInfoRequest> requestStream{ Endpoint{ { coordinatorAddresses }, WLTOKEN_PROTOCOL_INFO } };
+	ProtocolInfoReply reply = wait(retryBrokenPromise(requestStream, ProtocolInfoRequest{}));
 
-	state vector<Future<ProtocolInfoReply>> coordProtocols;
-	coordProtocols.reserve(coord.clientLeaderServers.size());
-	for (int i = 0; i < coord.clientLeaderServers.size(); i++) {
-		RequestStream<ProtocolInfoRequest> requestStream{ Endpoint{
-			{ coord.clientLeaderServers[i].getLeader.getEndpoint().addresses }, WLTOKEN_PROTOCOL_INFO } };
-		coordProtocols.push_back(retryBrokenPromise(requestStream, ProtocolInfoRequest{}));
-	}
-
-	wait(smartQuorum(coordProtocols, coordProtocols.size() / 2 + 1, 1.5));
-
-	std::unordered_map<uint64_t, int> protocolCount;
-	for (int i = 0; i < coordProtocols.size(); i++) {
-		if (coordProtocols[i].isReady()) {
-			protocolCount[coordProtocols[i].get().version.version()]++;
-		}
-	}
-
-	uint64_t majorityProtocol = std::max_element(protocolCount.begin(),
-	                                             protocolCount.end(),
-	                                             [](const std::pair<uint64_t, int>& l,
-	                                                const std::pair<uint64_t, int>& r) { return l.second < r.second; })
-	                                ->first;
-	return ProtocolVersion(majorityProtocol);
+	return reply.version;
 }
 
-// Returns the protocol version reported by a quorum of coordinators
-// If an expected version is given, the future won't return until the protocol version is different than expected
-ACTOR Future<ProtocolVersion> getClusterProtocol(Reference<ClusterConnectionFile> f,
-                                                 Optional<ProtocolVersion> expectedVersion) {
+// Gets the protocol version reported by a coordinator in its connect packet
+// If we are unable to get a version from the connect packet (e.g. because we lost connection with the peer), then this
+// function will return with an unset result.
+// If an expected version is given, this future won't return if the actual protocol version matches the expected version
+ACTOR Future<Optional<ProtocolVersion>> getCoordinatorProtocolFromConnectPacket(
+    NetworkAddress coordinatorAddress,
+    Optional<ProtocolVersion> expectedVersion) {
+
+	state Reference<AsyncVar<Optional<ProtocolVersion>>> protocolVersion =
+	    FlowTransport::transport().getPeerProtocolAsyncVar(coordinatorAddress);
+
 	loop {
-		ProtocolVersion protocolVersion = wait(coordinatorProtocolsFetcher(f));
-		if (!expectedVersion.present() || protocolVersion != expectedVersion.get()) {
-			return protocolVersion;
-		} else {
-			wait(delay(2.0)); // TODO: this is temporary, so not making into a knob yet
+		if (protocolVersion->get().present() &&
+		    (!expectedVersion.present() || expectedVersion.get() != protocolVersion->get().get())) {
+			return protocolVersion->get();
+		}
+
+		Future<Void> change = protocolVersion->onChange();
+		if (!protocolVersion->get().present()) {
+			// If we still don't have any connection info after a timeout, retry sending the protocol version request
+			change = timeout(change, FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT, Void());
+		}
+
+		wait(change);
+
+		if (!protocolVersion->get().present()) {
+			return protocolVersion->get();
 		}
 	}
 }
 
+// Returns the protocol version reported by the given coordinator
+// If an expected version is given, the future won't return until the protocol version is different than expected
+ACTOR Future<ProtocolVersion> getClusterProtocolImpl(
+    Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
+    Optional<ProtocolVersion> expectedVersion) {
+
+	state bool needToConnect = true;
+	state Future<ProtocolVersion> protocolVersion = Never();
+
+	loop {
+		if (!coordinator->get().present()) {
+			wait(coordinator->onChange());
+		} else {
+			Endpoint coordinatorEndpoint = coordinator->get().get().getLeader.getEndpoint();
+			if (needToConnect) {
+				// Even though we typically rely on the connect packet to get the protocol version, we need to send some
+				// request in order to start a connection. This protocol version request serves that purpose.
+				protocolVersion = getCoordinatorProtocol(coordinatorEndpoint.addresses);
+				needToConnect = false;
+			}
+			choose {
+				when(wait(coordinator->onChange())) { needToConnect = true; }
+
+				when(ProtocolVersion pv = wait(protocolVersion)) {
+					if (!expectedVersion.present() || expectedVersion.get() != pv) {
+						return pv;
+					}
+				}
+
+				// Older versions of FDB don't have an endpoint to return the protocol version, so we get this info from
+				// the connect packet
+				when(Optional<ProtocolVersion> pv = wait(getCoordinatorProtocolFromConnectPacket(
+				         coordinatorEndpoint.getPrimaryAddress(), expectedVersion))) {
+					if (pv.present()) {
+						return pv.get();
+					} else {
+						needToConnect = true;
+					}
+				}
+			}
+		}
+	}
+}
+
+// Returns the protocol version reported by the coordinator this client is currently connected to
+// If an expected version is given, the future won't return until the protocol version is different than expected
+Future<ProtocolVersion> DatabaseContext::getClusterProtocol(Optional<ProtocolVersion> expectedVersion) {
+	return getClusterProtocolImpl(coordinator, expectedVersion);
+}
+
 uint32_t Transaction::getSize() {
 	auto s = tr.transaction.mutations.expectedSize() + tr.transaction.read_conflict_ranges.expectedSize() +
 	         tr.transaction.write_conflict_ranges.expectedSize();
diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h
index 51411ae0a2..9f9b0057ca 100644
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@@ -76,11 +76,15 @@ class Database {
 public:
 	enum { API_VERSION_LATEST = -1 };
 
+	// Creates a database object that represents a connection to a cluster
+	// This constructor uses a preallocated DatabaseContext that may have been created
+	// on another thread
 	static Database createDatabase(Reference<ClusterConnectionFile> connFile,
 	                               int apiVersion,
 	                               bool internal = true,
 	                               LocalityData const& clientLocality = LocalityData(),
 	                               DatabaseContext* preallocatedDb = nullptr);
+
 	static Database createDatabase(std::string connFileName,
 	                               int apiVersion,
 	                               bool internal = true,
@@ -400,11 +404,6 @@ ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID sn
 // Checks with Data Distributor that it is safe to mark all servers in exclusions as failed
 ACTOR Future<bool> checkSafeExclusions(Database cx, vector<AddressExclusion> exclusions);
 
-// Returns the protocol version reported by a quorum of coordinators
-// If an expected version is given, the future won't return until the protocol version is different than expected
-ACTOR Future<ProtocolVersion> getClusterProtocol(Reference<ClusterConnectionFile> f,
-                                                 Optional<ProtocolVersion> expectedVersion);
-
 inline uint64_t getWriteOperationCost(uint64_t bytes) {
 	return bytes / std::max(1, CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR) + 1;
 }
diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp
index c5bf2dce87..ce17338af7 100644
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@@ -97,13 +97,12 @@ double ThreadSafeDatabase::getMainThreadBusyness() {
 	return g_network->networkInfo.metrics.networkBusyness;
 }
 
-// Returns the protocol version reported by a quorum of coordinators
+// Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
 ThreadFuture<ProtocolVersion> ThreadSafeDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
 	DatabaseContext* db = this->db;
-	return onMainThread([db, expectedVersion]() -> Future<ProtocolVersion> {
-		return getClusterProtocol(db->getConnectionFile(), expectedVersion);
-	});
+	return onMainThread(
+	    [db, expectedVersion]() -> Future<ProtocolVersion> { return db->getClusterProtocol(expectedVersion); });
 }
 
 ThreadSafeDatabase::ThreadSafeDatabase(std::string connFilename, int apiVersion) {
diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h
index e6360c2a6d..407f9aefae 100644
--- a/fdbclient/ThreadSafeTransaction.h
+++ b/fdbclient/ThreadSafeTransaction.h
@@ -39,7 +39,7 @@ public:
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
 
-	// Returns the protocol version reported by a quorum of coordinators
+	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
 	ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 56fca670b2..b7221c8876 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -760,6 +760,13 @@ ACTOR Future<Void> connectionKeeper(Reference<Peer> self,
 
 				conn->close();
 				conn = Reference<IConnection>();
+
+				// Old versions will throw this error, and we don't want to forget their protocol versions.
+				// This means we can't tell the difference between an old protocol version and one we
+				// can no longer connect to.
+				if (e.code() != error_code_incompatible_protocol_version) {
+					self->protocolVersion->set(Optional<ProtocolVersion>());
+				}
 			}
 
 			// Clients might send more packets in response, which needs to go out on the next connection
@@ -787,7 +794,8 @@ Peer::Peer(TransportData* transport, NetworkAddress const& destination)
     incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()),
     pingLatencies(destination.isPublic() ? FLOW_KNOBS->PING_SAMPLE_AMOUNT : 1), lastLoggedBytesReceived(0),
     bytesSent(0), lastLoggedBytesSent(0), lastLoggedTime(0.0), connectOutgoingCount(0), connectIncomingCount(0),
-    connectFailedCount(0), connectLatencies(destination.isPublic() ? FLOW_KNOBS->NETWORK_CONNECT_SAMPLE_AMOUNT : 1) {
+    connectFailedCount(0), connectLatencies(destination.isPublic() ? FLOW_KNOBS->NETWORK_CONNECT_SAMPLE_AMOUNT : 1),
+    protocolVersion(Reference<AsyncVar<Optional<ProtocolVersion>>>(new AsyncVar<Optional<ProtocolVersion>>())) {
 	IFailureMonitor::failureMonitor().setStatus(destination, FailureStatus(false));
 }
 
@@ -1103,12 +1111,12 @@ static int getNewBufferSize(const uint8_t* begin,
 	                          packetLen + sizeof(uint32_t) * (peerAddress.isTLS() ? 2 : 3));
 }
 
+// This actor exists whenever there is an open or opening connection, whether incoming or outgoing
+// For incoming connections conn is set and peer is initially nullptr; for outgoing connections it is the reverse
 ACTOR static Future<Void> connectionReader(TransportData* transport,
                                            Reference<IConnection> conn,
                                            Reference<Peer> peer,
                                            Promise<Reference<Peer>> onConnected) {
-	// This actor exists whenever there is an open or opening connection, whether incoming or outgoing
-	// For incoming connections conn is set and peer is initially nullptr; for outgoing connections it is the reverse
 
 	state Arena arena;
 	state uint8_t* unprocessed_begin = nullptr;
@@ -1209,6 +1217,7 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
 							if (!protocolVersion.hasMultiVersionClient()) {
 								// Older versions expected us to hang up. It may work even if we don't hang up here, but
 								// it's safer to keep the old behavior.
+								peer->protocolVersion->set(peerProtocolVersion);
 								throw incompatible_protocol_version();
 							}
 						} else {
@@ -1256,6 +1265,7 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
 							onConnected.send(peer);
 							wait(delay(0)); // Check for cancellation
 						}
+						peer->protocolVersion->set(peerProtocolVersion);
 					}
 				}
 
@@ -1669,6 +1679,16 @@ Reference<AsyncVar<bool>> FlowTransport::getDegraded() {
 	return self->degraded;
 }
 
+// Returns the protocol version of the peer at the specified address. The result is returned as an AsyncVar that
+// can be used to monitor for changes of a peer's protocol. The protocol version will be unset in the event that
+// there is no connection established to the peer.
+//
+// Note that this function does not establish a connection to the peer. In order to obtain a peer's protocol
+// version, some other mechanism should be used to connect to that peer.
+Reference<AsyncVar<Optional<ProtocolVersion>>> FlowTransport::getPeerProtocolAsyncVar(NetworkAddress addr) {
+	return self->peers.at(addr)->protocolVersion;
+}
+
 void FlowTransport::resetConnection(NetworkAddress address) {
 	auto peer = self->getPeer(address);
 	if (peer) {
diff --git a/fdbrpc/FlowTransport.h b/fdbrpc/FlowTransport.h
index bdec8237bd..e2bbfddeee 100644
--- a/fdbrpc/FlowTransport.h
+++ b/fdbrpc/FlowTransport.h
@@ -152,6 +152,9 @@ struct Peer : public ReferenceCounted<Peer> {
 	double lastLoggedTime;
 	int64_t lastLoggedBytesReceived;
 	int64_t lastLoggedBytesSent;
+
+	Reference<AsyncVar<Optional<ProtocolVersion>>> protocolVersion;
+
 	// Cleared every time stats are logged for this peer.
 	int connectOutgoingCount;
 	int connectIncomingCount;
@@ -174,64 +177,64 @@ public:
 	FlowTransport(uint64_t transportId);
 	~FlowTransport();
 
-	static void createInstance(bool isClient, uint64_t transportId);
 	// Creates a new FlowTransport and makes FlowTransport::transport() return it.  This uses g_network->global()
 	// variables, so it will be private to a simulation.
+	static void createInstance(bool isClient, uint64_t transportId);
 
 	static bool isClient() { return g_network->global(INetwork::enClientFailureMonitor) != nullptr; }
 
-	void initMetrics();
 	// Metrics must be initialized after FlowTransport::createInstance has been called
+	void initMetrics();
 
-	Future<Void> bind(NetworkAddress publicAddress, NetworkAddress listenAddress);
 	// Starts a server listening on the given listenAddress, and sets publicAddress to be the public
 	// address of this server.  Returns only errors.
+	Future<Void> bind(NetworkAddress publicAddress, NetworkAddress listenAddress);
 
-	NetworkAddress getLocalAddress() const;
 	// Returns first local NetworkAddress.
+	NetworkAddress getLocalAddress() const;
 
-	NetworkAddressList getLocalAddresses() const;
 	// Returns all local NetworkAddress.
+	NetworkAddressList getLocalAddresses() const;
 
-	std::map<NetworkAddress, std::pair<uint64_t, double>>* getIncompatiblePeers();
 	// Returns the same of all peers that have attempted to connect, but have incompatible protocol versions
+	std::map<NetworkAddress, std::pair<uint64_t, double>>* getIncompatiblePeers();
 
-	Future<Void> onIncompatibleChanged();
 	// Returns when getIncompatiblePeers has at least one peer which is incompatible.
+	Future<Void> onIncompatibleChanged();
 
-	void addPeerReference(const Endpoint&, bool isStream);
 	// Signal that a peer connection is being used, even if no messages are currently being sent to the peer
+	void addPeerReference(const Endpoint&, bool isStream);
 
-	void removePeerReference(const Endpoint&, bool isStream);
 	// Signal that a peer connection is no longer being used
+	void removePeerReference(const Endpoint&, bool isStream);
 
-	void addEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);
 	// Sets endpoint to be a new local endpoint which delivers messages to the given receiver
+	void addEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);
 
 	void addEndpoints(std::vector<std::pair<struct FlowReceiver*, TaskPriority>> const& streams);
 
-	void removeEndpoint(const Endpoint&, NetworkMessageReceiver*);
 	// The given local endpoint no longer delivers messages to the given receiver or uses resources
+	void removeEndpoint(const Endpoint&, NetworkMessageReceiver*);
 
-	void addWellKnownEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);
 	// Sets endpoint to a new local endpoint (without changing its token) which delivers messages to the given receiver
 	// Implementations may have limitations on when this function is called and what endpoint.token may be!
+	void addWellKnownEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);
 
+	// sendReliable will keep trying to deliver the data to the destination until cancelReliable is called. It will
+	// retry sending if the connection is closed or the failure manager reports the destination become available (edge
+	// triggered).
 	ReliablePacket* sendReliable(ISerializeSource const& what, const Endpoint& destination);
-	// sendReliable will keep trying to deliver the data to the destination until cancelReliable is
-	//   called.  It will retry sending if the connection is closed or the failure manager reports
-	//   the destination become available (edge triggered).
 
+	// Makes Packet "unreliable" (either the data or a connection close event will be delivered eventually). It can
+	// still be used safely to send a reply to a "reliable" request.
 	void cancelReliable(ReliablePacket*);
-	// Makes Packet "unreliable" (either the data or a connection close event will be delivered
-	//   eventually).  It can still be used safely to send a reply to a "reliable" request.
 
-	Reference<AsyncVar<bool>> getDegraded();
 	// This async var will be set to true when the process cannot connect to a public network address that the failure
 	// monitor thinks is healthy.
+	Reference<AsyncVar<bool>> getDegraded();
 
-	void resetConnection(NetworkAddress address);
 	// Forces the connection with this address to be reset
+	void resetConnection(NetworkAddress address);
 
 	Reference<Peer> sendUnreliable(ISerializeSource const& what,
 	                               const Endpoint& destination,
@@ -239,6 +242,14 @@ public:
 
 	bool incompatibleOutgoingConnectionsPresent();
 
+	// Returns the protocol version of the peer at the specified address. The result is returned as an AsyncVar that
+	// can be used to monitor for changes of a peer's protocol. The protocol version will be unset in the event that
+	// there is no connection established to the peer.
+	//
+	// Note that this function does not establish a connection to the peer. In order to obtain a peer's protocol
+	// version, some other mechanism should be used to connect to that peer.
+	Reference<AsyncVar<Optional<ProtocolVersion>>> getPeerProtocolAsyncVar(NetworkAddress addr);
+
 	static FlowTransport& transport() {
 		return *static_cast<FlowTransport*>((void*)g_network->global(INetwork::enFlowTransport));
 	}
diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h
index 1a5bd816b8..74da1dfd70 100644
--- a/flow/ProtocolVersion.h
+++ b/flow/ProtocolVersion.h
@@ -20,6 +20,7 @@
 
 #pragma once
 #include <cstdint>
+#include "flow/Trace.h"
 
 #define PROTOCOL_VERSION_FEATURE(v, x)                                                                                 \
 	struct x {                                                                                                         \
@@ -50,6 +51,10 @@ public:
 		return (other.version() & compatibleProtocolVersionMask) == (version() & compatibleProtocolVersionMask);
 	}
 
+	// Returns a normalized protocol version that will be the same for all compatible versions
+	constexpr ProtocolVersion normalizedVersion() const {
+		return ProtocolVersion(_version & compatibleProtocolVersionMask);
+	}
 	constexpr bool isValid() const { return version() >= minValidProtocolVersion; }
 
 	constexpr uint64_t version() const { return _version & versionFlagMask; }
@@ -134,6 +139,13 @@ public: // introduced features
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, SpanContext);
 };
 
+template <>
+struct Traceable<ProtocolVersion> : std::true_type {
+	static std::string toString(const ProtocolVersion& protocolVersion) {
+		return format("0x%016lX", protocolVersion.version());
+	}
+};
+
 // These impact both communications and the deserialization of certain database and IKeyValueStore keys.
 //
 // The convention is that 'x' and 'y' should match the major and minor version of the software, and 'z' should be 0.

From 711fb5829369458c1bb2305af0369af72c6b3043 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 15 Apr 2021 12:40:39 -0700
Subject: [PATCH 160/461] Improve logging on worker joining cluster

1. Logging on worker nodes when it joins a cluster and which cluster;
2. Log the connection string that is being used by worker;
3. Log a warning when a worker fails to join a cluster for longer than 5min, either because it doesn't know which cluster to join, or fails to get a RegisterWorkerReply within 5min.
---
 fdbserver/worker.actor.cpp | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 7e8ddbaf79..76554189be 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -526,9 +526,9 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			request.issues.push_back_deep(request.issues.arena(), i);
 		}
 		ClusterConnectionString fileConnectionString;
+		std::string connectionString = connFile->getConnectionString().toString();
 		if (connFile && !connFile->fileContentsUpToDate(fileConnectionString)) {
 			request.issues.push_back_deep(request.issues.arena(), LiteralStringRef("incorrect_cluster_file_contents"));
-			std::string connectionString = connFile->getConnectionString().toString();
 			if (!incorrectTime.present()) {
 				incorrectTime = now();
 			}
@@ -542,6 +542,12 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			}
 		} else {
 			incorrectTime = Optional<double>();
+			if (connFile->canGetFilename()) {
+				TraceEvent("ClusterFileContents")
+				    .detail("Filename", connFile->getFilename())
+				    .detail("ConnectionStringFromFile", fileConnectionString.toString())
+				    .detail("CurrentConnectionString", connectionString);
+			}
 		}
 
 		auto peers = FlowTransport::transport().getIncompatiblePeers();
@@ -554,21 +560,27 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			}
 		}
 
-		Future<RegisterWorkerReply> registrationReply =
+		state Future<RegisterWorkerReply> registrationReply =
 		    ccInterface->get().present()
 		        ? brokenPromiseToNever(ccInterface->get().get().registerWorker.getReply(request))
 		        : Never();
-		choose {
+		state double startTime = now();
+		loop choose {
 			when(RegisterWorkerReply reply = wait(registrationReply)) {
 				processClass = reply.processClass;
 				asyncPriorityInfo->set(reply.priorityInfo);
+				TraceEvent("WorkerJoiningCluster").detail("CCID", ccInterface->get().get().id());
+				break;
 			}
-			when(wait(ccInterface->onChange())) {}
-			when(wait(ddInterf->onChange())) {}
-			when(wait(rkInterf->onChange())) {}
-			when(wait(degraded->onChange())) {}
-			when(wait(FlowTransport::transport().onIncompatibleChanged())) {}
-			when(wait(issues->onChange())) {}
+			when(wait(delay(300))) { // 5 min
+				TraceEvent(SevWarn, "WorkerNotJoinedClusterForLongTime").detail("WaitTime", now() - startTime);
+			}
+			when(wait(ccInterface->onChange())) { break; }
+			when(wait(ddInterf->onChange())) { break; }
+			when(wait(rkInterf->onChange())) { break; }
+			when(wait(degraded->onChange())) { break; }
+			when(wait(FlowTransport::transport().onIncompatibleChanged())) { break; }
+			when(wait(issues->onChange())) { break; }
 		}
 	}
 }

From 486260e944c3362a283eb1494a79e12ba873f3af Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Thu, 15 Apr 2021 13:36:31 -0700
Subject: [PATCH 161/461] Fix infinite loop with stable interface protocol
 monitoring. Fix case where getting an error with a network option didn't
 properly terminate the database connection. Reduce option lock critical
 section.

---
 fdbclient/MultiVersionTransaction.actor.cpp | 24 +++++++++++----------
 fdbclient/NativeAPI.actor.cpp               |  2 ++
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 57f23e3d88..b39fde5cfd 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -1029,22 +1029,23 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
 
 		if (itr != clients.end()) {
 			auto& client = itr->second;
-			TraceEvent("CreatingDatabaseOnExternalClient")
+			TraceEvent("CreatingDatabaseOnClient")
 			    .detail("LibraryPath", client->libPath)
-			    .detail("Failed", client->failed);
+			    .detail("Failed", client->failed)
+			    .detail("External", client->external);
 
 			Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
 
 			optionLock.enter();
 			for (auto option : options) {
 				try {
-					newDb->setOption(
-					    option.first,
-					    option.second.castTo<StringRef>()); // In practice, this will set a deferred error instead
-					                                        // of throwing. If that happens, the database will be
-					                                        // unusable (attempts to use it will throw errors).
+					// In practice, this will set a deferred error instead of throwing. If that happens, the database
+					// will be unusable (attempts to use it will throw errors).
+					newDb->setOption(option.first, option.second.castTo<StringRef>());
 				} catch (Error& e) {
 					optionLock.leave();
+
+					// If we can't set all of the options on a cluster, we abandon the client
 					TraceEvent(SevError, "ClusterVersionChangeOptionError")
 					    .error(e)
 					    .detail("Option", option.first)
@@ -1052,19 +1053,20 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
 					    .detail("LibPath", client->libPath);
 					client->failed = true;
 					MultiVersionApi::api->updateSupportedVersions();
-					db = Reference<IDatabase>(); // If we can't set all of the options on a cluster, we abandon the
-					                             // client
+					newDb = Reference<IDatabase>();
 					break;
 				}
 			}
 
 			db = newDb;
-			if (dbProtocolVersion.get().hasStableInterfaces()) {
+
+			optionLock.leave();
+
+			if (dbProtocolVersion.get().hasStableInterfaces() && db) {
 				versionMonitorDb = db;
 			} else {
 				versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
 			}
-			optionLock.leave();
 		} else {
 			db = Reference<IDatabase>();
 			versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 4a6239346e..f673d025c6 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -4952,6 +4952,8 @@ ACTOR Future<ProtocolVersion> getClusterProtocolImpl(
 					if (!expectedVersion.present() || expectedVersion.get() != pv) {
 						return pv;
 					}
+
+					protocolVersion = Never();
 				}
 
 				// Older versions of FDB don't have an endpoint to return the protocol version, so we get this info from

From 551268b0f25be9f1eb35089e702a6a78cd2d1913 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 15 Apr 2021 13:50:50 -0700
Subject: [PATCH 162/461] Add well known endpoint for worker communication

---
 cmake/GetMsgpack.cmake              |  7 +++-
 fdbclient/NativeAPI.actor.cpp       |  4 ++
 fdbclient/ProcessInterface.h        | 57 +++++++++++++++++++++++++++++
 fdbclient/SpecialKeySpace.actor.cpp | 37 ++++++++++++++++++-
 fdbclient/SpecialKeySpace.actor.h   |  7 ++++
 fdbrpc/FlowTransport.actor.cpp      |  2 +-
 fdbserver/worker.actor.cpp          | 15 ++++++++
 flow/Platform.actor.cpp             | 18 +--------
 8 files changed, 126 insertions(+), 21 deletions(-)
 create mode 100644 fdbclient/ProcessInterface.h

diff --git a/cmake/GetMsgpack.cmake b/cmake/GetMsgpack.cmake
index 0b951d5a1b..dc9a578175 100644
--- a/cmake/GetMsgpack.cmake
+++ b/cmake/GetMsgpack.cmake
@@ -9,8 +9,11 @@ else()
   ExternalProject_add(msgpackProject
     URL "https://github.com/msgpack/msgpack-c/releases/download/cpp-3.3.0/msgpack-3.3.0.tar.gz"
     URL_HASH SHA256=6e114d12a5ddb8cb11f669f83f32246e484a8addd0ce93f274996f1941c1f07b
-    CONFIGURE_COMMAND BUILD_COMMAND INSTALL_COMMAND)
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ""
+  )
 
   ExternalProject_Get_property(msgpackProject SOURCE_DIR)
   target_include_directories(msgpack SYSTEM INTERFACE "${SOURCE_DIR}/include")
-endif()
\ No newline at end of file
+endif()
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index b208107fde..f5c135dd23 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -1046,6 +1046,10 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 		    std::make_unique<ClientProfilingImpl>(
 		        KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
 		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
+		registerSpecialKeySpaceModule(
+		    SpecialKeySpace::MODULE::ACTORLINEAGE,
+		    SpecialKeySpace::IMPLTYPE::READONLY,
+		    std::make_unique<ActorLineageImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE)));
 	}
 	if (apiVersionAtLeast(630)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION,
diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
new file mode 100644
index 0000000000..c76cf9ef48
--- /dev/null
+++ b/fdbclient/ProcessInterface.h
@@ -0,0 +1,57 @@
+/*
+ * ProcessInterface.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/FDBTypes.h"
+#include "fdbrpc/fdbrpc.h"
+
+constexpr UID WLTOKEN_PROCESS(-1, 11);
+
+struct ProcessInterface {
+	constexpr static FileIdentifier file_identifier = 985636;
+	RequestStream<struct GetProcessInterfaceRequest> getInterface;
+	RequestStream<struct EchoRequest> echo;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, echo);
+	}
+};
+
+struct GetProcessInterfaceRequest {
+	constexpr static FileIdentifier file_identifier = 7632546;
+	ReplyPromise<ProcessInterface> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, reply);
+	}
+};
+
+// TODO: Used for demonstration purposes, remove in later PR
+struct EchoRequest {
+	constexpr static FileIdentifier file_identifier = 10624019;
+	std::string message;
+	ReplyPromise<std::string> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, message, reply);
+	}
+};
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 5fb7360b0d..eaa35e353d 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -22,6 +22,7 @@
 #include "boost/algorithm/string.hpp"
 
 #include "fdbclient/Knobs.h"
+#include "fdbclient/ProcessInterface.h"
 #include "fdbclient/SpecialKeySpace.actor.h"
 #include "flow/Arena.h"
 #include "flow/UnitTest.h"
@@ -65,9 +66,12 @@ std::unordered_map<SpecialKeySpace::MODULE, KeyRange> SpecialKeySpace::moduleToB
 	{ SpecialKeySpace::MODULE::CONFIGURATION,
 	  KeyRangeRef(LiteralStringRef("\xff\xff/configuration/"), LiteralStringRef("\xff\xff/configuration0")) },
 	{ SpecialKeySpace::MODULE::TRACING,
-	  KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) }
+	  KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) },
+	{ SpecialKeySpace::MODULE::ACTORLINEAGE,
+	  KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) }
 };
 
+// TODO: Similar for actor lineage?
 std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandToRange = {
 	{ "exclude",
 	  KeyRangeRef(LiteralStringRef("excluded/"), LiteralStringRef("excluded0"))
@@ -1794,3 +1798,34 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke
 	    "profile",
 	    "Clear operation is forbidden for profile client. You can set it to default to disable profiling.");
 }
+
+ActorLineageImpl::ActorLineageImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
+
+ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYourWritesTransaction* ryw,
+                                                                          KeyRef prefix,
+                                                                          KeyRangeRef kr) {
+	state Standalone<RangeResultRef> result;
+	Standalone<StringRef> addressString = kr.begin.removePrefix(prefix);
+
+	try {
+		auto address = NetworkAddress::parse(addressString.contents().toString());
+
+		state ProcessInterface process;
+		process.getInterface = RequestStream<GetProcessInterfaceRequest>(Endpoint({ address }, WLTOKEN_PROCESS));
+		ProcessInterface p = wait(retryBrokenPromise(process.getInterface, GetProcessInterfaceRequest{}));
+		process = p;
+
+		EchoRequest echoRequest;
+		echoRequest.message = "Hello";
+		std::string response = wait(process.echo.getReply(echoRequest));
+		result.push_back_deep(result.arena(), KeyValueRef(kr.begin, response));
+	} catch (Error& e) {
+		TraceEvent(SevDebug, "SpecialKeysNetworkParseError").error(e);
+	}
+
+	return result;
+}
+
+Future<Standalone<RangeResultRef>> ActorLineageImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+	return actorLineageGetRangeActor(ryw, getKeyRange().begin, kr);
+}
diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h
index c760a10724..051b17470a 100644
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@@ -142,6 +142,7 @@ public:
 class SpecialKeySpace {
 public:
 	enum class MODULE {
+		ACTORLINEAGE, // Sampling data
 		CLUSTERFILEPATH,
 		CONFIGURATION, // Configuration of the cluster
 		CONNECTIONSTRING,
@@ -377,5 +378,11 @@ public:
 	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
 };
 
+class ActorLineageImpl : public SpecialKeyRangeReadImpl {
+public:
+	explicit ActorLineageImpl(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+};
+
 #include "flow/unactorcompiler.h"
 #endif
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 56fca670b2..15dac5dea0 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -334,7 +334,7 @@ ACTOR Future<Void> pingLatencyLogger(TransportData* self) {
 }
 
 TransportData::TransportData(uint64_t transportId)
-  : endpoints(/*wellKnownTokenCount*/ 11), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
+  : endpoints(/*wellKnownTokenCount*/ 12), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
     warnAlwaysForLargePacket(true), lastIncompatibleMessage(0), transportId(transportId),
     numIncompatibleConnections(0) {
 	degraded = makeReference<AsyncVar<bool>>(false);
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 2740d9e720..4d05d3f5fe 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -22,6 +22,7 @@
 #include <boost/lexical_cast.hpp>
 
 #include "fdbrpc/Locality.h"
+#include "fdbclient/ProcessInterface.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbserver/Knobs.h"
 #include "flow/ActorCollection.h"
@@ -2032,6 +2033,19 @@ ACTOR Future<Void> serveProtocolInfo() {
 	}
 }
 
+ACTOR Future<Void> serveProcess() {
+	state ProcessInterface process;
+	process.getInterface.makeWellKnownEndpoint(WLTOKEN_PROCESS, TaskPriority::DefaultEndpoint);
+	loop {
+		choose {
+			when(GetProcessInterfaceRequest req = waitNext(process.getInterface.getFuture())) {
+				req.reply.send(process);
+			}
+			when(EchoRequest req = waitNext(process.echo.getFuture())) { req.reply.send(req.message); }
+		}
+	}
+}
+
 ACTOR Future<Void> fdbd(Reference<ClusterConnectionFile> connFile,
                         LocalityData localities,
                         ProcessClass processClass,
@@ -2048,6 +2062,7 @@ ACTOR Future<Void> fdbd(Reference<ClusterConnectionFile> connFile,
 	currentLineage->modify(&RoleLineage::role) = ProcessClass::Worker;
 
 	actors.push_back(serveProtocolInfo());
+	actors.push_back(serveProcess());
 
 	try {
 		ServerCoordinators coordinators(connFile);
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 78fe11b0a5..4d435afe00 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -3683,28 +3683,12 @@ void* sampleThread(void* arg) {
 
 		// Get actor lineage of currently running actor.
 		auto actorLineage = currentLineageThreadSafe.get();
-		printf("Currently running actor lineage (%p):\n", actorLineage.getPtr());
-		auto stack = actorLineage->stack(&StackLineage::actorName);
-		while (!stack.empty()) {
-			printf("%s ", stack.back());
-			stack.pop_back();
-		}
-		printf("\n");
+		// TODO: Use actorLineage
 
 		for (const auto& [waitState, lineageFn] : samples) {
 			auto alps = lineageFn();
 
 			// TODO: Serialize collected actor linage properties
-
-			printf("Wait State #%d ALPs (%d):\n", waitState, alps.size());
-			for (auto actorLineage : alps) {
-				auto stack = actorLineage->stack(&StackLineage::actorName);
-				while (!stack.empty()) {
-					printf("%s ", stack.back());
-					stack.pop_back();
-				}
-				printf("\n");
-			}
 		}
 	}
 

From 5c33c7c4f59841585e92687e2a69433d5787b57a Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 15 Apr 2021 13:54:49 -0700
Subject: [PATCH 163/461] Remove TODO

---
 fdbclient/SpecialKeySpace.actor.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index eaa35e353d..b245b049ba 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -71,7 +71,6 @@ std::unordered_map<SpecialKeySpace::MODULE, KeyRange> SpecialKeySpace::moduleToB
 	  KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) }
 };
 
-// TODO: Similar for actor lineage?
 std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandToRange = {
 	{ "exclude",
 	  KeyRangeRef(LiteralStringRef("excluded/"), LiteralStringRef("excluded0"))

From 225375b043c67175907cc9559a0ab04371b49caf Mon Sep 17 00:00:00 2001
From: falsandtru <falsandtru@users.noreply.github.com>
Date: Fri, 16 Apr 2021 06:28:33 +0900
Subject: [PATCH 164/461] Update Golang sample

---
 packaging/docker/samples/golang/.env       | 6 +++---
 packaging/docker/samples/golang/app/go.mod | 4 ++--
 packaging/docker/samples/golang/app/go.sum | 6 ++++--
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/packaging/docker/samples/golang/.env b/packaging/docker/samples/golang/.env
index 811156201e..55ddee20ab 100644
--- a/packaging/docker/samples/golang/.env
+++ b/packaging/docker/samples/golang/.env
@@ -19,8 +19,8 @@
 
 COMPOSE_PROJECT_NAME=fdbgolangsample
 
-FDB_API_VERSION=620
-FDB_VERSION=6.2.28
+FDB_API_VERSION=630
+FDB_VERSION=6.3.12
 FDB_COORDINATOR=fdb-coordinator
 FDB_NETWORKING_MODE=container
-FDB_COORDINATOR_PORT=4500
\ No newline at end of file
+FDB_COORDINATOR_PORT=4500
diff --git a/packaging/docker/samples/golang/app/go.mod b/packaging/docker/samples/golang/app/go.mod
index 609602be19..8f285c9966 100644
--- a/packaging/docker/samples/golang/app/go.mod
+++ b/packaging/docker/samples/golang/app/go.mod
@@ -19,6 +19,6 @@
 
 module foundationdb.org/docker/samples/golang/v0/fdb-demo-golang
 
-go 1.13
+go 1.16
 
-require github.com/apple/foundationdb/bindings/go v0.0.0-20191129023120-e16ae7cadf80
+require github.com/apple/foundationdb/bindings/go v0.0.0-20210414233633-40942b2d9d13
diff --git a/packaging/docker/samples/golang/app/go.sum b/packaging/docker/samples/golang/app/go.sum
index 7ebbfbaab3..33f5d5038a 100644
--- a/packaging/docker/samples/golang/app/go.sum
+++ b/packaging/docker/samples/golang/app/go.sum
@@ -1,2 +1,4 @@
-github.com/apple/foundationdb/bindings/go v0.0.0-20191129023120-e16ae7cadf80 h1:VKL6OsaB8X91ijz5DEDOw2lBIxmqTUVm5A//EExEyvo=
-github.com/apple/foundationdb/bindings/go v0.0.0-20191129023120-e16ae7cadf80/go.mod h1:OMVSB21p9+xQUIqlGizHPZfjK+SHws1ht+ZytVDoz9U=
+github.com/apple/foundationdb/bindings/go v0.0.0-20210414233633-40942b2d9d13 h1:RxQG4vcIkRCjxCtN/QFm9SkMGvikjStR4TLHR0Z78+8=
+github.com/apple/foundationdb/bindings/go v0.0.0-20210414233633-40942b2d9d13/go.mod h1:w63jdZTFCtvdjsUj5yrdKgjxaAD5uXQX6hJ7EaiLFRs=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

From 41104040adee788be8a3e19525cd54d59c6ccdb3 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 15 Apr 2021 15:24:24 -0700
Subject: [PATCH 165/461] Rename init_snapshot_interval to
 initial_snapshot_interval.

---
 documentation/sphinx/source/backups.rst | 2 +-
 fdbbackup/backup.actor.cpp              | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/documentation/sphinx/source/backups.rst b/documentation/sphinx/source/backups.rst
index 404fe70f50..01a730a6bd 100644
--- a/documentation/sphinx/source/backups.rst
+++ b/documentation/sphinx/source/backups.rst
@@ -244,7 +244,7 @@ The ``start`` subcommand is used to start a backup.  If there is already a backu
 ``-s <DURATION>`` or ``--snapshot_interval <DURATION>``  
   Specifies the duration, in seconds, of the inconsistent snapshots written to the backup in continuous mode.  The default is 864000 which is 10 days.
 
-``--init_snapshot_interval <DURATION>``  
+``--initial_snapshot_interval <DURATION>``  
   Specifies the duration, in seconds, of the first inconsistent snapshot written to the backup.  The default is 0, which means as fast as possible.
 
 ``--partitioned_log_experimental``
diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index c171c2fcb5..4b4aa40765 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -105,7 +105,7 @@ enum {
 	// Backup constants
 	OPT_DESTCONTAINER,
 	OPT_SNAPSHOTINTERVAL,
-	OPT_INIT_SNAPSHOT_INTERVAL,
+	OPT_INITIAL_SNAPSHOT_INTERVAL,
 	OPT_ERRORLIMIT,
 	OPT_NOSTOPWHENDONE,
 	OPT_EXPIRE_BEFORE_VERSION,
@@ -233,7 +233,7 @@ CSimpleOpt::SOption g_rgBackupStartOptions[] = {
 	{ OPT_USE_PARTITIONED_LOG, "--partitioned_log_experimental", SO_NONE },
 	{ OPT_SNAPSHOTINTERVAL, "-s", SO_REQ_SEP },
 	{ OPT_SNAPSHOTINTERVAL, "--snapshot_interval", SO_REQ_SEP },
-	{ OPT_INIT_SNAPSHOT_INTERVAL, "--init_snapshot_interval", SO_REQ_SEP },
+	{ OPT_INITIAL_SNAPSHOT_INTERVAL, "--initial_snapshot_interval", SO_REQ_SEP },
 	{ OPT_TAGNAME, "-t", SO_REQ_SEP },
 	{ OPT_TAGNAME, "--tagname", SO_REQ_SEP },
 	{ OPT_BACKUPKEYS, "-k", SO_REQ_SEP },
@@ -3473,7 +3473,7 @@ int main(int argc, char* argv[]) {
 				modifyOptions.destURL = destinationContainer;
 				break;
 			case OPT_SNAPSHOTINTERVAL:
-			case OPT_INIT_SNAPSHOT_INTERVAL:
+			case OPT_INITIAL_SNAPSHOT_INTERVAL:
 			case OPT_MOD_ACTIVE_INTERVAL: {
 				const char* a = args->OptionArg();
 				int seconds;
@@ -3485,7 +3485,7 @@ int main(int argc, char* argv[]) {
 				if (optId == OPT_SNAPSHOTINTERVAL) {
 					snapshotIntervalSeconds = seconds;
 					modifyOptions.snapshotIntervalSeconds = seconds;
-				} else if (optId == OPT_INIT_SNAPSHOT_INTERVAL) {
+				} else if (optId == OPT_INITIAL_SNAPSHOT_INTERVAL) {
 					initialSnapshotIntervalSeconds = seconds;
 				} else if (optId == OPT_MOD_ACTIVE_INTERVAL) {
 					modifyOptions.activeSnapshotIntervalSeconds = seconds;

From 028c02c7b0fa16c8ddadb664b67b5a5df6f3eae8 Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh3y@gmail.com>
Date: Thu, 15 Apr 2021 22:26:32 +0000
Subject: [PATCH 166/461] doc: Link FDB Commit Process doc in CONTRIBUTING.md

---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e599780e37..525e80a9d9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -36,7 +36,7 @@ Members of the Apple FoundationDB team are part of the core committers helping r
 
 ## Contributing
 ### Opening a Pull Request
-We love pull requests! For minor changes, feel free to open up a PR directly. For larger feature development and any changes that may require community discussion, we ask that you discuss your ideas on the [community forums](https://forums.foundationdb.org) prior to opening a PR, and then reference that thread within your PR comment.
+We love pull requests! For minor changes, feel free to open up a PR directly. For larger feature development and any changes that may require community discussion, we ask that you discuss your ideas on the [community forums](https://forums.foundationdb.org) prior to opening a PR, and then reference that thread within your PR comment. Please refer to [FoundationDB Commit Process](https://github.com/apple/foundationdb/wiki/FoundationDB-Commit-Process) for more detailed guidelines.
 
 CI will be run automatically for core committers, and for community PRs it will be initiated by the request of a core committer.  Tests can also be run locally via `ctest`, and core committers can run additional validation on pull requests prior to merging them.
 

From 21c518467a3278dec795a2f104984611dc8674fb Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 15 Apr 2021 15:38:44 -0700
Subject: [PATCH 167/461] Move 300s to a knob.

---
 fdbserver/Knobs.cpp        | 1 +
 fdbserver/Knobs.h          | 1 +
 fdbserver/worker.actor.cpp | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index ad4c797b8d..df3434fb9b 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -616,6 +616,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	//Worker
 	init( WORKER_LOGGING_INTERVAL,                               5.0 );
 	init( HEAP_PROFILER_INTERVAL,                               30.0 );
+	init( JOIN_CLUSTER_WARNING_INTERVAL,                       300.0 );
 	init( DEGRADED_RESET_INTERVAL,                          24*60*60 ); if ( randomize && BUGGIFY ) DEGRADED_RESET_INTERVAL = 10;
 	init( DEGRADED_WARNING_LIMIT,                                  1 );
 	init( DEGRADED_WARNING_RESET_DELAY,                   7*24*60*60 );
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 9a5f2a528c..690bbe6327 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -543,6 +543,7 @@ public:
 	// Worker
 	double WORKER_LOGGING_INTERVAL;
 	double HEAP_PROFILER_INTERVAL;
+	double JOIN_CLUSTER_WARNING_INTERVAL;
 	double DEGRADED_RESET_INTERVAL;
 	double DEGRADED_WARNING_LIMIT;
 	double DEGRADED_WARNING_RESET_DELAY;
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 76554189be..e03089b618 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -572,7 +572,7 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 				TraceEvent("WorkerJoiningCluster").detail("CCID", ccInterface->get().get().id());
 				break;
 			}
-			when(wait(delay(300))) { // 5 min
+			when(wait(delay(SERVER_KNOBS->JOIN_CLUSTER_WARNING_INTERVAL))) {
 				TraceEvent(SevWarn, "WorkerNotJoinedClusterForLongTime").detail("WaitTime", now() - startTime);
 			}
 			when(wait(ccInterface->onChange())) { break; }

From b75e25e01292f28d5ea5a3621f6e1de809b4e511 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Fri, 16 Apr 2021 17:39:56 +0000
Subject: [PATCH 169/461] Fix multiple (inline) definitions of canReplyWith

canReplyWith was declared inline in two translation units with different
definitions. When building with clang, this resulted in
storageserver.actor.cpp using the definition from
StorageCache.actor.cpp. Fix that by giving both symbols internal
linkage.
---
 fdbserver/StorageCache.actor.cpp  | 2 +-
 fdbserver/storageserver.actor.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp
index 24aed1e7bf..73f6d0a245 100644
--- a/fdbserver/StorageCache.actor.cpp
+++ b/fdbserver/StorageCache.actor.cpp
@@ -43,7 +43,7 @@
 // Need to look into refactoring common code out for better code readability and to avoid duplication
 
 // TODO rename wrong_shard_server error to wrong_cache_server
-inline bool canReplyWith(Error e) {
+static inline bool canReplyWith(Error e) {
 	switch (e.code()) {
 	case error_code_transaction_too_old:
 	case error_code_future_version:
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 8c26f955bb..cadf5d36fb 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -72,7 +72,7 @@
 
 #define SHORT_CIRCUT_ACTUAL_STORAGE 0
 
-inline bool canReplyWith(Error e) {
+static inline bool canReplyWith(Error e) {
 	switch (e.code()) {
 	case error_code_transaction_too_old:
 	case error_code_future_version:

From e7d4a452e46e2c41b64f7ac978be4a5dcf6ee1fe Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh_yadav@apple.com>
Date: Tue, 13 Apr 2021 18:09:48 -0700
Subject: [PATCH 170/461] docker: Load custom bashrc if available

This will check if there is a `.bashrc.local` avialable in the synced directory,
and load it.

- This is useful so that any changes usrs make to our bashrc in Okteto containers,
doesn't get lost between re-deployement of containers.
- Can also used to automate setting up environment, e.g. copy various dotfiles
etc from the synced directory to $HOME folder during first run.
---
 build/docker/centos6/devel/Dockerfile | 7 ++++++-
 build/docker/centos7/devel/Dockerfile | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/build/docker/centos6/devel/Dockerfile b/build/docker/centos6/devel/Dockerfile
index 82c99d4464..c5c9db2914 100644
--- a/build/docker/centos6/devel/Dockerfile
+++ b/build/docker/centos6/devel/Dockerfile
@@ -76,4 +76,9 @@ RUN rm -f /root/anaconda-ks.cfg && \
     '   j start --tarball $(find ${HOME}/build_output/packages -name correctness\*.tar.gz) "${@}"' \
     '}' \
     '' \
-    >> .bashrc
\ No newline at end of file
+    'USER_BASHRC="$HOME/src/.bashrc.local"' \
+    'if test -f "$USER_BASHRC"; then' \
+    '   source $USER_BASHRC' \
+    'fi' \
+    '' \
+    >> .bashrc
diff --git a/build/docker/centos7/devel/Dockerfile b/build/docker/centos7/devel/Dockerfile
index ea60da54e7..98f1923c17 100644
--- a/build/docker/centos7/devel/Dockerfile
+++ b/build/docker/centos7/devel/Dockerfile
@@ -104,5 +104,10 @@ RUN rm -f /root/anaconda-ks.cfg && \
     '   j start --tarball $(find ${HOME}/build_output/packages -name correctness\*.tar.gz) "${@}"' \
     '}' \
     '' \
+    'USER_BASHRC="$HOME/src/.bashrc.local"' \
+    'if test -f "$USER_BASHRC"; then' \
+    '   source $USER_BASHRC' \
+    'fi' \
+    '' \
     'bash ${HOME}/docker_proxy.sh' \
-    >> .bashrc
\ No newline at end of file
+    >> .bashrc

From aba752d12e10e1469184b1b71d944f5e3af5efd7 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Fri, 16 Apr 2021 14:56:05 -0400
Subject: [PATCH 171/461] Added release notes regarding 6.3 to 6.2 downgrades

---
 .../sphinx/source/release-notes/release-notes-620.rst        | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/documentation/sphinx/source/release-notes/release-notes-620.rst b/documentation/sphinx/source/release-notes/release-notes-620.rst
index 3148eefa97..1150b29c38 100644
--- a/documentation/sphinx/source/release-notes/release-notes-620.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-620.rst
@@ -8,6 +8,11 @@ Release Notes
 * Fix backup agent stall when writing to local filesystem with slow metadata operations. `(PR #4428) <https://github.com/apple/foundationdb/pull/4428>`_
 * Backup agent no longer uses 4k block caching layer on local output files so that write operations are larger. `(PR #4428) <https://github.com/apple/foundationdb/pull/4428>`_
 * Fix accounting error that could cause commits to incorrectly fail with ``proxy_memory_limit_exceeded``. `(PR #4529) <https://github.com/apple/foundationdb/pull/4529>`_
+* Added support for downgrades from FDB version 6.3. `(PR #4673) <https://github.com/apple/foundationdb/pull/4673>`_
+* Restrictions added for 6.3 clusters to maintain compatibility with a 6.2 downgrade. `(PR #4469) <https://github.com/apple/foundationdb/pull/4469>`_
+   * Downgrades from 6.3 cannot have ``TLogVersion`` greater than V4 (6.2).
+   * Downgrades from 6.3 cannot use storage engine types that are not ``ssd-1``, ``ssd-2``, or ``memory``.
+   * Downgrades from 6.3 must not have any key servers serialized with tag encoding. ``TAG_ENCODE_KEY_SERVERS`` must not be set to true at any point in time.
 
 6.2.32
 ======

From 4a1a55f27063a16ea7f793b9da55c6fee6228117 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 16 Apr 2021 13:48:44 -0700
Subject: [PATCH 172/461] Remove fdb_get_server_protocol from the Python
 bindings. This C function this was using recently moved and changed
 signature, so it no longer works in Python.

---
 bindings/c/test/unit/unit_tests.cpp |  2 +-
 bindings/python/fdb/__init__.py     |  1 -
 bindings/python/fdb/impl.py         | 10 ----------
 3 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp
index 64898f6ede..54f763fb5c 100644
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@@ -1513,7 +1513,7 @@ TEST_CASE("fdb_transaction_get_approximate_size") {
 	}
 }
 
-TEST_CASE("fdb_get_server_protocol") {
+TEST_CASE("fdb_database_get_server_protocol") {
 	// We don't really have any expectations other than "don't crash" here
 	FDBFuture* protocolFuture = fdb_database_get_server_protocol(db, 0);
 	uint64_t out;
diff --git a/bindings/python/fdb/__init__.py b/bindings/python/fdb/__init__.py
index 17f697797d..c969b6c70c 100644
--- a/bindings/python/fdb/__init__.py
+++ b/bindings/python/fdb/__init__.py
@@ -95,7 +95,6 @@ def api_version(ver):
         'transactional',
         'options',
         'StreamingMode',
-        'get_server_protocol'
     )
 
     _add_symbols(fdb.impl, list)
diff --git a/bindings/python/fdb/impl.py b/bindings/python/fdb/impl.py
index 6e7803777a..e8cc2a79b8 100644
--- a/bindings/python/fdb/impl.py
+++ b/bindings/python/fdb/impl.py
@@ -1531,9 +1531,6 @@ def init_c_api():
     _capi.fdb_transaction_get_approximate_size.argtypes = [ctypes.c_void_p]
     _capi.fdb_transaction_get_approximate_size.restype = ctypes.c_void_p
 
-    _capi.fdb_get_server_protocol.argtypes = [ctypes.c_char_p]
-    _capi.fdb_get_server_protocol.restype = ctypes.c_void_p
-
     _capi.fdb_transaction_get_versionstamp.argtypes = [ctypes.c_void_p]
     _capi.fdb_transaction_get_versionstamp.restype = ctypes.c_void_p
 
@@ -1733,13 +1730,6 @@ open_databases = {}
 
 cacheLock = threading.Lock()
 
-def get_server_protocol(clusterFilePath=None):
-    with _network_thread_reentrant_lock:
-        if not _network_thread:
-            init()
-
-    return FutureUInt64(_capi.fdb_get_server_protocol(optionalParamToBytes(clusterFilePath)[0]))
-
 def open(cluster_file=None, event_model=None):
     """Opens the given database (or the default database of the cluster indicated
     by the fdb.cluster file in a platform-specific location, if no cluster_file

From bb5539bb70a27d38cdeeb8c1fab362fe1d8317fa Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 16 Apr 2021 13:47:41 -0700
Subject: [PATCH 173/461] Initialize version field

---
 fdbclient/SpecialKeySpace.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 2bbafbd451..9ac802a9ca 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1430,7 +1430,7 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 		}
 	}
 
-	VersionHistory vh;
+	VersionHistory vh{ 0 };
 
 	// Transform writes from the special-key-space (\xff\xff/global_config/) to
 	// the system key space (\xff/globalConfig/), and writes mutations to

From db610355cf4f59f8b097ff7a61964395dcdfb05d Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Fri, 16 Apr 2021 14:19:37 -0700
Subject: [PATCH 174/461] Keep simulated disk write delay high until speedUp is
 set.

---
 fdbrpc/AsyncFileNonDurable.actor.h | 2 +-
 flow/Knobs.cpp                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index fe3d3a4137..848d755fb1 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -197,7 +197,7 @@ private:
 		this->file = file;
 		this->filename = filename;
 		this->diskParameters = diskParameters;
-		maxWriteDelay = deterministicRandom()->random01() * FLOW_KNOBS->NON_DURABLE_MAX_WRITE_DELAY;
+		maxWriteDelay = FLOW_KNOBS->NON_DURABLE_MAX_WRITE_DELAY;
 		hasBeenSynced = false;
 
 		killMode = (KillMode)deterministicRandom()->randomInt(1, 3);
diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp
index 6dc77e2fb2..4a3eb4e2d7 100644
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@@ -135,7 +135,7 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) {
 	init( DISABLE_POSIX_KERNEL_AIO,                              0 );
 
 	//AsyncFileNonDurable
-	init( NON_DURABLE_MAX_WRITE_DELAY,                      0.0001 ); if( randomize && BUGGIFY ) NON_DURABLE_MAX_WRITE_DELAY = 5.0;
+	init( NON_DURABLE_MAX_WRITE_DELAY,                         5.0 );
 	init( MAX_PRIOR_MODIFICATION_DELAY,                        1.0 ); if( randomize && BUGGIFY ) MAX_PRIOR_MODIFICATION_DELAY = 10.0;
 
 	//GenericActors

From d79dc447b4fd45907e72ae8913b021e60ac76458 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 16 Apr 2021 15:24:21 -0700
Subject: [PATCH 175/461] Update release notes

---
 .../sphinx/source/release-notes/release-notes-630.rst         | 4 ++++
 .../sphinx/source/release-notes/release-notes-700.rst         | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst
index 076f85d74d..cd8c5e4150 100644
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@@ -2,6 +2,10 @@
 Release Notes
 #############
 
+6.3.13
+======
+* The multi-version client now requires at most two connections to the cluster, regardless of how many external clients are configured. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
+
 6.3.12
 ======
 * Change the default for --knob_tls_server_handshake_threads to 64. The previous was 1000. This avoids starting 1000 threads by default, but may adversely affect recovery time for large clusters using tls. Users with large tls clusters should consider explicitly setting this knob in their foundationdb.conf file. `(PR #4421) <https://github.com/apple/foundationdb/pull/4421>`_
diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst
index 431ea14fc2..5f3d3a4669 100644
--- a/documentation/sphinx/source/release-notes/release-notes-700.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-700.rst
@@ -15,7 +15,8 @@ Features
 Performance
 -----------
 
-* Increased performance of dr_agent when copying the mutation log. The ``COPY_LOG_BLOCK_SIZE``, ``COPY_LOG_BLOCKS_PER_TASK``, ``COPY_LOG_PREFETCH_BLOCKS``, ``COPY_LOG_READ_AHEAD_BYTES`` and ``COPY_LOG_TASK_DURATION_NANOS`` knobs can be set. `(PR 3436) <https://github.com/apple/foundationdb/pull/3436>`_
+* Increased performance of dr_agent when copying the mutation log. The ``COPY_LOG_BLOCK_SIZE``, ``COPY_LOG_BLOCKS_PER_TASK``, ``COPY_LOG_PREFETCH_BLOCKS``, ``COPY_LOG_READ_AHEAD_BYTES`` and ``COPY_LOG_TASK_DURATION_NANOS`` knobs can be set. `(PR #3436) <https://github.com/apple/foundationdb/pull/3436>`_
+* Reduced the number of connections required by the multi-version client when loading external clients. When connection to 7.0 clusters, only one connection will be used. With older clusters, at most two connections will be used. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
 
 Reliability
 -----------

From 336a429be106c8c88da24a8856d59736fbdb4773 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 16 Apr 2021 17:32:53 -0600
Subject: [PATCH 176/461] first version of profiler

---
 fdbclient/ActorLineageProfiler.cpp | 94 +++++++++++++++++++++++++++---
 fdbclient/ActorLineageProfiler.h   | 69 ++++++++++++++++++----
 fdbclient/AnnotateActor.h          | 20 ++++++-
 3 files changed, 161 insertions(+), 22 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 5c0aaf86d1..4993a74207 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -18,7 +18,9 @@
  * limitations under the License.
  */
 
+#include "flow/flow.h"
 #include "flow/singleton.h"
+#include "fdbrpc/IAsyncFile.h"
 #include "fdbclient/ActorLineageProfiler.h"
 #include <msgpack.hpp>
 #include <memory>
@@ -26,15 +28,6 @@
 
 using namespace std::literals;
 
-std::string_view to_string(WaitState w) {
-	switch (w) {
-	case WaitState::Running:
-		return "Running";
-	case WaitState::DiskIO:
-		return "DiskIO";
-	}
-}
-
 class Packer : public msgpack::packer<msgpack::sbuffer> {
 	struct visitor_t {
 		using VisitorMap = std::unordered_map<std::type_index, std::function<void(std::any const&, Packer& packer)>>;
@@ -201,3 +194,86 @@ std::shared_ptr<Sample> SampleCollectorT::collect() {
 	packer.pack(res);
 	return packer.done(time);
 }
+
+void SampleCollection_t::refresh() {
+	auto sample = _collector->collect();
+	auto min = sample->time - windowSize;
+	double oldest = 0.0;
+	while (oldest < min && !data.empty()) {
+		// we remove at most 10 elements at a time. This is so we don't block the main thread for too long.
+		{
+			Lock _{ mutex };
+			int i = 0;
+			do {
+				oldest = data.front()->time;
+				data.pop_front();
+				++i;
+			} while (i < 10 && oldest < min && !data.empty());
+		}
+	}
+	{
+		Lock _{ mutex };
+		data.push_back(sample);
+	}
+}
+
+std::vector<std::shared_ptr<Sample>> SampleCollection_t::get(double from /*= 0.0*/,
+                                                             double to /*= std::numeric_limits<double>::max()*/) const {
+	Lock _{ mutex };
+	std::vector<std::shared_ptr<Sample>> res;
+	for (const auto& sample : data) {
+		if (sample->time > to) {
+			break;
+		} else if (sample->time > from) {
+			res.emplace_back(sample);
+		}
+	}
+	return res;
+}
+
+ActorLineageProfilerT::ActorLineageProfilerT() {
+	collection->collector()->addGetter(WaitState::Network,
+	                                   std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet())));
+	collection->collector()->addGetter(
+	    WaitState::Disk,
+	    std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet())));
+	collection->collector()->addGetter(WaitState::Running, []() {
+		auto res = currentLineageThreadSafe.get();
+		return std::vector<Reference<ActorLineage>>({ currentLineageThreadSafe.get() });
+	});
+}
+
+ActorLineageProfilerT::~ActorLineageProfilerT() {
+	stop();
+}
+
+void ActorLineageProfilerT::stop() {
+	setFrequency(0);
+}
+
+void ActorLineageProfilerT::setFrequency(unsigned frequency) {
+	bool change = this->frequency != frequency;
+	this->frequency = frequency;
+	if (frequency != 0 && !profilerThread.joinable()) {
+		profilerThread = std::thread(std::bind(&ActorLineageProfilerT::profile, this));
+	} else if (change) {
+		cond.notify_all();
+	}
+}
+
+void ActorLineageProfilerT::profile() {
+	for (;;) {
+		collection->refresh();
+		if (frequency == 0) {
+			return;
+		}
+		{
+			std::unique_lock<std::mutex> lock{ mutex };
+			cond.wait_for(lock, std::chrono::microseconds(1000000 / frequency));
+			// cond.wait_until(lock, lastSample + std::chrono::milliseconds)
+		}
+		if (frequency == 0) {
+			return;
+		}
+	}
+}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 1f2bdad659..3f11840714 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -19,13 +19,19 @@
  */
 
 #pragma once
+#include "fdbclient/AnnotateActor.h"
+
 #include <optional>
 #include <string>
 #include <any>
 #include <vector>
+#include <mutex>
+#include <condition_variable>
 #include "flow/singleton.h"
 #include "flow/flow.h"
 
+void runSamplingProfiler();
+
 struct IALPCollectorBase {
 	virtual std::optional<std::any> collect(ActorLineage*) = 0;
 	virtual const std::string_view& name() = 0;
@@ -34,19 +40,9 @@ struct IALPCollectorBase {
 
 template <class T>
 struct IALPCollector : IALPCollectorBase {
-	const std::string_view& name() override {
-		static std::string_view res;
-		if (res == "") {
-			res = T::name;
-		}
-		return res;
-	}
+	const std::string_view& name() override { return T::name; }
 };
 
-enum class WaitState { Running, DiskIO };
-
-std::string_view to_string(WaitState w);
-
 struct Sample : std::enable_shared_from_this<Sample> {
 	double time = 0.0;
 	unsigned size = 0u;
@@ -68,6 +64,57 @@ private:
 public:
 	void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); }
 	std::shared_ptr<Sample> collect();
+	void addGetter(WaitState waitState, Getter const& getter);
 };
 
 using SampleCollector = crossbow::singleton<SampleCollectorT>;
+
+class SampleCollection_t {
+	friend struct crossbow::create_static<SampleCollection_t>;
+	using Lock = std::unique_lock<std::mutex>;
+	SampleCollection_t() {}
+
+	SampleCollector _collector;
+	mutable std::mutex mutex;
+	std::atomic<double> windowSize = 0.0;
+	std::deque<std::shared_ptr<Sample>> data;
+
+public:
+	/**
+	 * Define how many samples the collection shoul keep. The window size is defined by time dimension.
+	 *
+	 * \param duration How long a sample should be kept in the collection.
+	 */
+	void setWindowSize(double duration) { windowSize.store(duration); }
+	/**
+	 * By default returns reference counted pointers of all samples. A window can be defined in terms of absolute time.
+	 *
+	 * \param from The minimal age of all returned samples.
+	 * \param to The max age of all returned samples.
+	 */
+	std::vector<std::shared_ptr<Sample>> get(double from = 0.0, double to = std::numeric_limits<double>::max()) const;
+	/**
+	 * Collects all new samples from the sample collector and stores them in the collection.
+	 */
+	void refresh();
+	const SampleCollector& collector() const { return _collector; }
+	SampleCollector& collector() { return _collector; }
+};
+
+using SampleCollection = crossbow::singleton<SampleCollection_t>;
+
+class ActorLineageProfilerT {
+	friend struct crossbow::create_static<ActorLineageProfilerT>;
+	ActorLineageProfilerT();
+	SampleCollection collection;
+	std::thread profilerThread;
+	std::atomic<unsigned> frequency = 0;
+	std::mutex mutex;
+	std::condition_variable cond;
+	void profile();
+
+public:
+	~ActorLineageProfilerT();
+	void setFrequency(unsigned frequency);
+	void stop();
+};
diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h
index 265d1bb3ad..660b777d69 100644
--- a/fdbclient/AnnotateActor.h
+++ b/fdbclient/AnnotateActor.h
@@ -23,6 +23,8 @@
 #include "flow/flow.h"
 #include "flow/network.h"
 
+#include <string_view>
+
 // Used to manually instrument waiting actors to collect samples for the
 // sampling profiler.
 struct AnnotateActor {
@@ -51,7 +53,7 @@ struct AnnotateActor {
 
 		return *this;
 	}
-	
+
 	~AnnotateActor() {
 		if (set) {
 			g_network->getActorLineageSet().erase(index);
@@ -59,6 +61,20 @@ struct AnnotateActor {
 	}
 };
 
-enum WaitState { Disk, Network };
+enum class WaitState { Disk, Network, Running };
+// usually we shouldn't use `using namespace` in a header file, but literals should be safe as user defined literals
+// need to be prefixed with `_`
+using namespace std::literals;
+
+constexpr std::string_view to_string(WaitState st) {
+	switch (st) {
+	case WaitState::Disk:
+		return "Disk"sv;
+	case WaitState::Network:
+		return "Network"sv;
+	case WaitState::Running:
+		return "Running"sv;
+	}
+}
 
 extern std::map<WaitState, std::function<std::vector<Reference<ActorLineage>>()>> samples;

From 5c3cb0da205f02a407d471cf23ba6c492135f41f Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Fri, 16 Apr 2021 20:13:23 -0700
Subject: [PATCH 177/461] Pager now reports whether an uncacheable read hit the
 cache or not.  CommitSubtree now does uncacheable reads so it can avoid
 copying the old version of the page for modification if it was not already in
 cache.

---
 fdbserver/IPager.h                 | 10 +++-
 fdbserver/VersionedBTree.actor.cpp | 76 +++++++++++++++++++++---------
 2 files changed, 63 insertions(+), 23 deletions(-)

diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h
index 7f21e30566..4eddfbc5e8 100644
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@@ -65,7 +65,10 @@ public:
 
 class IPagerSnapshot {
 public:
-	virtual Future<Reference<const IPage>> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool nohit) = 0;
+	virtual Future<Reference<const IPage>> getPhysicalPage(LogicalPageID pageID,
+	                                                       bool cacheable,
+	                                                       bool nohit,
+	                                                       bool* fromCache = nullptr) = 0;
 	virtual bool tryEvictPage(LogicalPageID id) = 0;
 	virtual Version getVersion() const = 0;
 
@@ -117,7 +120,10 @@ public:
 	// Cacheable indicates that the page should be added to the page cache (if applicable?) as a result of this read.
 	// NoHit indicates that the read should not be considered a cache hit, such as when preloading pages that are
 	// considered likely to be needed soon.
-	virtual Future<Reference<IPage>> readPage(LogicalPageID pageID, bool cacheable = true, bool noHit = false) = 0;
+	virtual Future<Reference<IPage>> readPage(LogicalPageID pageID,
+	                                          bool cacheable = true,
+	                                          bool noHit = false,
+	                                          bool* fromCache = nullptr) = 0;
 
 	// Get a snapshot of the metakey and all pages as of the version v which must be >= getOldestVersion()
 	// Note that snapshots at any version may still see the results of updatePage() calls.
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 52408c2820..77d602d0cc 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -1711,14 +1711,24 @@ public:
 		return pageCache.tryEvict(physicalID);
 	}
 
-	// Reads the most recent version of pageID, either previously committed or written using updatePage() in the current
-	// commit
-	Future<Reference<IPage>> readPage(LogicalPageID pageID, bool cacheable, bool noHit = false) override {
+	// Reads the most recent version of pageID, either previously committed or written using updatePage()
+	// in the current commit
+	// If cacheable is false then if fromCache is valid it will be set to true if the page is from cache, otherwise
+	// false. If cacheable is true, fromCache is ignored as the result is automatically from cache by virtue of being
+	// cacheable.
+	Future<Reference<IPage>> readPage(LogicalPageID pageID,
+	                                  bool cacheable,
+	                                  bool noHit = false,
+	                                  bool* fromCache = nullptr) override {
 		// Use cached page if present, without triggering a cache hit.
 		// Otherwise, read the page and return it but don't add it to the cache
 		if (!cacheable) {
 			debug_printf("DWALPager(%s) op=readUncached %s\n", filename.c_str(), toString(pageID).c_str());
 			PageCacheEntry* pCacheEntry = pageCache.getIfExists(pageID);
+			if (fromCache != nullptr) {
+				*fromCache = pCacheEntry != nullptr;
+			}
+
 			if (pCacheEntry != nullptr) {
 				debug_printf("DWALPager(%s) op=readUncachedHit %s\n", filename.c_str(), toString(pageID).c_str());
 				return pCacheEntry->readFuture;
@@ -1771,9 +1781,13 @@ public:
 		return (PhysicalPageID)pageID;
 	}
 
-	Future<Reference<IPage>> readPageAtVersion(LogicalPageID logicalID, Version v, bool cacheable, bool noHit) {
+	Future<Reference<IPage>> readPageAtVersion(LogicalPageID logicalID,
+	                                           Version v,
+	                                           bool cacheable,
+	                                           bool noHit,
+	                                           bool* fromCache) {
 		PhysicalPageID physicalID = getPhysicalPageID(logicalID, v);
-		return readPage(physicalID, cacheable, noHit);
+		return readPage(physicalID, cacheable, noHit, fromCache);
 	}
 
 	// Get snapshot as of the most recent committed version of the pager
@@ -2302,11 +2316,14 @@ public:
 	  : pager(pager), metaKey(meta), version(version), expired(expiredFuture) {}
 	~DWALPagerSnapshot() override {}
 
-	Future<Reference<const IPage>> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool noHit) override {
+	Future<Reference<const IPage>> getPhysicalPage(LogicalPageID pageID,
+	                                               bool cacheable,
+	                                               bool noHit,
+	                                               bool* fromCache) override {
 		if (expired.isError()) {
 			throw expired.getError();
 		}
-		return map(pager->readPageAtVersion(pageID, version, cacheable, noHit),
+		return map(pager->readPageAtVersion(pageID, version, cacheable, noHit, fromCache),
 		           [=](Reference<IPage> p) { return Reference<const IPage>(std::move(p)); });
 	}
 
@@ -3324,7 +3341,7 @@ public:
 				}
 				// Start reading the page, without caching
 				entries.push_back(
-				    std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true)));
+				    std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true, false)));
 
 				--toPop;
 			}
@@ -4196,7 +4213,11 @@ private:
 	                                                     BTreePageIDRef id,
 	                                                     const RedwoodRecordRef* lowerBound,
 	                                                     const RedwoodRecordRef* upperBound,
-	                                                     bool forLazyClear = false) {
+	                                                     bool forLazyClear = false,
+	                                                     bool cacheable = true,
+	                                                     bool* fromCache = nullptr)
+
+	{
 		if (!forLazyClear) {
 			debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n",
 			             toString(id).c_str(),
@@ -4213,17 +4234,22 @@ private:
 		state Reference<const IPage> page;
 
 		if (id.size() == 1) {
-			Reference<const IPage> p = wait(snapshot->getPhysicalPage(id.front(), !forLazyClear, false));
+			Reference<const IPage> p = wait(snapshot->getPhysicalPage(id.front(), cacheable, false, fromCache));
 			page = std::move(p);
 		} else {
 			ASSERT(!id.empty());
 			std::vector<Future<Reference<const IPage>>> reads;
 			for (auto& pageID : id) {
-				reads.push_back(snapshot->getPhysicalPage(pageID, !forLazyClear, false));
+				reads.push_back(snapshot->getPhysicalPage(pageID, cacheable, false));
 			}
 			std::vector<Reference<const IPage>> pages = wait(getAll(reads));
 			// TODO:  Cache reconstituted super pages somehow, perhaps with help from the Pager.
 			page = Reference<const IPage>(new SuperPage(pages));
+
+			// In the current implementation, SuperPages are never present in the cache
+			if (fromCache != nullptr) {
+				*fromCache = false;
+			}
 		}
 
 		debug_printf("readPage() op=readComplete %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion());
@@ -4233,7 +4259,7 @@ private:
 		metrics.pageReadExt += (id.size() - 1);
 
 		if (!forLazyClear && page->userData == nullptr) {
-			debug_printf("readPage() Creating Reader for %s @%" PRId64 " lower=%s upper=%s\n",
+			debug_printf("readPage() Creating Mirror for %s @%" PRId64 " lower=%s upper=%s\n",
 			             toString(id).c_str(),
 			             snapshot->getVersion(),
 			             lowerBound->toString(false).c_str(),
@@ -4618,8 +4644,9 @@ private:
 		state Reference<FlowLock> commitReadLock = self->m_commitReadLock;
 		wait(commitReadLock->take());
 		state FlowLock::Releaser readLock(*commitReadLock);
-		state Reference<const IPage> page =
-		    wait(readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound));
+		state bool fromCache = false;
+		state Reference<const IPage> page = wait(
+		    readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound, false, false, &fromCache));
 		readLock.release();
 
 		state BTreePage* btPage = (BTreePage*)page->begin();
@@ -4631,11 +4658,13 @@ private:
 		// though it is awkward to reason about.
 		state bool tryToUpdate = btPage->tree().numItems > 0 && update->boundariesNormal();
 
-		// If trying to update the page, we need to clone it so we don't modify the original.
+		// If trying to update the page and the page reference points into the cache,
+		// we need to clone it so we don't modify the original version of the page.
 		// TODO: Refactor DeltaTree::Mirror so it can be shared between different versions of pages
-		if (tryToUpdate) {
+		if (tryToUpdate && fromCache) {
 			page = self->cloneForUpdate(page);
 			btPage = (BTreePage*)page->begin();
+			fromCache = false;
 		}
 
 		debug_printf(
@@ -5124,12 +5153,17 @@ private:
 				    parentInfo->count);
 				forceUpdate = true;
 				if (!m.updating) {
-					page = self->cloneForUpdate(page);
-					cursor = getCursor(page);
-					btPage = (BTreePage*)page->begin();
-					m.btPage = btPage;
-					m.m = cursor.mirror;
 					m.updating = true;
+
+					// Copy the page before modification if the page references the cache
+					if (fromCache) {
+						page = self->cloneForUpdate(page);
+						cursor = getCursor(page);
+						btPage = (BTreePage*)page->begin();
+						m.btPage = btPage;
+						m.m = cursor.mirror;
+						fromCache = false;
+					}
 				}
 				++g_redwoodMetrics.level(btPage->height).forceUpdate;
 			}

From 09ddcb3bae9a83818aa64f06ed79ebb3aca566ad Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 19 Apr 2021 11:27:19 -0600
Subject: [PATCH 178/461] remove old sample thread

---
 fdbclient/NativeAPI.actor.cpp |  3 ++-
 fdbserver/fdbserver.actor.cpp |  1 -
 flow/Platform.actor.cpp       | 28 ----------------------------
 flow/Platform.h               |  8 +++-----
 4 files changed, 5 insertions(+), 35 deletions(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index b761f6c049..7ab3f18440 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -1801,7 +1801,6 @@ void runNetwork() {
 	if (networkOptions.traceDirectory.present() && networkOptions.runLoopProfilingEnabled) {
 		setupRunLoopProfiler();
 	}
-	setupSamplingProfiler();
 
 	g_network->run();
 
@@ -2483,9 +2482,11 @@ ACTOR Future<Version> watchValue(Future<Version> version,
 				cx->invalidateCache(key);
 				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID));
 			} else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) {
+				// clang-format off
 				TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead
 				TEST(e.code() == error_code_watch_cancelled); // Too many watches on storage server, poll for changes
 				TEST(e.code() == error_code_process_behind); // The storage servers are all behind
+				// clang-format on
 				wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, info.taskID));
 			} else if (e.code() == error_code_timed_out) { // The storage server occasionally times out watches in case
 				                                           // it was cancelled
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index ce2a903c1f..ac1bf7950f 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -1948,7 +1948,6 @@ int main(int argc, char* argv[]) {
 				ASSERT(opts.connectionFile);
 
 				setupRunLoopProfiler();
-				setupSamplingProfiler();
 
 				auto dataFolder = opts.dataFolder;
 				if (!dataFolder.size())
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 4d435afe00..8cdb34f769 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -3677,34 +3677,6 @@ void setupRunLoopProfiler() {
 #endif
 }
 
-void* sampleThread(void* arg) {
-	while (true) {
-		threadSleep(1.0); // TODO: Read sample rate from global config
-
-		// Get actor lineage of currently running actor.
-		auto actorLineage = currentLineageThreadSafe.get();
-		// TODO: Use actorLineage
-
-		for (const auto& [waitState, lineageFn] : samples) {
-			auto alps = lineageFn();
-
-			// TODO: Serialize collected actor linage properties
-		}
-	}
-
-	return nullptr;
-}
-
-void setupSamplingProfiler() {
-	samples[WaitState::Disk] = std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet()));
-	samples[WaitState::Network] =
-	    std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet()));
-
-	// TODO: Add knob
-	TraceEvent("StartingSamplingProfilerThread");
-	startThread(&sampleThread, nullptr);
-}
-
 // UnitTest for getMemoryInfo
 #ifdef __linux__
 TEST_CASE("/flow/Platform/getMemoryInfo") {
diff --git a/flow/Platform.h b/flow/Platform.h
index edf9ff3997..c50c13e11a 100644
--- a/flow/Platform.h
+++ b/flow/Platform.h
@@ -741,8 +741,6 @@ void registerCrashHandler();
 void setupRunLoopProfiler();
 EXTERNC void setProfilingEnabled(int enabled);
 
-void setupSamplingProfiler();
-
 // Use _exit() or criticalError(), not exit()
 #define exit static_assert(false, "Calls to exit() are forbidden by policy");
 
@@ -793,17 +791,17 @@ inline void fdb_probe_actor_exit(const char* name, unsigned long id, int index)
 #include <inttypes.h>
 static inline uint32_t hwCrc32cU8(unsigned int crc, unsigned char v) {
 	uint32_t ret;
-	asm volatile("crc32cb %w[r], %w[c], %w[v]" : [r] "=r"(ret) : [c] "r"(crc), [v] "r"(v));
+	asm volatile("crc32cb %w[r], %w[c], %w[v]" : [ r ] "=r"(ret) : [ c ] "r"(crc), [ v ] "r"(v));
 	return ret;
 }
 static inline uint32_t hwCrc32cU32(unsigned int crc, unsigned int v) {
 	uint32_t ret;
-	asm volatile("crc32cw %w[r], %w[c], %w[v]" : [r] "=r"(ret) : [c] "r"(crc), [v] "r"(v));
+	asm volatile("crc32cw %w[r], %w[c], %w[v]" : [ r ] "=r"(ret) : [ c ] "r"(crc), [ v ] "r"(v));
 	return ret;
 }
 static inline uint64_t hwCrc32cU64(uint64_t crc, uint64_t v) {
 	uint64_t ret;
-	asm volatile("crc32cx %w[r], %w[c], %x[v]" : [r] "=r"(ret) : [c] "r"(crc), [v] "r"(v));
+	asm volatile("crc32cx %w[r], %w[c], %x[v]" : [ r ] "=r"(ret) : [ c ] "r"(crc), [ v ] "r"(v));
 	return ret;
 }
 #else

From f8d2bca6a4b0a664373d4ce511ee062c65b9cc9e Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 19 Apr 2021 13:10:27 -0600
Subject: [PATCH 179/461] address review comments

---
 fdbclient/ActorLineageProfiler.cpp | 32 ++++++++++++++----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 4993a74207..82d04aa42c 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -197,23 +197,21 @@ std::shared_ptr<Sample> SampleCollectorT::collect() {
 
 void SampleCollection_t::refresh() {
 	auto sample = _collector->collect();
-	auto min = sample->time - windowSize;
-	double oldest = 0.0;
-	while (oldest < min && !data.empty()) {
-		// we remove at most 10 elements at a time. This is so we don't block the main thread for too long.
-		{
-			Lock _{ mutex };
-			int i = 0;
-			do {
-				oldest = data.front()->time;
-				data.pop_front();
-				++i;
-			} while (i < 10 && oldest < min && !data.empty());
-		}
-	}
+	auto min = std::max(sample->time - windowSize, sample->time);
 	{
 		Lock _{ mutex };
-		data.push_back(sample);
+		data.emplace_back(std::move(sample));
+	}
+	double oldest = data.front()->time;
+	// we don't need to check for data.empty() in this loop (or the inner loop) as we know that we will end
+	// up with at least one entry which is the most recent sample
+	while (oldest < min) {
+		Lock _{ mutex };
+		// we remove at most 10 elements at a time. This is so we don't block the main thread for too long.
+		for (int i = 0; i < 10 && oldest < min; ++i) {
+			data.pop_front();
+			oldest = data.front()->time;
+		}
 	}
 }
 
@@ -224,8 +222,8 @@ std::vector<std::shared_ptr<Sample>> SampleCollection_t::get(double from /*= 0.0
 	for (const auto& sample : data) {
 		if (sample->time > to) {
 			break;
-		} else if (sample->time > from) {
-			res.emplace_back(sample);
+		} else if (sample->time >= from) {
+			res.push_back(sample);
 		}
 	}
 	return res;

From 7182289abb0ebfb30ecb37288bd9e288f30b93ac Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Mon, 19 Apr 2021 20:32:49 +0000
Subject: [PATCH 180/461] Use anonymous namespace

---
 fdbserver/StorageCache.actor.cpp  | 5 ++++-
 fdbserver/storageserver.actor.cpp | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp
index 73f6d0a245..b084f52896 100644
--- a/fdbserver/StorageCache.actor.cpp
+++ b/fdbserver/StorageCache.actor.cpp
@@ -42,8 +42,9 @@
 // TODO storageCache server shares quite a bit of storageServer functionality, although simplified
 // Need to look into refactoring common code out for better code readability and to avoid duplication
 
+namespace {
 // TODO rename wrong_shard_server error to wrong_cache_server
-static inline bool canReplyWith(Error e) {
+bool canReplyWith(Error e) {
 	switch (e.code()) {
 	case error_code_transaction_too_old:
 	case error_code_future_version:
@@ -56,6 +57,8 @@ static inline bool canReplyWith(Error e) {
 		return false;
 	};
 }
+} // namespace
+
 class StorageCacheUpdater;
 
 struct AddingCacheRange : NonCopyable {
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index cadf5d36fb..833a90ed5e 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -72,7 +72,8 @@
 
 #define SHORT_CIRCUT_ACTUAL_STORAGE 0
 
-static inline bool canReplyWith(Error e) {
+namespace {
+bool canReplyWith(Error e) {
 	switch (e.code()) {
 	case error_code_transaction_too_old:
 	case error_code_future_version:
@@ -85,6 +86,7 @@ static inline bool canReplyWith(Error e) {
 		return false;
 	};
 }
+} // namespace
 
 struct AddingShard : NonCopyable {
 	KeyRange keys;

From 03c031a09d77dec1e8700d5bcaf9de7b92f7e40a Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 19 Apr 2021 13:43:51 -0700
Subject: [PATCH 181/461] Update getCurrentVersion_impl

- If the restore is in the running state, then the current version is the getApplyBeginVersion()
- If the restore is in the completed state, the current version is the restore target version which comes from the restoreVersion() property.
- If the restore is in any other state, the current version can be reported as -1 as you have done.
---
 fdbclient/FileBackupAgent.actor.cpp | 43 ++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 4044df66cc..d95da6be89 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -244,20 +244,6 @@ public:
 
 	Key applyMutationsMapPrefix() { return uidPrefixKey(applyMutationsKeyVersionMapRange.begin, uid); }
 
-	ACTOR static Future<Version> getCurrentVersion_impl(Reference<ReadYourWritesTransaction> tr, UID uid) {
-		state Future<Optional<Value>> beginVal = tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid), true);
-		wait(success(beginVal));
-		if (!beginVal.get().present()) {
-			return -1;
-		}
-		Version currentVersion = BinaryReader::fromStringRef<Version>(beginVal.get().get(), Unversioned());
-		return currentVersion;
-	}
-
-	Future<Version> getCurrentVersion(Reference<ReadYourWritesTransaction> tr) {
-		return getCurrentVersion_impl(tr, uid);
-	}
-
 	ACTOR static Future<int64_t> getApplyVersionLag_impl(Reference<ReadYourWritesTransaction> tr, UID uid) {
 		// Both of these are snapshot reads
 		state Future<Optional<Value>> beginVal = tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid), true);
@@ -317,6 +303,13 @@ public:
 		tr->set(uidPrefixKey(applyMutationsBeginRange.begin, uid), BinaryWriter::toValue(ver, Unversioned()));
 	}
 
+	Future<Version> getApplyBeginVersion(Reference<ReadYourWritesTransaction> tr) {
+		return map(tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid)),
+		           [=](Optional<Value> const& value) -> Version {
+			           return value.present() ? BinaryReader::fromStringRef<Version>(value.get(), Unversioned()) : 0;
+		           });
+	}
+
 	void setApplyEndVersion(Reference<ReadYourWritesTransaction> tr, Version ver) {
 		tr->set(uidPrefixKey(applyMutationsEndRange.begin, uid), BinaryWriter::toValue(ver, Unversioned()));
 	}
@@ -328,6 +321,22 @@ public:
 		           });
 	}
 
+	ACTOR static Future<Version> getCurrentVersion_impl(RestoreConfig* self, Reference<ReadYourWritesTransaction> tr) {
+		state ERestoreState status = wait(self->stateEnum().getD(tr));
+		if (status == ERestoreState::RUNNING) {
+			Version version = wait(self->getApplyBeginVersion(tr));
+			return version;
+		} else if (status == ERestoreState::COMPLETED) {
+			Version version = wait(self->restoreVersion().getD(tr));
+			return version;
+		}
+		return -1;
+	}
+
+	Future<Version> getCurrentVersion(Reference<ReadYourWritesTransaction> tr) {
+		return getCurrentVersion_impl(this, tr);
+	}
+
 	ACTOR static Future<std::string> getProgress_impl(RestoreConfig restore, Reference<ReadYourWritesTransaction> tr);
 	Future<std::string> getProgress(Reference<ReadYourWritesTransaction> tr) { return getProgress_impl(*this, tr); }
 
@@ -5200,7 +5209,8 @@ public:
 	}
 
 	ACTOR static Future<Optional<Version>> getLastRestorable(FileBackupAgent* backupAgent,
-	                                                         Reference<ReadYourWritesTransaction> tr, Key tagName,
+	                                                         Reference<ReadYourWritesTransaction> tr,
+	                                                         Key tagName,
 	                                                         bool snapshot) {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
@@ -5594,7 +5604,8 @@ Future<std::string> FileBackupAgent::getStatusJSON(Database cx, std::string tagN
 	return FileBackupAgentImpl::getStatusJSON(this, cx, tagName);
 }
 
-Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName,
+Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr,
+                                                             Key tagName,
                                                              bool snapshot) {
 	return FileBackupAgentImpl::getLastRestorable(this, tr, tagName, snapshot);
 }

From ab4c5ff90e2598a36eb7bd8d0b3ab59f69abaa4b Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 19 Apr 2021 14:06:50 -0700
Subject: [PATCH 182/461] For better readability

---
 fdbclient/FileBackupAgent.actor.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index d95da6be89..f51ad736dd 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -323,14 +323,13 @@ public:
 
 	ACTOR static Future<Version> getCurrentVersion_impl(RestoreConfig* self, Reference<ReadYourWritesTransaction> tr) {
 		state ERestoreState status = wait(self->stateEnum().getD(tr));
+		state Version version = -1;
 		if (status == ERestoreState::RUNNING) {
-			Version version = wait(self->getApplyBeginVersion(tr));
-			return version;
+			wait(store(version, self->getApplyBeginVersion(tr)));
 		} else if (status == ERestoreState::COMPLETED) {
-			Version version = wait(self->restoreVersion().getD(tr));
-			return version;
+			wait(store(version, self->restoreVersion().getD(tr)));
 		}
-		return -1;
+		return version;
 	}
 
 	Future<Version> getCurrentVersion(Reference<ReadYourWritesTransaction> tr) {

From af72f76bd65250df3498ec42f26b468de4b537ae Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Mon, 19 Apr 2021 14:51:58 -0700
Subject: [PATCH 183/461] Update the documentation for
 \xff\xff/management/maintenance

---
 documentation/sphinx/source/developer-guide.rst | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index 6f4f7bcad4..3a3c731630 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -949,7 +949,11 @@ that process, and wait for necessary data to be moved away.
 #. ``\xff\xff/management/options/failed/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed/<exclusion>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
 #. ``\xff\xff/management/min_required_commit_version`` Read/write. Changing this key will change the corresponding system key ``\xff/minRequiredCommitVersion = [[Version]]``. The value of this special key is the literal text of the underlying ``Version``, which is ``int64_t``. If you set the key with a value failed to be parsed as ``int64_t``, ``special_keys_api_failure`` will be thrown. In addition, the given ``Version`` should be larger than the current read version and smaller than the upper bound(``2**63-1-version_per_second*3600*24*365*1000``). Otherwise, ``special_keys_api_failure`` is thrown. For more details, see help text of ``fdbcli`` command ``advanceversion``.
 #. ``\xff\xff/management/profiling/<client_txn_sample_rate|client_txn_size_limit>`` Read/write. Changing these two keys will change the corresponding system keys ``\xff\x02/fdbClientInfo/<client_txn_sample_rate|client_txn_size_limit>``, respectively. The value of ``\xff\xff/management/client_txn_sample_rate`` is a literal text of ``double``, and the value of ``\xff\xff/management/client_txn_size_limit`` is a literal text of ``int64_t``. A special value ``default`` can be set to or read from these two keys, representing the client profiling is disabled. In addition, ``clear`` in this range is not allowed. For more details, see help text of ``fdbcli`` command ``profile client``.
-#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of ``int`` which represents the remaining time for the zone to be in maintenance. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. The special key ``\xff\xff/management/maintenance/IgnoreSSFailures``, if set, will disable datadistribution for storage server failures and thus maintenance mode will be unable to use until the key is cleared. For more details, see help text of ``fdbcli`` command ``maintenance``.
+#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of ``int`` which represents the remaining time for the zone to be in maintenance. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. For more details, see help text of ``fdbcli`` command ``maintenance``.
+   In addition, a special key ``\xff\xff/management/maintenance/IgnoreSSFailures`` in the range, if set, will disable datadistribution for storage server failures.
+   It is doing the same thing as the fdbcli command ``datadistribution disable ssfailure``.
+   Maintenance mode will be unable to use until the key is cleared, which is the same as the fdbcli command ``datadistribution enable ssfailure``.
+   While the key is set, any commit that tries to set a key in the range will fail with the ``special_keys_api_failure`` error.
 #. ``\xff\xff/management/data_distribution/<mode|rebalance_ignored>`` Read/write. Changing these two keys will change the two corresponding system keys ``\xff/dataDistributionMode`` and ``\xff\x02/rebalanceDDIgnored``. The value of ``\xff\xff/management/data_distribution/mode`` is a literal text of ``0`` (disable) or ``1`` (enable). Transactions committed with invalid values will throw ``special_keys_api_failure`` . The value of ``\xff\xff/management/data_distribution/rebalance_ignored`` is empty. If present, it means data distribution is disabled for rebalance. Any transaction committed with non-empty value for this key will throw ``special_keys_api_failure``. For more details, see help text of ``fdbcli`` command ``datadistribution``.
 
 An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or

From c2c9ca43626a0cd5df87cb73e6c3378aacd0165d Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Mon, 19 Apr 2021 17:01:20 -0700
Subject: [PATCH 184/461] Assert was incorrect.  Restore ranges must begin with
 the restore prefix to remove.

---
 fdbclient/FileBackupAgent.actor.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index e7da8fbf58..2369501c82 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -4590,8 +4590,9 @@ public:
 				restoreRanges.push_back(KeyRange(KeyRangeRef(restoreRange.range().begin, restoreRange.range().end)));
 			}
 		}
-		for (auto& restoreRange : restoreRanges)
-			ASSERT(restoreRange.contains(removePrefix) || removePrefix.size() == 0);
+		for (auto& restoreRange : restoreRanges) {
+			ASSERT(restoreRange.begin.startsWith(removePrefix) && restoreRange.end.startsWith(removePrefix));
+		}
 
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
@@ -5183,7 +5184,8 @@ public:
 	}
 
 	ACTOR static Future<Optional<Version>> getLastRestorable(FileBackupAgent* backupAgent,
-	                                                         Reference<ReadYourWritesTransaction> tr, Key tagName,
+	                                                         Reference<ReadYourWritesTransaction> tr,
+	                                                         Key tagName,
 	                                                         bool snapshot) {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
@@ -5577,7 +5579,8 @@ Future<std::string> FileBackupAgent::getStatusJSON(Database cx, std::string tagN
 	return FileBackupAgentImpl::getStatusJSON(this, cx, tagName);
 }
 
-Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName,
+Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr,
+                                                             Key tagName,
                                                              bool snapshot) {
 	return FileBackupAgentImpl::getLastRestorable(this, tr, tagName, snapshot);
 }

From 36702e57ee3dd61b942705cccd6ae5a6ae20d7fd Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 19 Apr 2021 17:06:10 -0700
Subject: [PATCH 185/461] Rename a few variables.

---
 fdbserver/Knobs.cpp        | 2 +-
 fdbserver/Knobs.h          | 2 +-
 fdbserver/worker.actor.cpp | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index df3434fb9b..ef2334d3cf 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -616,7 +616,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	//Worker
 	init( WORKER_LOGGING_INTERVAL,                               5.0 );
 	init( HEAP_PROFILER_INTERVAL,                               30.0 );
-	init( JOIN_CLUSTER_WARNING_INTERVAL,                       300.0 );
+	init( REGISTER_WORKER_REQUEST_TIMEOUT,                       300.0 );
 	init( DEGRADED_RESET_INTERVAL,                          24*60*60 ); if ( randomize && BUGGIFY ) DEGRADED_RESET_INTERVAL = 10;
 	init( DEGRADED_WARNING_LIMIT,                                  1 );
 	init( DEGRADED_WARNING_RESET_DELAY,                   7*24*60*60 );
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 690bbe6327..a9333b0cf3 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -543,7 +543,7 @@ public:
 	// Worker
 	double WORKER_LOGGING_INTERVAL;
 	double HEAP_PROFILER_INTERVAL;
-	double JOIN_CLUSTER_WARNING_INTERVAL;
+	double REGISTER_WORKER_REQUEST_TIMEOUT;
 	double DEGRADED_RESET_INTERVAL;
 	double DEGRADED_WARNING_LIMIT;
 	double DEGRADED_WARNING_RESET_DELAY;
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index e03089b618..c2f35f226c 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -569,11 +569,11 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			when(RegisterWorkerReply reply = wait(registrationReply)) {
 				processClass = reply.processClass;
 				asyncPriorityInfo->set(reply.priorityInfo);
-				TraceEvent("WorkerJoiningCluster").detail("CCID", ccInterface->get().get().id());
+				TraceEvent("WorkerRegisterReply").detail("CCID", ccInterface->get().get().id());
 				break;
 			}
 			when(wait(delay(SERVER_KNOBS->JOIN_CLUSTER_WARNING_INTERVAL))) {
-				TraceEvent(SevWarn, "WorkerNotJoinedClusterForLongTime").detail("WaitTime", now() - startTime);
+				TraceEvent(SevWarn, "WorkerRegisterTimeout").detail("WaitTime", now() - startTime);
 			}
 			when(wait(ccInterface->onChange())) { break; }
 			when(wait(ddInterf->onChange())) { break; }

From 4a35fa07e784816843d3053dadc4a53c42a03912 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 19 Apr 2021 17:14:46 -0700
Subject: [PATCH 186/461] Add a safe check

---
 fdbserver/worker.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index c2f35f226c..0eb77c2553 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -542,7 +542,7 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			}
 		} else {
 			incorrectTime = Optional<double>();
-			if (connFile->canGetFilename()) {
+			if (connFile && connFile->canGetFilename()) {
 				TraceEvent("ClusterFileContents")
 				    .detail("Filename", connFile->getFilename())
 				    .detail("ConnectionStringFromFile", fileConnectionString.toString())

From f04303185fa81f9900ea38f99a03452d2059c886 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 19 Apr 2021 17:17:22 -0700
Subject: [PATCH 187/461] Huh

---
 fdbserver/worker.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 0eb77c2553..b7cbfe16d7 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -572,7 +572,7 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 				TraceEvent("WorkerRegisterReply").detail("CCID", ccInterface->get().get().id());
 				break;
 			}
-			when(wait(delay(SERVER_KNOBS->JOIN_CLUSTER_WARNING_INTERVAL))) {
+			when(wait(delay(SERVER_KNOBS->REGISTER_WORKER_REQUEST_TIMEOUT))) {
 				TraceEvent(SevWarn, "WorkerRegisterTimeout").detail("WaitTime", now() - startTime);
 			}
 			when(wait(ccInterface->onChange())) { break; }

From f8054b82de8c61a05c7cd8b0c45d9eca0981033d Mon Sep 17 00:00:00 2001
From: Cynthia <cynthia@coan.dev>
Date: Mon, 19 Apr 2021 22:24:13 -0600
Subject: [PATCH 188/461] fdbcli prints error on TLS File not found

---
 flow/TLSConfig.actor.cpp | 50 ++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/flow/TLSConfig.actor.cpp b/flow/TLSConfig.actor.cpp
index d716ccf19a..867c2369e1 100644
--- a/flow/TLSConfig.actor.cpp
+++ b/flow/TLSConfig.actor.cpp
@@ -253,21 +253,36 @@ LoadedTLSConfig TLSConfig::loadSync() const {
 
 	const std::string certPath = getCertificatePathSync();
 	if (certPath.size()) {
-		loaded.tlsCertBytes = readFileBytes(certPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE);
+		try {
+			loaded.tlsCertBytes = readFileBytes(certPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE);
+		} catch (Error& e) {
+			fprintf(stderr, "Error reading TLS Certificate [%s]: %s\n", certPath.c_str(), e.what());
+			throw;
+		}
 	} else {
 		loaded.tlsCertBytes = tlsCertBytes;
 	}
 
 	const std::string keyPath = getKeyPathSync();
 	if (keyPath.size()) {
-		loaded.tlsKeyBytes = readFileBytes(keyPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE);
+		try {
+			loaded.tlsKeyBytes = readFileBytes(keyPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE);
+		} catch (Error& e) {
+			fprintf(stderr, "Error reading TLS Key [%s]: %s\n", keyPath.c_str(), e.what());
+			throw;
+		}
 	} else {
 		loaded.tlsKeyBytes = tlsKeyBytes;
 	}
 
 	const std::string CAPath = getCAPathSync();
 	if (CAPath.size()) {
-		loaded.tlsCABytes = readFileBytes(CAPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE);
+		try {
+			loaded.tlsCABytes = readFileBytes(CAPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE);
+		} catch (Error& e) {
+			fprintf(stderr, "Error reading TLS CA [%s]: %s\n", CAPath.c_str(), e.what());
+			throw;
+		}
 	} else {
 		loaded.tlsCABytes = tlsCABytes;
 	}
@@ -297,28 +312,49 @@ ACTOR Future<LoadedTLSConfig> TLSConfig::loadAsync(const TLSConfig* self) {
 	state LoadedTLSConfig loaded;
 	state std::vector<Future<Void>> reads;
 
-	const std::string& certPath = self->getCertificatePathSync();
+	state int32_t certIdx = -1;
+	state int32_t keyIdx = -1;
+	state int32_t caIdx = -1;
+
+	state std::string certPath = self->getCertificatePathSync();
 	if (certPath.size()) {
 		reads.push_back(readEntireFile(certPath, &loaded.tlsCertBytes));
+		certIdx = reads.size() - 1;
 	} else {
 		loaded.tlsCertBytes = self->tlsCertBytes;
 	}
 
-	const std::string& keyPath = self->getKeyPathSync();
+	state std::string keyPath = self->getKeyPathSync();
 	if (keyPath.size()) {
 		reads.push_back(readEntireFile(keyPath, &loaded.tlsKeyBytes));
+		keyIdx = reads.size() - 1;
 	} else {
 		loaded.tlsKeyBytes = self->tlsKeyBytes;
 	}
 
-	const std::string& CAPath = self->getCAPathSync();
+	state std::string CAPath = self->getCAPathSync();
 	if (CAPath.size()) {
 		reads.push_back(readEntireFile(CAPath, &loaded.tlsCABytes));
+		caIdx = reads.size() - 1;
 	} else {
 		loaded.tlsCABytes = self->tlsCABytes;
 	}
 
-	wait(waitForAll(reads));
+	try {
+		wait(waitForAll(reads));
+	} catch (Error& e) {
+		if (certIdx != -1 && reads[certIdx].isError()) {
+			fprintf(stderr, "Failure reading TLS Certificate [%s]: %s\n", certPath.c_str(), e.what());
+		} else if (keyIdx != -1 && reads[keyIdx].isError()) {
+			fprintf(stderr, "Failure reading TLS Key [%s]: %s\n", keyPath.c_str(), e.what());
+		} else if (caIdx != -1 && reads[caIdx].isError()) {
+			fprintf(stderr, "Failure reading TLS Key [%s]: %s\n", CAPath.c_str(), e.what());
+		} else {
+			fprintf(stderr, "Failure reading TLS needed file: %s\n", e.what());
+		}
+
+		throw;
+	}
 
 	loaded.tlsPassword = self->tlsPassword;
 	loaded.tlsVerifyPeers = self->tlsVerifyPeers;

From 3c7dc1a59e616a19ce52cb595c35137a1514582b Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Mon, 19 Apr 2021 21:35:54 -0700
Subject: [PATCH 189/461] Backup correctness workload bug fix.  Sometimes the
 restore target ranges are empty, which causes a test timeout.   Renamed some
 variables for clarity.

---
 .../workloads/BackupCorrectness.actor.cpp     | 45 ++++++++++++-------
 tests/fast/BackupCorrectnessClean.toml        |  2 +-
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/fdbserver/workloads/BackupCorrectness.actor.cpp b/fdbserver/workloads/BackupCorrectness.actor.cpp
index 4a57d399fe..32ff788981 100644
--- a/fdbserver/workloads/BackupCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupCorrectness.actor.cpp
@@ -33,8 +33,8 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 	int backupRangesCount, backupRangeLengthMax;
 	bool differentialBackup, performRestore, agentRequest;
 	Standalone<VectorRef<KeyRangeRef>> backupRanges;
-	std::vector<std::string> prefixesMandatory;
-	Standalone<VectorRef<KeyRangeRef>> skipRestoreRanges;
+	std::vector<std::string> prefixesAllowed;
+	std::vector<Standalone<KeyRangeRef>> skippedRestoreRanges;
 	Standalone<VectorRef<KeyRangeRef>> restoreRanges;
 	static int backupAgentRequests;
 	bool locked;
@@ -68,7 +68,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 		agentRequest = getOption(options, LiteralStringRef("simBackupAgents"), true);
 		allowPauses = getOption(options, LiteralStringRef("allowPauses"), true);
 		shareLogRange = getOption(options, LiteralStringRef("shareLogRange"), false);
-		prefixesMandatory = getOption(options, LiteralStringRef("prefixesMandatory"), std::vector<std::string>());
+		prefixesAllowed = getOption(options, LiteralStringRef("prefixesAllowed"), std::vector<std::string>());
 		shouldSkipRestoreRanges = deterministicRandom()->random01() < 0.3 ? true : false;
 
 		TraceEvent("BARW_ClientId").detail("Id", wcx.clientId);
@@ -104,32 +104,45 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 			}
 		}
 
-		if (performRestore && !prefixesMandatory.empty() && shouldSkipRestoreRanges) {
+		if (performRestore && !prefixesAllowed.empty() && shouldSkipRestoreRanges) {
 			for (auto& range : backupRanges) {
 				bool intersection = false;
-				for (auto& prefix : prefixesMandatory) {
-					KeyRange mandatoryRange(KeyRangeRef(prefix, strinc(prefix)));
-					if (range.intersects(mandatoryRange))
+				for (auto& prefix : prefixesAllowed) {
+					KeyRange prefixRange(KeyRangeRef(prefix, strinc(prefix)));
+					if (range.intersects(prefixRange)) {
 						intersection = true;
+					}
 					TraceEvent("BARW_PrefixSkipRangeDetails")
-					    .detail("PrefixMandatory", printable(mandatoryRange))
-					    .detail("BackUpRange", printable(range))
+					    .detail("PrefixMandatory", printable(prefix))
+					    .detail("BackupRange", printable(range))
 					    .detail("Intersection", intersection);
 				}
-				if (!intersection && deterministicRandom()->random01() < 0.5)
-					skipRestoreRanges.push_back(skipRestoreRanges.arena(), range);
-				else
-					restoreRanges.push_back(restoreRanges.arena(), range);
+				// If the backup range intersects with prefixesAllowed or a coin flip is true then use it as a restore
+				// range as well, otherwise skip it.
+				if (intersection || deterministicRandom()->coinflip()) {
+					restoreRanges.push_back_deep(restoreRanges.arena(), range);
+				} else {
+					skippedRestoreRanges.push_back(range);
+				}
 			}
 		} else {
 			restoreRanges = backupRanges;
 		}
+
+		// If no random backup ranges intersected with prefixesAllowed or won the coin flip then restoreRanges will be
+		// empty, so move an item from skippedRestoreRanges to restoreRanges.
+		if (restoreRanges.empty()) {
+			ASSERT(!skippedRestoreRanges.empty());
+			restoreRanges.push_back_deep(restoreRanges.arena(), skippedRestoreRanges.back());
+			skippedRestoreRanges.pop_back();
+		}
+
 		for (auto& range : restoreRanges) {
 			TraceEvent("BARW_RestoreRange", randomID)
 			    .detail("RangeBegin", printable(range.begin))
 			    .detail("RangeEnd", printable(range.end));
 		}
-		for (auto& range : skipRestoreRanges) {
+		for (auto& range : skippedRestoreRanges) {
 			TraceEvent("BARW_SkipRange", randomID)
 			    .detail("RangeBegin", printable(range.begin))
 			    .detail("RangeEnd", printable(range.end));
@@ -171,8 +184,8 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 		loop {
 			try {
 				state int restoreIndex;
-				for (restoreIndex = 0; restoreIndex < self->skipRestoreRanges.size(); restoreIndex++) {
-					state KeyRangeRef range = self->skipRestoreRanges[restoreIndex];
+				for (restoreIndex = 0; restoreIndex < self->skippedRestoreRanges.size(); restoreIndex++) {
+					state KeyRangeRef range = self->skippedRestoreRanges[restoreIndex];
 					Standalone<StringRef> restoreTag(self->backupTag.toString() + "_" + std::to_string(restoreIndex));
 					Standalone<RangeResultRef> res = wait(tr.getRange(range, GetRangeLimits::ROW_LIMIT_UNLIMITED));
 					if (!res.empty()) {
diff --git a/tests/fast/BackupCorrectnessClean.toml b/tests/fast/BackupCorrectnessClean.toml
index 8562413197..bc6b68ae34 100644
--- a/tests/fast/BackupCorrectnessClean.toml
+++ b/tests/fast/BackupCorrectnessClean.toml
@@ -50,7 +50,7 @@ simBackupAgents = 'BackupToFile'
     restoreAfter = 60.0
     performRestore = true
     allowPauses = false
-    prefixesMandatory = 'a,A,m'
+    prefixesAllowed = 'a,A,m'
 
     [[test.workload]]
     testName = 'BackupAndRestoreCorrectness'

From 3f54a4a6dc571dd2213cebac165f73b1815cfad8 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Mon, 19 Apr 2021 21:52:38 -0700
Subject: [PATCH 190/461] Throw an error if an empty range set is passed to
 restore().

---
 fdbclient/FileBackupAgent.actor.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 2369501c82..e55044e11b 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -5237,6 +5237,11 @@ public:
 	                                     bool incrementalBackupOnly,
 	                                     Version beginVersion,
 	                                     UID randomUid) {
+		// The restore command line tool won't allow ranges to be empty, but correctness workloads somehow might.
+		if (ranges.empty()) {
+			throw restore_error();
+		}
+
 		state Reference<IBackupContainer> bc = IBackupContainer::openContainer(url.toString());
 
 		state BackupDescription desc = wait(bc->describeBackup(true));

From c81e1e95193ee07437984ae1d3ab8cc7f0bd957b Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 19 Apr 2021 22:46:57 -0700
Subject: [PATCH 191/461] Add sampling profiler frequency to global config

---
 fdbclient/ActorLineageProfiler.h      |  4 +++-
 fdbclient/GlobalConfig.actor.cpp      | 12 ++++++++++++
 fdbclient/GlobalConfig.actor.h        | 15 +++++++++++++++
 fdbserver/ClusterController.actor.cpp |  4 +++-
 fdbserver/fdbserver.actor.cpp         | 24 ++++++++++++++++++++++++
 5 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 3f11840714..5dee2a4291 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -64,7 +64,7 @@ private:
 public:
 	void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); }
 	std::shared_ptr<Sample> collect();
-	void addGetter(WaitState waitState, Getter const& getter);
+	void addGetter(WaitState waitState, Getter const& getter) { getSamples[waitState] = getter; };
 };
 
 using SampleCollector = crossbow::singleton<SampleCollectorT>;
@@ -118,3 +118,5 @@ public:
 	void setFrequency(unsigned frequency);
 	void stop();
 };
+
+using ActorLineageProfiler = crossbow::singleton<ActorLineageProfilerT>;
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 58e032d363..95d7cfce13 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -34,6 +34,8 @@ const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_inf
 const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag_sample_rate");
 const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost");
 
+const KeyRef sampleFrequency = LiteralStringRef("visibility/sample_frequency");
+
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
 void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
@@ -45,6 +47,10 @@ void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>>
 	}
 }
 
+void GlobalConfig::updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+	_updater = updater(&GlobalConfig::globalConfig(), dbInfo);
+}
+
 GlobalConfig& GlobalConfig::globalConfig() {
 	void* res = g_network->global(INetwork::enGlobalConfig);
 	ASSERT(res);
@@ -77,6 +83,10 @@ Future<Void> GlobalConfig::onInitialized() {
 	return initialized.getFuture();
 }
 
+Future<Void> GlobalConfig::onChange() {
+	return configChanged.onTrigger();
+}
+
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
 	data.erase(key);
 
@@ -222,6 +232,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 					self->lastUpdate = vh.version;
 				}
 			}
+
+			self->configChanged.trigger();
 		} catch (Error& e) {
 			throw;
 		}
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 5c3693f450..bf7532a974 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -49,6 +49,8 @@ extern const KeyRef fdbClientInfoTxnSizeLimit;
 extern const KeyRef transactionTagSampleRate;
 extern const KeyRef transactionTagSampleCost;
 
+extern const KeyRef sampleFrequency;
+
 // Structure used to hold the values stored by global configuration. The arena
 // is used as memory to store both the key and the value (the value is only
 // stored in the arena if it is an object; primitives are just copied).
@@ -78,6 +80,14 @@ public:
 	// For example, given "config/a", returns "\xff\xff/global_config/config/a".
 	static Key prefixedKey(KeyRef key);
 
+	// Update the ClientDBInfo object used internally to check for updates to
+	// global configuration. The ClientDBInfo reference must be the same one
+	// used in the cluster controller, but fdbserver requires initial creation
+	// of the GlobalConfig class before the cluster controller is initialized.
+	// This function allows the ClientDBInfo object to be updated after create
+	// was called.
+	void updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo);
+
 	// Get a value from the framework. Values are returned as a ConfigValue
 	// reference which also contains the arena holding the object. As long as
 	// the caller keeps the ConfigValue reference, the value is guaranteed to
@@ -114,6 +124,10 @@ public:
 	// been created and is ready.
 	Future<Void> onInitialized();
 
+	// Triggers the returned future when any key-value pair in the global
+	// configuration changes.
+	Future<Void> onChange();
+
 private:
 	GlobalConfig();
 
@@ -139,6 +153,7 @@ private:
 	Database cx;
 	Future<Void> _updater;
 	Promise<Void> initialized;
+	AsyncTrigger configChanged;
 	std::unordered_map<StringRef, Reference<ConfigValue>> data;
 	Version lastUpdate;
 };
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 8ec3a4d30c..6b929ca29e 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -135,7 +135,9 @@ public:
 		                                                                         true,
 		                                                                         TaskPriority::DefaultEndpoint,
 		                                                                         true)) // SOMEDAY: Locality!
-		{}
+		{
+			GlobalConfig::globalConfig().updateDBInfo(clientInfo);
+		}
 
 		void setDistributor(const DataDistributorInterface& interf) {
 			auto newInfo = serverInfo->get();
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 136cd90c3d..59e2f494fc 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -35,6 +35,8 @@
 #include <boost/algorithm/string.hpp>
 #include <boost/interprocess/managed_shared_memory.hpp>
 
+#include "fdbclient/ActorLineageProfiler.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/RestoreWorkerInterface.actor.h"
 #include "fdbclient/SystemData.h"
@@ -456,6 +458,27 @@ ACTOR Future<Void> dumpDatabase(Database cx, std::string outputFilename, KeyRang
 	}
 }
 
+// Handles running the sampling profiler, including responding to frequency
+// changes and other updates the client wishes to make through global
+// configuration.
+ACTOR Future<Void> actorLineageProfiler() {
+	wait(delay(1));
+	wait(GlobalConfig::globalConfig().onInitialized());
+	// TODO: Add flag to enable/disable
+	state unsigned frequency = GlobalConfig::globalConfig().get<double>(sampleFrequency, 0);
+	ActorLineageProfiler::instance().setFrequency(frequency);
+
+	loop {
+		wait(GlobalConfig::globalConfig().onChange());
+
+		unsigned latestFrequency = GlobalConfig::globalConfig().get<double>(sampleFrequency, 0);
+		if (latestFrequency != frequency) {
+			frequency = latestFrequency;
+			ActorLineageProfiler::instance().setFrequency(latestFrequency);
+		}
+	}
+}
+
 void memoryTest();
 void skipListTest();
 
@@ -1987,6 +2010,7 @@ int main(int argc, char* argv[]) {
 				                      opts.whitelistBinPaths));
 				actors.push_back(histogramReport());
 				// actors.push_back( recurring( []{}, .001 ) );  // for ASIO latency measurement
+				actors.push_back(actorLineageProfiler());
 
 				f = stopAfter(waitForAll(actors));
 				g_network->run();

From f634165b791020c544a8bf89717f34d731395782 Mon Sep 17 00:00:00 2001
From: Oleg Samarin <osamarin@openwaygroup.com>
Date: Mon, 5 Apr 2021 18:04:55 +0300
Subject: [PATCH 192/461] Fixed a typo in the Client Testing documentation

---
 documentation/sphinx/source/client-testing.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/client-testing.rst b/documentation/sphinx/source/client-testing.rst
index caf65a265a..884eff0933 100644
--- a/documentation/sphinx/source/client-testing.rst
+++ b/documentation/sphinx/source/client-testing.rst
@@ -315,7 +315,7 @@ and pass the test with ``-f``:
 
 .. code-block:: sh
 
-   fdbserver -r simulator -f testfile.txt
+   fdbserver -r simulation -f testfile.txt
 
 
 Running a Workload on an actual Cluster

From b8865673c31810a3a5f0e84e60066c5ac4cc9c2b Mon Sep 17 00:00:00 2001
From: Aaron Molitor <amolitor@apple.com>
Date: Tue, 20 Apr 2021 09:32:38 -0500
Subject: [PATCH 193/461] add redhat-lsb-core to build images -- cpack uses
 lsb_release

---
 build/docker/centos6/build/Dockerfile | 1 +
 build/docker/centos7/build/Dockerfile | 1 +
 2 files changed, 2 insertions(+)

diff --git a/build/docker/centos6/build/Dockerfile b/build/docker/centos6/build/Dockerfile
index 1290160c4f..c007626643 100644
--- a/build/docker/centos6/build/Dockerfile
+++ b/build/docker/centos6/build/Dockerfile
@@ -37,6 +37,7 @@ RUN sed -i -e '/enabled/d' /etc/yum.repos.d/CentOS-Base.repo && \
         lz4-devel \
         lz4-static \
         mono-devel \
+        redhat-lsb-core \
         rpm-build \
         tcl-devel \
         unzip \
diff --git a/build/docker/centos7/build/Dockerfile b/build/docker/centos7/build/Dockerfile
index 3a9ee06938..18773c041a 100644
--- a/build/docker/centos7/build/Dockerfile
+++ b/build/docker/centos7/build/Dockerfile
@@ -34,6 +34,7 @@ RUN rpmkeys --import mono-project.com.rpmkey.pgp && \
         lz4-devel \
         lz4-static \
         mono-devel \
+        redhat-lsb-core \
         rpm-build \
         tcl-devel \
         unzip \

From 5d0d8372681b8c4800484cc43e9b034130eb1b8d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 31 Mar 2021 15:19:50 -0700
Subject: [PATCH 194/461] Fix version cutoff

---
 fdbclient/DatabaseBackupAgent.actor.cpp       |  2 +-
 .../workloads/BackupToDBCorrectness.actor.cpp | 30 ++++++++++++++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/fdbclient/DatabaseBackupAgent.actor.cpp b/fdbclient/DatabaseBackupAgent.actor.cpp
index bc29f9e848..a6870568bf 100644
--- a/fdbclient/DatabaseBackupAgent.actor.cpp
+++ b/fdbclient/DatabaseBackupAgent.actor.cpp
@@ -1072,7 +1072,7 @@ struct CopyLogsTaskFunc : TaskFuncBase {
 
 			wait(waitForAll(addTaskVector) && taskBucket->finish(tr, task));
 		} else {
-			if (appliedVersion <= stopVersionData) {
+			if (appliedVersion < applyVersion) {
 				wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
 				wait(success(CopyLogsTaskFunc::addTask(
 				    tr, taskBucket, task, prevBeginVersion, beginVersion, TaskCompletionKey::signal(onDone))));
diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index b8776064a5..1cd8160edc 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -145,6 +145,29 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 
 	void getMetrics(vector<PerfMetric>& m) override {}
 
+	// Reads a series of key ranges and returns the number of matching records.
+	ACTOR static Future<int> readRanges(Database cx,
+	                                    Standalone<VectorRef<KeyRangeRef>> ranges,
+	                                    StringRef removePrefix) {
+		loop {
+			state Transaction tr(cx);
+			try {
+				state std::vector<Future<Standalone<RangeResultRef>>> results;
+				for (auto& range : ranges) {
+					results.push_back(tr.getRange(range.removePrefix(removePrefix), CLIENT_KNOBS->TOO_MANY));
+				}
+				wait(waitForAll(results));
+				int numResults = 0;
+				for (auto result : results) {
+					numResults += result.get().size();
+				}
+				return numResults;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+
 	ACTOR static Future<Void> diffRanges(Standalone<VectorRef<KeyRangeRef>> ranges,
 	                                     StringRef backupPrefix,
 	                                     Database src,
@@ -639,7 +662,7 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 					}
 				}
 
-				Standalone<VectorRef<KeyRangeRef>> restoreRange;
+				state Standalone<VectorRef<KeyRangeRef>> restoreRange;
 
 				for (auto r : self->backupRanges) {
 					restoreRange.push_back_deep(
@@ -660,6 +683,11 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 
 				wait(success(restoreTool.waitBackup(cx, self->restoreTag)));
 				wait(restoreTool.unlockBackup(cx, self->restoreTag));
+
+				state int res1 = wait(readRanges(cx, restoreRange, self->backupPrefix));
+				wait(delay(5));
+				state int res2 = wait(readRanges(cx, restoreRange, self->backupPrefix));
+				ASSERT(res1 == res2);
 			}
 
 			if (extraBackup.isValid()) {

From 9baa837b2d870a3fcdf6bda0e71e8e0620dff21d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 1 Apr 2021 11:00:50 -0700
Subject: [PATCH 195/461] Compare range contents instead of size

---
 .../workloads/BackupToDBCorrectness.actor.cpp | 28 +++++++++++++------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index 1cd8160edc..f39f6607a8 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -145,8 +145,8 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 
 	void getMetrics(vector<PerfMetric>& m) override {}
 
-	// Reads a series of key ranges and returns the number of matching records.
-	ACTOR static Future<int> readRanges(Database cx,
+	// Reads a series of key ranges and returns each range.
+	ACTOR static Future<std::vector<Standalone<RangeResultRef>>> readRanges(Database cx,
 	                                    Standalone<VectorRef<KeyRangeRef>> ranges,
 	                                    StringRef removePrefix) {
 		loop {
@@ -154,14 +154,15 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 			try {
 				state std::vector<Future<Standalone<RangeResultRef>>> results;
 				for (auto& range : ranges) {
-					results.push_back(tr.getRange(range.removePrefix(removePrefix), CLIENT_KNOBS->TOO_MANY));
+					results.push_back(tr.getRange(range.removePrefix(removePrefix), 1000));
 				}
 				wait(waitForAll(results));
-				int numResults = 0;
+
+				std::vector<Standalone<RangeResultRef>> ret;
 				for (auto result : results) {
-					numResults += result.get().size();
+					ret.push_back(result.get());
 				}
-				return numResults;
+				return ret;
 			} catch (Error& e) {
 				wait(tr.onError(e));
 			}
@@ -684,10 +685,19 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 				wait(success(restoreTool.waitBackup(cx, self->restoreTag)));
 				wait(restoreTool.unlockBackup(cx, self->restoreTag));
 
-				state int res1 = wait(readRanges(cx, restoreRange, self->backupPrefix));
+				state std::vector<Standalone<RangeResultRef>> res1 = wait(readRanges(cx, restoreRange, self->backupPrefix));
 				wait(delay(5));
-				state int res2 = wait(readRanges(cx, restoreRange, self->backupPrefix));
-				ASSERT(res1 == res2);
+				state std::vector<Standalone<RangeResultRef>> res2 = wait(readRanges(cx, restoreRange, self->backupPrefix));
+				ASSERT(res1.size() == res2.size());
+				for (int i = 0; i < res1.size(); ++i) {
+					auto range1 = res1.at(i);
+					auto range2 = res2.at(i);
+					ASSERT(range1.size() == range2.size());
+
+					for (int j = 0; i < range1.size(); ++j) {
+						ASSERT(range1[i].key == range2[i].key && range1[i].value == range2[i].value);
+					}
+				}
 			}
 
 			if (extraBackup.isValid()) {

From 3b95559419fe766f22d9d36ebd1add51ff4e489a Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 1 Apr 2021 11:13:05 -0700
Subject: [PATCH 196/461] Fix indexes

---
 fdbserver/workloads/BackupToDBCorrectness.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index f39f6607a8..b17fd206ee 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -694,8 +694,8 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 					auto range2 = res2.at(i);
 					ASSERT(range1.size() == range2.size());
 
-					for (int j = 0; i < range1.size(); ++j) {
-						ASSERT(range1[i].key == range2[i].key && range1[i].value == range2[i].value);
+					for (int j = 0; j < range1.size(); ++j) {
+						ASSERT(range1[j].key == range2[j].key && range1[j].value == range2[j].value);
 					}
 				}
 			}

From 38d780e847427d3d95d53c27289b079ae9b4357c Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 1 Apr 2021 17:35:43 -0700
Subject: [PATCH 197/461] Add buggify

---
 fdbclient/BackupAgentBase.actor.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp
index d6be426dab..72fce5a509 100644
--- a/fdbclient/BackupAgentBase.actor.cpp
+++ b/fdbclient/BackupAgentBase.actor.cpp
@@ -743,6 +743,9 @@ ACTOR Future<Void> applyMutations(Database cx,
 			wait(coalesceKeyVersionCache(
 			    uid, newEndVersion, keyVersion, commit, committedVersion, addActor, &commitLock));
 			beginVersion = newEndVersion;
+			if (BUGGIFY) {
+				wait(delay(2.0));
+			}
 		}
 	} catch (Error& e) {
 		TraceEvent(e.code() == error_code_restore_missing_data ? SevWarnAlways : SevError, "ApplyMutationsError")

From 92d11cb09e9cd64f52959bb64e6bdac488aa88fd Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 1 Apr 2021 18:36:27 -0700
Subject: [PATCH 198/461] Add idleness comment

---
 fdbserver/workloads/BackupToDBCorrectness.actor.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index b17fd206ee..47b23f72db 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -24,7 +24,9 @@
 #include "fdbserver/workloads/BulkSetup.actor.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 
-// A workload which test the correctness of backup and restore process
+// A workload which test the correctness of backup and restore process. The
+// database must be idle after the restore completes, and this workload checks
+// that the restore range does not change post restore.
 struct BackupToDBCorrectnessWorkload : TestWorkload {
 	double backupAfter, abortAndRestartAfter, restoreAfter;
 	double backupStartAt, restoreStartAfterBackupFinished, stopDifferentialAfter;

From 1c4f72c98a74ec2d370c69223c8e0b975cda0ab8 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 5 Apr 2021 09:35:34 -0700
Subject: [PATCH 199/461] Add explanation comment

---
 fdbserver/workloads/BackupToDBCorrectness.actor.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index 47b23f72db..b9c94199b2 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -687,6 +687,8 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 				wait(success(restoreTool.waitBackup(cx, self->restoreTag)));
 				wait(restoreTool.unlockBackup(cx, self->restoreTag));
 
+				// Make sure no more data is written to the restored range
+				// after the restore completes.
 				state std::vector<Standalone<RangeResultRef>> res1 = wait(readRanges(cx, restoreRange, self->backupPrefix));
 				wait(delay(5));
 				state std::vector<Standalone<RangeResultRef>> res2 = wait(readRanges(cx, restoreRange, self->backupPrefix));

From 7beccc8643f4a40ea00e7bd64bfd33f7f02ad9a2 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Tue, 20 Apr 2021 14:13:25 -0400
Subject: [PATCH 200/461] move operational details out of release notes and
 into administration.rst

---
 documentation/sphinx/source/administration.rst      | 13 +++++++++++++
 .../source/release-notes/release-notes-620.rst      |  5 +----
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst
index 5f6369d889..bcdeec1566 100644
--- a/documentation/sphinx/source/administration.rst
+++ b/documentation/sphinx/source/administration.rst
@@ -799,3 +799,16 @@ Upgrading from Older Versions
 -----------------------------
 
 Upgrades from versions older than 5.0.0 are no longer supported.
+
+Version-specific notes on downgrading
+===================================
+
+In general, downgrades between patch releases (i.e. 6.2.x - 6.1.x) are not supported.
+
+Downgrading from 6.3.13 - 6.2.33
+--------------------------------
+After upgrading from 6.2 to 6.3, the option of rolling back and downgrading to return to 6.2 is still possible, given that the following conditions are met:
+
+* The 6.3 cluster cannot have ``TLogVersion`` greater than V4 (6.2).
+* The 6.3 cluster cannot use storage engine types that are not ``ssd-1``, ``ssd-2``, or ``memory``.
+* The 6.3 cluster must not have any key servers serialized with tag encoding. The ``TAG_ENCODE_KEY_SERVERS`` fdbclient knob must not be set to true at any point in time.
\ No newline at end of file
diff --git a/documentation/sphinx/source/release-notes/release-notes-620.rst b/documentation/sphinx/source/release-notes/release-notes-620.rst
index 1150b29c38..b14bbc65fd 100644
--- a/documentation/sphinx/source/release-notes/release-notes-620.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-620.rst
@@ -9,10 +9,7 @@ Release Notes
 * Backup agent no longer uses 4k block caching layer on local output files so that write operations are larger. `(PR #4428) <https://github.com/apple/foundationdb/pull/4428>`_
 * Fix accounting error that could cause commits to incorrectly fail with ``proxy_memory_limit_exceeded``. `(PR #4529) <https://github.com/apple/foundationdb/pull/4529>`_
 * Added support for downgrades from FDB version 6.3. `(PR #4673) <https://github.com/apple/foundationdb/pull/4673>`_
-* Restrictions added for 6.3 clusters to maintain compatibility with a 6.2 downgrade. `(PR #4469) <https://github.com/apple/foundationdb/pull/4469>`_
-   * Downgrades from 6.3 cannot have ``TLogVersion`` greater than V4 (6.2).
-   * Downgrades from 6.3 cannot use storage engine types that are not ``ssd-1``, ``ssd-2``, or ``memory``.
-   * Downgrades from 6.3 must not have any key servers serialized with tag encoding. ``TAG_ENCODE_KEY_SERVERS`` must not be set to true at any point in time.
+* Restrictions added for 6.3 clusters to maintain compatibility with a 6.2 downgrade. Details available in ``administration.rst``. `(PR #4469) <https://github.com/apple/foundationdb/pull/4469>`_
 
 6.2.32
 ======

From 9ee2cd7bcbb1888ca0134511e507a841e2079f77 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 20 Apr 2021 11:46:16 -0700
Subject: [PATCH 201/461] Renamed prefixesAllowed again for clarity.

---
 fdbserver/workloads/BackupCorrectness.actor.cpp | 12 ++++++------
 tests/fast/BackupCorrectnessClean.toml          |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fdbserver/workloads/BackupCorrectness.actor.cpp b/fdbserver/workloads/BackupCorrectness.actor.cpp
index 32ff788981..5d88a19f14 100644
--- a/fdbserver/workloads/BackupCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupCorrectness.actor.cpp
@@ -33,7 +33,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 	int backupRangesCount, backupRangeLengthMax;
 	bool differentialBackup, performRestore, agentRequest;
 	Standalone<VectorRef<KeyRangeRef>> backupRanges;
-	std::vector<std::string> prefixesAllowed;
+	std::vector<std::string> restorePrefixesToInclude;
 	std::vector<Standalone<KeyRangeRef>> skippedRestoreRanges;
 	Standalone<VectorRef<KeyRangeRef>> restoreRanges;
 	static int backupAgentRequests;
@@ -68,7 +68,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 		agentRequest = getOption(options, LiteralStringRef("simBackupAgents"), true);
 		allowPauses = getOption(options, LiteralStringRef("allowPauses"), true);
 		shareLogRange = getOption(options, LiteralStringRef("shareLogRange"), false);
-		prefixesAllowed = getOption(options, LiteralStringRef("prefixesAllowed"), std::vector<std::string>());
+		restorePrefixesToInclude = getOption(options, LiteralStringRef("restorePrefixesToInclude"), std::vector<std::string>());
 		shouldSkipRestoreRanges = deterministicRandom()->random01() < 0.3 ? true : false;
 
 		TraceEvent("BARW_ClientId").detail("Id", wcx.clientId);
@@ -104,10 +104,10 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 			}
 		}
 
-		if (performRestore && !prefixesAllowed.empty() && shouldSkipRestoreRanges) {
+		if (performRestore && !restorePrefixesToInclude.empty() && shouldSkipRestoreRanges) {
 			for (auto& range : backupRanges) {
 				bool intersection = false;
-				for (auto& prefix : prefixesAllowed) {
+				for (auto& prefix : restorePrefixesToInclude) {
 					KeyRange prefixRange(KeyRangeRef(prefix, strinc(prefix)));
 					if (range.intersects(prefixRange)) {
 						intersection = true;
@@ -117,7 +117,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 					    .detail("BackupRange", printable(range))
 					    .detail("Intersection", intersection);
 				}
-				// If the backup range intersects with prefixesAllowed or a coin flip is true then use it as a restore
+				// If the backup range intersects with restorePrefixesToInclude or a coin flip is true then use it as a restore
 				// range as well, otherwise skip it.
 				if (intersection || deterministicRandom()->coinflip()) {
 					restoreRanges.push_back_deep(restoreRanges.arena(), range);
@@ -129,7 +129,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 			restoreRanges = backupRanges;
 		}
 
-		// If no random backup ranges intersected with prefixesAllowed or won the coin flip then restoreRanges will be
+		// If no random backup ranges intersected with restorePrefixesToInclude or won the coin flip then restoreRanges will be
 		// empty, so move an item from skippedRestoreRanges to restoreRanges.
 		if (restoreRanges.empty()) {
 			ASSERT(!skippedRestoreRanges.empty());
diff --git a/tests/fast/BackupCorrectnessClean.toml b/tests/fast/BackupCorrectnessClean.toml
index bc6b68ae34..d5fc3d945e 100644
--- a/tests/fast/BackupCorrectnessClean.toml
+++ b/tests/fast/BackupCorrectnessClean.toml
@@ -50,7 +50,7 @@ simBackupAgents = 'BackupToFile'
     restoreAfter = 60.0
     performRestore = true
     allowPauses = false
-    prefixesAllowed = 'a,A,m'
+    restorePrefixesToInclude = 'a,A,m'
 
     [[test.workload]]
     testName = 'BackupAndRestoreCorrectness'

From 8b2a72fea26d72c7237d6d74717eb2dc66dc9998 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 30 Mar 2021 17:17:47 -0700
Subject: [PATCH 202/461] Add option to clear destination range before backup

---
 fdbclient/BackupAgent.actor.h                 | 14 +++--
 fdbclient/DatabaseBackupAgent.actor.cpp       | 52 ++++++++++++-------
 .../workloads/BackupToDBCorrectness.actor.cpp | 16 ++++--
 3 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/fdbclient/BackupAgent.actor.h b/fdbclient/BackupAgent.actor.h
index fb8f6b1564..3047580c60 100644
--- a/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/BackupAgent.actor.h
@@ -488,6 +488,14 @@ public:
 		                         [=](Reference<ReadYourWritesTransaction> tr) { return unlockBackup(tr, tagName); });
 	}
 
+	// Specifies the action to take on the backup's destination key range
+	// before the backup begins.
+	enum PreBackupAction {
+		NONE = 0, // No action is taken
+		VERIFY = 1, // Verify the key range being restored to is empty.
+		CLEAR = 2 // Clear the key range being restored to.
+	};
+
 	Future<Void> submitBackup(Reference<ReadYourWritesTransaction> tr,
 	                          Key tagName,
 	                          Standalone<VectorRef<KeyRangeRef>> backupRanges,
@@ -495,7 +503,7 @@ public:
 	                          Key addPrefix = StringRef(),
 	                          Key removePrefix = StringRef(),
 	                          bool lockDatabase = false,
-	                          bool databasesInSync = false);
+	                          PreBackupAction backupAction = PreBackupAction::VERIFY);
 	Future<Void> submitBackup(Database cx,
 	                          Key tagName,
 	                          Standalone<VectorRef<KeyRangeRef>> backupRanges,
@@ -503,10 +511,10 @@ public:
 	                          Key addPrefix = StringRef(),
 	                          Key removePrefix = StringRef(),
 	                          bool lockDatabase = false,
-	                          bool databasesInSync = false) {
+	                          PreBackupAction backupAction = PreBackupAction::VERIFY) {
 		return runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) {
 			return submitBackup(
-			    tr, tagName, backupRanges, stopWhenDone, addPrefix, removePrefix, lockDatabase, databasesInSync);
+			    tr, tagName, backupRanges, stopWhenDone, addPrefix, removePrefix, lockDatabase, backupAction);
 		});
 	}
 
diff --git a/fdbclient/DatabaseBackupAgent.actor.cpp b/fdbclient/DatabaseBackupAgent.actor.cpp
index bc29f9e848..764ad9ccd0 100644
--- a/fdbclient/DatabaseBackupAgent.actor.cpp
+++ b/fdbclient/DatabaseBackupAgent.actor.cpp
@@ -2243,17 +2243,18 @@ struct StartFullBackupTaskFunc : TaskFuncBase {
 		return Void();
 	}
 
-	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
-	                                 Reference<TaskBucket> taskBucket,
-	                                 Key logUid,
-	                                 Key backupUid,
-	                                 Key keyAddPrefix,
-	                                 Key keyRemovePrefix,
-	                                 Key keyConfigBackupRanges,
-	                                 Key tagName,
-	                                 TaskCompletionKey completionKey,
-	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>(),
-	                                 bool databasesInSync = false) {
+	ACTOR static Future<Key> addTask(
+	    Reference<ReadYourWritesTransaction> tr,
+	    Reference<TaskBucket> taskBucket,
+	    Key logUid,
+	    Key backupUid,
+	    Key keyAddPrefix,
+	    Key keyRemovePrefix,
+	    Key keyConfigBackupRanges,
+	    Key tagName,
+	    TaskCompletionKey completionKey,
+	    Reference<TaskFuture> waitFor = Reference<TaskFuture>(),
+	    DatabaseBackupAgent::PreBackupAction backupAction = DatabaseBackupAgent::PreBackupAction::VERIFY) {
 		Key doneKey = wait(completionKey.get(tr, taskBucket));
 		auto task = makeReference<Task>(StartFullBackupTaskFunc::name, StartFullBackupTaskFunc::version, doneKey);
 
@@ -2264,7 +2265,7 @@ struct StartFullBackupTaskFunc : TaskFuncBase {
 		task->params[BackupAgentBase::keyConfigBackupRanges] = keyConfigBackupRanges;
 		task->params[BackupAgentBase::keyTagName] = tagName;
 		task->params[DatabaseBackupAgent::keyDatabasesInSync] =
-		    databasesInSync ? LiteralStringRef("t") : LiteralStringRef("f");
+		    backupAction == DatabaseBackupAgent::PreBackupAction::NONE ? LiteralStringRef("t") : LiteralStringRef("f");
 
 		if (!waitFor) {
 			return taskBucket->addTask(tr,
@@ -2514,7 +2515,7 @@ public:
 	                                       Key addPrefix,
 	                                       Key removePrefix,
 	                                       bool lockDB,
-	                                       bool databasesInSync) {
+	                                       DatabaseBackupAgent::PreBackupAction backupAction) {
 		state UID logUid = deterministicRandom()->randomUniqueID();
 		state Key logUidValue = BinaryWriter::toValue(logUid, Unversioned());
 		state UID logUidCurrent = wait(backupAgent->getLogUid(tr, tagName));
@@ -2558,7 +2559,7 @@ public:
 			}
 		}
 
-		if (!databasesInSync) {
+		if (backupAction == DatabaseBackupAgent::PreBackupAction::VERIFY) {
 			// Make sure all of the ranges are empty before we backup into them.
 			state std::vector<Future<Standalone<RangeResultRef>>> backupIntoResults;
 			for (auto& backupRange : backupRanges) {
@@ -2572,6 +2573,11 @@ public:
 					throw restore_destination_not_empty();
 				}
 			}
+		} else if (backupAction == DatabaseBackupAgent::PreBackupAction::CLEAR) {
+			// Clear out all ranges before we backup into them.
+			for (auto& backupRange : backupRanges) {
+				tr->clear(backupRange.removePrefix(removePrefix).withPrefix(addPrefix));
+			}
 		}
 
 		// Clear the backup ranges for the tag
@@ -2610,7 +2616,7 @@ public:
 		tr->clear(KeyRangeRef(mapPrefix, mapEnd));
 
 		state Version readVersion = invalidVersion;
-		if (databasesInSync) {
+		if (backupAction == DatabaseBackupAgent::PreBackupAction::NONE) {
 			Transaction readTransaction(backupAgent->taskBucket->src);
 			readTransaction.setOption(FDBTransactionOptions::LOCK_AWARE);
 			Version _ = wait(readTransaction.getReadVersion());
@@ -2629,7 +2635,7 @@ public:
 		    tagName,
 		    TaskCompletionKey::noSignal(),
 		    Reference<TaskFuture>(),
-		    databasesInSync));
+		    backupAction));
 
 		if (lockDB)
 			wait(lockDatabase(tr, logUid));
@@ -2772,8 +2778,14 @@ public:
 		TraceEvent("DBA_SwitchoverVersionUpgraded");
 
 		try {
-			wait(drAgent.submitBackup(
-			    backupAgent->taskBucket->src, tagName, backupRanges, false, addPrefix, removePrefix, true, true));
+			wait(drAgent.submitBackup(backupAgent->taskBucket->src,
+			                          tagName,
+			                          backupRanges,
+			                          false,
+			                          addPrefix,
+			                          removePrefix,
+			                          true,
+			                          DatabaseBackupAgent::PreBackupAction::NONE));
 		} catch (Error& e) {
 			if (e.code() != error_code_backup_duplicate)
 				throw;
@@ -3236,9 +3248,9 @@ Future<Void> DatabaseBackupAgent::submitBackup(Reference<ReadYourWritesTransacti
                                                Key addPrefix,
                                                Key removePrefix,
                                                bool lockDatabase,
-                                               bool databasesInSync) {
+                                               PreBackupAction backupAction) {
 	return DatabaseBackupAgentImpl::submitBackup(
-	    this, tr, tagName, backupRanges, stopWhenDone, addPrefix, removePrefix, lockDatabase, databasesInSync);
+	    this, tr, tagName, backupRanges, stopWhenDone, addPrefix, removePrefix, lockDatabase, backupAction);
 }
 
 Future<Void> DatabaseBackupAgent::discontinueBackup(Reference<ReadYourWritesTransaction> tr, Key tagName) {
diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index b8776064a5..434d67069d 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -286,7 +286,8 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 					                               stopDifferentialDelay ? false : true,
 					                               self->backupPrefix,
 					                               StringRef(),
-					                               self->locked));
+					                               self->locked,
+					                               DatabaseBackupAgent::PreBackupAction::CLEAR));
 					wait(tr2->commit());
 					break;
 				} catch (Error& e) {
@@ -600,7 +601,8 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 					                                       true,
 					                                       self->extraPrefix,
 					                                       StringRef(),
-					                                       self->locked);
+					                                       self->locked,
+					                                       DatabaseBackupAgent::PreBackupAction::CLEAR);
 				} catch (Error& e) {
 					TraceEvent("BARW_SubmitBackup2Exception", randomID)
 					    .error(e)
@@ -648,8 +650,14 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 				}
 
 				try {
-					wait(restoreTool.submitBackup(
-					    cx, self->restoreTag, restoreRange, true, StringRef(), self->backupPrefix, self->locked));
+					wait(restoreTool.submitBackup(cx,
+					                              self->restoreTag,
+					                              restoreRange,
+					                              true,
+					                              StringRef(),
+					                              self->backupPrefix,
+					                              self->locked,
+					                              DatabaseBackupAgent::PreBackupAction::CLEAR));
 				} catch (Error& e) {
 					TraceEvent("BARW_DoBackupSubmitBackupException", randomID)
 					    .error(e)

From 12e9e59c1b208e3d1ba29de2fb327b6fe75f814a Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 20 Apr 2021 11:59:03 -0700
Subject: [PATCH 203/461] Remove redundant clear range before backup

---
 .../workloads/BackupToDBCorrectness.actor.cpp | 69 ++++++-------------
 1 file changed, 22 insertions(+), 47 deletions(-)

diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index 434d67069d..8844c60df7 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -259,39 +259,33 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 			wait(backupAgent->unlockBackup(cx, tag));
 		}
 
-		// The range clear and submitBackup is being done here in the SAME transaction (which does make SubmitBackup's
-		// range emptiness check pointless in this test) because separating them causes rare errors where the
-		// SubmitBackup commit result is indeterminite but the submission was in fact successful and the backup actually
-		// completes before the retry of SubmitBackup so this second call to submit fails because the destination range
-		// is no longer empty.
+		// In prior versions of submitBackup, we have seen a rare bug where
+		// submitBackup results in a commit_unknown_result, causing the backup
+		// to retry when in fact it had successfully completed. On the retry,
+		// the range being backed up into was checked to make sure it was
+		// empty, and this check was failing because the backup had succeeded
+		// the first time. The old solution for this was to clear the backup
+		// range in the same transaction as the backup, but now we have
+		// switched to passing a "pre-backup action" to either verify the range
+		// being backed up into is empty, or clearing it first.
 		TraceEvent("BARW_DoBackupClearAndSubmitBackup", randomID)
 		    .detail("Tag", printable(tag))
 		    .detail("StopWhenDone", stopDifferentialDelay ? "False" : "True");
 
 		try {
-			state Reference<ReadYourWritesTransaction> tr2(new ReadYourWritesTransaction(self->extraDB));
-			loop {
-				try {
-					for (auto r : self->backupRanges) {
-						if (!r.empty()) {
-							auto targetRange = r.withPrefix(self->backupPrefix);
-							printf("Clearing %s in destination\n", printable(targetRange).c_str());
-							tr2->addReadConflictRange(targetRange);
-							tr2->clear(targetRange);
-						}
-					}
-					wait(backupAgent->submitBackup(tr2,
-					                               tag,
-					                               backupRanges,
-					                               stopDifferentialDelay ? false : true,
-					                               self->backupPrefix,
-					                               StringRef(),
-					                               self->locked,
-					                               DatabaseBackupAgent::PreBackupAction::CLEAR));
-					wait(tr2->commit());
-					break;
-				} catch (Error& e) {
-					wait(tr2->onError(e));
+			try {
+				wait(backupAgent->submitBackup(cx,
+				                               tag,
+				                               backupRanges,
+				                               stopDifferentialDelay ? false : true,
+				                               self->backupPrefix,
+				                               StringRef(),
+				                               self->locked,
+				                               DatabaseBackupAgent::PreBackupAction::CLEAR));
+			} catch (Error& e) {
+				TraceEvent("BARW_SubmitBackup1Exception", randomID).error(e);
+				if (e.code() != error_code_backup_unneeded && e.code() != error_code_backup_duplicate) {
+					throw;
 				}
 			}
 		} catch (Error& e) {
@@ -622,25 +616,6 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 				    .detail("BackupTag", printable(self->restoreTag));
 				// wait(diffRanges(self->backupRanges, self->backupPrefix, cx, self->extraDB));
 
-				state Transaction tr3(cx);
-				loop {
-					try {
-						// Run on the first proxy to ensure data is cleared
-						// when submitting the backup request below.
-						tr3.setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY);
-						for (auto r : self->backupRanges) {
-							if (!r.empty()) {
-								tr3.addReadConflictRange(r);
-								tr3.clear(r);
-							}
-						}
-						wait(tr3.commit());
-						break;
-					} catch (Error& e) {
-						wait(tr3.onError(e));
-					}
-				}
-
 				Standalone<VectorRef<KeyRangeRef>> restoreRange;
 
 				for (auto r : self->backupRanges) {

From 2e825908dc3a888cc3f91583cdd51f7806a6167a Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 20 Apr 2021 14:04:00 -0700
Subject: [PATCH 204/461] Add check to make sure maintenance time is positive
 and update the documentation

---
 documentation/sphinx/source/developer-guide.rst |  2 +-
 fdbclient/SpecialKeySpace.actor.cpp             | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index 3a3c731630..3c038efb29 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -949,7 +949,7 @@ that process, and wait for necessary data to be moved away.
 #. ``\xff\xff/management/options/failed/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed/<exclusion>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
 #. ``\xff\xff/management/min_required_commit_version`` Read/write. Changing this key will change the corresponding system key ``\xff/minRequiredCommitVersion = [[Version]]``. The value of this special key is the literal text of the underlying ``Version``, which is ``int64_t``. If you set the key with a value failed to be parsed as ``int64_t``, ``special_keys_api_failure`` will be thrown. In addition, the given ``Version`` should be larger than the current read version and smaller than the upper bound(``2**63-1-version_per_second*3600*24*365*1000``). Otherwise, ``special_keys_api_failure`` is thrown. For more details, see help text of ``fdbcli`` command ``advanceversion``.
 #. ``\xff\xff/management/profiling/<client_txn_sample_rate|client_txn_size_limit>`` Read/write. Changing these two keys will change the corresponding system keys ``\xff\x02/fdbClientInfo/<client_txn_sample_rate|client_txn_size_limit>``, respectively. The value of ``\xff\xff/management/client_txn_sample_rate`` is a literal text of ``double``, and the value of ``\xff\xff/management/client_txn_size_limit`` is a literal text of ``int64_t``. A special value ``default`` can be set to or read from these two keys, representing the client profiling is disabled. In addition, ``clear`` in this range is not allowed. For more details, see help text of ``fdbcli`` command ``profile client``.
-#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of ``int`` which represents the remaining time for the zone to be in maintenance. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. For more details, see help text of ``fdbcli`` command ``maintenance``.
+#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of a positive ``double`` which represents the remaining time for the zone to be in maintenance. Commiting with an invalid value will throw ``special_keys_api_failure``. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. For more details, see help text of ``fdbcli`` command ``maintenance``.
    In addition, a special key ``\xff\xff/management/maintenance/IgnoreSSFailures`` in the range, if set, will disable datadistribution for storage server failures.
    It is doing the same thing as the fdbcli command ``datadistribution disable ssfailure``.
    Maintenance mode will be unable to use until the key is cleared, which is the same as the fdbcli command ``datadistribution enable ssfailure``.
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 03cc4f1a67..f371a73ee7 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1937,10 +1937,10 @@ ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYou
 		if ((healthyZone.first == ignoreSSFailuresZoneString) ||
 		    (healthyZone.second > ryw->getTransaction().getReadVersion().get())) {
 			Key zone_key = healthyZone.first.withPrefix(prefix);
-			int64_t seconds = healthyZone.first == ignoreSSFailuresZoneString
-			                      ? 0
-			                      : (healthyZone.second - ryw->getTransaction().getReadVersion().get()) /
-			                            CLIENT_KNOBS->CORE_VERSIONSPERSECOND;
+			double seconds = healthyZone.first == ignoreSSFailuresZoneString
+			                     ? 0
+			                     : (healthyZone.second - ryw->getTransaction().getReadVersion().get()) /
+			                           CLIENT_KNOBS->CORE_VERSIONSPERSECOND;
 			if (kr.contains(zone_key)) {
 				result.push_back_deep(result.arena(),
 				                      KeyValueRef(zone_key, Value(boost::lexical_cast<std::string>(seconds))));
@@ -1965,7 +1965,7 @@ ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWrites
 	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
 	    ryw->getSpecialKeySpaceWriteMap().containedRanges(kr);
 	Key zoneId;
-	int64_t seconds;
+	double seconds;
 	bool isSet = false;
 	// Since maintenance only allows one zone at the same time,
 	// if a transaction has more than one set operation on different zone keys,
@@ -1979,7 +1979,7 @@ ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWrites
 				    false, "maintenance", "Multiple zones given for maintenance, only one allowed at the same time"));
 			isSet = true;
 			zoneId = iter->begin().removePrefix(kr.begin);
-			seconds = boost::lexical_cast<int64_t>(iter->value().second.get().toString());
+			seconds = boost::lexical_cast<double>(iter->value().second.get().toString());
 		} else {
 			// if we already have set operation, then all clear operations will be meaningless, thus skip
 			if (!isSet && healthyZone.present() && iter.range().contains(healthyZone.get().first.withPrefix(kr.begin)))
@@ -1992,6 +1992,10 @@ ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWrites
 			std::string msg = "Maintenance mode cannot be used while data distribution is disabled for storage "
 			                  "server failures.";
 			return Optional<std::string>(ManagementAPIError::toJsonString(false, "maintenance", msg));
+		} else if (seconds <= 0) {
+			std::string msg = "The specified maintenance time " + boost::lexical_cast<std::string>(seconds) +
+			                  " is not a positive value";
+			return Optional<std::string>(ManagementAPIError::toJsonString(false, "maintenance", msg));
 		} else {
 			TraceEvent(SevDebug, "SKSMaintenanceSet").detail("ZoneId", zoneId.toString());
 			ryw->getTransaction().set(healthyZoneKey,

From af387e1519cc0bb3166598cb2107d1941d985bd7 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 20 Apr 2021 14:09:52 -0700
Subject: [PATCH 205/461] Add check to make sure maintenance time is
 non-negative and update the documentation

---
 documentation/sphinx/source/developer-guide.rst | 2 +-
 fdbclient/SpecialKeySpace.actor.cpp             | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index 3c038efb29..d26f235304 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -949,7 +949,7 @@ that process, and wait for necessary data to be moved away.
 #. ``\xff\xff/management/options/failed/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed/<exclusion>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
 #. ``\xff\xff/management/min_required_commit_version`` Read/write. Changing this key will change the corresponding system key ``\xff/minRequiredCommitVersion = [[Version]]``. The value of this special key is the literal text of the underlying ``Version``, which is ``int64_t``. If you set the key with a value failed to be parsed as ``int64_t``, ``special_keys_api_failure`` will be thrown. In addition, the given ``Version`` should be larger than the current read version and smaller than the upper bound(``2**63-1-version_per_second*3600*24*365*1000``). Otherwise, ``special_keys_api_failure`` is thrown. For more details, see help text of ``fdbcli`` command ``advanceversion``.
 #. ``\xff\xff/management/profiling/<client_txn_sample_rate|client_txn_size_limit>`` Read/write. Changing these two keys will change the corresponding system keys ``\xff\x02/fdbClientInfo/<client_txn_sample_rate|client_txn_size_limit>``, respectively. The value of ``\xff\xff/management/client_txn_sample_rate`` is a literal text of ``double``, and the value of ``\xff\xff/management/client_txn_size_limit`` is a literal text of ``int64_t``. A special value ``default`` can be set to or read from these two keys, representing the client profiling is disabled. In addition, ``clear`` in this range is not allowed. For more details, see help text of ``fdbcli`` command ``profile client``.
-#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of a positive ``double`` which represents the remaining time for the zone to be in maintenance. Commiting with an invalid value will throw ``special_keys_api_failure``. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. For more details, see help text of ``fdbcli`` command ``maintenance``.
+#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of a non-negative ``double`` which represents the remaining time for the zone to be in maintenance. Commiting with an invalid value will throw ``special_keys_api_failure``. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. For more details, see help text of ``fdbcli`` command ``maintenance``.
    In addition, a special key ``\xff\xff/management/maintenance/IgnoreSSFailures`` in the range, if set, will disable datadistribution for storage server failures.
    It is doing the same thing as the fdbcli command ``datadistribution disable ssfailure``.
    Maintenance mode will be unable to use until the key is cleared, which is the same as the fdbcli command ``datadistribution enable ssfailure``.
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index f371a73ee7..12cbc0c41c 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1992,9 +1992,9 @@ ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWrites
 			std::string msg = "Maintenance mode cannot be used while data distribution is disabled for storage "
 			                  "server failures.";
 			return Optional<std::string>(ManagementAPIError::toJsonString(false, "maintenance", msg));
-		} else if (seconds <= 0) {
-			std::string msg = "The specified maintenance time " + boost::lexical_cast<std::string>(seconds) +
-			                  " is not a positive value";
+		} else if (seconds < 0) {
+			std::string msg =
+			    "The specified maintenance time " + boost::lexical_cast<std::string>(seconds) + " is a negative value";
 			return Optional<std::string>(ManagementAPIError::toJsonString(false, "maintenance", msg));
 		} else {
 			TraceEvent(SevDebug, "SKSMaintenanceSet").detail("ZoneId", zoneId.toString());

From d76b32da188c6f51da6d0837551cada78cc6ef53 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Tue, 20 Apr 2021 15:10:01 -0600
Subject: [PATCH 206/461] Annotate read paths on the server side

---
 fdbclient/ActorLineageProfiler.h      |   2 +-
 fdbclient/NativeAPI.actor.cpp         |   3 +
 fdbclient/TransactionLineage.cpp      |  25 +++++
 fdbclient/TransactionLineage.h        | 128 ++++++++++++++++++++++++++
 fdbserver/CommitProxyServer.actor.cpp |   4 +
 fdbserver/GrvProxyServer.actor.cpp    |   5 +
 fdbserver/storageserver.actor.cpp     |  15 ++-
 fdbserver/worker.actor.cpp            |   4 +
 8 files changed, 183 insertions(+), 3 deletions(-)
 create mode 100644 fdbclient/TransactionLineage.cpp
 create mode 100644 fdbclient/TransactionLineage.h

diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 3f11840714..81d4bcaec7 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -64,7 +64,7 @@ private:
 public:
 	void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); }
 	std::shared_ptr<Sample> collect();
-	void addGetter(WaitState waitState, Getter const& getter);
+	void addGetter(WaitState waitState, Getter const& getter) { getSamples.emplace(waitState, getter); }
 };
 
 using SampleCollector = crossbow::singleton<SampleCollectorT>;
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 1857cea0c7..ac45d83b05 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -49,6 +49,7 @@
 #include "fdbclient/SpecialKeySpace.actor.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/SystemData.h"
+#include "fdbclient/TransactionLineage.h"
 #include "fdbclient/versions.h"
 #include "fdbrpc/LoadBalance.h"
 #include "fdbrpc/Net2FileSystem.h"
@@ -86,6 +87,8 @@ using std::pair;
 
 namespace {
 
+TransactionLineageCollector transactionLineageCollector;
+
 template <class Interface, class Request>
 Future<REPLY_TYPE(Request)> loadBalance(
     DatabaseContext* ctx,
diff --git a/fdbclient/TransactionLineage.cpp b/fdbclient/TransactionLineage.cpp
new file mode 100644
index 0000000000..9ef0f21e1b
--- /dev/null
+++ b/fdbclient/TransactionLineage.cpp
@@ -0,0 +1,25 @@
+/*
+ * TransactionLineage.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/TransactionLineage.h"
+
+namespace {
+TransactionLineageCollector transactionLineageCollector;
+}
\ No newline at end of file
diff --git a/fdbclient/TransactionLineage.h b/fdbclient/TransactionLineage.h
new file mode 100644
index 0000000000..b4518de231
--- /dev/null
+++ b/fdbclient/TransactionLineage.h
@@ -0,0 +1,128 @@
+/*
+ * TransactionLineage.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "fdbclient/ActorLineageProfiler.h"
+
+struct TransactionLineage : LineageProperties<TransactionLineage> {
+	enum class Operation {
+		Unset,
+		GetValue,
+		GetKey,
+		GetKeyValues,
+		WatchValue,
+		GetConsistentReadVersion,
+		Commit,
+		GetKeyServersLocations
+	};
+	static constexpr std::string_view name = "Transaction"sv;
+	uint64_t txID;
+	Operation operation = Operation::Unset;
+
+	bool isSet(uint64_t TransactionLineage::*member) const { return this->*member > 0; }
+	bool isSet(Operation TransactionLineage::*member) const { return this->*member != Operation::Unset; }
+};
+
+struct TransactionLineageCollector : IALPCollector<TransactionLineage> {
+	using Operation = TransactionLineage::Operation;
+	std::optional<std::any> collect(ActorLineage* lineage) {
+		std::map<std::string_view, std::any> res;
+		auto txID = lineage->get(&TransactionLineage::txID);
+		if (txID.has_value()) {
+			res["ID"sv] = txID.value();
+		}
+		auto operation = lineage->get(&TransactionLineage::operation);
+		if (operation.has_value()) {
+			switch (operation.value()) {
+			case Operation::Unset:
+				res["operation"sv] = "Unset"sv;
+				break;
+			case Operation::GetValue:
+				res["operation"sv] = "GetValue"sv;
+				break;
+			case Operation::GetKey:
+				res["operation"sv] = "GetKey"sv;
+				break;
+			case Operation::GetKeyValues:
+				res["operation"sv] = "GetKeyValues"sv;
+				break;
+			case Operation::WatchValue:
+				res["operation"sv] = "WatchValue"sv;
+				break;
+			case Operation::GetConsistentReadVersion:
+				res["operation"sv] = "GetConsistentReadVersion"sv;
+				break;
+			case Operation::Commit:
+				res["operation"sv] = "Commit"sv;
+				break;
+			case Operation::GetKeyServersLocations:
+				res["operation"sv] = "GetKeyServersLocations"sv;
+				break;
+			}
+		}
+		if (res.empty()) {
+			return std::optional<std::any>{};
+		} else {
+			return res;
+		}
+	}
+};
+
+template <class T, class V>
+class ScopedLineage {
+	V before;
+	V T::*member;
+	bool valid = true;
+
+public:
+	ScopedLineage(V T::*member, V const& value) : member(member) {
+		auto val = currentLineage->modify(member);
+		before = val;
+		val = value;
+	}
+	~ScopedLineage() {
+		if (!valid) {
+			return;
+		}
+		currentLineage->modify(member) = before;
+	}
+	ScopedLineage(ScopedLineage<T, V>&& o) : before(std::move(o.before)), member(o.member), valid(o.valid) {
+		o.release();
+	}
+	ScopedLineage& operator=(ScopedLineage<T, V>&& o) {
+		if (valid) {
+			currentLineage->modify(member) = before;
+		}
+		before = std::move(o.before);
+		member = o.member;
+		valid = o.valid;
+		o.release();
+		return *this;
+	}
+	ScopedLineage(const ScopedLineage<T, V>&) = delete;
+	ScopedLineage& operator=(const ScopedLineage<T, V>&) = delete;
+	void release() { valid = false; }
+};
+
+template <class T, class V>
+ScopedLineage<T, V> make_scoped_lineage(V T::*member, V const& value) {
+	return ScopedLineage<T, V>(member, value);
+}
diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp
index 4ae833c050..428a384279 100644
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@@ -28,6 +28,7 @@
 #include "fdbclient/CommitProxyInterface.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/SystemData.h"
+#include "fdbclient/TransactionLineage.h"
 #include "fdbrpc/sim_validation.h"
 #include "fdbserver/ApplyMetadataMutation.h"
 #include "fdbserver/ConflictSet.h"
@@ -1396,6 +1397,7 @@ ACTOR Future<Void> commitBatch(ProxyCommitData* self,
 	// WARNING: this code is run at a high priority (until the first delay(0)), so it needs to do as little work as
 	// possible
 	state CommitBatch::CommitBatchContext context(self, trs, currentBatchMemBytesCount);
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::Commit;
 
 	// Active load balancing runs at a very high priority (to obtain accurate estimate of memory used by commit batches)
 	// so we need to downgrade here
@@ -1432,6 +1434,8 @@ ACTOR Future<Void> commitBatch(ProxyCommitData* self,
 
 ACTOR static Future<Void> doKeyServerLocationRequest(GetKeyServerLocationsRequest req, ProxyCommitData* commitData) {
 	// We can't respond to these requests until we have valid txnStateStore
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKeyServersLocations;
+	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 	wait(commitData->validState.getFuture());
 	wait(delay(0, TaskPriority::DefaultEndpoint));
 
diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index 8ab3719181..faad80d2d7 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -19,6 +19,7 @@
  */
 
 #include "fdbclient/Notified.h"
+#include "fdbclient/TransactionLineage.h"
 #include "fdbserver/LogSystem.h"
 #include "fdbserver/LogSystemDiskQueueAdapter.h"
 #include "fdbclient/CommitProxyInterface.h"
@@ -349,8 +350,11 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>>
                                                GrvProxyStats* stats,
                                                GrvTransactionRateInfo* batchRateInfo,
                                                TransactionTagMap<uint64_t>* transactionTagCounter) {
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetConsistentReadVersion;
 	loop choose {
 		when(GetReadVersionRequest req = waitNext(readVersionRequests)) {
+			auto lineage = make_scoped_lineage(&TransactionLineage::txID, req.spanContext.first());
+			// currentLineage->modify(&TransactionLineage::txID) =
 			// WARNING: this code is run at a high priority, so it needs to do as little work as possible
 			if (stats->txnRequestIn.getValue() - stats->txnRequestOut.getValue() >
 			    SERVER_KNOBS->START_TRANSACTION_MAX_QUEUE_SIZE) {
@@ -637,6 +641,7 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 	state Span span;
 
 	state int64_t midShardSize = SERVER_KNOBS->MIN_SHARD_BYTES;
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetConsistentReadVersion;
 	addActor.send(monitorDDMetricsChanges(&midShardSize, db));
 
 	addActor.send(getRate(proxy.id(),
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 8c26f955bb..7538685acf 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -42,6 +42,7 @@
 #include "fdbclient/Notified.h"
 #include "fdbclient/StatusClient.h"
 #include "fdbclient/SystemData.h"
+#include "fdbclient/TransactionLineage.h"
 #include "fdbclient/VersionedMap.h"
 #include "fdbserver/FDBExecHelper.actor.h"
 #include "fdbserver/IKeyValueStore.h"
@@ -521,7 +522,7 @@ public:
 	//   process of committing makeShardDurable)
 	//   == v              -> k is readable (from storage+versionedData) @ [storageVersion,v], and not being updated
 	//   when version increases
-	//   == latestVersion  -> k is readable (from storage+versionedData) @ [storageVersion,version.get()], and thus
+	//   == latestVersion  -> k is readable (from stora	ge+versionedData) @ [storageVersion,version.get()], and thus
 	//   stays available when version increases
 	CoalescedKeyRangeMap<Version> newestAvailableVersion;
 
@@ -874,7 +875,7 @@ public:
 		}
 		return fun(this, request);
 	}
-};
+		    };
 
 const StringRef StorageServer::CurrentRunningFetchKeys::emptyString = LiteralStringRef("");
 const KeyRangeRef StorageServer::CurrentRunningFetchKeys::emptyKeyRange =
@@ -1106,6 +1107,7 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 	state int64_t resultSize = 0;
 	Span span("SS:getValue"_loc, { req.spanContext });
 	span.addTag("key"_sr, req.key);
+	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 
 	try {
 		++data->counters.getValueQueries;
@@ -1799,6 +1801,7 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 {
 	state Span span("SS:getKeyValues"_loc, { req.spanContext });
 	state int64_t resultSize = 0;
+	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 
 	++data->counters.getRangeQueries;
 	++data->counters.allQueries;
@@ -1959,6 +1962,7 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
 	state Span span("SS:getKey"_loc, { req.spanContext });
 	state int64_t resultSize = 0;
+	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 
 	++data->counters.getKeyQueries;
 	++data->counters.allQueries;
@@ -4324,6 +4328,7 @@ ACTOR Future<Void> checkBehind(StorageServer* self) {
 }
 
 ACTOR Future<Void> serveGetValueRequests(StorageServer* self, FutureStream<GetValueRequest> getValue) {
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetValue;
 	loop {
 		GetValueRequest req = waitNext(getValue);
 		// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade
@@ -4341,6 +4346,7 @@ ACTOR Future<Void> serveGetValueRequests(StorageServer* self, FutureStream<GetVa
 }
 
 ACTOR Future<Void> serveGetKeyValuesRequests(StorageServer* self, FutureStream<GetKeyValuesRequest> getKeyValues) {
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKeyValues;
 	loop {
 		GetKeyValuesRequest req = waitNext(getKeyValues);
 		// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade
@@ -4350,6 +4356,7 @@ ACTOR Future<Void> serveGetKeyValuesRequests(StorageServer* self, FutureStream<G
 }
 
 ACTOR Future<Void> serveGetKeyRequests(StorageServer* self, FutureStream<GetKeyRequest> getKey) {
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKey;
 	loop {
 		GetKeyRequest req = waitNext(getKey);
 		// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade
@@ -4362,6 +4369,7 @@ ACTOR Future<Void> watchValueWaitForVersion(StorageServer* self,
                                             WatchValueRequest req,
                                             PromiseStream<WatchValueRequest> stream) {
 	state Span span("SS:watchValueWaitForVersion"_loc, { req.spanContext });
+	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 	try {
 		wait(success(waitForVersionNoTooOld(self, req.version)));
 		stream.send(req);
@@ -4375,9 +4383,11 @@ ACTOR Future<Void> watchValueWaitForVersion(StorageServer* self,
 
 ACTOR Future<Void> serveWatchValueRequestsImpl(StorageServer* self, FutureStream<WatchValueRequest> stream) {
 	loop {
+		currentLineage->modify(&TransactionLineage::txID) = 0;
 		state WatchValueRequest req = waitNext(stream);
 		state Reference<ServerWatchMetadata> metadata = self->getWatchMetadata(req.key.contents());
 		state Span span("SS:serveWatchValueRequestsImpl"_loc, { req.spanContext });
+		currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 
 		if (!metadata.isValid()) { // case 1: no watch set for the current key
 			metadata = makeReference<ServerWatchMetadata>(req.key, req.value, req.version, req.tags, req.debugID);
@@ -4451,6 +4461,7 @@ ACTOR Future<Void> serveWatchValueRequestsImpl(StorageServer* self, FutureStream
 
 ACTOR Future<Void> serveWatchValueRequests(StorageServer* self, FutureStream<WatchValueRequest> watchValue) {
 	state PromiseStream<WatchValueRequest> stream;
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::WatchValue;
 	self->actors.add(serveWatchValueRequestsImpl(self, stream.getFuture()));
 
 	loop {
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 4d05d3f5fe..2beccdf0ef 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -79,6 +79,10 @@ extern IKeyValueStore* keyValueStoreCompressTestData(IKeyValueStore* store);
 #define KV_STORE(filename, uid) keyValueStoreMemory(filename, uid)
 #endif
 
+namespace {
+RoleLineageCollector roleLineageCollector;
+}
+
 ACTOR Future<std::vector<Endpoint>> tryDBInfoBroadcast(RequestStream<UpdateServerDBInfoRequest> stream,
                                                        UpdateServerDBInfoRequest req) {
 	ErrorOr<std::vector<Endpoint>> rep =

From 235717772281f3d545e571eb72624a9eb0a5320e Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 20 Apr 2021 15:05:51 -0700
Subject: [PATCH 207/461] Add bool support to global configuration

---
 fdbclient/ActorLineageProfiler.cpp  |  4 ++++
 fdbclient/GlobalConfig.actor.cpp    |  4 +++-
 fdbclient/GlobalConfig.actor.h      |  2 +-
 fdbclient/SpecialKeySpace.actor.cpp |  3 +++
 fdbclient/Tuple.cpp                 | 29 +++++++++++++++++++++++++++++
 fdbclient/Tuple.h                   |  4 +++-
 fdbserver/fdbserver.actor.cpp       |  5 ++---
 7 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 82d04aa42c..733f581718 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -257,6 +257,10 @@ void ActorLineageProfilerT::setFrequency(unsigned frequency) {
 	} else if (change) {
 		cond.notify_all();
 	}
+
+	if (frequency == 0) {
+		profilerThread.join();
+	}
 }
 
 void ActorLineageProfilerT::profile() {
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 95d7cfce13..8096688786 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -34,7 +34,7 @@ const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_inf
 const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag_sample_rate");
 const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost");
 
-const KeyRef sampleFrequency = LiteralStringRef("visibility/sample_frequency");
+const KeyRef samplingFrequency = LiteralStringRef("visibility/sampling/frequency");
 
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
@@ -99,6 +99,8 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
 			any = StringRef(arena, t.getString(0).contents());
 		} else if (t.getType(0) == Tuple::ElementType::INT) {
 			any = t.getInt(0);
+		} else if (t.getType(0) == Tuple::ElementType::BOOL) {
+			any = t.getBool(0);
 		} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
 			any = t.getFloat(0);
 		} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index bf7532a974..8835955400 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -49,7 +49,7 @@ extern const KeyRef fdbClientInfoTxnSizeLimit;
 extern const KeyRef transactionTagSampleRate;
 extern const KeyRef transactionTagSampleCost;
 
-extern const KeyRef sampleFrequency;
+extern const KeyRef samplingFrequency;
 
 // Structure used to hold the values stored by global configuration. The arena
 // is used as memory to store both the key and the value (the value is only
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index af1f106a66..603887fcf6 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1397,6 +1397,9 @@ Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTran
 			} else if (config->value.type() == typeid(int64_t)) {
 				result.push_back_deep(result.arena(),
 				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config->value))));
+			} else if (config->value.type() == typeid(bool)) {
+				result.push_back_deep(result.arena(),
+				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<bool>(config->value))));
 			} else if (config->value.type() == typeid(float)) {
 				result.push_back_deep(result.arena(),
 				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config->value))));
diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp
index 367a7b80fb..ab1fcb0314 100644
--- a/fdbclient/Tuple.cpp
+++ b/fdbclient/Tuple.cpp
@@ -71,6 +71,8 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
 			i += sizeof(float) + 1;
 		} else if (data[i] == 0x21) {
 			i += sizeof(double) + 1;
+		} else if (data[i] == 0x26 || data[i] == 0x27) {
+			i += 1;
 		} else if (data[i] == '\x00') {
 			i += 1;
 		} else {
@@ -144,6 +146,16 @@ Tuple& Tuple::append(int64_t value) {
 	return *this;
 }
 
+Tuple& Tuple::appendBool(bool value) {
+	offsets.push_back(data.size());
+	if (value) {
+		data.push_back(data.arena(), 0x27);
+	} else {
+		data.push_back(data.arena(), 0x26);
+	}
+	return *this;
+}
+
 Tuple& Tuple::appendFloat(float value) {
 	offsets.push_back(data.size());
 	float swap = bigEndianFloat(value);
@@ -192,6 +204,8 @@ Tuple::ElementType Tuple::getType(size_t index) const {
 		return ElementType::FLOAT;
 	} else if (code == 0x21) {
 		return ElementType::DOUBLE;
+	} else if (code == 0x26 || code == 0x27) {
+		return ElementType::BOOL;
 	} else {
 		throw invalid_tuple_data_type();
 	}
@@ -287,6 +301,21 @@ int64_t Tuple::getInt(size_t index, bool allow_incomplete) const {
 }
 
 // TODO: Combine with bindings/flow/Tuple.*. This code is copied from there.
+bool Tuple::getBool(size_t index) const {
+	if (index >= offsets.size()) {
+		throw invalid_tuple_index();
+	}
+	ASSERT_LT(offsets[index], data.size());
+	uint8_t code = data[offsets[index]];
+	if (code == 0x26) {
+		return false;
+	} else if (code == 0x27) {
+		return true;
+	} else {
+		throw invalid_tuple_data_type();
+	}
+}
+
 float Tuple::getFloat(size_t index) const {
 	if (index >= offsets.size()) {
 		throw invalid_tuple_index();
diff --git a/fdbclient/Tuple.h b/fdbclient/Tuple.h
index 3dc597f262..62feba307b 100644
--- a/fdbclient/Tuple.h
+++ b/fdbclient/Tuple.h
@@ -40,6 +40,7 @@ struct Tuple {
 	Tuple& append(int64_t);
 	// There are some ambiguous append calls in fdbclient, so to make it easier
 	// to add append for floats and doubles, name them differently for now.
+	Tuple& appendBool(bool);
 	Tuple& appendFloat(float);
 	Tuple& appendDouble(double);
 	Tuple& appendNull();
@@ -51,7 +52,7 @@ struct Tuple {
 		return append(t);
 	}
 
-	enum ElementType { NULL_TYPE, INT, BYTES, UTF8, FLOAT, DOUBLE };
+	enum ElementType { NULL_TYPE, INT, BYTES, UTF8, BOOL, FLOAT, DOUBLE };
 
 	// this is number of elements, not length of data
 	size_t size() const { return offsets.size(); }
@@ -59,6 +60,7 @@ struct Tuple {
 	ElementType getType(size_t index) const;
 	Standalone<StringRef> getString(size_t index) const;
 	int64_t getInt(size_t index, bool allow_incomplete = false) const;
+	bool getBool(size_t index) const;
 	float getFloat(size_t index) const;
 	double getDouble(size_t index) const;
 
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 59e2f494fc..ab31760f7f 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -464,14 +464,13 @@ ACTOR Future<Void> dumpDatabase(Database cx, std::string outputFilename, KeyRang
 ACTOR Future<Void> actorLineageProfiler() {
 	wait(delay(1));
 	wait(GlobalConfig::globalConfig().onInitialized());
-	// TODO: Add flag to enable/disable
-	state unsigned frequency = GlobalConfig::globalConfig().get<double>(sampleFrequency, 0);
+	state unsigned frequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
 	ActorLineageProfiler::instance().setFrequency(frequency);
 
 	loop {
 		wait(GlobalConfig::globalConfig().onChange());
 
-		unsigned latestFrequency = GlobalConfig::globalConfig().get<double>(sampleFrequency, 0);
+		unsigned latestFrequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
 		if (latestFrequency != frequency) {
 			frequency = latestFrequency;
 			ActorLineageProfiler::instance().setFrequency(latestFrequency);

From 115efaabc3b2d875a3ccabb8fd74c15fed55124c Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 20 Apr 2021 15:31:13 -0700
Subject: [PATCH 208/461] Move profiler start function

---
 ...ler.cpp => ActorLineageProfiler.actor.cpp} | 23 +++++++++++++++++-
 ...rofiler.h => ActorLineageProfiler.actor.h} | 13 +++++++++-
 fdbclient/CMakeLists.txt                      |  4 ++--
 fdbserver/RoleLineage.actor.h                 |  2 +-
 fdbserver/fdbserver.actor.cpp                 | 24 ++-----------------
 5 files changed, 39 insertions(+), 27 deletions(-)
 rename fdbclient/{ActorLineageProfiler.cpp => ActorLineageProfiler.actor.cpp} (89%)
 rename fdbclient/{ActorLineageProfiler.h => ActorLineageProfiler.actor.h} (90%)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.actor.cpp
similarity index 89%
rename from fdbclient/ActorLineageProfiler.cpp
rename to fdbclient/ActorLineageProfiler.actor.cpp
index 733f581718..5c746ad9e2 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.actor.cpp
@@ -21,7 +21,8 @@
 #include "flow/flow.h"
 #include "flow/singleton.h"
 #include "fdbrpc/IAsyncFile.h"
-#include "fdbclient/ActorLineageProfiler.h"
+#include "fdbclient/ActorLineageProfiler.actor.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include <msgpack.hpp>
 #include <memory>
 #include <boost/endian/conversion.hpp>
@@ -279,3 +280,23 @@ void ActorLineageProfilerT::profile() {
 		}
 	}
 }
+
+// Handles running the sampling profiler, including responding to frequency
+// changes and other updates the client wishes to make through global
+// configuration.
+ACTOR Future<Void> runSamplingProfiler() {
+	wait(delay(1)); // A bit of a hack to get around GlobalConfig not being setup yet
+	wait(GlobalConfig::globalConfig().onInitialized());
+	state unsigned frequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
+	ActorLineageProfiler::instance().setFrequency(frequency);
+
+	loop {
+		wait(GlobalConfig::globalConfig().onChange());
+
+		unsigned latestFrequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
+		if (latestFrequency != frequency) {
+			frequency = latestFrequency;
+			ActorLineageProfiler::instance().setFrequency(latestFrequency);
+		}
+	}
+}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.actor.h
similarity index 90%
rename from fdbclient/ActorLineageProfiler.h
rename to fdbclient/ActorLineageProfiler.actor.h
index 5dee2a4291..50d064b746 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.actor.h
@@ -19,6 +19,13 @@
  */
 
 #pragma once
+
+#if defined(NO_INTELLISENSE) && !defined(FLOW_ACTORLINEAGEPROFILER_ACTOR_G_H)
+#define FLOW_ACTORLINEAGEPROFILER_ACTOR_G_H
+#include "fdbclient/ActorLineageProfiler.actor.g.h"
+#elif !defined(FLOW_ACTORLINEAGEPROFILER_ACTOR_H)
+#define FLOW_ACTORLINEAGEPROFILER_ACTOR_H
+
 #include "fdbclient/AnnotateActor.h"
 
 #include <optional>
@@ -30,7 +37,9 @@
 #include "flow/singleton.h"
 #include "flow/flow.h"
 
-void runSamplingProfiler();
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+ACTOR Future<Void> runSamplingProfiler();
 
 struct IALPCollectorBase {
 	virtual std::optional<std::any> collect(ActorLineage*) = 0;
@@ -120,3 +129,5 @@ public:
 };
 
 using ActorLineageProfiler = crossbow::singleton<ActorLineageProfilerT>;
+
+#endif
diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index ee87d08646..25825f3f23 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(FDBCLIENT_SRCS
-  ActorLineageProfiler.h
-  ActorLineageProfiler.cpp
+  ActorLineageProfiler.actor.h
+  ActorLineageProfiler.actor.cpp
   AnnotateActor.cpp
   AsyncFileS3BlobStore.actor.cpp
   AsyncFileS3BlobStore.actor.h
diff --git a/fdbserver/RoleLineage.actor.h b/fdbserver/RoleLineage.actor.h
index 5cbf65ed53..977adaa47b 100644
--- a/fdbserver/RoleLineage.actor.h
+++ b/fdbserver/RoleLineage.actor.h
@@ -28,7 +28,7 @@
 
 #include "flow/singleton.h"
 #include "fdbrpc/Locality.h"
-#include "fdbclient/ActorLineageProfiler.h"
+#include "fdbclient/ActorLineageProfiler.actor.h"
 #include "fdbserver/WorkerInterface.actor.h"
 
 #include <string_view>
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index ab31760f7f..53876fd6fd 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -35,7 +35,7 @@
 #include <boost/algorithm/string.hpp>
 #include <boost/interprocess/managed_shared_memory.hpp>
 
-#include "fdbclient/ActorLineageProfiler.h"
+#include "fdbclient/ActorLineageProfiler.actor.h"
 #include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/RestoreWorkerInterface.actor.h"
@@ -458,26 +458,6 @@ ACTOR Future<Void> dumpDatabase(Database cx, std::string outputFilename, KeyRang
 	}
 }
 
-// Handles running the sampling profiler, including responding to frequency
-// changes and other updates the client wishes to make through global
-// configuration.
-ACTOR Future<Void> actorLineageProfiler() {
-	wait(delay(1));
-	wait(GlobalConfig::globalConfig().onInitialized());
-	state unsigned frequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
-	ActorLineageProfiler::instance().setFrequency(frequency);
-
-	loop {
-		wait(GlobalConfig::globalConfig().onChange());
-
-		unsigned latestFrequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
-		if (latestFrequency != frequency) {
-			frequency = latestFrequency;
-			ActorLineageProfiler::instance().setFrequency(latestFrequency);
-		}
-	}
-}
-
 void memoryTest();
 void skipListTest();
 
@@ -2009,7 +1989,7 @@ int main(int argc, char* argv[]) {
 				                      opts.whitelistBinPaths));
 				actors.push_back(histogramReport());
 				// actors.push_back( recurring( []{}, .001 ) );  // for ASIO latency measurement
-				actors.push_back(actorLineageProfiler());
+				actors.push_back(runSamplingProfiler());
 
 				f = stopAfter(waitForAll(actors));
 				g_network->run();

From 9e89159efb7a994d2d880ddd474bc9834cbd6a2e Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Tue, 20 Apr 2021 16:21:01 -0700
Subject: [PATCH 209/461] Don't use DLDatabase objects before they are ready
 (applicable for API versions < 610). Fix reference counting of DLDatabase
 objects to avoid leaking the underlying database handle. Update release notes
 to note that clients older than 6.2 still create extra connections.

---
 .../release-notes/release-notes-630.rst       |  2 +-
 .../release-notes/release-notes-700.rst       |  2 +-
 fdbclient/MultiVersionTransaction.actor.cpp   | 94 ++++++++++++-------
 fdbclient/MultiVersionTransaction.h           |  4 +
 4 files changed, 66 insertions(+), 36 deletions(-)

diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst
index cd8c5e4150..f4b5c8aacb 100644
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@@ -4,7 +4,7 @@ Release Notes
 
 6.3.13
 ======
-* The multi-version client now requires at most two connections to the cluster, regardless of how many external clients are configured. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
+* The multi-version client now requires at most two client connections with version 6.2 or larger, regardless of how many external clients are configured. Clients older than 6.2 will continue to create an additional connection each. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
 
 6.3.12
 ======
diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst
index 5f3d3a4669..84e8f0680a 100644
--- a/documentation/sphinx/source/release-notes/release-notes-700.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-700.rst
@@ -16,7 +16,7 @@ Performance
 -----------
 
 * Increased performance of dr_agent when copying the mutation log. The ``COPY_LOG_BLOCK_SIZE``, ``COPY_LOG_BLOCKS_PER_TASK``, ``COPY_LOG_PREFETCH_BLOCKS``, ``COPY_LOG_READ_AHEAD_BYTES`` and ``COPY_LOG_TASK_DURATION_NANOS`` knobs can be set. `(PR #3436) <https://github.com/apple/foundationdb/pull/3436>`_
-* Reduced the number of connections required by the multi-version client when loading external clients. When connection to 7.0 clusters, only one connection will be used. With older clusters, at most two connections will be used. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
+* Reduced the number of connections required by the multi-version client when loading external clients. When connecting to 7.0 clusters, only one connection with version 6.2 or larger will be used. With older clusters, at most two connections with version 6.2 or larger will be used. Clients older than version 6.2 will continue to create an additional connection each. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
 
 Reliability
 -----------
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index b39fde5cfd..555765c26c 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -289,12 +289,15 @@ void DLTransaction::reset() {
 
 // DLDatabase
 DLDatabase::DLDatabase(Reference<FdbCApi> api, ThreadFuture<FdbCApi::FDBDatabase*> dbFuture) : api(api), db(nullptr) {
+	addref();
 	ready = mapThreadFuture<FdbCApi::FDBDatabase*, Void>(dbFuture, [this](ErrorOr<FdbCApi::FDBDatabase*> db) {
 		if (db.isError()) {
+			delref();
 			return ErrorOr<Void>(db.getError());
 		}
 
 		this->db = db.get();
+		delref();
 		return ErrorOr<Void>(Void());
 	});
 }
@@ -1013,12 +1016,56 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
 	});
 }
 
+// Replaces the active database connection with a new one. Must be called from the main thread.
+void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> newDb, Reference<ClientInfo> client) {
+	if (newDb) {
+		optionLock.enter();
+		for (auto option : options) {
+			try {
+				// In practice, this will set a deferred error instead of throwing. If that happens, the database
+				// will be unusable (attempts to use it will throw errors).
+				newDb->setOption(option.first, option.second.castTo<StringRef>());
+			} catch (Error& e) {
+				optionLock.leave();
+
+				// If we can't set all of the options on a cluster, we abandon the client
+				TraceEvent(SevError, "ClusterVersionChangeOptionError")
+				    .error(e)
+				    .detail("Option", option.first)
+				    .detail("OptionValue", option.second)
+				    .detail("LibPath", client->libPath);
+				client->failed = true;
+				MultiVersionApi::api->updateSupportedVersions();
+				newDb = Reference<IDatabase>();
+				break;
+			}
+		}
+
+		db = newDb;
+
+		optionLock.leave();
+
+		if (dbProtocolVersion.get().hasStableInterfaces() && db) {
+			versionMonitorDb = db;
+		} else {
+			versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
+		}
+	} else {
+		db = Reference<IDatabase>();
+		versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
+	}
+
+	dbVar->set(db);
+	protocolVersionMonitor = monitorProtocolVersion();
+}
+
 // Called when a change to the protocol version of the cluster has been detected. Must be called from the main
 // thread.
 void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion protocolVersion) {
 	if (dbProtocolVersion.present() &&
 	    protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) {
 		dbProtocolVersion = protocolVersion;
+		protocolVersionMonitor = monitorProtocolVersion();
 	} else {
 		TraceEvent("ProtocolVersionChanged")
 		    .detail("NewProtocolVersion", protocolVersion)
@@ -1036,46 +1083,25 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
 
 			Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
 
-			optionLock.enter();
-			for (auto option : options) {
-				try {
-					// In practice, this will set a deferred error instead of throwing. If that happens, the database
-					// will be unusable (attempts to use it will throw errors).
-					newDb->setOption(option.first, option.second.castTo<StringRef>());
-				} catch (Error& e) {
-					optionLock.leave();
+			if (client->external && !MultiVersionApi::apiVersionAtLeast(610)) {
+				dbReady = mapThreadFuture<Void, Void>(
+				    newDb.castTo<DLDatabase>()->onReady(), [this, newDb, client](ErrorOr<Void> ready) {
+					    if (!ready.isError()) {
+						    onMainThreadVoid([this, newDb, client]() { updateDatabase(newDb, client); }, nullptr);
+					    } else {
+						    updateDatabase(Reference<IDatabase>(), client);
+					    }
 
-					// If we can't set all of the options on a cluster, we abandon the client
-					TraceEvent(SevError, "ClusterVersionChangeOptionError")
-					    .error(e)
-					    .detail("Option", option.first)
-					    .detail("OptionValue", option.second)
-					    .detail("LibPath", client->libPath);
-					client->failed = true;
-					MultiVersionApi::api->updateSupportedVersions();
-					newDb = Reference<IDatabase>();
-					break;
-				}
-			}
-
-			db = newDb;
-
-			optionLock.leave();
-
-			if (dbProtocolVersion.get().hasStableInterfaces() && db) {
-				versionMonitorDb = db;
+					    dbReady = ThreadFuture<Void>();
+					    return ready;
+				    });
 			} else {
-				versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
+				updateDatabase(newDb, client);
 			}
 		} else {
-			db = Reference<IDatabase>();
-			versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
+			updateDatabase(Reference<IDatabase>(), Reference<ClientInfo>());
 		}
-
-		dbVar->set(db);
 	}
-
-	protocolVersionMonitor = monitorProtocolVersion();
 }
 
 std::atomic_flag MultiVersionDatabase::externalClientsInitialized = ATOMIC_FLAG_INIT;
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index c8aaeb840e..4e0e91a969 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -467,6 +467,9 @@ public:
 	struct DatabaseState : ThreadSafeReferenceCounted<DatabaseState> {
 		DatabaseState(std::string clusterFilePath, Reference<IDatabase> versionMonitorDb);
 
+		// Replaces the active database connection with a new one. Must be called from the main thread.
+		void updateDatabase(Reference<IDatabase> newDb, Reference<ClientInfo> client);
+
 		// Called when a change to the protocol version of the cluster has been detected. Must be called from the main
 		// thread.
 		void protocolVersionChanged(ProtocolVersion protocolVersion);
@@ -490,6 +493,7 @@ public:
 
 		bool cancelled;
 
+		ThreadFuture<Void> dbReady;
 		ThreadFuture<Void> protocolVersionMonitor;
 		Optional<ProtocolVersion> dbProtocolVersion;
 		std::map<ProtocolVersion, Reference<ClientInfo>> clients;

From 15336ca274261bdfc27143c5143d02fa90ee0472 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 20 Apr 2021 17:51:38 -0700
Subject: [PATCH 210/461] Add callback for specific global configuration key
 changes

---
 ...ler.actor.cpp => ActorLineageProfiler.cpp} | 29 +++++++------------
 ...rofiler.actor.h => ActorLineageProfiler.h} | 12 +-------
 fdbclient/CMakeLists.txt                      |  4 +--
 fdbclient/GlobalConfig.actor.cpp              | 17 +++++++++--
 fdbclient/GlobalConfig.actor.h                |  9 ++++++
 fdbclient/NativeAPI.actor.cpp                 |  2 ++
 fdbserver/RoleLineage.actor.h                 |  2 +-
 fdbserver/fdbserver.actor.cpp                 |  3 +-
 fdbserver/worker.actor.cpp                    |  3 ++
 9 files changed, 44 insertions(+), 37 deletions(-)
 rename fdbclient/{ActorLineageProfiler.actor.cpp => ActorLineageProfiler.cpp} (90%)
 rename fdbclient/{ActorLineageProfiler.actor.h => ActorLineageProfiler.h} (90%)

diff --git a/fdbclient/ActorLineageProfiler.actor.cpp b/fdbclient/ActorLineageProfiler.cpp
similarity index 90%
rename from fdbclient/ActorLineageProfiler.actor.cpp
rename to fdbclient/ActorLineageProfiler.cpp
index 5c746ad9e2..c317a88f37 100644
--- a/fdbclient/ActorLineageProfiler.actor.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -21,7 +21,7 @@
 #include "flow/flow.h"
 #include "flow/singleton.h"
 #include "fdbrpc/IAsyncFile.h"
-#include "fdbclient/ActorLineageProfiler.actor.h"
+#include "fdbclient/ActorLineageProfiler.h"
 #include "fdbclient/GlobalConfig.actor.h"
 #include <msgpack.hpp>
 #include <memory>
@@ -259,7 +259,7 @@ void ActorLineageProfilerT::setFrequency(unsigned frequency) {
 		cond.notify_all();
 	}
 
-	if (frequency == 0) {
+	if (frequency == 0 && profilerThread.joinable()) {
 		profilerThread.join();
 	}
 }
@@ -281,22 +281,13 @@ void ActorLineageProfilerT::profile() {
 	}
 }
 
-// Handles running the sampling profiler, including responding to frequency
-// changes and other updates the client wishes to make through global
-// configuration.
-ACTOR Future<Void> runSamplingProfiler() {
-	wait(delay(1)); // A bit of a hack to get around GlobalConfig not being setup yet
-	wait(GlobalConfig::globalConfig().onInitialized());
-	state unsigned frequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
-	ActorLineageProfiler::instance().setFrequency(frequency);
-
-	loop {
-		wait(GlobalConfig::globalConfig().onChange());
-
-		unsigned latestFrequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
-		if (latestFrequency != frequency) {
-			frequency = latestFrequency;
-			ActorLineageProfiler::instance().setFrequency(latestFrequency);
-		}
+// Callback used to update the sampling profilers run frequency whenever the
+// frequency changes.
+void samplingProfilerUpdateFrequency(std::optional<std::any> freq) {
+	double frequency = 0;
+	if (freq.has_value()) {
+		frequency = std::any_cast<double>(freq.value());
 	}
+	TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency);
+	ActorLineageProfiler::instance().setFrequency(frequency);
 }
diff --git a/fdbclient/ActorLineageProfiler.actor.h b/fdbclient/ActorLineageProfiler.h
similarity index 90%
rename from fdbclient/ActorLineageProfiler.actor.h
rename to fdbclient/ActorLineageProfiler.h
index 50d064b746..b73e7d04eb 100644
--- a/fdbclient/ActorLineageProfiler.actor.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -20,12 +20,6 @@
 
 #pragma once
 
-#if defined(NO_INTELLISENSE) && !defined(FLOW_ACTORLINEAGEPROFILER_ACTOR_G_H)
-#define FLOW_ACTORLINEAGEPROFILER_ACTOR_G_H
-#include "fdbclient/ActorLineageProfiler.actor.g.h"
-#elif !defined(FLOW_ACTORLINEAGEPROFILER_ACTOR_H)
-#define FLOW_ACTORLINEAGEPROFILER_ACTOR_H
-
 #include "fdbclient/AnnotateActor.h"
 
 #include <optional>
@@ -37,9 +31,7 @@
 #include "flow/singleton.h"
 #include "flow/flow.h"
 
-#include "flow/actorcompiler.h" // This must be the last #include.
-
-ACTOR Future<Void> runSamplingProfiler();
+void samplingProfilerUpdateFrequency(std::optional<std::any> freq);
 
 struct IALPCollectorBase {
 	virtual std::optional<std::any> collect(ActorLineage*) = 0;
@@ -129,5 +121,3 @@ public:
 };
 
 using ActorLineageProfiler = crossbow::singleton<ActorLineageProfilerT>;
-
-#endif
diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index 25825f3f23..ee87d08646 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(FDBCLIENT_SRCS
-  ActorLineageProfiler.actor.h
-  ActorLineageProfiler.actor.cpp
+  ActorLineageProfiler.h
+  ActorLineageProfiler.cpp
   AnnotateActor.cpp
   AsyncFileS3BlobStore.actor.cpp
   AsyncFileS3BlobStore.actor.h
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 8096688786..79bbbb2202 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -87,6 +87,10 @@ Future<Void> GlobalConfig::onChange() {
 	return configChanged.onTrigger();
 }
 
+void GlobalConfig::trigger(KeyRef key, std::function<void(std::optional<std::any>)> fn) {
+	callbacks.emplace(key, std::move(fn));
+}
+
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
 	data.erase(key);
 
@@ -109,19 +113,26 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
 			ASSERT(false);
 		}
 		data[stableKey] = makeReference<ConfigValue>(std::move(arena), std::move(any));
+
+		if (callbacks.find(stableKey) != callbacks.end()) {
+			callbacks[stableKey](data[stableKey]->value);
+		}
 	} catch (Error& e) {
 		TraceEvent("GlobalConfigTupleParseError").detail("What", e.what());
 	}
 }
 
 void GlobalConfig::erase(KeyRef key) {
-	data.erase(key);
+	erase(KeyRangeRef(key, keyAfter(key)));
 }
 
 void GlobalConfig::erase(KeyRangeRef range) {
 	auto it = data.begin();
 	while (it != data.end()) {
 		if (range.contains(it->first)) {
+			if (callbacks.find(it->first) != callbacks.end()) {
+				callbacks[it->first](std::nullopt);
+			}
 			it = data.erase(it);
 		} else {
 			++it;
@@ -175,7 +186,9 @@ ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
 // Updates local copy of global configuration by reading the entire key-range
 // from storage.
 ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
-	self->data.clear();
+	for (const auto& [key, _] : self->data) {
+		self->erase(key);
+	}
 
 	Transaction tr(self->cx);
 	Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 8835955400..de98c442e1 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -27,7 +27,9 @@
 #define FDBCLIENT_GLOBALCONFIG_ACTOR_H
 
 #include <any>
+#include <functional>
 #include <map>
+#include <optional>
 #include <type_traits>
 #include <unordered_map>
 
@@ -128,6 +130,12 @@ public:
 	// configuration changes.
 	Future<Void> onChange();
 
+	// Calls \ref fn when the value associated with \ref key is changed. \ref
+	// fn is passed the updated value for the key, or an empty optional if the
+	// key has been cleared. If the value is an allocated object, its memory
+	// remains in the control of the global configuration.
+	void trigger(KeyRef key, std::function<void(std::optional<std::any>)> fn);
+
 private:
 	GlobalConfig();
 
@@ -156,6 +164,7 @@ private:
 	AsyncTrigger configChanged;
 	std::unordered_map<StringRef, Reference<ConfigValue>> data;
 	Version lastUpdate;
+	std::unordered_map<KeyRef, std::function<void(std::optional<std::any>)>> callbacks;
 };
 
 #endif
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 1857cea0c7..cd7638221b 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -32,6 +32,7 @@
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/MultiInterface.h"
 
+#include "fdbclient/ActorLineageProfiler.h"
 #include "fdbclient/AnnotateActor.h"
 #include "fdbclient/Atomic.h"
 #include "fdbclient/ClusterInterface.h"
@@ -960,6 +961,7 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 	getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted"));
 
 	GlobalConfig::create(this, clientInfo);
+	GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
 
 	monitorProxiesInfoChange = monitorProxiesChange(clientInfo, &proxiesChangeTrigger);
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);
diff --git a/fdbserver/RoleLineage.actor.h b/fdbserver/RoleLineage.actor.h
index 977adaa47b..5cbf65ed53 100644
--- a/fdbserver/RoleLineage.actor.h
+++ b/fdbserver/RoleLineage.actor.h
@@ -28,7 +28,7 @@
 
 #include "flow/singleton.h"
 #include "fdbrpc/Locality.h"
-#include "fdbclient/ActorLineageProfiler.actor.h"
+#include "fdbclient/ActorLineageProfiler.h"
 #include "fdbserver/WorkerInterface.actor.h"
 
 #include <string_view>
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 53876fd6fd..1d66b163d4 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -35,7 +35,7 @@
 #include <boost/algorithm/string.hpp>
 #include <boost/interprocess/managed_shared_memory.hpp>
 
-#include "fdbclient/ActorLineageProfiler.actor.h"
+#include "fdbclient/ActorLineageProfiler.h"
 #include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/RestoreWorkerInterface.actor.h"
@@ -1989,7 +1989,6 @@ int main(int argc, char* argv[]) {
 				                      opts.whitelistBinPaths));
 				actors.push_back(histogramReport());
 				// actors.push_back( recurring( []{}, .001 ) );  // for ASIO latency measurement
-				actors.push_back(runSamplingProfiler());
 
 				f = stopAfter(waitForAll(actors));
 				g_network->run();
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 4d05d3f5fe..fea422dcd8 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -22,6 +22,7 @@
 #include <boost/lexical_cast.hpp>
 
 #include "fdbrpc/Locality.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/ProcessInterface.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbserver/Knobs.h"
@@ -1038,6 +1039,8 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 			metricsLogger = runMetrics(openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, lockAware),
 			                           KeyRef(metricsPrefix));
 		}
+
+		GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
 	}
 
 	errorForwarders.add(resetAfter(degraded,

From 8b280f5be637a465e57c1821f3fa41d07619da6e Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 20 Apr 2021 17:55:27 -0700
Subject: [PATCH 211/461] Remove old includes

---
 fdbclient/ActorLineageProfiler.cpp | 1 -
 fdbserver/fdbserver.actor.cpp      | 2 --
 2 files changed, 3 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index c317a88f37..42ac76da90 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -22,7 +22,6 @@
 #include "flow/singleton.h"
 #include "fdbrpc/IAsyncFile.h"
 #include "fdbclient/ActorLineageProfiler.h"
-#include "fdbclient/GlobalConfig.actor.h"
 #include <msgpack.hpp>
 #include <memory>
 #include <boost/endian/conversion.hpp>
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 1d66b163d4..136cd90c3d 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -35,8 +35,6 @@
 #include <boost/algorithm/string.hpp>
 #include <boost/interprocess/managed_shared_memory.hpp>
 
-#include "fdbclient/ActorLineageProfiler.h"
-#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/RestoreWorkerInterface.actor.h"
 #include "fdbclient/SystemData.h"

From 36b1ab7ba5fabaf0214785fff286dabcbaaced1f Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 20 Apr 2021 22:05:16 -0700
Subject: [PATCH 212/461] Detach profiler thread instead of joining it

---
 fdbclient/ActorLineageProfiler.cpp | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 42ac76da90..fe335d90d5 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -250,23 +250,30 @@ void ActorLineageProfilerT::stop() {
 }
 
 void ActorLineageProfilerT::setFrequency(unsigned frequency) {
+	unsigned oldFrequency = this->frequency;
 	bool change = this->frequency != frequency;
 	this->frequency = frequency;
-	if (frequency != 0 && !profilerThread.joinable()) {
-		profilerThread = std::thread(std::bind(&ActorLineageProfilerT::profile, this));
-	} else if (change) {
-		cond.notify_all();
-	}
 
-	if (frequency == 0 && profilerThread.joinable()) {
-		profilerThread.join();
+	if (change) {
+		// Profiler thread will automatically switch to new frequency after
+		// being triggered by the the condition variable. Only need to start a
+		// new profiler thread if the old one has been stopped due to the
+		// profiler thread returning (frequency set to 0).
+		if (oldFrequency == 0 && frequency != 0) {
+			std::thread(&ActorLineageProfilerT::profile, this).detach();
+		}
+		cond.notify_all();
 	}
 }
 
 void ActorLineageProfilerT::profile() {
+	static std::atomic_int profileThreadCount = 0;
+	ASSERT(++profileThreadCount == 1);
+
 	for (;;) {
 		collection->refresh();
 		if (frequency == 0) {
+			profileThreadCount--;
 			return;
 		}
 		{
@@ -275,6 +282,7 @@ void ActorLineageProfilerT::profile() {
 			// cond.wait_until(lock, lastSample + std::chrono::milliseconds)
 		}
 		if (frequency == 0) {
+			profileThreadCount--;
 			return;
 		}
 	}

From e18c9961b44ab61855983d5a61e9546cfe8a989e Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 21 Apr 2021 00:22:33 -0700
Subject: [PATCH 213/461] rewrote tlog recruitment logic so that it is
 deterministic, to prevent better master exists from triggering spuriously

---
 fdbrpc/ReplicationPolicy.h            |   4 +
 fdbserver/ClusterController.actor.cpp | 768 ++++++++++++++++++++------
 fdbserver/WorkerInterface.actor.h     |   2 +
 3 files changed, 611 insertions(+), 163 deletions(-)

diff --git a/fdbrpc/ReplicationPolicy.h b/fdbrpc/ReplicationPolicy.h
index f74f434304..a9c6f33e09 100644
--- a/fdbrpc/ReplicationPolicy.h
+++ b/fdbrpc/ReplicationPolicy.h
@@ -151,6 +151,10 @@ struct PolicyAcross final : IReplicationPolicy, public ReferenceCounted<PolicyAc
 		_policy->attributeKeys(set);
 	}
 
+	Reference<IReplicationPolicy> embeddedPolicy() { return _policy; }
+
+	std::string attributeKey() { return _attribKey; }
+
 protected:
 	int _count;
 	std::string _attribKey;
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 8ec3a4d30c..5121e72d94 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -321,7 +321,433 @@ public:
 		return results;
 	}
 
-	// Selects workers as TLogs from available workers based on input parameters.
+	// Adds workers to the result such that each field is used in the result set as evenly as possible,
+	// with a secondary criteria of minimizing the reuse of zoneIds
+	// only add workers which have a field which is already in the result set
+	void addWorkersByLowestField(StringRef field,
+	                             int desired,
+	                             std::vector<WorkerDetails> workers,
+	                             std::set<WorkerDetails>& resultSet) {
+		typedef Optional<Standalone<StringRef>> Field;
+		typedef Optional<Standalone<StringRef>> Zone;
+		typedef std::pair<int, Field> FieldCount;
+		typedef std::pair<int, Zone> ZoneCount;
+
+		std::priority_queue<FieldCount, std::vector<FieldCount>, std::greater<FieldCount>> fieldQueue;
+		std::map<Field, std::priority_queue<ZoneCount, std::vector<ZoneCount>, std::greater<ZoneCount>>>
+		    field_zoneQueue;
+
+		std::map<Field, std::pair<int, int>> field_count;
+		std::map<Zone, std::pair<int, Field>> zone_count;
+		std::map<Zone, std::vector<WorkerDetails>> zone_workers;
+
+		// Count the amount of fields and zones already in the result set
+		for (auto& worker : resultSet) {
+			auto thisField = worker.interf.locality.get(field);
+			auto thisZone = worker.interf.locality.zoneId();
+			auto thisDc = worker.interf.locality.dcId();
+
+			auto& f = field_count[thisField];
+			f.first++;
+			if (thisDc == clusterControllerDcId) {
+				f.second = 1;
+			}
+			auto& z = zone_count[thisZone];
+			z.first++;
+			z.second = thisField;
+		}
+
+		for (auto& worker : workers) {
+			auto thisField = worker.interf.locality.get(field);
+			auto thisZone = worker.interf.locality.zoneId();
+			zone_workers[thisZone].push_back(worker);
+
+			if (field_count.count(thisField)) {
+				zone_count[thisZone].second = thisField;
+			}
+		}
+
+		// try to avoid fields in the cluster controller datacenter if everything else is equal
+		for (auto& it : field_count) {
+			fieldQueue.push(std::make_pair(2 * it.second.first + it.second.second, it.first));
+		}
+
+		for (auto& it : zone_count) {
+			field_zoneQueue[it.second.second].push(std::make_pair(it.second.first, it.first));
+		}
+
+		// start with the least used field, and try to find a worker with that field
+		while (fieldQueue.size()) {
+			auto lowestField = fieldQueue.top();
+			auto& lowestZoneQueue = field_zoneQueue[lowestField.second];
+			bool added = false;
+			// start with the least used zoneId, and try and find a worker with that zone
+			while (lowestZoneQueue.size() && !added) {
+				auto lowestZone = lowestZoneQueue.top();
+				auto& zoneWorkers = zone_workers[lowestZone.second];
+
+				while (zoneWorkers.size() && !added) {
+					if (!resultSet.count(zoneWorkers.back())) {
+						resultSet.insert(zoneWorkers.back());
+						if (resultSet.size() >= desired) {
+							return;
+						}
+						added = true;
+					}
+					zoneWorkers.pop_back();
+				}
+				lowestZoneQueue.pop();
+				if (added && zoneWorkers.size()) {
+					lowestZoneQueue.push(std::make_pair(lowestZone.first + 1, lowestZone.second));
+				}
+			}
+			fieldQueue.pop();
+			if (added) {
+				fieldQueue.push(std::make_pair(lowestField.first + 2, lowestField.second));
+			}
+		}
+	}
+
+	// Adds workers to the result which minimize the reuse of zoneIds
+	void addWorkersByLowestZone(int desired, std::vector<WorkerDetails> workers, std::set<WorkerDetails>& resultSet) {
+		typedef Optional<Standalone<StringRef>> Zone;
+		typedef std::pair<int, Zone> ZoneCount;
+
+		std::map<Zone, int> zone_count;
+		std::map<Zone, std::vector<WorkerDetails>> zone_workers;
+		std::priority_queue<ZoneCount, std::vector<ZoneCount>, std::greater<ZoneCount>> zoneQueue;
+
+		for (auto& worker : workers) {
+			auto thisZone = worker.interf.locality.zoneId();
+			zone_count[thisZone] = 0;
+			zone_workers[thisZone].push_back(worker);
+		}
+
+		for (auto& worker : resultSet) {
+			auto thisZone = worker.interf.locality.zoneId();
+			zone_count[thisZone]++;
+		}
+
+		for (auto& it : zone_count) {
+			zoneQueue.push(std::make_pair(it.second, it.first));
+		}
+
+		while (zoneQueue.size()) {
+			auto lowestZone = zoneQueue.top();
+			auto& zoneWorkers = zone_workers[lowestZone.second];
+
+			bool added = false;
+			while (zoneWorkers.size() && !added) {
+				if (!resultSet.count(zoneWorkers.back())) {
+					resultSet.insert(zoneWorkers.back());
+					if (resultSet.size() >= desired) {
+						return;
+					}
+					added = true;
+				}
+				zoneWorkers.pop_back();
+			}
+			zoneQueue.pop();
+			if (added && zoneWorkers.size()) {
+				zoneQueue.push(std::make_pair(lowestZone.first + 1, lowestZone.second));
+			}
+		}
+	}
+
+	// A TLog recruitment method specialized for three_data_hall and three_datacenter configurations
+	// It attempts to evenly recruit processes from across data_halls or datacenters
+	std::vector<WorkerDetails> getWorkersForTlogsComplex(DatabaseConfiguration const& conf,
+	                                                     int32_t desired,
+	                                                     std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	                                                     StringRef field,
+	                                                     int minFields,
+	                                                     int minPerField,
+	                                                     bool allowDegraded,
+	                                                     bool checkStable,
+	                                                     std::set<Optional<Key>> dcIds,
+	                                                     std::vector<UID> exclusionWorkerIds) {
+		std::map<std::pair<ProcessClass::Fitness, int>, vector<WorkerDetails>> fitness_workers;
+		desired = std::max(desired, minFields * minPerField);
+
+		// Go through all the workers to list all the workers that can be recruited.
+		for (const auto& [worker_process_id, worker_info] : id_worker) {
+			const auto& worker_details = worker_info.details;
+			auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
+
+			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
+			        exclusionWorkerIds.end() ||
+			    !workerAvailable(worker_info, checkStable) ||
+			    conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign ||
+			    (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) ||
+			    (!allowDegraded && worker_details.degraded)) {
+				continue;
+			}
+
+			fitness_workers[std::make_pair(fitness, id_used[worker_process_id])].push_back(worker_details);
+		}
+
+		auto requiredFitness = ProcessClass::BestFit;
+		int requiredUsed = 0;
+
+		typedef Optional<Standalone<StringRef>> Field;
+		typedef Optional<Standalone<StringRef>> Zone;
+		std::map<Field, std::pair<std::set<Zone>, std::vector<WorkerDetails>>> field_zones;
+		std::set<Field> fieldsWithMin;
+		std::map<Field, int> field_count;
+		std::map<Field, std::tuple<ProcessClass::Fitness, int, bool>> field_fitness;
+
+		// Determine the best required workers by finding the workers with enough unique zoneIds per field
+		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+			deterministicRandom()->randomShuffle(workerIter->second);
+			for (auto& worker : workerIter->second) {
+				auto thisField = worker.interf.locality.get(field);
+				auto& zones = field_zones[thisField];
+				if (zones.first.insert(worker.interf.locality.zoneId()).second) {
+					zones.second.push_back(worker);
+					if (zones.first.size() == minPerField) {
+						fieldsWithMin.insert(thisField);
+					}
+				}
+				field_count[thisField]++;
+				field_fitness.insert({ thisField,
+				                       std::make_tuple(workerIter->first.first,
+				                                       workerIter->first.second,
+				                                       worker.interf.locality.dcId() == clusterControllerDcId) });
+			}
+			if (fieldsWithMin.size() >= minFields) {
+				requiredFitness = workerIter->first.first;
+				requiredUsed = workerIter->first.second;
+				break;
+			}
+		}
+
+		if (fieldsWithMin.size() < minFields) {
+			throw no_more_servers();
+		}
+
+		// If we cannot use all of the fields, use the fields which allow the best workers to be chosen
+		if (fieldsWithMin.size() * minPerField > desired) {
+			std::vector<std::tuple<ProcessClass::Fitness, int, bool, int, Field>> orderedFields;
+			for (auto& it : fieldsWithMin) {
+				auto& fitness = field_fitness[it];
+				orderedFields.push_back(std::make_tuple(
+				    std::get<0>(fitness), std::get<1>(fitness), std::get<2>(fitness), field_count[it], it));
+			}
+			std::sort(orderedFields.begin(), orderedFields.end());
+			std::set<Field> newFieldsWithMin;
+			int totalFields = desired / minPerField;
+			int maxCount = 0;
+			for (int i = 0; i < orderedFields.size(); i++) {
+				if (newFieldsWithMin.size() == totalFields - 1 && maxCount + std::get<3>(orderedFields[i]) < desired) {
+					for (int j = i + 1; j < orderedFields.size(); j++) {
+						if (maxCount + std::get<3>(orderedFields[j]) >= desired) {
+							newFieldsWithMin.insert(std::get<4>(orderedFields[j]));
+							break;
+						}
+					}
+					if (newFieldsWithMin.size() == totalFields) {
+						break;
+					}
+				}
+				maxCount += std::get<3>(orderedFields[i]);
+				newFieldsWithMin.insert(std::get<4>(orderedFields[i]));
+				if (newFieldsWithMin.size() == totalFields) {
+					break;
+				}
+			}
+			fieldsWithMin = newFieldsWithMin;
+		}
+
+		// Create a result set with fulfills the minField and minPerField requirements before adding more workers
+		std::set<WorkerDetails> resultSet;
+		for (auto& it : fieldsWithMin) {
+			auto& w = field_zones[it].second;
+			for (int i = 0; i < minPerField; i++) {
+				resultSet.insert(w[i]);
+			}
+		}
+
+		// Continue adding workers to the result set until we reach the desired number of workers
+		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+			if (workerIter->first.first > requiredFitness ||
+			    (workerIter->first.first == requiredFitness && workerIter->first.second > requiredUsed)) {
+				break;
+			}
+			if (workerIter->second.size() + resultSet.size() <= desired) {
+				for (auto& worker : workerIter->second) {
+					if (fieldsWithMin.count(worker.interf.locality.get(field))) {
+						resultSet.insert(worker);
+					}
+				}
+			} else {
+				addWorkersByLowestField(field, desired, workerIter->second, resultSet);
+			}
+			if (resultSet.size() >= desired) {
+				break;
+			}
+		}
+
+		for (auto& result : resultSet) {
+			id_used[result.interf.locality.processId()]++;
+		}
+
+		return std::vector<WorkerDetails>(resultSet.begin(), resultSet.end());
+	}
+
+	// Attempt to recruit TLogs without degraded processes and see if it improves the configuration
+	std::vector<WorkerDetails> getWorkersForTlogsComplex(DatabaseConfiguration const& conf,
+	                                                     int32_t desired,
+	                                                     std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	                                                     StringRef field,
+	                                                     int minFields,
+	                                                     int minPerField,
+	                                                     bool checkStable,
+	                                                     std::set<Optional<Key>> dcIds,
+	                                                     std::vector<UID> exclusionWorkerIds) {
+		std::map<Optional<Standalone<StringRef>>, int> withDegradedUsed = id_used;
+		auto withDegraded = getWorkersForTlogsComplex(conf,
+		                                              desired,
+		                                              withDegradedUsed,
+		                                              field,
+		                                              minFields,
+		                                              minPerField,
+		                                              true,
+		                                              checkStable,
+		                                              dcIds,
+		                                              exclusionWorkerIds);
+		RoleFitness withDegradedFitness(withDegraded, ProcessClass::TLog, withDegradedUsed);
+
+		bool usedDegraded = false;
+		for (auto& it : withDegraded) {
+			if (it.degraded) {
+				usedDegraded = true;
+				break;
+			}
+		}
+
+		if (!usedDegraded) {
+			id_used = withDegradedUsed;
+			return withDegraded;
+		}
+
+		try {
+			std::map<Optional<Standalone<StringRef>>, int> withoutDegradedUsed = id_used;
+			auto withoutDegraded = getWorkersForTlogsComplex(conf,
+			                                                 desired,
+			                                                 withoutDegradedUsed,
+			                                                 field,
+			                                                 minFields,
+			                                                 minPerField,
+			                                                 false,
+			                                                 checkStable,
+			                                                 dcIds,
+			                                                 exclusionWorkerIds);
+			RoleFitness withoutDegradedFitness(withoutDegraded, ProcessClass::TLog, withoutDegradedUsed);
+
+			if (withDegradedFitness < withoutDegradedFitness) {
+				id_used = withDegradedUsed;
+				return withDegraded;
+			}
+			id_used = withoutDegradedUsed;
+			return withoutDegraded;
+		} catch (Error& e) {
+			if (e.code() != error_code_no_more_servers) {
+				throw;
+			}
+			id_used = withDegradedUsed;
+			return withDegraded;
+		}
+	}
+
+	// A TLog recruitment method specialized for single, double, and triple configurations
+	// It recruits processes from with unique zoneIds until it reaches the desired amount
+	std::vector<WorkerDetails> getWorkersForTlogsSimple(DatabaseConfiguration const& conf,
+	                                                    int32_t required,
+	                                                    int32_t desired,
+	                                                    std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	                                                    bool checkStable,
+	                                                    std::set<Optional<Key>> dcIds,
+	                                                    std::vector<UID> exclusionWorkerIds) {
+		std::map<std::tuple<ProcessClass::Fitness, int, bool, bool>, vector<WorkerDetails>> fitness_workers;
+
+		// Go through all the workers to list all the workers that can be recruited.
+		for (const auto& [worker_process_id, worker_info] : id_worker) {
+			const auto& worker_details = worker_info.details;
+			auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
+			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
+			        exclusionWorkerIds.end() ||
+			    !workerAvailable(worker_info, checkStable) ||
+			    conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign ||
+			    (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0)) {
+				continue;
+			}
+
+			// This worker is a candidate for TLog recruitment.
+			bool inCCDC = worker_details.interf.locality.dcId() == clusterControllerDcId;
+			// Prefer recruiting a TransactionClass non-degraded process over a LogClass degraded process
+			if (worker_details.degraded) {
+				fitness = std::max(fitness, ProcessClass::GoodFit);
+			}
+
+			fitness_workers[std::make_tuple(fitness, id_used[worker_process_id], worker_details.degraded, inCCDC)]
+			    .push_back(worker_details);
+		}
+
+		auto requiredFitness = ProcessClass::BestFit;
+		int requiredUsed = 0;
+
+		std::set<Optional<Standalone<StringRef>>> zones;
+		std::set<WorkerDetails> resultSet;
+
+		// Determine the best required workers by finding the workers with enough unique zoneIds
+		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+			auto fitness = std::get<0>(workerIter->first);
+			auto used = std::get<1>(workerIter->first);
+			deterministicRandom()->randomShuffle(workerIter->second);
+			for (auto& worker : workerIter->second) {
+				if (!zones.count(worker.interf.locality.zoneId())) {
+					zones.insert(worker.interf.locality.zoneId());
+					resultSet.insert(worker);
+					if (resultSet.size() >= required) {
+						break;
+					}
+				}
+			}
+			if (resultSet.size() >= required) {
+				requiredFitness = fitness;
+				requiredUsed = used;
+				break;
+			}
+		}
+
+		// Continue adding workers to the result set until we reach the desired number of workers
+		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+			auto fitness = std::get<0>(workerIter->first);
+			auto used = std::get<1>(workerIter->first);
+			if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+				break;
+			}
+			if (workerIter->second.size() + resultSet.size() <= desired) {
+				for (auto& worker : workerIter->second) {
+					resultSet.insert(worker);
+				}
+			} else {
+				addWorkersByLowestZone(desired, workerIter->second, resultSet);
+			}
+			if (resultSet.size() >= desired) {
+				break;
+			}
+		}
+
+		for (auto& result : resultSet) {
+			id_used[result.interf.locality.processId()]++;
+		}
+
+		return std::vector<WorkerDetails>(resultSet.begin(), resultSet.end());
+	}
+
+	// A backup method for TLog recruitment that is used for custom policies, but does a worse job
+	// selecting the best workers.
 	//   conf:        the database configuration.
 	//   required:    the required number of TLog workers to select.
 	//   desired:     the desired number of TLog workers to select.
@@ -332,78 +758,30 @@ public:
 	//   dcIds:       the target data centers the workers are in. The selected workers must all be from these
 	//                data centers:
 	//   exclusionWorkerIds: the workers to be excluded from the selection.
-	std::vector<WorkerDetails> getWorkersForTlogs(DatabaseConfiguration const& conf,
-	                                              int32_t required,
-	                                              int32_t desired,
-	                                              Reference<IReplicationPolicy> const& policy,
-	                                              std::map<Optional<Standalone<StringRef>>, int>& id_used,
-	                                              bool checkStable = false,
-	                                              std::set<Optional<Key>> dcIds = std::set<Optional<Key>>(),
-	                                              std::vector<UID> exclusionWorkerIds = {}) {
+	std::vector<WorkerDetails> getWorkersForTlogsBackup(DatabaseConfiguration const& conf,
+	                                                    int32_t required,
+	                                                    int32_t desired,
+	                                                    Reference<IReplicationPolicy> const& policy,
+	                                                    std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	                                                    bool checkStable = false,
+	                                                    std::set<Optional<Key>> dcIds = std::set<Optional<Key>>(),
+	                                                    std::vector<UID> exclusionWorkerIds = {}) {
 		std::map<std::tuple<ProcessClass::Fitness, int, bool, bool>, vector<WorkerDetails>> fitness_workers;
 		std::vector<WorkerDetails> results;
-		std::vector<LocalityData> unavailableLocals;
-		Reference<LocalitySet> logServerSet;
-		LocalityMap<WorkerDetails>* logServerMap;
+		Reference<LocalitySet> logServerSet = Reference<LocalitySet>(new LocalityMap<WorkerDetails>());
+		LocalityMap<WorkerDetails>* logServerMap = (LocalityMap<WorkerDetails>*)logServerSet.getPtr();
 		bool bCompleted = false;
 		desired = std::max(required, desired);
 
-		// Construct the list of DCs where the TLog recruitment is happening. This is mainly for logging purpose.
-		std::string dcList;
-		for (const auto& dc : dcIds) {
-			if (!dcList.empty()) {
-				dcList += ',';
-			}
-			dcList += printable(dc);
-		}
-
-		logServerSet = Reference<LocalitySet>(new LocalityMap<WorkerDetails>());
-		logServerMap = (LocalityMap<WorkerDetails>*)logServerSet.getPtr();
-
-		// Populate `unavailableLocals` and log the reason why the worker is considered as unavailable.
-		auto logWorkerUnavailable = [this, &unavailableLocals, &dcList](const std::string& reason,
-		                                                                const WorkerDetails& details,
-		                                                                ProcessClass::Fitness fitness) {
-			unavailableLocals.push_back(details.interf.locality);
-
-			// Note that the recruitment happens only during initial database creation and recovery. So these trace
-			// events should be sparse.
-			TraceEvent("GetTLogTeamWorkerUnavailable", id)
-			    .detail("Reason", reason)
-			    .detail("WorkerID", details.interf.id())
-			    .detail("WorkerDC", details.interf.locality.dcId())
-			    .detail("Address", details.interf.addresses().toString())
-			    .detail("Fitness", fitness)
-			    .detail("RecruitmentDcIds", dcList);
-		};
-
 		// Go through all the workers to list all the workers that can be recruited.
 		for (const auto& [worker_process_id, worker_info] : id_worker) {
 			const auto& worker_details = worker_info.details;
 			auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
 			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
-			    exclusionWorkerIds.end()) {
-				logWorkerUnavailable("Worker is excluded", worker_details, fitness);
-				continue;
-			}
-
-			if (!workerAvailable(worker_info, checkStable)) {
-				logWorkerUnavailable("Worker is not available", worker_details, fitness);
-				continue;
-			}
-
-			if (conf.isExcludedServer(worker_details.interf.addresses())) {
-				logWorkerUnavailable("Worker server is excluded from the cluster", worker_details, fitness);
-				continue;
-			}
-
-			if (fitness == ProcessClass::NeverAssign) {
-				logWorkerUnavailable("Worker's fitness is NeverAssign", worker_details, fitness);
-				continue;
-			}
-
-			if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {
-				logWorkerUnavailable("Worker is not in the target DC", worker_details, fitness);
+			        exclusionWorkerIds.end() ||
+			    !workerAvailable(worker_info, checkStable) ||
+			    conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign ||
+			    (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0)) {
 				continue;
 			}
 
@@ -413,6 +791,7 @@ public:
 			if (worker_details.degraded) {
 				fitness = std::max(fitness, ProcessClass::GoodFit);
 			}
+
 			fitness_workers[std::make_tuple(fitness, id_used[worker_process_id], worker_details.degraded, inCCDC)]
 			    .push_back(worker_details);
 		}
@@ -452,21 +831,6 @@ public:
 				tLocalities.push_back(object->interf.locality);
 			}
 
-			TraceEvent(SevWarn, "GetTLogTeamFailed")
-			    .detail("DcIds", dcList)
-			    .detail("Policy", policy->info())
-			    .detail("Processes", logServerSet->size())
-			    .detail("Workers", id_worker.size())
-			    .detail("FitnessGroups", fitness_workers.size())
-			    .detail("TLogZones", ::describeZones(tLocalities))
-			    .detail("TLogDataHalls", ::describeDataHalls(tLocalities))
-			    .detail("MissingZones", ::describeZones(unavailableLocals))
-			    .detail("MissingDataHalls", ::describeDataHalls(unavailableLocals))
-			    .detail("Required", required)
-			    .detail("DesiredLogs", desired)
-			    .detail("CheckStable", checkStable)
-			    .detail("NumExclusionWorkers", exclusionWorkerIds.size());
-
 			logServerSet->clear();
 			logServerSet.clear();
 			throw no_more_servers();
@@ -480,18 +844,6 @@ public:
 			for (auto& result : results) {
 				id_used[result.interf.locality.processId()]++;
 			}
-			TraceEvent("GetTLogTeamDone")
-			    .detail("DcIds", dcList)
-			    .detail("Policy", policy->info())
-			    .detail("Results", results.size())
-			    .detail("Processes", logServerSet->size())
-			    .detail("Workers", id_worker.size())
-			    .detail("Required", required)
-			    .detail("Desired", desired)
-			    .detail("Fitness", requiredFitness)
-			    .detail("Used", requiredUsed)
-			    .detail("AddingDegraded", requiredDegraded)
-			    .detail("InCCDC", requiredInCCDC);
 			return results;
 		}
 
@@ -566,18 +918,6 @@ public:
 			for (auto& result : results) {
 				id_used[result.interf.locality.processId()]++;
 			}
-			TraceEvent("GetTLogTeamDone")
-			    .detail("DcIds", dcList)
-			    .detail("Policy", policy->info())
-			    .detail("Results", results.size())
-			    .detail("Processes", logServerSet->size())
-			    .detail("Workers", id_worker.size())
-			    .detail("Required", required)
-			    .detail("Desired", desired)
-			    .detail("Fitness", requiredFitness)
-			    .detail("Used", requiredUsed)
-			    .detail("AddingDegraded", requiredDegraded)
-			    .detail("InCCDC", requiredInCCDC);
 			return results;
 		}
 
@@ -604,7 +944,6 @@ public:
 			id_used[result.interf.locality.processId()]++;
 		}
 		TraceEvent("GetTLogTeamDone")
-		    .detail("DcIds", dcList)
 		    .detail("Policy", policy->info())
 		    .detail("Results", results.size())
 		    .detail("Processes", logServerSet->size())
@@ -621,6 +960,114 @@ public:
 		return results;
 	}
 
+	// Selects the best method for TLog recruitment based on the specified policy
+	std::vector<WorkerDetails> getWorkersForTlogs(DatabaseConfiguration const& conf,
+	                                              int32_t required,
+	                                              int32_t desired,
+	                                              Reference<IReplicationPolicy> const& policy,
+	                                              std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	                                              bool checkStable = false,
+	                                              std::set<Optional<Key>> dcIds = std::set<Optional<Key>>(),
+	                                              std::vector<UID> exclusionWorkerIds = {}) {
+		desired = std::max(required, desired);
+		bool useSimple = false;
+		if (policy->name() == "Across") {
+			PolicyAcross* pa1 = (PolicyAcross*)policy.getPtr();
+			Reference<IReplicationPolicy> embedded = pa1->embeddedPolicy();
+			if (embedded->name() == "Across") {
+				PolicyAcross* pa2 = (PolicyAcross*)embedded.getPtr();
+				if (pa2->attributeKey() == "zoneid" && pa2->embeddedPolicyName() == "One") {
+					std::map<Optional<Standalone<StringRef>>, int> testUsed = id_used;
+
+					auto workers = getWorkersForTlogsComplex(conf,
+					                                         desired,
+					                                         id_used,
+					                                         pa1->attributeKey(),
+					                                         pa1->getCount(),
+					                                         pa2->getCount(),
+					                                         checkStable,
+					                                         dcIds,
+					                                         exclusionWorkerIds);
+
+					if (g_network->isSimulated()) {
+						auto testWorkers = getWorkersForTlogsBackup(
+						    conf, required, desired, policy, testUsed, checkStable, dcIds, exclusionWorkerIds);
+						RoleFitness testFitness(testWorkers, ProcessClass::TLog, testUsed);
+						RoleFitness fitness(workers, ProcessClass::TLog, id_used);
+
+						std::map<Optional<Standalone<StringRef>>, int> field_count;
+						std::set<Optional<Standalone<StringRef>>> zones;
+						bool foundDegraded = false;
+						for (auto& worker : testWorkers) {
+							if (!zones.count(worker.interf.locality.zoneId())) {
+								field_count[worker.interf.locality.get(pa1->attributeKey())]++;
+								zones.insert(worker.interf.locality.zoneId());
+							}
+							foundDegraded = foundDegraded || worker.degraded;
+						}
+						testFitness.worstDegraded = foundDegraded;
+
+						int minField = 100;
+
+						for (auto& f : field_count) {
+							minField = std::min(minField, f.second);
+						}
+
+						if (fitness > testFitness && minField > 1) {
+							for (auto& w : testWorkers) {
+								TraceEvent("TestTLogs").detail("Interf", w.interf.address());
+							}
+							for (auto& w : workers) {
+								TraceEvent("RealTLogs").detail("Interf", w.interf.address());
+							}
+							TraceEvent("FitnessCompare")
+							    .detail("TestF", testFitness.toString())
+							    .detail("RealF", fitness.toString());
+							ASSERT(false);
+						}
+					}
+
+					return workers;
+				}
+			} else if (pa1->attributeKey() == "zoneid" && embedded->name() == "One") {
+				ASSERT(pa1->getCount() == required);
+				useSimple = true;
+			}
+		} else if (policy->name() == "One") {
+			useSimple = true;
+		}
+		if (useSimple) {
+			std::map<Optional<Standalone<StringRef>>, int> testUsed = id_used;
+
+			auto workers =
+			    getWorkersForTlogsSimple(conf, required, desired, id_used, checkStable, dcIds, exclusionWorkerIds);
+
+			if (g_network->isSimulated()) {
+				auto testWorkers = getWorkersForTlogsBackup(
+				    conf, required, desired, policy, testUsed, checkStable, dcIds, exclusionWorkerIds);
+				RoleFitness testFitness(testWorkers, ProcessClass::TLog, testUsed);
+				RoleFitness fitness(workers, ProcessClass::TLog, id_used);
+
+				if (fitness > testFitness) {
+					for (auto& w : testWorkers) {
+						TraceEvent("TestTLogs").detail("Interf", w.interf.address());
+					}
+					for (auto& w : workers) {
+						TraceEvent("RealTLogs").detail("Interf", w.interf.address());
+					}
+					TraceEvent("FitnessCompare")
+					    .detail("TestF", testFitness.toString())
+					    .detail("RealF", fitness.toString());
+					ASSERT(false);
+				}
+			}
+			return workers;
+		}
+		ASSERT(false);
+		return getWorkersForTlogsBackup(
+		    conf, required, desired, policy, id_used, checkStable, dcIds, exclusionWorkerIds);
+	}
+
 	// FIXME: This logic will fallback unnecessarily when usable dcs > 1 because it does not check all combinations of
 	// potential satellite locations
 	std::vector<WorkerDetails> getWorkersForSatelliteLogs(const DatabaseConfiguration& conf,
@@ -824,8 +1271,7 @@ public:
 		ProcessClass::ClusterRole role;
 		int count;
 		int worstUsed = 1;
-		bool degraded = false;
-		bool inClusterControllerDC = false;
+		bool worstDegraded = false;
 
 		RoleFitness(int bestFit, int worstFit, int count, ProcessClass::ClusterRole role)
 		  : bestFit((ProcessClass::Fitness)bestFit), worstFit((ProcessClass::Fitness)worstFit), count(count),
@@ -841,25 +1287,18 @@ public:
 
 		RoleFitness(const vector<WorkerDetails>& workers,
 		            ProcessClass::ClusterRole role,
-		            const std::map<Optional<Standalone<StringRef>>, int>& id_used,
-		            Optional<Standalone<StringRef>> ccDcId)
+		            const std::map<Optional<Standalone<StringRef>>, int>& id_used)
 		  : role(role) {
 			// Every recruitment will attempt to recruit the preferred amount through GoodFit,
 			// So a recruitment which only has BestFit is not better than one that has a GoodFit process
 			worstFit = ProcessClass::GoodFit;
-
-			degraded = false;
-			inClusterControllerDC = false;
+			worstDegraded = false;
 			bestFit = ProcessClass::NeverAssign;
 			worstUsed = 1;
 			for (auto& it : workers) {
 				auto thisFit = it.processClass.machineClassFitness(role);
-				worstFit = std::max(worstFit, thisFit);
-				bestFit = std::min(bestFit, thisFit);
-				degraded = it.degraded || degraded;
-				inClusterControllerDC = (it.interf.locality.dcId() == ccDcId) || inClusterControllerDC;
-
 				auto thisUsed = id_used.find(it.interf.locality.processId());
+
 				if (thisUsed == id_used.end()) {
 					TraceEvent(SevError, "UsedNotFound").detail("ProcessId", it.interf.locality.processId().get());
 					ASSERT(false);
@@ -868,16 +1307,28 @@ public:
 					TraceEvent(SevError, "UsedIsZero").detail("ProcessId", it.interf.locality.processId().get());
 					ASSERT(false);
 				}
-				worstUsed = std::max(worstUsed, thisUsed->second);
+
+				bestFit = std::min(bestFit, thisFit);
+
+				if (thisFit > worstFit) {
+					worstFit = thisFit;
+					worstUsed = thisUsed->second;
+					worstDegraded = it.degraded;
+				} else if (thisFit == worstFit) {
+					if (thisUsed->second > worstUsed) {
+						worstUsed = thisUsed->second;
+						worstDegraded = it.degraded;
+					} else if (thisUsed->second == worstUsed) {
+						worstDegraded = it.degraded || worstDegraded;
+					}
+				}
 			}
 
 			count = workers.size();
 
 			// degraded is only used for recruitment of tlogs
-			// only tlogs avoid the cluster controller dc
 			if (role != ProcessClass::TLog) {
-				degraded = false;
-				inClusterControllerDC = false;
+				worstDegraded = false;
 			}
 		}
 
@@ -888,10 +1339,8 @@ public:
 				return worstUsed < r.worstUsed;
 			if (count != r.count)
 				return count > r.count;
-			if (degraded != r.degraded)
-				return r.degraded;
-			if (inClusterControllerDC != r.inClusterControllerDC)
-				return r.inClusterControllerDC;
+			if (worstDegraded != r.worstDegraded)
+				return r.worstDegraded;
 			// FIXME: TLog recruitment process does not guarantee the best fit is not worsened.
 			if (role != ProcessClass::TLog && role != ProcessClass::LogRouter && bestFit != r.bestFit)
 				return bestFit < r.bestFit;
@@ -908,20 +1357,18 @@ public:
 				return worstFit < r.worstFit;
 			if (worstUsed != r.worstUsed)
 				return worstUsed < r.worstUsed;
-			if (degraded != r.degraded)
-				return r.degraded;
-			if (inClusterControllerDC != r.inClusterControllerDC)
-				return r.inClusterControllerDC;
+			if (worstDegraded != r.worstDegraded)
+				return r.worstDegraded;
 			return false;
 		}
 
 		bool operator==(RoleFitness const& r) const {
 			return worstFit == r.worstFit && worstUsed == r.worstUsed && bestFit == r.bestFit && count == r.count &&
-			       degraded == r.degraded && inClusterControllerDC == r.inClusterControllerDC;
+			       worstDegraded == r.worstDegraded;
 		}
 
 		std::string toString() const {
-			return format("%d %d %d %d %d %d", worstFit, worstUsed, count, degraded, inClusterControllerDC, bestFit);
+			return format("%d %d %d %d %d", worstFit, worstUsed, count, worstDegraded, bestFit);
 		}
 	};
 
@@ -970,9 +1417,9 @@ public:
 		if (!goodRemoteRecruitmentTime.isReady() &&
 		    ((RoleFitness(
 		          SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredRemoteLogs(), ProcessClass::TLog)
-		          .betterCount(RoleFitness(remoteLogs, ProcessClass::TLog, id_used, clusterControllerDcId))) ||
+		          .betterCount(RoleFitness(remoteLogs, ProcessClass::TLog, id_used))) ||
 		     (RoleFitness(SERVER_KNOBS->EXPECTED_LOG_ROUTER_FITNESS, req.logRouterCount, ProcessClass::LogRouter)
-		          .betterCount(RoleFitness(logRouters, ProcessClass::LogRouter, id_used, clusterControllerDcId))))) {
+		          .betterCount(RoleFitness(logRouters, ProcessClass::LogRouter, id_used))))) {
 			throw operation_failed();
 		}
 
@@ -1102,24 +1549,24 @@ public:
 
 		if (!goodRecruitmentTime.isReady() &&
 		    (RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs(), ProcessClass::TLog)
-		         .betterCount(RoleFitness(tlogs, ProcessClass::TLog, id_used, clusterControllerDcId)) ||
+		         .betterCount(RoleFitness(tlogs, ProcessClass::TLog, id_used)) ||
 		     (region.satelliteTLogReplicationFactor > 0 && req.configuration.usableRegions > 1 &&
 		      RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS,
 		                  req.configuration.getDesiredSatelliteLogs(dcId),
 		                  ProcessClass::TLog)
-		          .betterCount(RoleFitness(satelliteLogs, ProcessClass::TLog, id_used, clusterControllerDcId))) ||
+		          .betterCount(RoleFitness(satelliteLogs, ProcessClass::TLog, id_used))) ||
 		     RoleFitness(SERVER_KNOBS->EXPECTED_COMMIT_PROXY_FITNESS,
 		                 req.configuration.getDesiredCommitProxies(),
 		                 ProcessClass::CommitProxy)
-		         .betterCount(RoleFitness(commit_proxies, ProcessClass::CommitProxy, id_used, clusterControllerDcId)) ||
+		         .betterCount(RoleFitness(commit_proxies, ProcessClass::CommitProxy, id_used)) ||
 		     RoleFitness(SERVER_KNOBS->EXPECTED_GRV_PROXY_FITNESS,
 		                 req.configuration.getDesiredGrvProxies(),
 		                 ProcessClass::GrvProxy)
-		         .betterCount(RoleFitness(grv_proxies, ProcessClass::GrvProxy, id_used, clusterControllerDcId)) ||
+		         .betterCount(RoleFitness(grv_proxies, ProcessClass::GrvProxy, id_used)) ||
 		     RoleFitness(SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS,
 		                 req.configuration.getDesiredResolvers(),
 		                 ProcessClass::Resolver)
-		         .betterCount(RoleFitness(resolvers, ProcessClass::Resolver, id_used, clusterControllerDcId)))) {
+		         .betterCount(RoleFitness(resolvers, ProcessClass::Resolver, id_used)))) {
 			return operation_failed();
 		}
 
@@ -1280,10 +1727,9 @@ public:
 					                                               used,
 					                                               first_resolver);
 
-					auto fitness = std::make_tuple(
-					    RoleFitness(commit_proxies, ProcessClass::CommitProxy, used, clusterControllerDcId),
-					    RoleFitness(grv_proxies, ProcessClass::GrvProxy, used, clusterControllerDcId),
-					    RoleFitness(resolvers, ProcessClass::Resolver, used, clusterControllerDcId));
+					auto fitness = std::make_tuple(RoleFitness(commit_proxies, ProcessClass::CommitProxy, used),
+					                               RoleFitness(grv_proxies, ProcessClass::GrvProxy, used),
+					                               RoleFitness(resolvers, ProcessClass::Resolver, used));
 
 					if (dcId == clusterControllerDcId) {
 						bestFitness = fitness;
@@ -1349,7 +1795,7 @@ public:
 			if (!goodRecruitmentTime.isReady() &&
 			    (RoleFitness(
 			         SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs(), ProcessClass::TLog)
-			         .betterCount(RoleFitness(tlogs, ProcessClass::TLog, id_used, clusterControllerDcId)) ||
+			         .betterCount(RoleFitness(tlogs, ProcessClass::TLog, id_used)) ||
 			     RoleFitness(SERVER_KNOBS->EXPECTED_COMMIT_PROXY_FITNESS,
 			                 req.configuration.getDesiredCommitProxies(),
 			                 ProcessClass::CommitProxy)
@@ -1693,7 +2139,7 @@ public:
 
 		// Check tLog fitness
 		updateIdUsed(tlogs, old_id_used);
-		RoleFitness oldTLogFit(tlogs, ProcessClass::TLog, old_id_used, clusterControllerDcId);
+		RoleFitness oldTLogFit(tlogs, ProcessClass::TLog, old_id_used);
 		auto newTLogs = getWorkersForTlogs(db.config,
 		                                   db.config.tLogReplicationFactor,
 		                                   db.config.getDesiredLogs(),
@@ -1701,7 +2147,7 @@ public:
 		                                   id_used,
 		                                   true,
 		                                   primaryDC);
-		RoleFitness newTLogFit(newTLogs, ProcessClass::TLog, id_used, clusterControllerDcId);
+		RoleFitness newTLogFit(newTLogs, ProcessClass::TLog, id_used);
 
 		bool oldSatelliteFallback = false;
 
@@ -1718,14 +2164,14 @@ public:
 		}
 
 		updateIdUsed(satellite_tlogs, old_id_used);
-		RoleFitness oldSatelliteTLogFit(satellite_tlogs, ProcessClass::TLog, old_id_used, clusterControllerDcId);
+		RoleFitness oldSatelliteTLogFit(satellite_tlogs, ProcessClass::TLog, old_id_used);
 		bool newSatelliteFallback = false;
 		auto newSatelliteTLogs = satellite_tlogs;
 		RoleFitness newSatelliteTLogFit = oldSatelliteTLogFit;
 		if (region.satelliteTLogReplicationFactor > 0 && db.config.usableRegions > 1) {
 			newSatelliteTLogs =
 			    getWorkersForSatelliteLogs(db.config, region, remoteRegion, id_used, newSatelliteFallback, true);
-			newSatelliteTLogFit = RoleFitness(newSatelliteTLogs, ProcessClass::TLog, id_used, clusterControllerDcId);
+			newSatelliteTLogFit = RoleFitness(newSatelliteTLogs, ProcessClass::TLog, id_used);
 		}
 
 		std::map<Optional<Key>, int32_t> satellite_priority;
@@ -1778,7 +2224,7 @@ public:
 		}
 
 		updateIdUsed(remote_tlogs, old_id_used);
-		RoleFitness oldRemoteTLogFit(remote_tlogs, ProcessClass::TLog, old_id_used, clusterControllerDcId);
+		RoleFitness oldRemoteTLogFit(remote_tlogs, ProcessClass::TLog, old_id_used);
 		std::vector<UID> exclusionWorkerIds;
 		auto fn = [](const WorkerDetails& in) { return in.interf.id(); };
 		std::transform(newTLogs.begin(), newTLogs.end(), std::back_inserter(exclusionWorkerIds), fn);
@@ -1795,15 +2241,14 @@ public:
 			                                                  remoteDC,
 			                                                  exclusionWorkerIds),
 			                               ProcessClass::TLog,
-			                               id_used,
-			                               clusterControllerDcId);
+			                               id_used);
 		}
 		int oldRouterCount =
 		    oldTLogFit.count * std::max<int>(1, db.config.desiredLogRouterCount / std::max(1, oldTLogFit.count));
 		int newRouterCount =
 		    newTLogFit.count * std::max<int>(1, db.config.desiredLogRouterCount / std::max(1, newTLogFit.count));
 		updateIdUsed(log_routers, old_id_used);
-		RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter, old_id_used, clusterControllerDcId);
+		RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter, old_id_used);
 		RoleFitness newLogRoutersFit = oldLogRoutersFit;
 		if (db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED) {
 			newLogRoutersFit = RoleFitness(getWorkersForRoleInDatacenter(*remoteDC.begin(),
@@ -1814,8 +2259,7 @@ public:
 			                                                             Optional<WorkerFitnessInfo>(),
 			                                                             true),
 			                               ProcessClass::LogRouter,
-			                               id_used,
-			                               clusterControllerDcId);
+			                               id_used);
 		}
 
 		if (oldLogRoutersFit.count < oldRouterCount) {
@@ -1829,10 +2273,9 @@ public:
 		updateIdUsed(commitProxyClasses, old_id_used);
 		updateIdUsed(grvProxyClasses, old_id_used);
 		updateIdUsed(resolverClasses, old_id_used);
-		RoleFitness oldCommitProxyFit(
-		    commitProxyClasses, ProcessClass::CommitProxy, old_id_used, clusterControllerDcId);
-		RoleFitness oldGrvProxyFit(grvProxyClasses, ProcessClass::GrvProxy, old_id_used, clusterControllerDcId);
-		RoleFitness oldResolverFit(resolverClasses, ProcessClass::Resolver, old_id_used, clusterControllerDcId);
+		RoleFitness oldCommitProxyFit(commitProxyClasses, ProcessClass::CommitProxy, old_id_used);
+		RoleFitness oldGrvProxyFit(grvProxyClasses, ProcessClass::GrvProxy, old_id_used);
+		RoleFitness oldResolverFit(resolverClasses, ProcessClass::Resolver, old_id_used);
 
 		auto first_commit_proxy = getWorkerForRoleInDatacenter(clusterControllerDcId,
 		                                                       ProcessClass::CommitProxy,
@@ -1881,13 +2324,13 @@ public:
 		                                               first_resolver,
 		                                               true);
 
-		RoleFitness newCommitProxyFit(commit_proxies, ProcessClass::CommitProxy, id_used, clusterControllerDcId);
-		RoleFitness newGrvProxyFit(grv_proxies, ProcessClass::GrvProxy, id_used, clusterControllerDcId);
-		RoleFitness newResolverFit(resolvers, ProcessClass::Resolver, id_used, clusterControllerDcId);
+		RoleFitness newCommitProxyFit(commit_proxies, ProcessClass::CommitProxy, id_used);
+		RoleFitness newGrvProxyFit(grv_proxies, ProcessClass::GrvProxy, id_used);
+		RoleFitness newResolverFit(resolvers, ProcessClass::Resolver, id_used);
 
 		// Check backup worker fitness
 		updateIdUsed(backup_workers, old_id_used);
-		RoleFitness oldBackupWorkersFit(backup_workers, ProcessClass::Backup, old_id_used, clusterControllerDcId);
+		RoleFitness oldBackupWorkersFit(backup_workers, ProcessClass::Backup, old_id_used);
 		const int nBackup = backup_addresses.size();
 		RoleFitness newBackupWorkersFit(getWorkersForRoleInDatacenter(clusterControllerDcId,
 		                                                              ProcessClass::Backup,
@@ -1897,8 +2340,7 @@ public:
 		                                                              Optional<WorkerFitnessInfo>(),
 		                                                              true),
 		                                ProcessClass::Backup,
-		                                id_used,
-		                                clusterControllerDcId);
+		                                id_used);
 
 		auto oldFit = std::make_tuple(oldTLogFit,
 		                              oldSatelliteTLogFit,
@@ -2774,13 +3216,13 @@ void registerWorker(RegisterWorkerRequest req, ClusterControllerData* self) {
 		self->goodRemoteRecruitmentTime = lowPriorityDelay(SERVER_KNOBS->WAIT_FOR_GOOD_REMOTE_RECRUITMENT_DELAY);
 	} else {
 		TraceEvent("ClusterControllerWorkerAlreadyRegistered", self->id)
-		    .suppressFor(1.0)
 		    .detail("WorkerId", w.id())
 		    .detail("ProcessId", w.locality.processId())
 		    .detail("ZoneId", w.locality.zoneId())
 		    .detail("DataHall", w.locality.dataHallId())
 		    .detail("PClass", req.processClass.toString())
-		    .detail("Workers", self->id_worker.size());
+		    .detail("Workers", self->id_worker.size())
+		    .detail("Degraded", req.degraded);
 	}
 	if (w.address() == g_network->getLocalAddress()) {
 		if (self->changingDcIds.get().first) {
diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h
index f1d83ec819..3446b3a7b8 100644
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@@ -130,6 +130,8 @@ struct WorkerDetails {
 	WorkerDetails(const WorkerInterface& interf, ProcessClass processClass, bool degraded)
 	  : interf(interf), processClass(processClass), degraded(degraded) {}
 
+	bool operator<(const WorkerDetails& r) const { return interf.id() < r.interf.id(); }
+
 	template <class Ar>
 	void serialize(Ar& ar) {
 		serializer(ar, interf, processClass, degraded);

From 715c98572c53c868dc5df79ece73890a97667c58 Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Wed, 21 Apr 2021 10:48:35 -0400
Subject: [PATCH 214/461] bit more documentation

---
 documentation/sphinx/source/mr-status-json-schemas.rst.inc | 2 +-
 fdbserver/GrvProxyServer.actor.cpp                         | 4 ++++
 fdbserver/Status.actor.cpp                                 | 3 +++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 17a67dd57e..a6fdd0d63e 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -120,7 +120,7 @@
                      "counter":0,
                      "roughness":0.0
                   },
-                  "grv_latency_statistics":{
+                  "grv_latency_statistics":{ // GRV Latency metrics are grouped according to priority (currently batch or default).
                      "default":{
                          "count":0,
                          "min":0.0,
diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index 9d3a0c2020..aaf9f8b186 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -77,6 +77,7 @@ struct GrvProxyStats {
 		       (FLOW_KNOBS->BASIC_LOAD_BALANCE_UPDATE_RATE - (lastBucketBegin + bucketInterval - now()));
 	}
 
+	// Current stats maintained for a given grv proxy server
 	explicit GrvProxyStats(UID id)
 	  : cc("GrvProxyStats", id.toString()), recentRequests(0), lastBucketBegin(now()),
 	    bucketInterval(FLOW_KNOBS->BASIC_LOAD_BALANCE_UPDATE_RATE / FLOW_KNOBS->BASIC_LOAD_BALANCE_BUCKETS),
@@ -513,6 +514,9 @@ ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanID parentSpan,
 	return rep;
 }
 
+// Returns the current read version (or minimum known committed verison if requested),
+// to each request in the provided list. Also check if the request should be throttled.
+// Update GRV statistics according to the request's priority.
 ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
                                   std::vector<GetReadVersionRequest> requests,
                                   GrvProxyStats* stats,
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 90caeee703..a4f2edcfa3 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -624,6 +624,8 @@ struct RolesInfo {
 
 		return roles.insert(std::make_pair(iface.address(), obj))->second;
 	}
+
+	// Returns a json object encoding a snapshot of grv proxy statistics
 	JsonBuilderObject& addRole(std::string const& role, GrvProxyInterface& iface, EventMap const& metrics) {
 		JsonBuilderObject obj;
 		obj["id"] = iface.id().shortString();
@@ -1844,6 +1846,7 @@ ACTOR static Future<vector<std::pair<TLogInterface, EventMap>>> getTLogsAndMetri
 	return results;
 }
 
+// Returns list of tuples of grv proxy interfaces and their latency metrics
 ACTOR static Future<vector<std::pair<CommitProxyInterface, EventMap>>> getCommitProxiesAndMetrics(
     Reference<AsyncVar<ServerDBInfo>> db,
     std::unordered_map<NetworkAddress, WorkerInterface> address_workers) {

From 28f8a2716e03384e113255d33c69fe4a75fc79c0 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 21 Apr 2021 11:54:05 -0700
Subject: [PATCH 215/461] For old incompatible connections, set the correct
 protocol version on the version async var

---
 fdbrpc/FlowTransport.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index b7221c8876..47bf03c7e8 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -1217,7 +1217,7 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
 							if (!protocolVersion.hasMultiVersionClient()) {
 								// Older versions expected us to hang up. It may work even if we don't hang up here, but
 								// it's safer to keep the old behavior.
-								peer->protocolVersion->set(peerProtocolVersion);
+								peer->protocolVersion->set(protocolVersion);
 								throw incompatible_protocol_version();
 							}
 						} else {

From f485d7fa5ea0bc6090acb2a4caba97d6aeb3b00b Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 21 Apr 2021 12:25:03 -0700
Subject: [PATCH 216/461] Fix comment typo

---
 fdbclient/MultiVersionTransaction.actor.cpp | 2 +-
 fdbclient/MultiVersionTransaction.h         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 555765c26c..0168dea969 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -1569,7 +1569,7 @@ void MultiVersionApi::addNetworkThreadCompletionHook(void (*hook)(void*), void*
 	}
 }
 
-// Creates an IDatabase object that represents a connections to the cluster
+// Creates an IDatabase object that represents a connection to the cluster
 Reference<IDatabase> MultiVersionApi::createDatabase(const char* clusterFilePath) {
 	lock.enter();
 	if (!networkSetup) {
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index 4e0e91a969..4bad3c7ca9 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -526,7 +526,7 @@ public:
 	void stopNetwork() override;
 	void addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParameter) override;
 
-	// Creates an IDatabase object that represents a connections to the cluster
+	// Creates an IDatabase object that represents a connection to the cluster
 	Reference<IDatabase> createDatabase(const char* clusterFilePath) override;
 	static MultiVersionApi* api;
 

From 80e15e87685fd462d8051a1728f8680bc08f69de Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 21 Apr 2021 14:56:02 -0600
Subject: [PATCH 217/461] started implementation

---
 .stignore                           |  2 +
 fdbclient/ActorLineageProfiler.cpp  |  3 ++
 fdbclient/ActorLineageProfiler.h    | 50 +++++++++++++++++++++-
 fdbclient/CMakeLists.txt            |  1 +
 fdbclient/FluentDSampleIngestor.cpp | 65 +++++++++++++++++++++++++++++
 okteto.yml                          | 12 ++++++
 6 files changed, 131 insertions(+), 2 deletions(-)
 create mode 100644 .stignore
 create mode 100644 fdbclient/FluentDSampleIngestor.cpp
 create mode 100644 okteto.yml

diff --git a/.stignore b/.stignore
new file mode 100644
index 0000000000..7500a08f9f
--- /dev/null
+++ b/.stignore
@@ -0,0 +1,2 @@
+.git
+.clangd
diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 82d04aa42c..f2e65e47fb 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -213,6 +213,7 @@ void SampleCollection_t::refresh() {
 			oldest = data.front()->time;
 		}
 	}
+	config->ingest(sample);
 }
 
 std::vector<std::shared_ptr<Sample>> SampleCollection_t::get(double from /*= 0.0*/,
@@ -275,3 +276,5 @@ void ActorLineageProfilerT::profile() {
 		}
 	}
 }
+
+SampleIngestor::~SampleIngestor() {}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 3f11840714..4d32760e32 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -30,8 +30,6 @@
 #include "flow/singleton.h"
 #include "flow/flow.h"
 
-void runSamplingProfiler();
-
 struct IALPCollectorBase {
 	virtual std::optional<std::any> collect(ActorLineage*) = 0;
 	virtual const std::string_view& name() = 0;
@@ -50,6 +48,53 @@ struct Sample : std::enable_shared_from_this<Sample> {
 	~Sample() { ::free(data); }
 };
 
+class SampleIngestor : std::enable_shared_from_this<SampleIngestor> {
+public:
+	virtual ~SampleIngestor();
+	virtual void ingest(std::shared_ptr<Sample> const& sample) = 0;
+};
+
+class NoneIngestor : public SampleIngestor {
+public:
+	void ingest(std::shared_ptr<Sample> const& sample) override {}
+};
+
+// The FluentD ingestor uses the pimp idiom. This is to make compilation less heavy weight as this implementation has
+// dependencies to boost::asio
+struct FluentDIngestorImpl;
+
+class FluentDIngestor : public SampleIngestor {
+public: // Public Types
+	enum class Protocol { TCP, UDP };
+
+private: // members
+	FluentDIngestorImpl* impl;
+
+public: // interface
+	void ingest(std::shared_ptr<Sample> const& sample) override;
+	FluentDIngestor(Protocol protocol, NetworkAddress& endpoint);
+	~FluentDIngestor();
+};
+
+class ProfilerConfigT {
+private: // private types
+	using Lock = std::unique_lock<std::mutex>;
+	friend class crossbow::create_static<ProfilerConfigT>;
+
+private: // members
+	std::shared_ptr<SampleIngestor> ingestor = std::make_shared<NoneIngestor>();
+
+private: // construction
+	ProfilerConfigT() {}
+	ProfilerConfigT(ProfilerConfigT const&) = delete;
+	ProfilerConfigT& operator=(ProfilerConfigT const&) = delete;
+
+public:
+	void setBackend(std::shared_ptr<SampleIngestor> ingestor) { this->ingestor = ingestor; }
+};
+
+using ProfilerConfig = crossbow::singleton<ProfilerConfigT>;
+
 class SampleCollectorT {
 public: // Types
 	friend struct crossbow::create_static<SampleCollectorT>;
@@ -78,6 +123,7 @@ class SampleCollection_t {
 	mutable std::mutex mutex;
 	std::atomic<double> windowSize = 0.0;
 	std::deque<std::shared_ptr<Sample>> data;
+	ProfilerConfig config;
 
 public:
 	/**
diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index ee87d08646..e9d3d3716b 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -30,6 +30,7 @@ set(FDBCLIENT_SRCS
   EventTypes.actor.h
   FDBOptions.h
   FDBTypes.h
+  FluentDSampleIngestor.cpp
   FileBackupAgent.actor.cpp
   GlobalConfig.h
   GlobalConfig.actor.h
diff --git a/fdbclient/FluentDSampleIngestor.cpp b/fdbclient/FluentDSampleIngestor.cpp
new file mode 100644
index 0000000000..0a81ba0613
--- /dev/null
+++ b/fdbclient/FluentDSampleIngestor.cpp
@@ -0,0 +1,65 @@
+/*
+ * FluentDSampleIngestor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/ActorLineageProfiler.h"
+#include <boost/asio.hpp>
+
+namespace {
+struct FluentDSocket {
+	virtual ~FluentDSocket() {}
+	virtual void connect(NetworkAddress& endpoint) = 0;
+	// virtual void send() = 0;
+};
+
+struct TCPFluentDSocket : FluentDSocket {
+	boost::asio::io_context& io_context;
+	boost::asio::ip::tcp::socket socket;
+	TCPFluentDSocket(boost::asio::io_context& context) : io_context(context), socket(context) {}
+	void connect(NetworkAddress& endpoint) override { boost::asio::ip::tcp::resolver resolver(io_context); }
+};
+
+struct UDPFluentDSocket : FluentDSocket {
+	boost::asio::io_context& io_context;
+	boost::asio::ip::tcp::socket socket;
+	UDPFluentDSocket(boost::asio::io_context& context) : io_context(context), socket(context) {}
+	void connect(NetworkAddress& endpoint) override {}
+};
+} // namespace
+
+struct FluentDIngestorImpl {
+	using Protocol = FluentDIngestor::Protocol;
+	boost::asio::io_context io_context;
+	std::unique_ptr<FluentDSocket> socket;
+	FluentDIngestorImpl(Protocol protocol, NetworkAddress& endpoint) {
+		switch (protocol) {
+		case Protocol::TCP:
+			socket.reset(new TCPFluentDSocket(io_context));
+			break;
+		case Protocol::UDP:
+			socket.reset(new UDPFluentDSocket(io_context));
+			break;
+		}
+		socket->connect(endpoint);
+	}
+};
+
+FluentDIngestor::~FluentDIngestor() {}
+
+FluentDIngestor::FluentDIngestor(Protocol protocol, NetworkAddress& endpoint) {}
\ No newline at end of file
diff --git a/okteto.yml b/okteto.yml
new file mode 100644
index 0000000000..efa744a7d8
--- /dev/null
+++ b/okteto.yml
@@ -0,0 +1,12 @@
+name: foundationdb
+autocreate: true
+image: foundationdb/devel:centos7-latest
+command: bash
+volumes:
+- /root/.m2
+- /root/build
+sync:
+- .:/usr/src/fdb
+forward:
+- 5005:5005
+- 8080:8080

From eef144bd38c78e90847b6552ff8d9fc41d6197ef Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Wed, 21 Apr 2021 14:13:00 -0700
Subject: [PATCH 218/461] Fix comment typo

---
 flow/ThreadHelper.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/ThreadHelper.actor.cpp b/flow/ThreadHelper.actor.cpp
index 740fe967b2..e6d5d09346 100644
--- a/flow/ThreadHelper.actor.cpp
+++ b/flow/ThreadHelper.actor.cpp
@@ -3,7 +3,7 @@
  *
  * This source file is part of the FoundationDB open source project
  *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From c535f93a2a9f1389d8bf320bd78b5eadabd0ee04 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Wed, 21 Apr 2021 14:13:47 -0700
Subject: [PATCH 219/461] Add documentation for consistencycheck special key

---
 documentation/sphinx/source/developer-guide.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index bb30e9e469..e669ed203d 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -949,6 +949,7 @@ that process, and wait for necessary data to be moved away.
 #. ``\xff\xff/management/options/failed/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed/<exclusion>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
 #. ``\xff\xff/management/min_required_commit_version`` Read/write. Changing this key will change the corresponding system key ``\xff/minRequiredCommitVersion = [[Version]]``. The value of this special key is the literal text of the underlying ``Version``, which is ``int64_t``. If you set the key with a value failed to be parsed as ``int64_t``, ``special_keys_api_failure`` will be thrown. In addition, the given ``Version`` should be larger than the current read version and smaller than the upper bound(``2**63-1-version_per_second*3600*24*365*1000``). Otherwise, ``special_keys_api_failure`` is thrown. For more details, see help text of ``fdbcli`` command ``advanceversion``.
 #. ``\xff\xff/management/profiling/<client_txn_sample_rate|client_txn_size_limit>`` Read/write. Changing these two keys will change the corresponding system keys ``\xff\x02/fdbClientInfo/<client_txn_sample_rate|client_txn_size_limit>``, respectively. The value of ``\xff\xff/management/client_txn_sample_rate`` is a literal text of ``double``, and the value of ``\xff\xff/management/client_txn_size_limit`` is a literal text of ``int64_t``. A special value ``default`` can be set to or read from these two keys, representing the client profiling is disabled. In addition, ``clear`` in this range is not allowed. For more details, see help text of ``fdbcli`` command ``profile client``.
+#. ``\xff\xff/management/consistency_check_suspended`` Read/write. Set or read this key will set or read the underlying system key ``\xff\x02/ConsistencyCheck/Suspend``. The value of this special key is unused thus if present, will be empty. In particular, if the key exists, then consistency is suspended. For more details, see help text of ``fdbcli`` command ``consistencycheck``.
 
 An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
 an ip address and port (e.g. ``127.0.0.1:4500``). If no port is specified,

From b61a91168591248061f7524b879c2421c8a72a38 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 21 Apr 2021 14:30:06 -0700
Subject: [PATCH 220/461] removed an ASSERT that was for debugging purposed,
 and increased the max commit latency, because it can be spuriously triggered
 by dummy transactions that take 5+ seconds each

---
 fdbserver/ClusterController.actor.cpp    | 1 -
 fdbserver/workloads/LowLatency.actor.cpp | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 5121e72d94..db23dfd215 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1063,7 +1063,6 @@ public:
 			}
 			return workers;
 		}
-		ASSERT(false);
 		return getWorkersForTlogsBackup(
 		    conf, required, desired, policy, id_used, checkStable, dcIds, exclusionWorkerIds);
 	}
diff --git a/fdbserver/workloads/LowLatency.actor.cpp b/fdbserver/workloads/LowLatency.actor.cpp
index 90b03dd8e9..7e761b2262 100644
--- a/fdbserver/workloads/LowLatency.actor.cpp
+++ b/fdbserver/workloads/LowLatency.actor.cpp
@@ -40,7 +40,7 @@ struct LowLatencyWorkload : TestWorkload {
 	  : TestWorkload(wcx), operations("Operations"), retries("Retries"), ok(true) {
 		testDuration = getOption(options, LiteralStringRef("testDuration"), 600.0);
 		maxGRVLatency = getOption(options, LiteralStringRef("maxGRVLatency"), 20.0);
-		maxCommitLatency = getOption(options, LiteralStringRef("maxCommitLatency"), 30.0);
+		maxCommitLatency = getOption(options, LiteralStringRef("maxCommitLatency"), 33.0);
 		checkDelay = getOption(options, LiteralStringRef("checkDelay"), 1.0);
 		testWrites = getOption(options, LiteralStringRef("testWrites"), true);
 		testKey = getOption(options, LiteralStringRef("testKey"), LiteralStringRef("testKey"));

From 7c4b5b03370a1357b3703573ffca9a3757d17ecb Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Tue, 20 Apr 2021 11:45:26 -0700
Subject: [PATCH 221/461] Add first consistent version in restore status.

First consistent version is the max of versions in RestoreFileSet.
---
 fdbclient/FileBackupAgent.actor.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 1c94aae6ef..7169412758 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -150,6 +150,7 @@ public:
 	KeyBackedProperty<Key> batchFuture() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	KeyBackedProperty<Version> beginVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	KeyBackedProperty<Version> restoreVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
+	KeyBackedProperty<Version> firstConsistentVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 
 	KeyBackedProperty<Reference<IBackupContainer>> sourceContainer() {
 		return configSpace.pack(LiteralStringRef(__FUNCTION__));
@@ -358,6 +359,7 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	state Future<StringRef> status = restore.stateText(tr);
 	state Future<Version> currentVersion = restore.getCurrentVersion(tr);
 	state Future<Version> lag = restore.getApplyVersionLag(tr);
+	state Future<Version> firstConsistentVersion = restore.firstConsistentVersion().getD(tr);
 	state Future<std::string> tag = restore.tag().getD(tr);
 	state Future<std::pair<std::string, Version>> lastError = restore.lastError().getD(tr);
 
@@ -365,7 +367,7 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	state UID uid = restore.getUid();
 	wait(success(fileCount) && success(fileBlockCount) && success(fileBlocksDispatched) &&
 	     success(fileBlocksFinished) && success(bytesWritten) && success(status) && success(currentVersion) &&
-	     success(lag) && success(tag) && success(lastError));
+	     success(lag) && success(firstConsistentVersion) && success(tag) && success(lastError));
 
 	std::string errstr = "None";
 	if (lastError.get().second != 0)
@@ -383,11 +385,12 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	    .detail("FileBlocksInProgress", fileBlocksDispatched.get() - fileBlocksFinished.get())
 	    .detail("BytesWritten", bytesWritten.get())
 	    .detail("CurrentVersion", currentVersion.get())
+	    .detail("FirstConsistentVersion", firstConsistentVersion.get())
 	    .detail("ApplyLag", lag.get())
 	    .detail("TaskInstance", THIS_ADDR);
 
 	return format("Tag: %s  UID: %s  State: %s  Blocks: %lld/%lld  BlocksInProgress: %lld  Files: %lld  BytesWritten: "
-	              "%lld  CurrentVersion: %lld  ApplyVersionLag: %lld  LastError: %s",
+	              "%lld  CurrentVersion: %lld FirstConsistentVersion: %lld  ApplyVersionLag: %lld  LastError: %s",
 	              tag.get().c_str(),
 	              uid.toString().c_str(),
 	              status.get().toString().c_str(),
@@ -397,6 +400,7 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	              fileCount.get(),
 	              bytesWritten.get(),
 	              currentVersion.get(),
+	              firstConsistentVersion.get(),
 	              lag.get(),
 	              errstr.c_str());
 }
@@ -4105,7 +4109,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 		for (auto const& r : ranges) {
 			keyRangesFilter.push_back_deep(keyRangesFilter.arena(), KeyRangeRef(r));
 		}
-		Optional<RestorableFileSet> restorable =
+		state Optional<RestorableFileSet> restorable =
 		    wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, incremental, beginVersion));
 		if (!incremental) {
 			beginVersion = restorable.get().snapshot.beginVersion;
@@ -4121,9 +4125,13 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 		// Order does not matter, they will be put in order when written to the restoreFileMap below.
 		state std::vector<RestoreConfig::RestoreFile> files;
 
+		Version firstConsistentVersion = std::numeric_limits<int64_t>::min();
 		for (const RangeFile& f : restorable.get().ranges) {
 			files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
+			firstConsistentVersion = std::max(firstConsistentVersion, f.version);
 		}
+		restore.firstConsistentVersion().set(tr, firstConsistentVersion);
+		wait(tr->commit());
 
 		if (!CLIENT_KNOBS->RESTORE_IGNORE_LOG_FILES) {
 			for (const LogFile& f : restorable.get().logs) {

From b90f61d740e6b3a3dbe0bf81e2a40acc737ee04e Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 21 Apr 2021 15:50:25 -0700
Subject: [PATCH 222/461] Move commit to its own try loop.

---
 fdbclient/FileBackupAgent.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 7169412758..04868c352d 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -4125,7 +4125,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 		// Order does not matter, they will be put in order when written to the restoreFileMap below.
 		state std::vector<RestoreConfig::RestoreFile> files;
 
-		Version firstConsistentVersion = std::numeric_limits<int64_t>::min();
+		state Version firstConsistentVersion = beginVersion;
 		for (const RangeFile& f : restorable.get().ranges) {
 			files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
 			firstConsistentVersion = std::max(firstConsistentVersion, f.version);

From 8a00c6cdf840e7f44ec03644a6d7d4b5b82122e3 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Wed, 14 Apr 2021 17:30:21 +0000
Subject: [PATCH 223/461] Add -Wshift-sign-overflow

This catches the bug fixed in #4656 at compile time
---
 cmake/ConfigureCompiler.cmake            | 11 ++++++-----
 fdbclient/CoordinationInterface.h        | 13 ++++++++-----
 fdbclient/MonitorLeader.actor.cpp        |  3 ++-
 fdbserver/workloads/FileSystem.actor.cpp |  4 ++--
 4 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake
index 4dbe9db816..c14c5011c5 100644
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@@ -280,7 +280,12 @@ else()
         -Wno-unknown-attributes)
     endif()
     add_compile_options(
-      -Wall -Wextra
+      -Wall
+      -Wextra
+      -Wredundant-move
+      -Wpessimizing-move
+      -Woverloaded-virtual
+      -Wshift-sign-overflow
       # Here's the current set of warnings we need to explicitly disable to compile warning-free with clang 10
       -Wno-comment
       -Wno-dangling-else
@@ -288,16 +293,12 @@ else()
       -Wno-format
       -Wno-mismatched-tags
       -Wno-missing-field-initializers
-      -Wno-overloaded-virtual
       -Wno-reorder
       -Wno-reorder-ctor
       -Wno-sign-compare
       -Wno-tautological-pointer-compare
       -Wno-undefined-var-template
       -Wno-tautological-pointer-compare
-      -Wredundant-move
-      -Wpessimizing-move
-      -Woverloaded-virtual
       -Wno-unknown-pragmas
       -Wno-unknown-warning-option
       -Wno-unused-function
diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h
index 0d22b035fb..53b52ec5bf 100644
--- a/fdbclient/CoordinationInterface.h
+++ b/fdbclient/CoordinationInterface.h
@@ -107,8 +107,9 @@ private:
 
 struct LeaderInfo {
 	constexpr static FileIdentifier file_identifier = 8338794;
+	// The first 7 bits of changeID represent cluster controller process class fitness, the lower the better
 	UID changeID;
-	static const uint64_t mask = ~(127ll << 57);
+	static const uint64_t changeIDMask = ~(uint64_t(0b1111111) << 57);
 	Value serializedInfo;
 	bool forward; // If true, serializedInfo is a connection string instead!
 
@@ -125,13 +126,13 @@ struct LeaderInfo {
 	// The first 7 bits of ChangeID represent cluster controller process class fitness, the lower the better
 	void updateChangeID(ClusterControllerPriorityInfo info) {
 		changeID = UID(((uint64_t)info.processClassFitness << 57) | ((uint64_t)info.isExcluded << 60) |
-		                   ((uint64_t)info.dcFitness << 61) | (changeID.first() & mask),
+		                   ((uint64_t)info.dcFitness << 61) | (changeID.first() & changeIDMask),
 		               changeID.second());
 	}
 
 	// All but the first 7 bits are used to represent process id
 	bool equalInternalId(LeaderInfo const& leaderInfo) const {
-		return ((changeID.first() & mask) == (leaderInfo.changeID.first() & mask)) &&
+		return ((changeID.first() & changeIDMask) == (leaderInfo.changeID.first() & changeIDMask)) &&
 		       changeID.second() == leaderInfo.changeID.second();
 	}
 
@@ -139,8 +140,10 @@ struct LeaderInfo {
 	// 1. the candidate has better process class fitness and the candidate is not the leader
 	// 2. the leader process class fitness becomes worse
 	bool leaderChangeRequired(LeaderInfo const& candidate) const {
-		return ((changeID.first() & ~mask) > (candidate.changeID.first() & ~mask) && !equalInternalId(candidate)) ||
-		       ((changeID.first() & ~mask) < (candidate.changeID.first() & ~mask) && equalInternalId(candidate));
+		return ((changeID.first() & ~changeIDMask) > (candidate.changeID.first() & ~changeIDMask) &&
+		        !equalInternalId(candidate)) ||
+		       ((changeID.first() & ~changeIDMask) < (candidate.changeID.first() & ~changeIDMask) &&
+		        equalInternalId(candidate));
 	}
 
 	ClusterControllerPriorityInfo getPriorityInfo() const {
diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp
index af563c68b0..6d5bf4f691 100644
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@@ -432,7 +432,8 @@ Optional<std::pair<LeaderInfo, bool>> getLeader(const vector<Optional<LeaderInfo
 	for (int i = 0; i < nominees.size(); i++) {
 		if (nominees[i].present()) {
 			maskedNominees.push_back(std::make_pair(
-			    UID(nominees[i].get().changeID.first() & LeaderInfo::mask, nominees[i].get().changeID.second()), i));
+			    UID(nominees[i].get().changeID.first() & LeaderInfo::changeIDMask, nominees[i].get().changeID.second()),
+			    i));
 		}
 	}
 
diff --git a/fdbserver/workloads/FileSystem.actor.cpp b/fdbserver/workloads/FileSystem.actor.cpp
index c131d790b3..4b57a5ad0d 100644
--- a/fdbserver/workloads/FileSystem.actor.cpp
+++ b/fdbserver/workloads/FileSystem.actor.cpp
@@ -103,7 +103,7 @@ struct FileSystemWorkload : TestWorkload {
 
 		tr->set(key, path);
 		std::string keyStr(key.toString());
-		tr->set(keyStr + "/size", format("%d", deterministicRandom()->randomInt(0, 2 << 30)));
+		tr->set(keyStr + "/size", format("%d", deterministicRandom()->randomInt(0, std::numeric_limits<int>::max())));
 		tr->set(keyStr + "/server", format("%d", deterministicRandom()->randomInt(0, self->serverCount)));
 		tr->set(keyStr + "/deleted", deleted ? LiteralStringRef("1") : LiteralStringRef("0"));
 		tr->set(keyStr + "/server", format("%d", serverID));
@@ -236,7 +236,7 @@ struct FileSystemWorkload : TestWorkload {
 		loop {
 			state int fileID = deterministicRandom()->randomInt(0, self->fileCount);
 			state bool isDeleting = deterministicRandom()->random01() < 0.25;
-			state int size = isDeleting ? 0 : deterministicRandom()->randomInt(0, 2 << 30);
+			state int size = isDeleting ? 0 : deterministicRandom()->randomInt(0, std::numeric_limits<int>::max());
 			state std::string keyStr = self->keyForFileID(fileID).toString();
 			state double tstart = now();
 			state Transaction tr(cx);

From bc43fa99acab29b0e8ccf6924331e48d41ab0385 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 21 Apr 2021 17:37:58 -0700
Subject: [PATCH 224/461] Move commit to its own try loop.

---
 fdbclient/FileBackupAgent.actor.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 04868c352d..2598c597ce 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -4130,8 +4130,18 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
 			firstConsistentVersion = std::max(firstConsistentVersion, f.version);
 		}
-		restore.firstConsistentVersion().set(tr, firstConsistentVersion);
-		wait(tr->commit());
+		tr->reset();
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+				restore.firstConsistentVersion().set(tr, firstConsistentVersion);
+				wait(tr->commit());
+				break;
+			} catch (Error& e) {
+				wait(tr->onError(e));
+			}
+		}
 
 		if (!CLIENT_KNOBS->RESTORE_IGNORE_LOG_FILES) {
 			for (const LogFile& f : restorable.get().logs) {

From aeb493872039bf0478972f04b282cb5c9a2978b8 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel_b_smith@apple.com>
Date: Thu, 22 Apr 2021 12:52:39 -0400
Subject: [PATCH 225/461] Remove unnecessary copy of KVS entries into range
 read response

---
 fdbserver/storageserver.actor.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 833a90ed5e..261e9f33c3 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -1437,7 +1437,7 @@ ACTOR Future<Void> getShardStateQ(StorageServer* data, GetShardStateRequest req)
 void merge(Arena& arena,
            VectorRef<KeyValueRef, VecSerStrategy::String>& output,
            VectorRef<KeyValueRef> const& vm_output,
-           VectorRef<KeyValueRef> const& base,
+           Standalone<RangeResultRef> const& base,
            int& vCount,
            int limit,
            bool stopAtEndOfBase,
@@ -1448,6 +1448,9 @@ void merge(Arena& arena,
 // start is still inclusive and end is exclusive
 {
 	ASSERT(limit != 0);
+	// Add a dependency of the new arena on the result from the KVS so that we don't have to copy any of the KVS
+	// results.
+	arena.dependsOn(base.arena());
 
 	bool forward = limit > 0;
 	if (!forward)
@@ -1458,7 +1461,7 @@ void merge(Arena& arena,
 	KeyValueRef const* baseEnd = base.end();
 	while (baseStart != baseEnd && vCount > 0 && output.size() < adjustedLimit && accumulatedBytes < limitBytes) {
 		if (forward ? baseStart->key < vm_output[pos].key : baseStart->key > vm_output[pos].key) {
-			output.push_back_deep(arena, *baseStart++);
+			output.push_back(arena, *baseStart++);
 		} else {
 			output.push_back_deep(arena, vm_output[pos]);
 			if (baseStart->key == vm_output[pos].key)
@@ -1469,7 +1472,7 @@ void merge(Arena& arena,
 		accumulatedBytes += sizeof(KeyValueRef) + output.end()[-1].expectedSize();
 	}
 	while (baseStart != baseEnd && output.size() < adjustedLimit && accumulatedBytes < limitBytes) {
-		output.push_back_deep(arena, *baseStart++);
+		output.push_back(arena, *baseStart++);
 		accumulatedBytes += sizeof(KeyValueRef) + output.end()[-1].expectedSize();
 	}
 	if (!stopAtEndOfBase) {

From 41ca11c3e57fb888529883a550e1269b9b670ab3 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 22 Apr 2021 13:53:37 -0700
Subject: [PATCH 226/461] Implement restoring an inconsistent snapshot as a
 real feature.

---
 documentation/sphinx/source/backups.rst |  3 +++
 fdbbackup/backup.actor.cpp              | 14 ++++++++++++--
 fdbclient/BackupAgent.actor.h           |  3 +++
 fdbclient/FileBackupAgent.actor.cpp     | 18 ++++++++++++++++--
 fdbclient/Knobs.cpp                     |  1 -
 fdbclient/Knobs.h                       |  1 -
 6 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/documentation/sphinx/source/backups.rst b/documentation/sphinx/source/backups.rst
index 01a730a6bd..9f606a2b51 100644
--- a/documentation/sphinx/source/backups.rst
+++ b/documentation/sphinx/source/backups.rst
@@ -490,6 +490,9 @@ The ``start`` command will start a new restore on the specified (or default) tag
 ``--orig_cluster_file <CONNFILE>``
   The cluster file for the original database from which the backup was created.  The original database is only needed to convert a --timestamp argument to a database version.
 
+``--inconsistent_snapshot_only``
+  Ignore mutation log files during the restore to speedup the process. Because only range files are restored, this option gives an inconsistent snapshot in most cases and is not recommended to use.
+
 .. program:: fdbrestore abort
 
 ``abort``
diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index 8bb8cd5fc3..43e6f86b10 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -146,6 +146,7 @@ enum {
 	OPT_RESTORE_CLUSTERFILE_DEST,
 	OPT_RESTORE_CLUSTERFILE_ORIG,
 	OPT_RESTORE_BEGIN_VERSION,
+	OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY,
 
 	// Shared constants
 	OPT_CLUSTERFILE,
@@ -694,6 +695,7 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = {
 	{ OPT_BLOB_CREDENTIALS, "--blob_credentials", SO_REQ_SEP },
 	{ OPT_INCREMENTALONLY, "--incremental", SO_NONE },
 	{ OPT_RESTORE_BEGIN_VERSION, "--begin_version", SO_REQ_SEP },
+	{ OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY, "--inconsistent_snapshot_only", SO_NONE },
 #ifndef TLS_DISABLED
 	TLS_OPTION_FLAGS
 #endif
@@ -2256,7 +2258,8 @@ ACTOR Future<Void> runRestore(Database db,
                               bool waitForDone,
                               std::string addPrefix,
                               std::string removePrefix,
-                              bool incrementalBackupOnly) {
+                              bool incrementalBackupOnly,
+                              bool inconsistentSnapshotOnly) {
 	if (ranges.empty()) {
 		ranges.push_back_deep(ranges.arena(), normalKeys);
 	}
@@ -2328,6 +2331,7 @@ ACTOR Future<Void> runRestore(Database db,
 			                                                   KeyRef(removePrefix),
 			                                                   true,
 			                                                   incrementalBackupOnly,
+			                                                   inconsistentSnapshotOnly,
 			                                                   beginVersion));
 
 			if (waitForDone && verbose) {
@@ -3243,6 +3247,7 @@ int main(int argc, char* argv[]) {
 		bool stopWhenDone = true;
 		bool usePartitionedLog = false; // Set to true to use new backup system
 		bool incrementalBackupOnly = false;
+		bool inconsistentSnapshotOnly = false;
 		bool forceAction = false;
 		bool trace = false;
 		bool quietDisplay = false;
@@ -3556,6 +3561,10 @@ int main(int argc, char* argv[]) {
 				restoreVersion = ver;
 				break;
 			}
+			case OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY: {
+				inconsistentSnapshotOnly = true;
+				break;
+			}
 #ifdef _WIN32
 			case OPT_PARENTPID: {
 				auto pid_str = args->OptionArg();
@@ -4023,7 +4032,8 @@ int main(int argc, char* argv[]) {
 				                         waitForDone,
 				                         addPrefix,
 				                         removePrefix,
-				                         incrementalBackupOnly));
+				                         incrementalBackupOnly,
+				                         inconsistentSnapshotOnly));
 				break;
 			case RestoreType::WAIT:
 				f = stopAfter(success(ba.waitRestore(db, KeyRef(tagName), true)));
diff --git a/fdbclient/BackupAgent.actor.h b/fdbclient/BackupAgent.actor.h
index 61af1ca6a7..38888d4b59 100644
--- a/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/BackupAgent.actor.h
@@ -295,6 +295,7 @@ public:
 	                        Key removePrefix = Key(),
 	                        bool lockDB = true,
 	                        bool incrementalBackupOnly = false,
+	                        bool inconsistentSnapshotOnly = false,
 	                        Version beginVersion = -1);
 	Future<Version> restore(Database cx,
 	                        Optional<Database> cxOrig,
@@ -308,6 +309,7 @@ public:
 	                        Key removePrefix = Key(),
 	                        bool lockDB = true,
 	                        bool incrementalBackupOnly = false,
+	                        bool inconsistentSnapshotOnly = false,
 	                        Version beginVersion = -1) {
 		Standalone<VectorRef<KeyRangeRef>> rangeRef;
 		rangeRef.push_back_deep(rangeRef.arena(), range);
@@ -323,6 +325,7 @@ public:
 		               removePrefix,
 		               lockDB,
 		               incrementalBackupOnly,
+					   inconsistentSnapshotOnly,
 		               beginVersion);
 	}
 	Future<Version> atomicRestore(Database cx,
diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 2598c597ce..9d405af38f 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -142,6 +142,7 @@ public:
 	KeyBackedProperty<Key> addPrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	KeyBackedProperty<Key> removePrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	KeyBackedProperty<bool> incrementalBackupOnly() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
+	KeyBackedProperty<bool> inconsistentSnapshotOnly() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	// XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges
 	KeyBackedProperty<KeyRange> restoreRange() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	KeyBackedProperty<std::vector<KeyRange>> restoreRanges() {
@@ -4143,7 +4144,11 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			}
 		}
 
-		if (!CLIENT_KNOBS->RESTORE_IGNORE_LOG_FILES) {
+		tr->reset();
+		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+		bool inconsistentSnapshotOnly = wait(restore.inconsistentSnapshotOnly().getD(tr, false, false));
+		if (!inconsistentSnapshotOnly) {
 			for (const LogFile& f : restorable.get().logs) {
 				files.push_back({ f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion });
 			}
@@ -4622,6 +4627,7 @@ public:
 	                                        Key removePrefix,
 	                                        bool lockDB,
 	                                        bool incrementalBackupOnly,
+	                                        bool inconsistentSnapshotOnly,
 	                                        Version beginVersion,
 	                                        UID uid) {
 		KeyRangeMap<int> restoreRangeSet;
@@ -4691,6 +4697,7 @@ public:
 		restore.stateEnum().set(tr, ERestoreState::QUEUED);
 		restore.restoreVersion().set(tr, restoreVersion);
 		restore.incrementalBackupOnly().set(tr, incrementalBackupOnly);
+		restore.inconsistentSnapshotOnly().set(tr, inconsistentSnapshotOnly);
 		restore.beginVersion().set(tr, beginVersion);
 		if (BUGGIFY && restoreRanges.size() == 1) {
 			restore.restoreRange().set(tr, restoreRanges[0]);
@@ -5264,7 +5271,9 @@ public:
 	//   removePrefix: for each key to be restored, remove this prefix first.
 	//   lockDB: if set lock the database with randomUid before performing restore;
 	//           otherwise, check database is locked with the randomUid
-	//   incrementalBackupOnly: only perform incremental backup
+	//   incrementalBackupOnly: only perform incremental restore, by only applying mutation logs
+	//   inconsistentSnapshotOnly: Ignore mutation log files during the restore to speedup the process.
+	//                             When set to true, gives an inconsistent snapshot, thus not recommended
 	//   beginVersion: restore's begin version
 	//   randomUid: the UID for lock the database
 	ACTOR static Future<Version> restore(FileBackupAgent* backupAgent,
@@ -5280,6 +5289,7 @@ public:
 	                                     Key removePrefix,
 	                                     bool lockDB,
 	                                     bool incrementalBackupOnly,
+	                                     bool inconsistentSnapshotOnly,
 	                                     Version beginVersion,
 	                                     UID randomUid) {
 		// The restore command line tool won't allow ranges to be empty, but correctness workloads somehow might.
@@ -5336,6 +5346,7 @@ public:
 				                   removePrefix,
 				                   lockDB,
 				                   incrementalBackupOnly,
+				                   inconsistentSnapshotOnly,
 				                   beginVersion,
 				                   randomUid));
 				wait(tr->commit());
@@ -5491,6 +5502,7 @@ public:
 			                           removePrefix,
 			                           true,
 			                           false,
+			                           false,
 			                           invalidVersion,
 			                           randomUid));
 			return ver;
@@ -5552,6 +5564,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
                                          Key removePrefix,
                                          bool lockDB,
                                          bool incrementalBackupOnly,
+                                         bool inconsistentSnapshotOnly,
                                          Version beginVersion) {
 	return FileBackupAgentImpl::restore(this,
 	                                    cx,
@@ -5566,6 +5579,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
 	                                    removePrefix,
 	                                    lockDB,
 	                                    incrementalBackupOnly,
+	                                    inconsistentSnapshotOnly,
 	                                    beginVersion,
 	                                    deterministicRandom()->randomUniqueID());
 }
diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index 3f5523e218..bcca5ed166 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -172,7 +172,6 @@ void ClientKnobs::initialize(bool randomize) {
 	init( BACKUP_STATUS_DELAY,                    40.0 );
 	init( BACKUP_STATUS_JITTER,                   0.05 );
 	init( MIN_CLEANUP_SECONDS,                  3600.0 );
-	init( RESTORE_IGNORE_LOG_FILES,              false );
 
 	// Configuration
 	init( DEFAULT_AUTO_COMMIT_PROXIES,               3 );
diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h
index 8cfcd9e6bd..3d22b5a24b 100644
--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@@ -167,7 +167,6 @@ public:
 	double BACKUP_STATUS_DELAY;
 	double BACKUP_STATUS_JITTER;
 	double MIN_CLEANUP_SECONDS;
-	bool RESTORE_IGNORE_LOG_FILES;   // Default is false. When set to true, the log files will be ignored during the restore, which can produce inconsistent restored data.
 
 	// Configuration
 	int32_t DEFAULT_AUTO_COMMIT_PROXIES;

From b0f6cb57b8ed29d2b22dcc2915fd9261bc2eb3f8 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 22 Apr 2021 14:19:16 -0700
Subject: [PATCH 227/461] Update simulation.

---
 fdbserver/workloads/IncrementalBackup.actor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdbserver/workloads/IncrementalBackup.actor.cpp b/fdbserver/workloads/IncrementalBackup.actor.cpp
index 8c4b20a07a..4c07866a4c 100644
--- a/fdbserver/workloads/IncrementalBackup.actor.cpp
+++ b/fdbserver/workloads/IncrementalBackup.actor.cpp
@@ -229,6 +229,7 @@ struct IncrementalBackupWorkload : TestWorkload {
 			                                       Key(),
 			                                       true,
 			                                       true,
+												   false,
 			                                       beginVersion)));
 			TraceEvent("IBackupRestoreSuccess");
 		}

From 69d7951cfa50f32d6bf4b161d5e38b353ff16eb1 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 22 Apr 2021 14:21:30 -0700
Subject: [PATCH 228/461] Clang format.

---
 fdbserver/workloads/IncrementalBackup.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/workloads/IncrementalBackup.actor.cpp b/fdbserver/workloads/IncrementalBackup.actor.cpp
index 4c07866a4c..1d9fa547af 100644
--- a/fdbserver/workloads/IncrementalBackup.actor.cpp
+++ b/fdbserver/workloads/IncrementalBackup.actor.cpp
@@ -229,7 +229,7 @@ struct IncrementalBackupWorkload : TestWorkload {
 			                                       Key(),
 			                                       true,
 			                                       true,
-												   false,
+			                                       false,
 			                                       beginVersion)));
 			TraceEvent("IBackupRestoreSuccess");
 		}

From 99c1edf87eb57816b1e7b75f1a57cae89a731456 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 22 Apr 2021 17:48:09 -0600
Subject: [PATCH 229/461] Implemented fluentd functionality

---
 fdbclient/ActorLineageProfiler.cpp  |  84 ++++++++--------
 fdbclient/ActorLineageProfiler.h    |  19 ++--
 fdbclient/FluentDSampleIngestor.cpp | 143 ++++++++++++++++++++++++----
 3 files changed, 181 insertions(+), 65 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 5bcfaacbb5..a62d0ae890 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -25,6 +25,7 @@
 #include <msgpack.hpp>
 #include <memory>
 #include <boost/endian/conversion.hpp>
+#include <boost/asio.hpp>
 
 using namespace std::literals;
 
@@ -230,7 +231,45 @@ std::vector<std::shared_ptr<Sample>> SampleCollection_t::get(double from /*= 0.0
 	return res;
 }
 
-ActorLineageProfilerT::ActorLineageProfilerT() {
+struct ProfilerImpl {
+	boost::asio::io_context context;
+	boost::asio::executor_work_guard<decltype(context.get_executor())> workGuard;
+	boost::asio::steady_timer timer;
+	std::thread mainThread;
+	unsigned frequency;
+
+	SampleCollection collection;
+
+	ProfilerImpl() : workGuard(context.get_executor()), timer(context) {
+		mainThread = std::thread([this]() { context.run(); });
+	}
+	~ProfilerImpl() {
+		setFrequency(0);
+		workGuard.reset();
+		mainThread.join();
+	}
+
+	void profileHandler(boost::system::error_code const& ec) {
+		if (ec) {
+			return;
+		}
+		collection->refresh();
+		timer = boost::asio::steady_timer(context, std::chrono::microseconds(1000000 / frequency));
+		timer.async_wait([this](auto const& ec) { profileHandler(ec); });
+	}
+
+	void setFrequency(unsigned frequency) {
+		boost::asio::post(context, [this, frequency]() {
+			this->frequency = frequency;
+			timer.cancel();
+			if (frequency > 0) {
+				profileHandler(boost::system::error_code{});
+			}
+		});
+	}
+};
+
+ActorLineageProfilerT::ActorLineageProfilerT() : impl(new ProfilerImpl()) {
 	collection->collector()->addGetter(WaitState::Network,
 	                                   std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet())));
 	collection->collector()->addGetter(
@@ -243,50 +282,15 @@ ActorLineageProfilerT::ActorLineageProfilerT() {
 }
 
 ActorLineageProfilerT::~ActorLineageProfilerT() {
-	stop();
-}
-
-void ActorLineageProfilerT::stop() {
-	setFrequency(0);
+	delete impl;
 }
 
 void ActorLineageProfilerT::setFrequency(unsigned frequency) {
-	unsigned oldFrequency = this->frequency;
-	bool change = this->frequency != frequency;
-	this->frequency = frequency;
-
-	if (change) {
-		// Profiler thread will automatically switch to new frequency after
-		// being triggered by the the condition variable. Only need to start a
-		// new profiler thread if the old one has been stopped due to the
-		// profiler thread returning (frequency set to 0).
-		if (oldFrequency == 0 && frequency != 0) {
-			std::thread(&ActorLineageProfilerT::profile, this).detach();
-		}
-		cond.notify_all();
-	}
+	impl->setFrequency(frequency);
 }
 
-void ActorLineageProfilerT::profile() {
-	static std::atomic_int profileThreadCount = 0;
-	ASSERT(++profileThreadCount == 1);
-
-	for (;;) {
-		collection->refresh();
-		if (frequency == 0) {
-			profileThreadCount--;
-			return;
-		}
-		{
-			std::unique_lock<std::mutex> lock{ mutex };
-			cond.wait_for(lock, std::chrono::microseconds(1000000 / frequency));
-			// cond.wait_until(lock, lastSample + std::chrono::milliseconds)
-		}
-		if (frequency == 0) {
-			profileThreadCount--;
-			return;
-		}
-	}
+boost::asio::io_context& ActorLineageProfilerT::context() {
+	return impl->context;
 }
 
 SampleIngestor::~SampleIngestor() {}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index c7348d83c1..0e7c8e7385 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -152,20 +152,25 @@ public:
 
 using SampleCollection = crossbow::singleton<SampleCollection_t>;
 
+struct ProfilerImpl;
+
+namespace boost {
+namespace asio {
+// forward declare io_context because including boost asio is super expensive
+class io_context;
+} // namespace asio
+} // namespace boost
+
 class ActorLineageProfilerT {
 	friend struct crossbow::create_static<ActorLineageProfilerT>;
-	ActorLineageProfilerT();
+	ProfilerImpl* impl;
 	SampleCollection collection;
-	std::thread profilerThread;
-	std::atomic<unsigned> frequency = 0;
-	std::mutex mutex;
-	std::condition_variable cond;
-	void profile();
+	ActorLineageProfilerT();
 
 public:
 	~ActorLineageProfilerT();
 	void setFrequency(unsigned frequency);
-	void stop();
+	boost::asio::io_context& context();
 };
 
 using ActorLineageProfiler = crossbow::singleton<ActorLineageProfilerT>;
diff --git a/fdbclient/FluentDSampleIngestor.cpp b/fdbclient/FluentDSampleIngestor.cpp
index 0a81ba0613..f1609ae5b3 100644
--- a/fdbclient/FluentDSampleIngestor.cpp
+++ b/fdbclient/FluentDSampleIngestor.cpp
@@ -20,46 +20,153 @@
 
 #include "fdbclient/ActorLineageProfiler.h"
 #include <boost/asio.hpp>
+#include <boost/asio/co_spawn.hpp>
 
 namespace {
+
+boost::asio::ip::address ipAddress(IPAddress const& n) {
+	if (n.isV6()) {
+		return boost::asio::ip::address_v6(n.toV6());
+	} else {
+		return boost::asio::ip::address_v4(n.toV4());
+	}
+}
+
+template <class Protocol>
+boost::asio::ip::basic_endpoint<Protocol> toEndpoint(NetworkAddress const n) {
+	return boost::asio::ip::basic_endpoint<Protocol>(ipAddress(n.ip), n.port);
+}
+
 struct FluentDSocket {
 	virtual ~FluentDSocket() {}
-	virtual void connect(NetworkAddress& endpoint) = 0;
-	// virtual void send() = 0;
+	virtual void connect(NetworkAddress const& endpoint) = 0;
+	virtual void send(std::shared_ptr<Sample> const& sample) = 0;
+	virtual const boost::system::error_code& failed() const = 0;
 };
 
-struct TCPFluentDSocket : FluentDSocket {
-	boost::asio::io_context& io_context;
-	boost::asio::ip::tcp::socket socket;
-	TCPFluentDSocket(boost::asio::io_context& context) : io_context(context), socket(context) {}
-	void connect(NetworkAddress& endpoint) override { boost::asio::ip::tcp::resolver resolver(io_context); }
+template <class Protocol>
+struct FluentDSocketImpl : FluentDSocket, std::enable_shared_from_this<FluentDSocketImpl<Protocol>> {
+	static constexpr unsigned MAX_QUEUE_SIZE = 100;
+	boost::asio::io_context& context;
+	typename Protocol::socket socket;
+	FluentDSocketImpl(boost::asio::io_context& context) : context(context), socket(context) {}
+	bool ready = false;
+	std::deque<std::shared_ptr<Sample>> queue;
+	boost::system::error_code _failed;
+
+	const boost::system::error_code& failed() const override { return _failed; }
+
+	void sendCompletionHandler(boost::system::error_code const& ec) {
+		if (ec) {
+			// TODO: trace error
+			_failed = ec;
+			return;
+		}
+		if (queue.empty()) {
+			ready = true;
+		} else {
+			auto sample = queue.front();
+			queue.pop_front();
+			sendImpl<Protocol>(sample);
+		}
+	}
+
+	template <class P>
+	std::enable_if_t<std::is_same_v<boost::asio::ip::tcp, P>> sendImpl(std::shared_ptr<Sample> const& sample) {
+		boost::asio::async_write(
+		    socket,
+		    boost::asio::const_buffer(sample->data, sample->size),
+		    [sample, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
+	}
+
+	template <class P>
+	std::enable_if_t<std::is_same_v<boost::asio::ip::udp, P>> sendImpl(std::shared_ptr<Sample> const& sample) {
+		socket.async_send(
+		    boost::asio::const_buffer(sample->data, sample->size),
+		    [sample, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
+	}
+
+	void send(std::shared_ptr<Sample> const& sample) override {
+		if (_failed) {
+			return;
+		}
+		if (ready) {
+			ready = false;
+			sendImpl<Protocol>(sample);
+		} else {
+			if (queue.size() < MAX_QUEUE_SIZE) {
+				queue.push_back(sample);
+			} // TODO: else trace a warning
+		}
+	}
+
+	void connect(NetworkAddress const& endpoint) override {
+		auto to = toEndpoint<Protocol>(endpoint);
+		socket.async_connect(to, [self = this->shared_from_this()](boost::system::error_code const& ec) {
+			if (ec) {
+				// TODO: error handling
+				self->_failed = ec;
+				return;
+			}
+			self->ready = true;
+		});
+	}
 };
 
-struct UDPFluentDSocket : FluentDSocket {
-	boost::asio::io_context& io_context;
-	boost::asio::ip::tcp::socket socket;
-	UDPFluentDSocket(boost::asio::io_context& context) : io_context(context), socket(context) {}
-	void connect(NetworkAddress& endpoint) override {}
-};
 } // namespace
 
 struct FluentDIngestorImpl {
 	using Protocol = FluentDIngestor::Protocol;
-	boost::asio::io_context io_context;
+	Protocol protocol;
+	NetworkAddress endpoint;
+	boost::asio::io_context& io_context;
 	std::unique_ptr<FluentDSocket> socket;
-	FluentDIngestorImpl(Protocol protocol, NetworkAddress& endpoint) {
+	boost::asio::steady_timer retryTimer;
+	FluentDIngestorImpl(Protocol protocol, NetworkAddress const& endpoint)
+	  : protocol(protocol), endpoint(endpoint), io_context(ActorLineageProfiler::instance().context()),
+	    retryTimer(io_context) {
+		connect();
+	}
+
+	~FluentDIngestorImpl() { retryTimer.cancel(); }
+
+	void connect() {
 		switch (protocol) {
 		case Protocol::TCP:
-			socket.reset(new TCPFluentDSocket(io_context));
+			socket.reset(new FluentDSocketImpl<boost::asio::ip::tcp>(io_context));
 			break;
 		case Protocol::UDP:
-			socket.reset(new UDPFluentDSocket(io_context));
+			socket.reset(new FluentDSocketImpl<boost::asio::ip::udp>(io_context));
 			break;
 		}
 		socket->connect(endpoint);
 	}
+
+	void retry() {
+		retryTimer = boost::asio::steady_timer(io_context, std::chrono::seconds(1));
+		retryTimer.async_wait([this](auto const& ec) {
+			if (ec) {
+				return;
+			}
+			connect();
+		});
+		socket.reset();
+	}
 };
 
 FluentDIngestor::~FluentDIngestor() {}
 
-FluentDIngestor::FluentDIngestor(Protocol protocol, NetworkAddress& endpoint) {}
\ No newline at end of file
+FluentDIngestor::FluentDIngestor(Protocol protocol, NetworkAddress& endpoint)
+  : impl(new FluentDIngestorImpl(protocol, endpoint)) {}
+
+void FluentDIngestor::ingest(const std::shared_ptr<Sample>& sample) {
+	if (!impl->socket) {
+		// the connection failed in the past and we wait for a timeout before we retry
+		return;
+	} else if (impl->socket->failed()) {
+		impl->retry();
+		return;
+	} else {
+		impl->socket->send(sample);
+	}
+}

From adb0ce97769721ba5d95206880ccf3570d5355d2 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 22 Apr 2021 17:52:27 -0600
Subject: [PATCH 230/461] address review comments

---
 fdbclient/TransactionLineage.h    | 2 +-
 fdbserver/storageserver.actor.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdbclient/TransactionLineage.h b/fdbclient/TransactionLineage.h
index b4518de231..711d89101c 100644
--- a/fdbclient/TransactionLineage.h
+++ b/fdbclient/TransactionLineage.h
@@ -94,7 +94,7 @@ class ScopedLineage {
 
 public:
 	ScopedLineage(V T::*member, V const& value) : member(member) {
-		auto val = currentLineage->modify(member);
+		auto& val = currentLineage->modify(member);
 		before = val;
 		val = value;
 	}
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 7538685acf..254484710d 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -522,7 +522,7 @@ public:
 	//   process of committing makeShardDurable)
 	//   == v              -> k is readable (from storage+versionedData) @ [storageVersion,v], and not being updated
 	//   when version increases
-	//   == latestVersion  -> k is readable (from stora	ge+versionedData) @ [storageVersion,version.get()], and thus
+	//   == latestVersion  -> k is readable (from storage+versionedData) @ [storageVersion,version.get()], and thus
 	//   stays available when version increases
 	CoalescedKeyRangeMap<Version> newestAvailableVersion;
 
@@ -875,7 +875,7 @@ public:
 		}
 		return fun(this, request);
 	}
-		    };
+};
 
 const StringRef StorageServer::CurrentRunningFetchKeys::emptyString = LiteralStringRef("");
 const KeyRangeRef StorageServer::CurrentRunningFetchKeys::emptyKeyRange =

From 26569273b0faf522ab7b8979641d9c5be23bfc6f Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Thu, 22 Apr 2021 19:34:24 -0700
Subject: [PATCH 231/461] Update safeThreadFutureToFuture and remove the
 unnecessary wrapper

---
 flow/ThreadHelper.actor.cpp | 12 +++++-----
 flow/ThreadHelper.actor.h   | 45 +++++++++++++++----------------------
 2 files changed, 24 insertions(+), 33 deletions(-)

diff --git a/flow/ThreadHelper.actor.cpp b/flow/ThreadHelper.actor.cpp
index e6d5d09346..785ae2baf4 100644
--- a/flow/ThreadHelper.actor.cpp
+++ b/flow/ThreadHelper.actor.cpp
@@ -22,6 +22,7 @@
 #include "flow/Error.h"
 #include "flow/UnitTest.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
+#include "flow/flow.h"
 #include <string>
 
 ThreadCallback* ThreadCallback::addCallback(ThreadCallback* cb) {
@@ -36,7 +37,7 @@ struct ThreadFutureSendObj {
 
 // A simple thread object that cancels the threadFuture
 struct ThreadFutureCancelObj {
-	ThreadFutureCancelObj(ThreadSingleAssignmentVar<Void>* tsav) : f(tsav) {}
+	ThreadFutureCancelObj(ThreadFuture<Void> f) : f(f) {}
 	void operator()() { f.cancel(); }
 	ThreadFuture<Void> f;
 };
@@ -52,13 +53,12 @@ TEST_CASE("/safeThreadFutureSend") {
 	return Void();
 }
 
-// This unit test should be running with TSAN enabled binary
+// Test the case where the underlying threadFuture is cancelled
 TEST_CASE("/safeThreadFutureCancel") {
-	auto* tsav = new ThreadSingleAssignmentVar<Void>;
-	state std::thread thread = std::thread{ ThreadFutureCancelObj(tsav) };
+	ThreadFuture<Void> f = onMainThread([]() -> Future<Void> { return Never(); });
+	state std::thread thread = std::thread{ ThreadFutureCancelObj(f) };
 	try {
-		ThreadFuture<Void> f(tsav);
-		wait(safeThreadFutureToFuture(f)); // this actor should be thrown actor_cancelled
+		wait(safeThreadFutureToFuture(f)); // this actor should get actor_cancelled
 		ASSERT(false);
 	} catch (Error& e) {
 		ASSERT(e.code() == error_code_actor_cancelled);
diff --git a/flow/ThreadHelper.actor.h b/flow/ThreadHelper.actor.h
index dd1065481f..8ff7ac4702 100644
--- a/flow/ThreadHelper.actor.h
+++ b/flow/ThreadHelper.actor.h
@@ -22,6 +22,7 @@
 
 // When actually compiled (NO_INTELLISENSE), include the generated
 // version of this file.  In intellisense use the source version.
+#include "flow/Error.h"
 #if defined(NO_INTELLISENSE) && !defined(FLOW_THREADHELPER_ACTOR_G_H)
 #define FLOW_THREADHELPER_ACTOR_G_H
 #include "flow/ThreadHelper.actor.g.h"
@@ -556,40 +557,30 @@ private:
 
 // The underlying actor that converts ThreadFuture from Future
 // Note: should be used from main thread
+// The cancellation here works both way
+// If the underlying "threadFuture" is cancelled, this actor will get actor_cancelled.
+// If instead, this actor is cancelled, we will also cancel the underlying "threadFuture"
+// Note: we are required to have unique ownership of the "threadFuture"
 ACTOR template <class T>
-static Future<Void> safeThreadFutureToFutureActor(Promise<T> result, ThreadFuture<T> threadFuture) {
+Future<T> safeThreadFutureToFuture(ThreadFuture<T> threadFuture) {
 	Promise<Void> ready;
 	Future<Void> onReady = ready.getFuture();
-	auto savPtr = ready.extractRawPointer();
-	UtilCallback<T>* callback = new UtilCallback<T>(threadFuture, savPtr);
+	UtilCallback<T>* callback = new UtilCallback<T>(threadFuture, ready.extractRawPointer());
 	int unused = 0;
 	threadFuture.callOrSetAsCallback(callback, unused, 0);
-	wait(onReady);
+	try {
+		wait(onReady);
+	} catch (Error& e) {
+		ASSERT(e.code() == error_code_actor_cancelled);
+		// prerequisite: we have exclusive ownership of the threadFuture
+		threadFuture.cancel();
+		throw e;
+	}
 	// threadFuture should be ready
+	ASSERT(threadFuture.isReady());
 	if (threadFuture.isError())
-		result.sendError(threadFuture.getError());
-	result.send(threadFuture.get());
-	return Void();
-}
-
-// A wrapper actor used for cancellation
-ACTOR template <class T>
-static Future<T> safeThreadFutureToFutureCancellableActor(ThreadFuture<T> threadFuture) {
-	state Promise<T> result;
-	Future<Void> cancellable = safeThreadFutureToFutureActor(result, threadFuture);
-	threadFuture.getPtr()->setCancel(Future<Void>(cancellable));
-	wait(cancellable);
-	Future<T> ready = result.getFuture();
-	if (ready.isError())
-		throw ready.getError();
-	return ready.get();
-}
-
-// Converts a ThreadFuture into a Future
-// Note: This is a thread-safe method when used from the main thread and supports cancellation
-template <class T>
-Future<T> safeThreadFutureToFuture(ThreadFuture<T> threadFuture) {
-	return safeThreadFutureToFutureCancellableActor(threadFuture);
+		throw threadFuture.getError();
+	return threadFuture.get();
 }
 
 ACTOR template <class R, class F>

From de4753a5db3daa09172ebfb93b27eb5c793279f5 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Fri, 23 Apr 2021 01:32:30 -0700
Subject: [PATCH 232/461] Add a workaround to temporily use the ryw to create a
 ThreadTransaction; Make sure we are using the same underlying ryw object

---
 fdbcli/ConsistencycheckCommand.actor.cpp |  7 +++---
 fdbcli/fdbcli.actor.cpp                  | 28 ++++++++++++++++++++++--
 fdbcli/fdbcli.h                          |  2 +-
 fdbclient/ThreadSafeTransaction.cpp      |  6 +++++
 fdbclient/ThreadSafeTransaction.h        |  3 +++
 5 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/fdbcli/ConsistencycheckCommand.actor.cpp b/fdbcli/ConsistencycheckCommand.actor.cpp
index 4279727b1d..1d099212d6 100644
--- a/fdbcli/ConsistencycheckCommand.actor.cpp
+++ b/fdbcli/ConsistencycheckCommand.actor.cpp
@@ -10,8 +10,7 @@
 
 using namespace fdb_cli;
 
-ACTOR static Future<bool> consistencycheckCommandActor(Reference<IDatabase> db, std::vector<StringRef> tokens) {
-	state Reference<ITransaction> tr = db->createTransaction();
+ACTOR static Future<bool> consistencycheckCommandActor(Reference<ITransaction> tr, std::vector<StringRef> tokens) {
 	tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
 	if (tokens.size() == 1) {
 		Optional<Value> suspended = wait(safeThreadFutureToFuture(tr->get(consistencyCheckSpeicalKey)));
@@ -33,8 +32,8 @@ namespace fdb_cli {
 
 const KeyRef consistencyCheckSpeicalKey = LiteralStringRef("\xff\xff/management/consistency_check_suspended");
 
-Future<bool> consistencycheckCommand(Reference<IDatabase> db, std::vector<StringRef> tokens) {
-	return consistencycheckCommandActor(db, tokens);
+Future<bool> consistencycheckCommand(Reference<ITransaction> tr, std::vector<StringRef> tokens) {
+	return consistencycheckCommandActor(tr, tokens);
 }
 
 CommandFactory consistencycheckFactory(
diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index 062aca4399..8eac79fba1 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -36,6 +36,7 @@
 #include "fdbclient/FDBOptions.g.h"
 #include "fdbclient/TagThrottle.h"
 
+#include "fdbclient/ThreadSafeTransaction.h"
 #include "flow/DeterministicRandom.h"
 #include "flow/Platform.h"
 
@@ -183,6 +184,13 @@ public:
 		}
 	}
 
+	// TODO: replace the above function after we refactor all fdbcli code
+	void apply(Reference<ITransaction> tr) {
+		for (const auto& [name, value] : transactionOptions.options) {
+			tr->setOption(name, value.castTo<StringRef>());
+		}
+	}
+
 	// Returns true if any options have been set
 	bool hasAnyOptionsEnabled() const { return !transactionOptions.options.empty(); }
 
@@ -2653,6 +2661,20 @@ Reference<ReadYourWritesTransaction> getTransaction(Database db,
 	return tr;
 }
 
+// TODO: Update this function to get rid of Database and ReadYourWritesTransaction after refactoring
+Reference<ITransaction> getTransaction(Database db,
+                                       Reference<ReadYourWritesTransaction>& tr,
+                                       Reference<ITransaction>& tr2,
+                                       FdbOptions* options,
+                                       bool intrans) {
+	if (!tr || !intrans) {
+		tr = makeReference<ReadYourWritesTransaction>(db);
+		options->apply(tr);
+	}
+	tr2 = Reference<ITransaction>(new ThreadSafeTransaction(tr.getPtr()));
+	return tr2;
+}
+
 std::string newCompletion(const char* base, const char* name) {
 	return format("%s%s ", base, name);
 }
@@ -3124,8 +3146,9 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 
 	state Database db;
 	state Reference<ReadYourWritesTransaction> tr;
-	// Note: refactoring work, will replace db when we have all commands through the general fdb interface
+	// TODO: refactoring work, will replace db, tr when we have all commands through the general fdb interface
 	state Reference<IDatabase> db2;
+	state Reference<ITransaction> tr2;
 
 	state bool writeMode = false;
 
@@ -3790,7 +3813,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 				}
 
 				if (tokencmp(tokens[0], "consistencycheck")) {
-					bool _result = wait(consistencycheckCommand(db2, tokens));
+					getTransaction(db, tr, tr2, options, intrans);
+					bool _result = wait(consistencycheckCommand(tr2, tokens));
 					is_error = !_result;
 					continue;
 				}
diff --git a/fdbcli/fdbcli.h b/fdbcli/fdbcli.h
index a5328191d5..007ee6e5f5 100644
--- a/fdbcli/fdbcli.h
+++ b/fdbcli/fdbcli.h
@@ -62,7 +62,7 @@ void printUsage(StringRef command);
 
 // All fdbcli commands (alphabetically)
 // consistency command
-Future<bool> consistencycheckCommand(Reference<IDatabase> db, std::vector<StringRef> tokens);
+Future<bool> consistencycheckCommand(Reference<ITransaction> tr, std::vector<StringRef> tokens);
 
 } // namespace fdb_cli
 
diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp
index c5bf2dce87..b40b4b7a2f 100644
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@@ -152,6 +152,12 @@ ThreadSafeTransaction::ThreadSafeTransaction(DatabaseContext* cx) {
 	    nullptr);
 }
 
+// This constructor is only used while refactoring fdbcli and only called from the main thread
+ThreadSafeTransaction::ThreadSafeTransaction(ReadYourWritesTransaction* ryw) : tr(ryw) {
+	if (tr)
+		tr->addref();
+}
+
 ThreadSafeTransaction::~ThreadSafeTransaction() {
 	ReadYourWritesTransaction* tr = this->tr;
 	if (tr)
diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h
index e6360c2a6d..5552837a37 100644
--- a/fdbclient/ThreadSafeTransaction.h
+++ b/fdbclient/ThreadSafeTransaction.h
@@ -72,6 +72,9 @@ public:
 	explicit ThreadSafeTransaction(DatabaseContext* cx);
 	~ThreadSafeTransaction() override;
 
+	// Note: used while refactoring fdbcli, need to be removed later
+	explicit ThreadSafeTransaction(ReadYourWritesTransaction* ryw);
+
 	void cancel() override;
 	void setVersion(Version v) override;
 	ThreadFuture<Version> getReadVersion() override;

From 36067e96856dda0e5bbb327b12e72917d51d930b Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Fri, 23 Apr 2021 01:38:25 -0700
Subject: [PATCH 233/461] remove unnecessary print message

---
 fdbcli/fdbcli.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index 8eac79fba1..24981bdd39 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -3190,8 +3190,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 	try {
 		db2 = API->createDatabase(opt.clusterFile.c_str());
 	} catch (Error& e) {
-		fprintf(stderr, "(CAPI)ERROR: %s (%d)\n", e.what(), e.code());
-		printf("(Refactoring): Unable to connect to cluster from `%s'\n", ccf->getFilename().c_str());
+		fprintf(stderr, "ERROR: %s (%d)\n", e.what(), e.code());
+		printf("Unable to connect to cluster from `%s'\n", ccf->getFilename().c_str());
 		return 1;
 	}
 

From 302dd7d54dd7780e6d3bd786281fb34b3762608a Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Fri, 23 Apr 2021 01:49:45 -0700
Subject: [PATCH 234/461] Update the file/function name to follow normal case
 naming as consistencyCheck*

---
 fdbcli/CMakeLists.txt                    | 2 +-
 fdbcli/ConsistencycheckCommand.actor.cpp | 8 ++++----
 fdbcli/fdbcli.actor.cpp                  | 2 +-
 fdbcli/fdbcli.h                          | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fdbcli/CMakeLists.txt b/fdbcli/CMakeLists.txt
index b1eb09d491..b97619fc9a 100644
--- a/fdbcli/CMakeLists.txt
+++ b/fdbcli/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(FDBCLI_SRCS
   fdbcli.h
   fdbcli.actor.cpp
-  ConsistencycheckCommand.actor.cpp
+  ConsistencyCheckCommand.actor.cpp
   FlowLineNoise.actor.cpp
   FlowLineNoise.h
   Util.cpp
diff --git a/fdbcli/ConsistencycheckCommand.actor.cpp b/fdbcli/ConsistencycheckCommand.actor.cpp
index 1d099212d6..3180c7a82d 100644
--- a/fdbcli/ConsistencycheckCommand.actor.cpp
+++ b/fdbcli/ConsistencycheckCommand.actor.cpp
@@ -10,7 +10,7 @@
 
 using namespace fdb_cli;
 
-ACTOR static Future<bool> consistencycheckCommandActor(Reference<ITransaction> tr, std::vector<StringRef> tokens) {
+ACTOR static Future<bool> consistencyCheckCommandActor(Reference<ITransaction> tr, std::vector<StringRef> tokens) {
 	tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
 	if (tokens.size() == 1) {
 		Optional<Value> suspended = wait(safeThreadFutureToFuture(tr->get(consistencyCheckSpeicalKey)));
@@ -32,11 +32,11 @@ namespace fdb_cli {
 
 const KeyRef consistencyCheckSpeicalKey = LiteralStringRef("\xff\xff/management/consistency_check_suspended");
 
-Future<bool> consistencycheckCommand(Reference<ITransaction> tr, std::vector<StringRef> tokens) {
-	return consistencycheckCommandActor(tr, tokens);
+Future<bool> consistencyCheckCommand(Reference<ITransaction> tr, std::vector<StringRef> tokens) {
+	return consistencyCheckCommandActor(tr, tokens);
 }
 
-CommandFactory consistencycheckFactory(
+CommandFactory consistencyCheckFactory(
     "consistencycheck",
     CommandHelp(
         "consistencycheck [on|off]",
diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index 24981bdd39..eaae79c4e3 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -3814,7 +3814,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 
 				if (tokencmp(tokens[0], "consistencycheck")) {
 					getTransaction(db, tr, tr2, options, intrans);
-					bool _result = wait(consistencycheckCommand(tr2, tokens));
+					bool _result = wait(consistencyCheckCommand(tr2, tokens));
 					is_error = !_result;
 					continue;
 				}
diff --git a/fdbcli/fdbcli.h b/fdbcli/fdbcli.h
index 007ee6e5f5..652862b9c2 100644
--- a/fdbcli/fdbcli.h
+++ b/fdbcli/fdbcli.h
@@ -62,7 +62,7 @@ void printUsage(StringRef command);
 
 // All fdbcli commands (alphabetically)
 // consistency command
-Future<bool> consistencycheckCommand(Reference<ITransaction> tr, std::vector<StringRef> tokens);
+Future<bool> consistencyCheckCommand(Reference<ITransaction> tr, std::vector<StringRef> tokens);
 
 } // namespace fdb_cli
 

From d263d011d0705be928621a321676548b97dad1ec Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Fri, 23 Apr 2021 02:02:53 -0700
Subject: [PATCH 235/461] Update the file name to follow normal case naming

---
 ...checkCommand.actor.cpp => ConsistencyCheckCommand.actor.cpp} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename fdbcli/{ConsistencycheckCommand.actor.cpp => ConsistencyCheckCommand.actor.cpp} (98%)

diff --git a/fdbcli/ConsistencycheckCommand.actor.cpp b/fdbcli/ConsistencyCheckCommand.actor.cpp
similarity index 98%
rename from fdbcli/ConsistencycheckCommand.actor.cpp
rename to fdbcli/ConsistencyCheckCommand.actor.cpp
index 3180c7a82d..e7aea8dd0d 100644
--- a/fdbcli/ConsistencycheckCommand.actor.cpp
+++ b/fdbcli/ConsistencyCheckCommand.actor.cpp
@@ -44,4 +44,4 @@ CommandFactory consistencyCheckFactory(
         "Calling this command with `on' permits consistency check processes to run and `off' will halt their checking. "
         "Calling this command with no arguments will display if consistency checking is currently allowed.\n"));
 
-} // namespace fdb_cli
+} // namespace fdb_cli
\ No newline at end of file

From 8898a0f4d2024b589631901ec96c06aa5726ed29 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Fri, 23 Apr 2021 02:04:16 -0700
Subject: [PATCH 236/461] Add new line in the end of the file

---
 fdbcli/ConsistencyCheckCommand.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbcli/ConsistencyCheckCommand.actor.cpp b/fdbcli/ConsistencyCheckCommand.actor.cpp
index e7aea8dd0d..3180c7a82d 100644
--- a/fdbcli/ConsistencyCheckCommand.actor.cpp
+++ b/fdbcli/ConsistencyCheckCommand.actor.cpp
@@ -44,4 +44,4 @@ CommandFactory consistencyCheckFactory(
         "Calling this command with `on' permits consistency check processes to run and `off' will halt their checking. "
         "Calling this command with no arguments will display if consistency checking is currently allowed.\n"));
 
-} // namespace fdb_cli
\ No newline at end of file
+} // namespace fdb_cli

From 3e18b857a872275e912043c4b4a66e1006e1375f Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 23 Apr 2021 11:02:53 -0600
Subject: [PATCH 237/461] add command line args to configure profile ingestor

---
 fdbclient/ActorLineageProfiler.cpp | 55 ++++++++++++++++++++++++++++++
 fdbclient/ActorLineageProfiler.h   |  7 +++-
 fdbserver/fdbserver.actor.cpp      | 34 ++++++++++++++++--
 3 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index a62d0ae890..3b300f1653 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -305,3 +305,58 @@ void samplingProfilerUpdateFrequency(std::optional<std::any> freq) {
 	TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency);
 	ActorLineageProfiler::instance().setFrequency(frequency);
 }
+
+void ProfilerConfigT::reset(std::map<std::string, std::string> const& config) {
+	bool expectNoMore = false, useFluentD = false, useTCP = false;
+	std::string endpoint;
+	ConfigError err;
+	for (auto& kv : config) {
+		if (expectNoMore) {
+			err.description = format("Unexpected option %s", kv.first.c_str());
+			throw err;
+		}
+		if (kv.first == "collector") {
+			std::string val = kv.second;
+			std::for_each(val.begin(), val.end(), [](auto c) { return std::tolower(c); });
+			if (val == "none") {
+				setBackend(std::make_shared<NoneIngestor>());
+			} else if (val == "fluentd") {
+				useFluentD = true;
+			} else {
+				err.description = format("Unsupported collector: %s", val.c_str());
+				throw err;
+			}
+		} else if (kv.first == "collector_endpoint") {
+			endpoint = kv.second;
+		} else if (kv.first == "collector_protocol") {
+			auto val = kv.second;
+			std::for_each(val.begin(), val.end(), [](auto c) { return std::tolower(c); });
+			if (val == "tcp") {
+				useTCP = true;
+			} else if (val == "udp") {
+				useTCP = false;
+			} else {
+				err.description = format("Unsupported protocol for fluentd: %s", kv.second.c_str());
+				throw err;
+			}
+		} else {
+			err.description = format("Unknown option %s", kv.first.c_str());
+			throw err;
+		}
+	}
+	if (useFluentD) {
+		if (endpoint.empty()) {
+			err.description = "Endpoint is required for fluentd ingestor";
+			throw err;
+		}
+		NetworkAddress address;
+		try {
+			address = NetworkAddress::parse(endpoint);
+		} catch (Error& e) {
+			err.description = format("Can't parse address %s", endpoint.c_str());
+			throw err;
+		}
+		setBackend(std::make_shared<FluentDIngestor>(
+		    useTCP ? FluentDIngestor::Protocol::TCP : FluentDIngestor::Protocol::TCP, address));
+	}
+}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 0e7c8e7385..d09aba7d2c 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -79,6 +79,10 @@ public: // interface
 	~FluentDIngestor();
 };
 
+struct ConfigError {
+	std::string description;
+};
+
 class ProfilerConfigT {
 private: // private types
 	using Lock = std::unique_lock<std::mutex>;
@@ -91,9 +95,10 @@ private: // construction
 	ProfilerConfigT() {}
 	ProfilerConfigT(ProfilerConfigT const&) = delete;
 	ProfilerConfigT& operator=(ProfilerConfigT const&) = delete;
+	void setBackend(std::shared_ptr<SampleIngestor> ingestor) { this->ingestor = ingestor; }
 
 public:
-	void setBackend(std::shared_ptr<SampleIngestor> ingestor) { this->ingestor = ingestor; }
+	void reset(std::map<std::string, std::string> const& config);
 };
 
 using ProfilerConfig = crossbow::singleton<ProfilerConfigT>;
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 136cd90c3d..75247d85cf 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -68,6 +68,7 @@
 #include "flow/Tracing.h"
 #include "flow/WriteOnlySet.h"
 #include "flow/UnitTest.h"
+#include "fdbclient/ActorLineageProfiler.h"
 
 #if defined(__linux__) || defined(__FreeBSD__)
 #include <execinfo.h>
@@ -85,6 +86,8 @@
 
 #include "flow/actorcompiler.h" // This must be the last #include.
 
+using namespace std::literals;
+
 // clang-format off
 enum {
 	OPT_CONNFILE, OPT_SEEDCONNFILE, OPT_SEEDCONNSTRING, OPT_ROLE, OPT_LISTEN, OPT_PUBLICADDR, OPT_DATAFOLDER, OPT_LOGFOLDER, OPT_PARENTPID, OPT_TRACER, OPT_NEWCONSOLE,
@@ -92,7 +95,7 @@ enum {
 	OPT_DCID, OPT_MACHINE_CLASS, OPT_BUGGIFY, OPT_VERSION, OPT_BUILD_FLAGS, OPT_CRASHONERROR, OPT_HELP, OPT_NETWORKIMPL, OPT_NOBUFSTDOUT, OPT_BUFSTDOUTERR,
 	OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_UNITTESTPARAM, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE,
 	OPT_METRICSPREFIX, OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE,
-	OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE
+	OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE, OPT_PROFILER
 };
 
 CSimpleOpt::SOption g_rgOptions[] = {
@@ -172,9 +175,10 @@ CSimpleOpt::SOption g_rgOptions[] = {
 	{ OPT_METRICSPREFIX,         "--metrics_prefix",            SO_REQ_SEP },
 	{ OPT_IO_TRUST_SECONDS,      "--io_trust_seconds",          SO_REQ_SEP },
 	{ OPT_IO_TRUST_WARN_ONLY,    "--io_trust_warn_only",        SO_NONE },
-	{ OPT_TRACE_FORMAT      ,    "--trace_format",              SO_REQ_SEP },
+	{ OPT_TRACE_FORMAT,          "--trace_format",              SO_REQ_SEP },
 	{ OPT_WHITELIST_BINPATH,     "--whitelist_binpath",         SO_REQ_SEP },
 	{ OPT_BLOB_CREDENTIAL_FILE,  "--blob_credential_file",      SO_REQ_SEP },
+	{ OPT_PROFILER,	             "--profiler_",                 SO_REQ_SEP},
 
 #ifndef TLS_DISABLED
 	TLS_OPTION_FLAGS
@@ -618,6 +622,11 @@ static void printUsage(const char* name, bool devhelp) {
 	                 " Machine class (valid options are storage, transaction,"
 	                 " resolution, grv_proxy, commit_proxy, master, test, unset, stateless, log, router,"
 	                 " and cluster_controller).");
+	printOptionUsage("--profiler_",
+	                 "Set a actor profiler option. Supported options are:\n"
+	                 "  collector -- None or FluentD (FluentD requires collector_endpoint to be set)\n"
+	                 "  collector_endpoint -- IP:PORT of the fluentd server\n"
+	                 "  collector_protocol -- UDP or TCP (default is UDP)");
 #ifndef TLS_DISABLED
 	printf(TLS_HELP);
 #endif
@@ -981,6 +990,8 @@ struct CLIOptions {
 	Standalone<StringRef> machineId;
 	UnitTestParameters testParams;
 
+	std::map<std::string, std::string> profilerConfig;
+
 	static CLIOptions parseArgs(int argc, char* argv[]) {
 		CLIOptions opts;
 		opts.parseArgsInternal(argc, argv);
@@ -1054,6 +1065,18 @@ private:
 				knobs.push_back(std::make_pair(syn, args.OptionArg()));
 				break;
 			}
+			case OPT_PROFILER: {
+				std::string syn = args.OptionSyntax();
+				std::string_view key = syn;
+				auto prefix = "--profiler_"sv;
+				if (key.find(prefix) != 0) {
+					fprintf(stderr, "ERROR: unable to parse profiler option '%s'\n", syn.c_str());
+					flushAndExit(FDB_EXIT_ERROR);
+				}
+				key.remove_prefix(prefix.size());
+				profilerConfig.emplace(key, args.OptionArg());
+				break;
+			};
 			case OPT_UNITTESTPARAM: {
 				std::string syn = args.OptionSyntax();
 				if (!StringRef(syn).startsWith(LiteralStringRef("--test_"))) {
@@ -1454,6 +1477,13 @@ private:
 			}
 		}
 
+		try {
+			ProfilerConfig::instance().reset(profilerConfig);
+		} catch (ConfigError& e) {
+			printf("Error seting up profiler: %s", e.description.c_str());
+			flushAndExit(FDB_EXIT_ERROR);
+		}
+
 		if (seedConnString.length() && seedConnFile.length()) {
 			fprintf(
 			    stderr, "%s\n", "--seed_cluster_file and --seed_connection_string may not both be specified at once.");

From a05ca1ba7b57ff6243869cb05b652956b89c1672 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Fri, 23 Apr 2021 10:47:56 -0700
Subject: [PATCH 238/461] Disable the two unit tests in simulation as
 std::thread is not supported in simulation

---
 flow/ThreadHelper.actor.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/flow/ThreadHelper.actor.cpp b/flow/ThreadHelper.actor.cpp
index 785ae2baf4..a558940dbe 100644
--- a/flow/ThreadHelper.actor.cpp
+++ b/flow/ThreadHelper.actor.cpp
@@ -23,6 +23,7 @@
 #include "flow/UnitTest.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 #include "flow/flow.h"
+#include "flow/network.h"
 #include <string>
 
 ThreadCallback* ThreadCallback::addCallback(ThreadCallback* cb) {
@@ -44,6 +45,9 @@ struct ThreadFutureCancelObj {
 
 // This unit test should be running with TSAN enabled binary
 TEST_CASE("/safeThreadFutureSend") {
+	// std::thread is not working in simulation at present, disable this in simulation
+	if (g_network->isSimulated())
+		return Void();
 	auto* tsav = new ThreadSingleAssignmentVar<Void>;
 	state std::thread thread = std::thread{ ThreadFutureSendObj{ tsav } };
 	ThreadFuture<Void> f(tsav);
@@ -55,6 +59,9 @@ TEST_CASE("/safeThreadFutureSend") {
 
 // Test the case where the underlying threadFuture is cancelled
 TEST_CASE("/safeThreadFutureCancel") {
+	// std::thread is not working in simulation at present, disable this in simulation
+	if (g_network->isSimulated())
+		return Void();
 	ThreadFuture<Void> f = onMainThread([]() -> Future<Void> { return Never(); });
 	state std::thread thread = std::thread{ ThreadFutureCancelObj(f) };
 	try {

From 185d08b5b87ff46e8dc6914116445387af855907 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Fri, 23 Apr 2021 11:13:08 -0700
Subject: [PATCH 239/461] Add comments for added actors

---
 fdbclient/SpecialKeySpace.actor.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 12cbc0c41c..543b089753 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1925,6 +1925,11 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke
 
 MaintenanceImpl::MaintenanceImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
+// Used to read the healthZoneKey
+// If the key is persisted and the delayed read version is still larger than current read version,
+// we will calculate the remaining time(truncated to integer, the same as fdbcli) and return back as the value
+// If the zoneId is the special one `ignoreSSFailuresZoneString`,
+// value will be 0 (same as fdbcli)
 ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYourWritesTransaction* ryw,
                                                                          KeyRef prefix,
                                                                          KeyRangeRef kr) {
@@ -1954,6 +1959,11 @@ Future<Standalone<RangeResultRef>> MaintenanceImpl::getRange(ReadYourWritesTrans
 	return MaintenanceGetRangeActor(ryw, getKeyRange().begin, kr);
 }
 
+// Commit the change to healthZoneKey
+// We do not allow more than one zone to be set in maintenance in one transaction
+// In addition, if the zoneId now is 'ignoreSSFailuresZoneString',
+// which means the data distribution is disabled for storage failures.
+// Only clear this specific key is allowed, any other operations will throw error
 ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
 	// read
 	ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
@@ -2013,6 +2023,7 @@ Future<Optional<std::string>> MaintenanceImpl::commit(ReadYourWritesTransaction*
 
 DataDistributionImpl::DataDistributionImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
+// Read the system keys dataDistributionModeKey and rebalanceDDIgnoreKey
 ACTOR static Future<Standalone<RangeResultRef>> DataDistributionGetRangeActor(ReadYourWritesTransaction* ryw,
                                                                               KeyRef prefix,
                                                                               KeyRangeRef kr) {

From 4b5bca6761924ee25599952fda3aa00dd49d5467 Mon Sep 17 00:00:00 2001
From: Josh Slocum <josh.slocum@snowflake.com>
Date: Tue, 2 Mar 2021 21:06:01 +0000
Subject: [PATCH 240/461] Minor Redwood comparison optimizations

---
 fdbserver/DeltaTree.h                 |   4 +
 fdbserver/VersionedBTree.actor.cpp    | 173 +++++++++++++++++++++++---
 flow/Arena.h                          |  11 ++
 tests/CMakeLists.txt                  |   2 +
 tests/RandomRangeRead.txt             |  11 ++
 tests/RedwoodPerfRandomRangeScans.txt |   6 +
 6 files changed, 193 insertions(+), 14 deletions(-)
 create mode 100644 tests/RandomRangeRead.txt
 create mode 100644 tests/RedwoodPerfRandomRangeScans.txt

diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h
index bef753a440..dc113ff98d 100644
--- a/fdbserver/DeltaTree.h
+++ b/fdbserver/DeltaTree.h
@@ -530,6 +530,10 @@ public:
 
 		const T& getOrUpperBound() const { return valid() ? node->item : *mirror->upperBound(); }
 
+		const T& lowerBound() const { return *mirror->lowerBound(); }
+
+		const T& upperBound() const { return *mirror->upperBound(); }
+
 		bool operator==(const Cursor& rhs) const { return node == rhs.node; }
 
 		bool operator!=(const Cursor& rhs) const { return node != rhs.node; }
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index fab29a7034..d1c9ad77f0 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -2493,7 +2493,7 @@ struct RedwoodRecordRef {
 	// This is the same order that delta compression uses for prefix borrowing
 	int compare(const RedwoodRecordRef& rhs, int skip = 0) const {
 		int keySkip = std::min(skip, key.size());
-		int cmp = key.substr(keySkip).compare(rhs.key.substr(keySkip));
+		int cmp = key.compareSuffix(rhs.key, keySkip);
 
 		if (cmp == 0) {
 			cmp = version - rhs.version;
@@ -6062,9 +6062,13 @@ public:
 				// Read page contents without using waits
 				bool isRoot = cur.inRoot();
 				BTreePage::BinaryTree::Cursor leafCursor = cur.popPath();
+				// we can bypass the bounds check for each key in the leaf if the entire leaf is in range
+				// > because both query end and page upper bound are exclusive of the query results and page contents,
+				// respectively
+				bool boundsCheck = leafCursor.upperBound() > keys.end;
 				while (leafCursor.valid()) {
 					KeyValueRef kv = leafCursor.get().toKeyValueRef();
-					if (kv.key >= keys.end) {
+					if (boundsCheck && kv.key.compare(keys.end) >= 0) {
 						break;
 					}
 					accumulatedBytes += kv.expectedSize();
@@ -6087,9 +6091,13 @@ public:
 				// Read page contents without using waits
 				bool isRoot = cur.inRoot();
 				BTreePage::BinaryTree::Cursor leafCursor = cur.popPath();
+				// we can bypass the bounds check for each key in the leaf if the entire leaf is in range
+				// < because both query begin and page lower bound are inclusive of the query results and page contents,
+				// respectively
+				bool boundsCheck = leafCursor.lowerBound() < keys.begin;
 				while (leafCursor.valid()) {
 					KeyValueRef kv = leafCursor.get().toKeyValueRef();
-					if (kv.key < keys.begin) {
+					if (boundsCheck && kv.key.compare(keys.begin) < 0) {
 						break;
 					}
 					accumulatedBytes += kv.expectedSize();
@@ -7089,6 +7097,22 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	}
 	printf("%" PRId64 " writeDelta() %f M/s\n", total, count / (timer() - start) / 1e6);
 
+	start = timer();
+	total = 0;
+	count = 10e6;
+	for (i = 0; i < count; ++i) {
+		total += rec1.compare(rec2, 0);
+	}
+	printf("%" PRId64 " compare(skip=0) %f M/s\n", total, count / (timer() - start) / 1e6);
+
+	start = timer();
+	total = 0;
+	count = 10e6;
+	for (i = 0; i < count; ++i) {
+		total += rec1.compare(rec2, 50);
+	}
+	printf("%" PRId64 " compare(skip=50) %f M/s\n", total, count / (timer() - start) / 1e6);
+
 	return Void();
 }
 
@@ -8255,6 +8279,8 @@ struct KVSource {
 	std::string valueData;
 	int prefixLen;
 	int lastIndex;
+	// TODO there is probably a better way to do this
+	Prefix extraRangePrefix;
 
 	KVSource(const std::vector<PrefixSegment>& desc, int numPrefixes = 0) : desc(desc) {
 		if (numPrefixes == 0) {
@@ -8303,6 +8329,32 @@ struct KVSource {
 		return makeKey(p, suffixLen);
 	}
 
+	// Like getKeyRef but gets a KeyRangeRef. If samePrefix, it returns a range from the same prefix,
+	// otherwise it returns a random range from the entire keyspace
+	// Can technically return an empty range with low probability
+	KeyRangeRef getKeyRangeRef(bool samePrefix, int suffixLen, bool sorted = false) {
+		KeyRef a, b;
+
+		a = getKeyRef(suffixLen);
+		// Copy a so that b's Prefix Arena allocation doesn't overwrite a if using the same prefix
+		extraRangePrefix.reserve(extraRangePrefix.arena(), a.size());
+		a.copyTo((uint8_t*)extraRangePrefix.begin());
+		a = KeyRef(extraRangePrefix.begin(), a.size());
+
+		if (samePrefix) {
+			b = getAnotherKeyRef(suffixLen, sorted);
+		} else {
+			b = getKeyRef(suffixLen);
+		}
+
+		if (a < b) {
+			return KeyRangeRef(a, b);
+		} else {
+			return KeyRangeRef(b, a);
+		}
+	}
+
+	// TODO unused, remove?
 	// Like getKeyRef but gets a KeyRangeRef for two keys covering the given number of sorted adjacent prefixes
 	KeyRangeRef getRangeRef(int prefixesCovered, int suffixLen) {
 		prefixesCovered = std::min<int>(prefixesCovered, prefixes.size());
@@ -8373,7 +8425,8 @@ ACTOR Future<Void> prefixClusteredInsert(IKeyValueStore* kvs,
                                          int valueSize,
                                          KVSource source,
                                          int recordCountTarget,
-                                         bool usePrefixesInOrder) {
+                                         bool usePrefixesInOrder,
+                                         bool clearAfter) {
 	state int commitTarget = 5e6;
 
 	state int recordSize = source.prefixLen + suffixSize + valueSize;
@@ -8418,7 +8471,7 @@ ACTOR Future<Void> prefixClusteredInsert(IKeyValueStore* kvs,
 
 		state int i;
 		for (i = 0; i < recordsPerPrefix; ++i) {
-			KeyValueRef kv(source.getAnotherKeyRef(4, usePrefixesInOrder), source.getValue(valueSize));
+			KeyValueRef kv(source.getAnotherKeyRef(suffixSize, usePrefixesInOrder), source.getValue(valueSize));
 			kvs->set(kv);
 			kvBytes += kv.expectedSize();
 			++records;
@@ -8440,6 +8493,8 @@ ACTOR Future<Void> prefixClusteredInsert(IKeyValueStore* kvs,
 	}
 
 	wait(commit);
+	// TODO is it desired that not all records are committed? This could commit again to ensure any records set() since
+	// the last commit are persisted. For the purposes of how this is used currently, I don't think it matters though
 	stats();
 	printf("\n");
 
@@ -8447,13 +8502,15 @@ ACTOR Future<Void> prefixClusteredInsert(IKeyValueStore* kvs,
 	StorageBytes sb = wait(getStableStorageBytes(kvs));
 	printf("storageBytes: %s (stable after %.2f seconds)\n", toString(sb).c_str(), timer() - intervalStart);
 
-	printf("Clearing all keys\n");
-	intervalStart = timer();
-	kvs->clear(KeyRangeRef(LiteralStringRef(""), LiteralStringRef("\xff")));
-	state StorageBytes sbClear = wait(getStableStorageBytes(kvs));
-	printf("Cleared all keys in %.2f seconds, final storageByte: %s\n",
-	       timer() - intervalStart,
-	       toString(sbClear).c_str());
+	if (clearAfter) {
+		printf("Clearing all keys\n");
+		intervalStart = timer();
+		kvs->clear(KeyRangeRef(LiteralStringRef(""), LiteralStringRef("\xff")));
+		state StorageBytes sbClear = wait(getStableStorageBytes(kvs));
+		printf("Cleared all keys in %.2f seconds, final storageByte: %s\n",
+		       timer() - intervalStart,
+		       toString(sbClear).c_str());
+	}
 
 	return Void();
 }
@@ -8540,7 +8597,7 @@ ACTOR Future<Void> doPrefixInsertComparison(int suffixSize,
 	deleteFile("test.redwood");
 	wait(delay(5));
 	state IKeyValueStore* redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood", UID(), 0);
-	wait(prefixClusteredInsert(redwood, suffixSize, valueSize, source, recordCountTarget, usePrefixesInOrder));
+	wait(prefixClusteredInsert(redwood, suffixSize, valueSize, source, recordCountTarget, usePrefixesInOrder, true));
 	wait(closeKVS(redwood));
 	printf("\n");
 
@@ -8548,7 +8605,7 @@ ACTOR Future<Void> doPrefixInsertComparison(int suffixSize,
 	deleteFile("test.sqlite-wal");
 	wait(delay(5));
 	state IKeyValueStore* sqlite = openKVStore(KeyValueStoreType::SSD_BTREE_V2, "test.sqlite", UID(), 0);
-	wait(prefixClusteredInsert(sqlite, suffixSize, valueSize, source, recordCountTarget, usePrefixesInOrder));
+	wait(prefixClusteredInsert(sqlite, suffixSize, valueSize, source, recordCountTarget, usePrefixesInOrder, true));
 	wait(closeKVS(sqlite));
 	printf("\n");
 
@@ -8590,3 +8647,91 @@ TEST_CASE(":/redwood/performance/sequentialInsert") {
 
 	return Void();
 }
+
+// singlePrefix forces the range read to have the start and end key with the same prefix
+ACTOR Future<Void> randomRangeScans(IKeyValueStore* kvs,
+                                    int suffixSize,
+                                    KVSource source,
+                                    int valueSize,
+                                    int recordCountTarget,
+                                    bool singlePrefix,
+                                    int rowLimit) {
+	printf("\nstoreType: %d\n", kvs->getType());
+	printf("prefixSource: %s\n", source.toString().c_str());
+	printf("suffixSize: %d\n", suffixSize);
+	printf("recordCountTarget: %d\n", recordCountTarget);
+	printf("singlePrefix: %d\n", singlePrefix);
+	printf("rowLimit: %d\n", rowLimit);
+
+	state int64_t recordSize = source.prefixLen + suffixSize + valueSize;
+	state int64_t bytesRead = 0;
+	state int64_t recordsRead = 0;
+	state int queries = 0;
+	state int64_t nextPrintRecords = 1e5;
+
+	state double start = timer();
+	state std::function<void()> stats = [&]() {
+		double elapsed = timer() - start;
+		printf("Cumulative stats: %.2f seconds  %d queries %.2f MB %d records  %.2f qps %.2f MB/s  %.2f rec/s\r\n",
+		       elapsed,
+		       queries,
+		       bytesRead / 1e6,
+		       recordsRead,
+		       queries / elapsed,
+		       bytesRead / elapsed / 1e6,
+		       recordsRead / elapsed);
+		fflush(stdout);
+	};
+
+	while (recordsRead < recordCountTarget) {
+		KeyRangeRef range = source.getKeyRangeRef(singlePrefix, suffixSize);
+		int rowLim = (deterministicRandom()->randomInt(0, 2) != 0) ? rowLimit : -rowLimit;
+
+		Standalone<RangeResultRef> result = wait(kvs->readRange(range, rowLim));
+
+		recordsRead += result.size();
+		bytesRead += result.size() * recordSize;
+		++queries;
+
+		// log stats with exponential backoff
+		if (recordsRead >= nextPrintRecords) {
+			stats();
+			nextPrintRecords *= 2;
+		}
+	}
+
+	stats();
+	printf("\n");
+
+	return Void();
+}
+
+TEST_CASE("!/redwood/performance/randomRangeScans") {
+	state int prefixLen = 30;
+	state int suffixSize = 12;
+	state int valueSize = 100;
+
+	// TODO change to 100e8 after figuring out no-disk redwood mode
+	state int writeRecordCountTarget = 1e6;
+	state int queryRecordTarget = 1e7;
+	state int writePrefixesInOrder = false;
+
+	state KVSource source({ { prefixLen, 1000 } });
+
+	deleteFile("test.redwood");
+	wait(delay(5));
+	state IKeyValueStore* redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood", UID(), 0);
+	wait(prefixClusteredInsert(
+	    redwood, suffixSize, valueSize, source, writeRecordCountTarget, writePrefixesInOrder, false));
+
+	// divide targets for tiny queries by 10 because they are much slower
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget / 10, true, 10));
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, true, 1000));
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget / 10, false, 100));
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, false, 10000));
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, false, 1000000));
+	wait(closeKVS(redwood));
+	printf("\n");
+
+	return Void();
+}
diff --git a/flow/Arena.h b/flow/Arena.h
index 712c1fc8bf..12c6932ff0 100644
--- a/flow/Arena.h
+++ b/flow/Arena.h
@@ -529,6 +529,17 @@ public:
 		return ::compare(size(), other.size());
 	}
 
+	int compareSuffix(StringRef const& other, int prefixLen) const {
+		// pre: prefixLen <= size() && prefixLen <= other.size()
+		size_t minSuffixSize = std::min(size(), other.size()) - prefixLen;
+		if (minSuffixSize != 0) {
+			int c = memcmp(begin() + prefixLen, other.begin() + prefixLen, minSuffixSize);
+			if (c != 0)
+				return c;
+		}
+		return ::compare(size(), other.size());
+	}
+
 	// Removes bytes from begin up to and including the sep string, returns StringRef of the part before sep
 	StringRef eat(StringRef sep) {
 		for (int i = 0, iend = size() - sep.size(); i <= iend; ++i) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7caaf13007..4574752906 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -62,6 +62,7 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES PureNetwork.txt IGNORE)
   add_fdb_test(TEST_FILES RRW2500.txt IGNORE)
   add_fdb_test(TEST_FILES RandomRead.txt IGNORE)
+  add_fdb_test(TEST_FILES RandomRangeRead.txt IGNORE)
   add_fdb_test(TEST_FILES RandomReadWrite.txt IGNORE)
   add_fdb_test(TEST_FILES ReadAbsent.txt IGNORE)
   add_fdb_test(TEST_FILES ReadAfterWrite.txt IGNORE)
@@ -74,6 +75,7 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES RedwoodPerfSet.txt IGNORE)
   add_fdb_test(TEST_FILES RedwoodPerfPrefixCompression.txt IGNORE)
   add_fdb_test(TEST_FILES RedwoodPerfSequentialInsert.txt IGNORE)
+  add_fdb_test(TEST_FILES RedwoodPerfRandomRangeScans.txt IGNORE)
   add_fdb_test(TEST_FILES RocksDBTest.txt IGNORE)
   add_fdb_test(TEST_FILES S3BlobStore.txt IGNORE)
   add_fdb_test(TEST_FILES SampleNoSimAttrition.txt IGNORE)
diff --git a/tests/RandomRangeRead.txt b/tests/RandomRangeRead.txt
new file mode 100644
index 0000000000..e614cdcf4e
--- /dev/null
+++ b/tests/RandomRangeRead.txt
@@ -0,0 +1,11 @@
+testTitle=RandomReadWriteTest
+testName=ReadWrite
+testDuration=10.0
+transactionsPerSecond=2500
+writesPerTransactionA=0
+readsPerTransactionA=10
+rangeReads=true
+alpha=0
+nodeCount=50000
+valueBytes=16
+discardEdgeMeasurements=false
diff --git a/tests/RedwoodPerfRandomRangeScans.txt b/tests/RedwoodPerfRandomRangeScans.txt
new file mode 100644
index 0000000000..5f49bf8099
--- /dev/null
+++ b/tests/RedwoodPerfRandomRangeScans.txt
@@ -0,0 +1,6 @@
+testTitle=UnitTests
+testName=UnitTests
+startDelay=0
+useDB=false
+maxTestCases=0
+testsMatching=!/redwood/performance/randomRangeScans

From 52bba82e8ef24d1eaef8ee0347e04a1bc0bf8e85 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 23 Apr 2021 14:05:05 -0700
Subject: [PATCH 241/461] Add window size configuration key

---
 fdbclient/ActorLineageProfiler.cpp | 10 ++++++++++
 fdbclient/ActorLineageProfiler.h   |  1 +
 fdbclient/GlobalConfig.actor.cpp   |  1 +
 fdbclient/GlobalConfig.actor.h     |  1 +
 fdbclient/NativeAPI.actor.cpp      |  1 +
 5 files changed, 14 insertions(+)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index fe335d90d5..46a74bace7 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -298,3 +298,13 @@ void samplingProfilerUpdateFrequency(std::optional<std::any> freq) {
 	TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency);
 	ActorLineageProfiler::instance().setFrequency(frequency);
 }
+
+// Callback used to update the sample collector window size.
+void samplingProfilerUpdateWindow(std::optional<std::any> window) {
+	double duration = 0;
+	if (window.has_value()) {
+		duration = std::any_cast<double>(window.value());
+	}
+	TraceEvent(SevInfo, "SamplingProfilerUpdateWindow").detail("Duration", duration);
+	SampleCollection::instance().setWindowSize(duration);
+}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index b73e7d04eb..c612274133 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -32,6 +32,7 @@
 #include "flow/flow.h"
 
 void samplingProfilerUpdateFrequency(std::optional<std::any> freq);
+void samplingProfilerUpdateWindow(std::optional<std::any> window);
 
 struct IALPCollectorBase {
 	virtual std::optional<std::any> collect(ActorLineage*) = 0;
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 79bbbb2202..1d06d84880 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -35,6 +35,7 @@ const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag
 const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost");
 
 const KeyRef samplingFrequency = LiteralStringRef("visibility/sampling/frequency");
+const KeyRef samplingWindow = LiteralStringRef("visibility/sampling/window");
 
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index de98c442e1..65028dcd92 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -52,6 +52,7 @@ extern const KeyRef transactionTagSampleRate;
 extern const KeyRef transactionTagSampleCost;
 
 extern const KeyRef samplingFrequency;
+extern const KeyRef samplingWindow;
 
 // Structure used to hold the values stored by global configuration. The arena
 // is used as memory to store both the key and the value (the value is only
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index c329a17546..d9e24f79dc 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -965,6 +965,7 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 
 	GlobalConfig::create(this, clientInfo);
 	GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
+	GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow);
 
 	monitorProxiesInfoChange = monitorProxiesChange(clientInfo, &proxiesChangeTrigger);
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);

From 25fb85a64c8ab7e0578c80e0adc77ab667b3268f Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 22 Apr 2021 20:55:06 -0700
Subject: [PATCH 242/461] Add API to read samples from worker

---
 fdbcli/fdbcli.actor.cpp             |   8 ++
 fdbclient/ActorLineageProfiler.cpp  |   7 +-
 fdbclient/ActorLineageProfiler.h    |   2 +-
 fdbclient/ProcessInterface.h        |  45 +++++++-
 fdbclient/SpecialKeySpace.actor.cpp | 169 +++++++++++++++++++++++++---
 fdbclient/SpecialKeySpace.actor.h   |   7 ++
 fdbserver/worker.actor.cpp          |  24 +++-
 7 files changed, 242 insertions(+), 20 deletions(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index d655601e22..d21775d47f 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -4698,6 +4698,14 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 		} catch (Error& e) {
 			if (e.code() != error_code_actor_cancelled)
 				fprintf(stderr, "ERROR: %s (%d)\n", e.what(), e.code());
+			if (e.code() == error_code_special_keys_api_failure) {
+				auto f = tr->get(LiteralStringRef("\xff\xff/error_message"));
+				ASSERT(f.isReady());
+				if (f.get().present()) {
+					auto msg = f.get().get().toString();
+					printf("Special Key space error_message: %s\n", msg.c_str());
+				}
+			}
 			is_error = true;
 			if (intrans) {
 				printf("Rolling back current transaction\n");
diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 46a74bace7..46de22d2fc 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -63,13 +63,14 @@ class Packer : public msgpack::packer<msgpack::sbuffer> {
 			                     std::string_view,
 			                     std::vector<std::any>,
 			                     std::map<std::string, std::any>,
-			                     std::map<std::string_view, std::any>>::populate(visitorMap);
+			                     std::map<std::string_view, std::any>,
+			                     std::vector<std::map<std::string_view, std::any>>>::populate(visitorMap);
 		}
 
 		void visit(const std::any& val, Packer& packer) {
 			auto iter = visitorMap.find(val.type());
 			if (iter == visitorMap.end()) {
-				// TODO: trace error
+				TraceEvent(SevError, "PackerTypeNotFound").detail("Type", val.type().name());
 			} else {
 				iter->second(val, packer);
 			}
@@ -197,7 +198,7 @@ std::shared_ptr<Sample> SampleCollectorT::collect() {
 
 void SampleCollection_t::refresh() {
 	auto sample = _collector->collect();
-	auto min = std::max(sample->time - windowSize, sample->time);
+	auto min = std::min(sample->time - windowSize, sample->time);
 	{
 		Lock _{ mutex };
 		data.emplace_back(std::move(sample));
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index c612274133..82cd22cb1c 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -78,7 +78,7 @@ class SampleCollection_t {
 
 	SampleCollector _collector;
 	mutable std::mutex mutex;
-	std::atomic<double> windowSize = 0.0;
+	std::atomic<double> windowSize = 5.0;
 	std::deque<std::shared_ptr<Sample>> data;
 
 public:
diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
index c76cf9ef48..9b648d8127 100644
--- a/fdbclient/ProcessInterface.h
+++ b/fdbclient/ProcessInterface.h
@@ -18,6 +18,7 @@
  * limitations under the License.
  */
 
+#include "fdbclient/AnnotateActor.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbrpc/fdbrpc.h"
 
@@ -26,11 +27,11 @@ constexpr UID WLTOKEN_PROCESS(-1, 11);
 struct ProcessInterface {
 	constexpr static FileIdentifier file_identifier = 985636;
 	RequestStream<struct GetProcessInterfaceRequest> getInterface;
-	RequestStream<struct EchoRequest> echo;
+	RequestStream<struct ActorLineageRequest> actorLineage;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, echo);
+		serializer(ar, actorLineage);
 	}
 };
 
@@ -55,3 +56,43 @@ struct EchoRequest {
 		serializer(ar, message, reply);
 	}
 };
+
+// This type is used to send serialized sample data over the network.
+// TODO: Possible to combine with `Sample`?
+struct SerializedSample {
+	constexpr static FileIdentifier file_identifier = 15785634;
+
+	WaitState waitState;
+	double time;
+	int seq;
+	std::string data;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, waitState, time, seq, data);
+	}
+};
+
+struct ActorLineageReply {
+	constexpr static FileIdentifier file_identifier = 1887656;
+	std::vector<SerializedSample> samples;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, samples);
+	}
+};
+
+struct ActorLineageRequest {
+	constexpr static FileIdentifier file_identifier = 11654765;
+	WaitState waitStateStart, waitStateEnd;
+	double timeStart, timeEnd;
+	int seqStart, seqEnd;
+	// TODO: Add end values
+	ReplyPromise<ActorLineageReply> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, waitStateStart, waitStateEnd, timeStart, timeEnd, seqStart, seqEnd, reply);
+	}
+};
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 603887fcf6..f251feddfa 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -21,6 +21,10 @@
 #include "boost/lexical_cast.hpp"
 #include "boost/algorithm/string.hpp"
 
+#include <msgpack.hpp>
+
+#include <exception>
+
 #include "fdbclient/Knobs.h"
 #include "fdbclient/ProcessInterface.h"
 #include "fdbclient/GlobalConfig.actor.h"
@@ -96,6 +100,15 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandT
 	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
 };
 
+std::unordered_map<std::string, KeyRange> SpecialKeySpace::actorLineageApiCommandToRange = {
+	{ "state",
+	  KeyRangeRef(LiteralStringRef("state/"), LiteralStringRef("state0"))
+	      .withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) },
+	{ "time",
+	  KeyRangeRef(LiteralStringRef("time/"), LiteralStringRef("time0"))
+	      .withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) }
+};
+
 std::set<std::string> SpecialKeySpace::options = { "excluded/force", "failed/force" };
 
 std::set<std::string> SpecialKeySpace::tracingOptions = { kTracingTransactionIdKey, kTracingTokenKey };
@@ -1925,26 +1938,156 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke
 
 ActorLineageImpl::ActorLineageImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
 
+void parse(StringRef& val, int& i) {
+	i = std::stoi(val.toString());
+}
+
+void parse(StringRef& val, double& d) {
+	d = std::stod(val.toString());
+}
+
+void parse(StringRef& val, WaitState& w) {
+	if (val == LiteralStringRef("disk")) {
+		w = WaitState::Disk;
+	} else if (val == LiteralStringRef("network")) {
+		w = WaitState::Network;
+	} else if (val == LiteralStringRef("running")) {
+		w = WaitState::Running;
+	} else {
+		throw std::range_error("failed to parse run state");
+	}
+}
+
+void parse(StringRef& val, NetworkAddress& a) {
+	auto address = NetworkAddress::parse(val.toString());
+	if (!address.isValid()) {
+		throw std::invalid_argument("invalid host");
+	}
+	a = address;
+}
+
+// Base case function for parsing function below.
+template <typename T>
+void parse(std::vector<StringRef>::iterator it, std::vector<StringRef>::iterator end, T& t1) {
+	if (it == end) {
+		return;
+	}
+	parse(*it, t1);
+}
+
+// Given an iterator into a vector of string tokens, an iterator to the end of
+// the search space in the vector (exclusive), and a list of references to
+// types, parses each token in the vector into the associated type according to
+// the order of the arguments.
+//
+// For example, given the vector ["1", "1.5", "127.0.0.1:4000"] and the
+// argument list int a, double b, NetworkAddress c, after this function returns
+// each parameter passed in will hold the parsed value from the token list.
+//
+// The appropriate parsing function must be implemented for the type you wish
+// to parse. See the existing parsing functions above, and add your own if
+// necessary.
+template <typename T, typename... Types>
+void parse(std::vector<StringRef>::iterator it, std::vector<StringRef>::iterator end, T& t1, Types&... remaining) {
+	// Return as soon as all tokens have been parsed. This allows parameters
+	// passed at the end to act as optional parameters -- they will only be set
+	// if the value exists.
+	if (it == end) {
+		return;
+	}
+
+	try {
+		parse(*it, t1);
+		parse(++it, end, remaining...);
+	} catch (Error& e) {
+		throw e;
+	} catch (std::exception& e) {
+		throw e;
+	}
+}
+
 ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYourWritesTransaction* ryw,
                                                                           KeyRef prefix,
                                                                           KeyRangeRef kr) {
 	state Standalone<RangeResultRef> result;
-	Standalone<StringRef> addressString = kr.begin.removePrefix(prefix);
+
+	// Set default values for all fields. The default will be used if the field
+	// is missing in the key.
+	state NetworkAddress host;
+	state WaitState waitStateStart = WaitState{ 0 };
+	state WaitState waitStateEnd = WaitState{ 2 };
+	state double timeStart = 0;
+	state double timeEnd = std::numeric_limits<double>::max();
+	state int seqStart = 0;
+	state int seqEnd = std::numeric_limits<int>::max();
+
+	state std::vector<StringRef> beginValues = kr.begin.removePrefix(prefix).splitAny("/"_sr);
+	state std::vector<StringRef> endValues = kr.end.removePrefix(prefix).splitAny("/"_sr);
+	// Require index (either "state" or "time") and address:port.
+	if (beginValues.size() < 2 || endValues.size() < 2) {
+		ryw->setSpecialKeySpaceErrorMsg("missing required parameters (index, host)");
+		throw special_keys_api_failure();
+	}
 
 	try {
-		auto address = NetworkAddress::parse(addressString.contents().toString());
-
-		state ProcessInterface process;
-		process.getInterface = RequestStream<GetProcessInterfaceRequest>(Endpoint({ address }, WLTOKEN_PROCESS));
-		ProcessInterface p = wait(retryBrokenPromise(process.getInterface, GetProcessInterfaceRequest{}));
-		process = p;
-
-		EchoRequest echoRequest;
-		echoRequest.message = "Hello";
-		std::string response = wait(process.echo.getReply(echoRequest));
-		result.push_back_deep(result.arena(), KeyValueRef(kr.begin, response));
+		state NetworkAddress endRangeHost;
+		if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
+			// For the range \xff\xff/actor_lineage/state/ip:port/wait-state/time/seq
+			parse(beginValues.begin() + 1, beginValues.end(), host, waitStateStart, timeStart, seqStart);
+			if (kr.begin != kr.end) {
+				parse(endValues.begin() + 1, endValues.end(), endRangeHost, waitStateEnd, timeEnd, seqEnd);
+			}
+		} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
+			// For the range \xff\xff/actor_lineage/time/ip:port/time/wait-state/seq
+			parse(beginValues.begin() + 1, beginValues.end(), host, timeStart, waitStateStart, seqStart);
+			if (kr.begin != kr.end) {
+				parse(endValues.begin() + 1, endValues.end(), endRangeHost, timeEnd, waitStateEnd, seqEnd);
+			}
+		} else {
+			ryw->setSpecialKeySpaceErrorMsg("invalid index in actor_lineage");
+			throw special_keys_api_failure();
+		}
 	} catch (Error& e) {
-		TraceEvent(SevDebug, "SpecialKeysNetworkParseError").error(e);
+		if (e.code() != special_keys_api_failure().code()) {
+			ryw->setSpecialKeySpaceErrorMsg("failed to parse key");
+			throw special_keys_api_failure();
+		} else {
+			throw e;
+		}
+	}
+
+	if (kr.begin != kr.end && host != endRangeHost) {
+		// The client doesn't know about all the hosts, so a get range covering
+		// multiple hosts has no way of knowing which IP:port combos to use.
+		ryw->setSpecialKeySpaceErrorMsg("the host must remain the same on both ends of the range");
+		throw special_keys_api_failure();
+	}
+
+	// Open endpoint to target process on each call. This can be optimized at
+	// some point...
+	state ProcessInterface process;
+	process.getInterface = RequestStream<GetProcessInterfaceRequest>(Endpoint({ host }, WLTOKEN_PROCESS));
+	ProcessInterface p = wait(retryBrokenPromise(process.getInterface, GetProcessInterfaceRequest{}));
+	process = p;
+
+	ActorLineageRequest actorLineageRequest;
+	actorLineageRequest.waitStateStart = waitStateStart;
+	actorLineageRequest.waitStateEnd = waitStateEnd;
+	actorLineageRequest.timeStart = timeStart;
+	actorLineageRequest.timeEnd = timeEnd;
+	actorLineageRequest.seqStart = seqStart;
+	actorLineageRequest.seqEnd = seqEnd;
+	ActorLineageReply reply = wait(process.actorLineage.getReply(actorLineageRequest));
+
+	for (const auto& sample : reply.samples) {
+		msgpack::object_handle oh = msgpack::unpack(sample.data.data(), sample.data.size());
+		msgpack::object deserialized = oh.get();
+
+		std::ostringstream stream;
+		stream << deserialized;
+		// TODO: Fix return value for ranges
+		Key returnKey = prefix.withSuffix(host.toString() + "/" + std::to_string(sample.seq));
+		result.push_back_deep(result.arena(), KeyValueRef(returnKey, stream.str()));
 	}
 
 	return result;
diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h
index 08a3c6cfc5..fd16af7c2c 100644
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@@ -200,6 +200,12 @@ public:
 	static KeyRef getManagementApiCommandPrefix(const std::string& command) {
 		return managementApiCommandToRange.at(command).begin;
 	}
+	static KeyRangeRef getActorLineageApiCommandRange(const std::string& command) {
+		return actorLineageApiCommandToRange.at(command);
+	}
+	static KeyRef getActorLineageApiCommandPrefix(const std::string& command) {
+		return actorLineageApiCommandToRange.at(command).begin;
+	}
 	static Key getManagementApiCommandOptionSpecialKey(const std::string& command, const std::string& option);
 	static const std::set<std::string>& getManagementApiOptionsSet() { return options; }
 	static const std::set<std::string>& getTracingOptions() { return tracingOptions; }
@@ -228,6 +234,7 @@ private:
 	static std::unordered_map<SpecialKeySpace::MODULE, KeyRange> moduleToBoundary;
 	static std::unordered_map<std::string, KeyRange>
 	    managementApiCommandToRange; // management command to its special keys' range
+	static std::unordered_map<std::string, KeyRange> actorLineageApiCommandToRange;
 	static std::set<std::string> options; // "<command>/<option>"
 	static std::set<std::string> tracingOptions;
 
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 875a92a949..4c5dfecb16 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -2040,6 +2040,8 @@ ACTOR Future<Void> serveProtocolInfo() {
 	}
 }
 
+// Handles requests from ProcessInterface, an interface meant for direct
+// communication between the client and FDB processes.
 ACTOR Future<Void> serveProcess() {
 	state ProcessInterface process;
 	process.getInterface.makeWellKnownEndpoint(WLTOKEN_PROCESS, TaskPriority::DefaultEndpoint);
@@ -2048,7 +2050,27 @@ ACTOR Future<Void> serveProcess() {
 			when(GetProcessInterfaceRequest req = waitNext(process.getInterface.getFuture())) {
 				req.reply.send(process);
 			}
-			when(EchoRequest req = waitNext(process.echo.getFuture())) { req.reply.send(req.message); }
+			when(ActorLineageRequest req = waitNext(process.actorLineage.getFuture())) {
+				state SampleCollection sampleCollector;
+				// TODO: Add filtering by wait state
+				auto samples = sampleCollector->get(req.timeStart, req.timeEnd);
+				// The size of samples should never approach 2 billion, so
+				// casting from 64 to 32 bits here should be okay.
+				ASSERT(samples.size() < std::numeric_limits<int>::max());
+				int maxSeq = std::min(req.seqEnd, static_cast<int>(samples.size()));
+
+				std::vector<SerializedSample> serializedSamples;
+				for (int i = req.seqStart; i < maxSeq; ++i) {
+					auto samplePtr = samples.at(i);
+					auto serialized = SerializedSample{ .waitState = WaitState::Network, // TODO: Currently unused
+						                                .time = samplePtr->time,
+						                                .seq = i,
+						                                .data = std::string(samplePtr->data, samplePtr->size) };
+					serializedSamples.push_back(std::move(serialized));
+				}
+				ActorLineageReply reply{ serializedSamples };
+				req.reply.send(reply);
+			}
 		}
 	}
 }

From 34b6671303cd0b72977ac2890d70eda589ab6adc Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 22 Apr 2021 20:58:03 -0700
Subject: [PATCH 243/461] Remove temporary fix

---
 fdbcli/fdbcli.actor.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index d21775d47f..d655601e22 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -4698,14 +4698,6 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 		} catch (Error& e) {
 			if (e.code() != error_code_actor_cancelled)
 				fprintf(stderr, "ERROR: %s (%d)\n", e.what(), e.code());
-			if (e.code() == error_code_special_keys_api_failure) {
-				auto f = tr->get(LiteralStringRef("\xff\xff/error_message"));
-				ASSERT(f.isReady());
-				if (f.get().present()) {
-					auto msg = f.get().get().toString();
-					printf("Special Key space error_message: %s\n", msg.c_str());
-				}
-			}
 			is_error = true;
 			if (intrans) {
 				printf("Rolling back current transaction\n");

From 3cf2dd0fbe54df831650315da4acc4a26db03cf0 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 22 Apr 2021 21:00:29 -0700
Subject: [PATCH 244/461] Remove TODO

---
 fdbclient/ProcessInterface.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
index 9b648d8127..c89f6028bb 100644
--- a/fdbclient/ProcessInterface.h
+++ b/fdbclient/ProcessInterface.h
@@ -88,7 +88,6 @@ struct ActorLineageRequest {
 	WaitState waitStateStart, waitStateEnd;
 	double timeStart, timeEnd;
 	int seqStart, seqEnd;
-	// TODO: Add end values
 	ReplyPromise<ActorLineageReply> reply;
 
 	template <class Ar>

From 9adce8456a2786d659acbb7fd59619f8eb012af5 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 23 Apr 2021 13:33:08 -0700
Subject: [PATCH 245/461] Add invalid reference check

---
 fdbclient/ActorLineageProfiler.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 46de22d2fc..f1a71bae60 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -238,7 +238,10 @@ ActorLineageProfilerT::ActorLineageProfilerT() {
 	    std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet())));
 	collection->collector()->addGetter(WaitState::Running, []() {
 		auto res = currentLineageThreadSafe.get();
-		return std::vector<Reference<ActorLineage>>({ currentLineageThreadSafe.get() });
+		if (res.isValid()) {
+			return std::vector<Reference<ActorLineage>>({ res });
+		}
+		return std::vector<Reference<ActorLineage>>();
 	});
 }
 

From 6fc59379d8e7ed64bf56829153ff72bb7b104f3c Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Fri, 23 Apr 2021 21:17:41 +0000
Subject: [PATCH 246/461] Add /fdbclient/multiversionclient/ to ctest, and fix
 thread safety

---
 fdbclient/MultiVersionTransaction.actor.cpp | 23 +++++++++++---
 flow/FastRef.h                              | 34 +++++++++------------
 tests/CMakeLists.txt                        |  4 +++
 3 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 4b6ba0c27c..cff80f60b5 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -1987,6 +1987,9 @@ THREAD_FUNC runSingleAssignmentVarTest(void* arg) {
 			tf.validate();
 
 			tf.future.extractPtr(); // leaks
+			for (auto t : tf.threads) {
+				waitThread(t);
+			}
 		}
 
 		for (int numRuns = 0; numRuns < 25; ++numRuns) {
@@ -2057,12 +2060,14 @@ struct AbortableTest {
 
 TEST_CASE("/fdbclient/multiversionclient/AbortableSingleAssignmentVar") {
 	state volatile bool done = false;
-	g_network->startThread(runSingleAssignmentVarTest<AbortableTest>, (void*)&done);
+	state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest<AbortableTest>, (void*)&done);
 
 	while (!done) {
 		wait(delay(1.0));
 	}
 
+	waitThread(thread);
+
 	return Void();
 }
 
@@ -2134,20 +2139,24 @@ TEST_CASE("/fdbclient/multiversionclient/DLSingleAssignmentVar") {
 	state volatile bool done = false;
 
 	MultiVersionApi::api->callbackOnMainThread = true;
-	g_network->startThread(runSingleAssignmentVarTest<DLTest>, (void*)&done);
+	state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest<DLTest>, (void*)&done);
 
 	while (!done) {
 		wait(delay(1.0));
 	}
 
+	waitThread(thread);
+
 	done = false;
 	MultiVersionApi::api->callbackOnMainThread = false;
-	g_network->startThread(runSingleAssignmentVarTest<DLTest>, (void*)&done);
+	thread = g_network->startThread(runSingleAssignmentVarTest<DLTest>, (void*)&done);
 
 	while (!done) {
 		wait(delay(1.0));
 	}
 
+	waitThread(thread);
+
 	return Void();
 }
 
@@ -2172,12 +2181,14 @@ struct MapTest {
 
 TEST_CASE("/fdbclient/multiversionclient/MapSingleAssignmentVar") {
 	state volatile bool done = false;
-	g_network->startThread(runSingleAssignmentVarTest<MapTest>, (void*)&done);
+	state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest<MapTest>, (void*)&done);
 
 	while (!done) {
 		wait(delay(1.0));
 	}
 
+	waitThread(thread);
+
 	return Void();
 }
 
@@ -2209,11 +2220,13 @@ struct FlatMapTest {
 
 TEST_CASE("/fdbclient/multiversionclient/FlatMapSingleAssignmentVar") {
 	state volatile bool done = false;
-	g_network->startThread(runSingleAssignmentVarTest<FlatMapTest>, (void*)&done);
+	state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest<FlatMapTest>, (void*)&done);
 
 	while (!done) {
 		wait(delay(1.0));
 	}
 
+	waitThread(thread);
+
 	return Void();
 }
diff --git a/flow/FastRef.h b/flow/FastRef.h
index eca6ab72d5..aaa50b9595 100644
--- a/flow/FastRef.h
+++ b/flow/FastRef.h
@@ -22,45 +22,39 @@
 #define FLOW_FASTREF_H
 #pragma once
 
+#include <atomic>
 #include <cstdint>
 
-#include "flow/Platform.h"
-
-#if VALGRIND
-#include <drd.h>
-#endif
-
 template <class Subclass>
 class ThreadSafeReferenceCounted {
 public:
 	ThreadSafeReferenceCounted() : referenceCount(1) {}
 	// NO virtual destructor!  Subclass should have a virtual destructor if it is not sealed.
-	void addref() const { interlockedIncrement(&referenceCount); }
+	void addref() const { referenceCount.fetch_add(1); }
 	// If return value is true, caller is responsible for destruction of object
 	bool delref_no_destroy() const {
-		if (interlockedDecrement(&referenceCount) != 0) {
-#ifdef VALGRIND
-			ANNOTATE_HAPPENS_BEFORE(&referenceCount);
-#endif
-			return false;
+		// The performance of this seems comparable to a version with less strict memory ordering (see e.g.
+		// https://www.boost.org/doc/libs/1_57_0/doc/html/atomic/usage_examples.html#boost_atomic.usage_examples.example_reference_counters),
+		// on both x86 and ARM, with gcc8.
+		if (referenceCount.fetch_sub(1) == 1) {
+			return true;
 		}
-#ifdef VALGRIND
-		ANNOTATE_HAPPENS_AFTER(&referenceCount);
-#endif
-		return true;
+		return false;
 	}
 	void delref() const {
 		if (delref_no_destroy())
 			delete (Subclass*)this;
 	}
-	void setrefCountUnsafe(int32_t count) const { referenceCount = count; }
-	int32_t debugGetReferenceCount() const { return referenceCount; } // Never use in production code, only for tracing
-	bool isSoleOwnerUnsafe() const { return referenceCount == 1; }
+	void setrefCountUnsafe(int32_t count) const { referenceCount.store(count); }
+	int32_t debugGetReferenceCount() const {
+		return referenceCount.load();
+	} // Never use in production code, only for tracing
+	bool isSoleOwnerUnsafe() const { return referenceCount.load() == 1; }
 
 private:
 	ThreadSafeReferenceCounted(const ThreadSafeReferenceCounted&) /* = delete*/;
 	void operator=(const ThreadSafeReferenceCounted&) /* = delete*/;
-	mutable volatile int32_t referenceCount;
+	mutable std::atomic<int32_t> referenceCount;
 };
 
 template <class Subclass>
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7caaf13007..781d1af2ee 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -261,6 +261,10 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES status/separate_not_enough_servers.txt)
   add_fdb_test(TEST_FILES status/single_process_too_many_config_params.txt)
 
+  add_test(
+    NAME multiversion_client/unit_tests
+    COMMAND $<TARGET_FILE:fdbserver> -r unittests -f /fdbclient/multiversionclient/
+  )
 
   verify_testing()
   if (NOT OPEN_FOR_IDE AND NOT WIN32)

From a794fca9329a8c1a2e4a6abc7a86991c4e70059b Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 23 Apr 2021 15:00:21 -0700
Subject: [PATCH 247/461] Support 5.0 (and earlier) client versions by adding
 GRV probing for old versions. Update the C bindings implementation of
 get_server_protocol to convert the ProtocolVersion object into a uint64_t.
 Rename a misleading protocol version alias.

---
 bindings/c/fdb_c.cpp                        |   7 +-
 fdbclient/MultiVersionTransaction.actor.cpp | 196 +++++++++++++++-----
 fdbclient/MultiVersionTransaction.h         |  46 ++++-
 fdbclient/NativeAPI.actor.cpp               |  15 +-
 fdbrpc/FlowTransport.actor.cpp              |   2 +-
 flow/ProtocolVersion.h                      |   2 +-
 6 files changed, 207 insertions(+), 61 deletions(-)

diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp
index 2c133dae36..4b6b3a87ed 100644
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@@ -23,6 +23,7 @@
 #define FDB_INCLUDE_LEGACY_TYPES
 
 #include "fdbclient/MultiVersionTransaction.h"
+#include "fdbclient/MultiVersionAssignmentVars.h"
 #include "foundationdb/fdb_c.h"
 
 int g_api_version = 0;
@@ -372,7 +373,11 @@ extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db
 		expected = ProtocolVersion(expected_version);
 	}
 
-	return (FDBFuture*)(DB(db)->getServerProtocol(expected).extractPtr());
+	return (
+	    FDBFuture*)(mapThreadFuture<ProtocolVersion,
+	                                uint64_t>(DB(db)->getServerProtocol(expected), [](ErrorOr<ProtocolVersion> result) {
+		                return result.map<uint64_t>([](ProtocolVersion pv) { return pv.versionWithFlags(); });
+	                }).extractPtr());
 }
 
 extern "C" DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr) {
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 0168dea969..3aa14fd6aa 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -901,10 +901,14 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
 
 		api->runOnExternalClients(threadIdx, [this](Reference<ClientInfo> client) { dbState->addClient(client); });
 
-		dbState->protocolVersionMonitor = dbState->monitorProtocolVersion();
+		onMainThreadVoid([this]() { dbState->protocolVersionMonitor = dbState->monitorProtocolVersion(); }, nullptr);
 	}
 }
 
+MultiVersionDatabase::~MultiVersionDatabase() {
+	dbState->close();
+}
+
 // Create a MultiVersionDatabase that wraps an already created IDatabase object
 // For internal use in testing
 Reference<IDatabase> MultiVersionDatabase::debugCreateFromExistingDatabase(Reference<IDatabase> db) {
@@ -998,15 +1002,28 @@ void MultiVersionDatabase::DatabaseState::addClient(Reference<ClientInfo> client
 
 		MultiVersionApi::api->updateSupportedVersions();
 	}
+
+	if (!client->protocolVersion.hasInexpensiveMultiVersionClient() && !client->failed) {
+		TraceEvent("AddingLegacyVersionMonitor")
+		    .detail("LibPath", client->libPath)
+		    .detail("ProtocolVersion", client->protocolVersion);
+
+		legacyVersionMonitors.emplace_back(client);
+	}
 }
 
-// Watch the cluster protocol version for changes and update the database state when it does
+// Watch the cluster protocol version for changes and update the database state when it does.
+// Must be called from the main thread
 ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion() {
+	startLegacyVersionMonitors();
+
+	Optional<ProtocolVersion> expected = dbProtocolVersion;
 	ThreadFuture<ProtocolVersion> f = versionMonitorDb->getServerProtocol(dbProtocolVersion);
-	return mapThreadFuture<ProtocolVersion, Void>(f, [this](ErrorOr<ProtocolVersion> cv) {
+
+	return mapThreadFuture<ProtocolVersion, Void>(f, [this, expected](ErrorOr<ProtocolVersion> cv) {
 		if (cv.isError()) {
 			TraceEvent("ErrorGettingClusterProtocolVersion")
-			    .detail("ExpectedProtocolVersion", dbProtocolVersion)
+			    .detail("ExpectedProtocolVersion", expected)
 			    .error(cv.getError());
 		}
 
@@ -1016,6 +1033,57 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
 	});
 }
 
+// Called when a change to the protocol version of the cluster has been detected.
+// Must be called from the main thread
+void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion protocolVersion) {
+	// If the protocol version changed but is still compatible, update our local version but keep the same connection
+	if (dbProtocolVersion.present() &&
+	    protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) {
+		dbProtocolVersion = protocolVersion;
+		protocolVersionMonitor = monitorProtocolVersion();
+	}
+
+	// The protocol version has changed to a different, incompatible version
+	else {
+		TraceEvent("ProtocolVersionChanged")
+		    .detail("NewProtocolVersion", protocolVersion)
+		    .detail("OldProtocolVersion", dbProtocolVersion);
+
+		dbProtocolVersion = protocolVersion;
+
+		auto itr = clients.find(protocolVersion.normalizedVersion());
+		if (itr != clients.end()) {
+			auto& client = itr->second;
+			TraceEvent("CreatingDatabaseOnClient")
+			    .detail("LibraryPath", client->libPath)
+			    .detail("Failed", client->failed)
+			    .detail("External", client->external);
+
+			Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
+
+			if (client->external && !MultiVersionApi::apiVersionAtLeast(610)) {
+				// Old API versions return a future when creating the database, so we need to wait for it
+				dbReady = mapThreadFuture<Void, Void>(
+				    newDb.castTo<DLDatabase>()->onReady(), [this, newDb, client](ErrorOr<Void> ready) {
+					    if (!ready.isError()) {
+						    onMainThreadVoid([this, newDb, client]() { updateDatabase(newDb, client); }, nullptr);
+					    } else {
+						    onMainThreadVoid([this, client]() { updateDatabase(Reference<IDatabase>(), client); },
+						                     nullptr);
+					    }
+
+					    return ready;
+				    });
+			} else {
+				updateDatabase(newDb, client);
+			}
+		} else {
+			// We don't have a client matching the current protocol
+			updateDatabase(Reference<IDatabase>(), Reference<ClientInfo>());
+		}
+	}
+}
+
 // Replaces the active database connection with a new one. Must be called from the main thread.
 void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> newDb, Reference<ClientInfo> client) {
 	if (newDb) {
@@ -1048,9 +1116,11 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
 		if (dbProtocolVersion.get().hasStableInterfaces() && db) {
 			versionMonitorDb = db;
 		} else {
+			// For older clients that don't have an API to get the protocol version, we have to monitor it locally
 			versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
 		}
 	} else {
+		// We don't have a database connection, so use the local client to monitor the protocol version
 		db = Reference<IDatabase>();
 		versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
 	}
@@ -1059,51 +1129,87 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
 	protocolVersionMonitor = monitorProtocolVersion();
 }
 
-// Called when a change to the protocol version of the cluster has been detected. Must be called from the main
-// thread.
-void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion protocolVersion) {
-	if (dbProtocolVersion.present() &&
-	    protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) {
-		dbProtocolVersion = protocolVersion;
-		protocolVersionMonitor = monitorProtocolVersion();
-	} else {
-		TraceEvent("ProtocolVersionChanged")
-		    .detail("NewProtocolVersion", protocolVersion)
-		    .detail("OldProtocolVersion", dbProtocolVersion);
-
-		dbProtocolVersion = protocolVersion;
-		auto itr = clients.find(protocolVersion.normalizedVersion());
-
-		if (itr != clients.end()) {
-			auto& client = itr->second;
-			TraceEvent("CreatingDatabaseOnClient")
-			    .detail("LibraryPath", client->libPath)
-			    .detail("Failed", client->failed)
-			    .detail("External", client->external);
-
-			Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
-
-			if (client->external && !MultiVersionApi::apiVersionAtLeast(610)) {
-				dbReady = mapThreadFuture<Void, Void>(
-				    newDb.castTo<DLDatabase>()->onReady(), [this, newDb, client](ErrorOr<Void> ready) {
-					    if (!ready.isError()) {
-						    onMainThreadVoid([this, newDb, client]() { updateDatabase(newDb, client); }, nullptr);
-					    } else {
-						    updateDatabase(Reference<IDatabase>(), client);
-					    }
-
-					    dbReady = ThreadFuture<Void>();
-					    return ready;
-				    });
-			} else {
-				updateDatabase(newDb, client);
-			}
-		} else {
-			updateDatabase(Reference<IDatabase>(), Reference<ClientInfo>());
+// Starts version monitors for old client versions that don't support connect packet monitoring (<= 5.0).
+// Must be called from the main thread
+void MultiVersionDatabase::DatabaseState::startLegacyVersionMonitors() {
+	for (auto itr = legacyVersionMonitors.begin(); itr != legacyVersionMonitors.end(); ++itr) {
+		while (itr != legacyVersionMonitors.end() && itr->client->failed) {
+			itr = legacyVersionMonitors.erase(itr);
+		}
+		if (itr != legacyVersionMonitors.end() &&
+		    (!dbProtocolVersion.present() || itr->client->protocolVersion != dbProtocolVersion.get())) {
+			itr->startConnectionMonitor(Reference<DatabaseState>::addRef(this));
 		}
 	}
 }
 
+// Cleans up state for the legacy version monitors to break reference cycles
+// Must be called from the main thread
+void MultiVersionDatabase::DatabaseState::close() {
+	legacyVersionMonitors.clear();
+}
+
+// Starts the connection monitor by creating a database object at an old version.
+// Must be called from the main thread
+void MultiVersionDatabase::LegacyVersionMonitor::startConnectionMonitor(
+    Reference<MultiVersionDatabase::DatabaseState> dbState) {
+	if (!monitorRunning) {
+		monitorRunning = true;
+
+		db = client->api->createDatabase(dbState->clusterFilePath.c_str());
+		tr = Reference<ITransaction>();
+
+		TraceEvent("StartingLegacyVersionMonitor").detail("ProtocolVersion", client->protocolVersion);
+		versionMonitor =
+		    mapThreadFuture<Void, Void>(db.castTo<DLDatabase>()->onReady(), [this, dbState](ErrorOr<Void> ready) {
+			    onMainThreadVoid(
+			        [this, ready, dbState]() {
+				        if (ready.isError()) {
+					        TraceEvent(SevError, "FailedToOpenDatabaseOnClient")
+					            .error(ready.getError())
+					            .detail("LibPath", client->libPath);
+
+					        client->failed = true;
+					        MultiVersionApi::api->updateSupportedVersions();
+				        } else {
+					        runGrvProbe(dbState);
+				        }
+			        },
+			        nullptr);
+
+			    return ready;
+		    });
+	}
+}
+
+// Runs a GRV probe on the cluster to determine if the client version is compatible with the cluster.
+// Must be called from main thread
+void MultiVersionDatabase::LegacyVersionMonitor::runGrvProbe(Reference<MultiVersionDatabase::DatabaseState> dbState) {
+	tr = db->createTransaction();
+	versionMonitor = mapThreadFuture<Version, Void>(tr->getReadVersion(), [this, dbState](ErrorOr<Version> v) {
+		onMainThreadVoid(
+		    [this, v, dbState]() {
+			    monitorRunning = false;
+
+			    // If the version attempt returns an error, we regard that as a connection (except
+			    // operation_cancelled)
+			    if (v.isError() && v.getError().code() == error_code_operation_cancelled) {
+				    TraceEvent(SevError, "FailedToOpenDatabaseOnClient")
+				        .error(v.getError())
+				        .detail("LibPath", client->libPath);
+
+				    client->failed = true;
+				    MultiVersionApi::api->updateSupportedVersions();
+			    } else {
+				    dbState->protocolVersionChanged(client->protocolVersion);
+			    }
+		    },
+		    nullptr);
+
+		return v.map<Void>([](Version v) { return Void(); });
+	});
+}
+
 std::atomic_flag MultiVersionDatabase::externalClientsInitialized = ATOMIC_FLAG_INIT;
 
 // MultiVersionApi
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index 4bad3c7ca9..86e5cc0a63 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -440,6 +440,8 @@ public:
 	                     Reference<IDatabase> versionMonitorDb,
 	                     bool openConnectors = true);
 
+	~MultiVersionDatabase() override;
+
 	Reference<ITransaction> createTransaction() override;
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
@@ -462,6 +464,8 @@ public:
 
 	// private:
 
+	struct LegacyVersionMonitor;
+
 	// A struct that manages the current connection state of the MultiVersionDatabase. This wraps the underlying
 	// IDatabase object that is currently interacting with the cluster.
 	struct DatabaseState : ThreadSafeReferenceCounted<DatabaseState> {
@@ -470,23 +474,32 @@ public:
 		// Replaces the active database connection with a new one. Must be called from the main thread.
 		void updateDatabase(Reference<IDatabase> newDb, Reference<ClientInfo> client);
 
-		// Called when a change to the protocol version of the cluster has been detected. Must be called from the main
-		// thread.
+		// Called when a change to the protocol version of the cluster has been detected.
+		// Must be called from the main thread
 		void protocolVersionChanged(ProtocolVersion protocolVersion);
 
 		// Adds a client (local or externally loaded) that can be used to connect to the cluster
 		void addClient(Reference<ClientInfo> client);
 
-		// Watch the cluster protocol version for changes and update the database state when it does
+		// Watch the cluster protocol version for changes and update the database state when it does.
+		// Must be called from the main thread
 		ThreadFuture<Void> monitorProtocolVersion();
 
+		// Starts version monitors for old client versions that don't support connect packet monitoring (<= 5.0).
+		// Must be called from the main thread
+		void startLegacyVersionMonitors();
+
+		// Cleans up state for the legacy version monitors to break reference cycles
+		// Must be called from the main thread
+		void close();
+
 		Reference<IDatabase> db;
 		const Reference<ThreadSafeAsyncVar<Reference<IDatabase>>> dbVar;
 		std::string clusterFilePath;
 
 		// Used to monitor the cluster protocol version. Will be the same as db unless we have either not connected
-		// yet or if the client version associated with db does not support protocol monitoring. In those cases, this
-		// will be a specially created local db.
+		// yet or if the client version associated with db does not support protocol monitoring. In those cases,
+		// this will be a specially created local db.
 		Reference<IDatabase> versionMonitorDb;
 
 		ThreadFuture<Void> changed;
@@ -495,6 +508,7 @@ public:
 
 		ThreadFuture<Void> dbReady;
 		ThreadFuture<Void> protocolVersionMonitor;
+		std::list<LegacyVersionMonitor> legacyVersionMonitors;
 		Optional<ProtocolVersion> dbProtocolVersion;
 		std::map<ProtocolVersion, Reference<ClientInfo>> clients;
 
@@ -503,6 +517,28 @@ public:
 		Mutex optionLock;
 	};
 
+	// A struct that enables monitoring whether the cluster is running an old version (<= 5.0) that doesn't support
+	// connect packet monitoring.
+	struct LegacyVersionMonitor {
+		LegacyVersionMonitor(Reference<ClientInfo> client) : client(client), monitorRunning(false) {}
+		~LegacyVersionMonitor() { TraceEvent("DestroyingVersionMonitor"); }
+
+		// Starts the connection monitor by creating a database object at an old version.
+		// Must be called from the main thread
+		void startConnectionMonitor(Reference<DatabaseState> dbState);
+
+		// Runs a GRV probe on the cluster to determine if the client version is compatible with the cluster.
+		// Must be called from main thread
+		void runGrvProbe(Reference<DatabaseState> dbState);
+
+		Reference<ClientInfo> client;
+		Reference<IDatabase> db;
+		Reference<ITransaction> tr;
+
+		ThreadFuture<Void> versionMonitor;
+		bool monitorRunning;
+	};
+
 	const Reference<DatabaseState> dbState;
 	friend class MultiVersionTransaction;
 
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 1b1d79cbc2..f264d16bfa 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -1024,13 +1024,13 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 		        singleKeyRange(LiteralStringRef("consistency_check_suspended"))
 		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
 		registerSpecialKeySpaceModule(
-		    SpecialKeySpace::MODULE::GLOBALCONFIG, SpecialKeySpace::IMPLTYPE::READWRITE,
-		    std::make_unique<GlobalConfigImpl>(
-		        SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG)));
+		    SpecialKeySpace::MODULE::GLOBALCONFIG,
+		    SpecialKeySpace::IMPLTYPE::READWRITE,
+		    std::make_unique<GlobalConfigImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG)));
 		registerSpecialKeySpaceModule(
-		    SpecialKeySpace::MODULE::TRACING, SpecialKeySpace::IMPLTYPE::READWRITE,
-		    std::make_unique<TracingOptionsImpl>(
-		        SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::TRACING)));
+		    SpecialKeySpace::MODULE::TRACING,
+		    SpecialKeySpace::IMPLTYPE::READWRITE,
+		    std::make_unique<TracingOptionsImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::TRACING)));
 		registerSpecialKeySpaceModule(
 		    SpecialKeySpace::MODULE::CONFIGURATION,
 		    SpecialKeySpace::IMPLTYPE::READWRITE,
@@ -4916,8 +4916,7 @@ ACTOR Future<Optional<ProtocolVersion>> getCoordinatorProtocolFromConnectPacket(
 	    FlowTransport::transport().getPeerProtocolAsyncVar(coordinatorAddress);
 
 	loop {
-		if (protocolVersion->get().present() &&
-		    (!expectedVersion.present() || expectedVersion.get() != protocolVersion->get().get())) {
+		if (protocolVersion->get().present() && protocolVersion->get() != expectedVersion) {
 			return protocolVersion->get();
 		}
 
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 47bf03c7e8..c8dd207d3a 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -1214,7 +1214,7 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
 								    now() + FLOW_KNOBS->CONNECTION_ID_TIMEOUT;
 							}
 							compatible = false;
-							if (!protocolVersion.hasMultiVersionClient()) {
+							if (!protocolVersion.hasInexpensiveMultiVersionClient()) {
 								// Older versions expected us to hang up. It may work even if we don't hang up here, but
 								// it's safer to keep the old behavior.
 								peer->protocolVersion->set(protocolVersion);
diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h
index 74da1dfd70..07a2675f1b 100644
--- a/flow/ProtocolVersion.h
+++ b/flow/ProtocolVersion.h
@@ -91,7 +91,7 @@ public: // introduced features
 	PROTOCOL_VERSION_FEATURE(0x0FDB00A446020000LL, Locality);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00A460010000LL, MultiGenerationTLog);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00A460010000LL, SharedMutations);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00A551000000LL, MultiVersionClient);
+	PROTOCOL_VERSION_FEATURE(0x0FDB00A551000000LL, InexpensiveMultiVersionClient);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00A560010000LL, TagLocality);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B060000000LL, Fearless);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B061020000LL, EndpointAddrList);

From b39b2b4a3cc90c3e3b6c0508a4036c858e3fb40f Mon Sep 17 00:00:00 2001
From: john_leach <jleach4@gmail.com>
Date: Mon, 19 Apr 2021 11:52:44 -0700
Subject: [PATCH 248/461] Initial Container structure, build images from
 build_output/packages/

---
 cmake/InstallLayout.cmake                     |  7 +++
 packaging/docker/README.md                    | 12 ++++
 packaging/docker/base/Dockerfile              | 46 ++++++++++++++
 packaging/docker/dev/Dockerfile               | 62 +++++++++++++++++++
 packaging/docker/dev_ycsb/Dockerfile          | 45 ++++++++++++++
 packaging/docker/{ => release}/Dockerfile     | 17 +++--
 .../{ => scripts}/create_cluster_file.bash    |  0
 .../create_server_environment.bash            |  0
 .../download_multiversion_libraries.bash      |  0
 packaging/docker/{ => scripts}/fdb.bash       |  0
 10 files changed, 180 insertions(+), 9 deletions(-)
 create mode 100644 packaging/docker/base/Dockerfile
 create mode 100644 packaging/docker/dev/Dockerfile
 create mode 100644 packaging/docker/dev_ycsb/Dockerfile
 rename packaging/docker/{ => release}/Dockerfile (94%)
 rename packaging/docker/{ => scripts}/create_cluster_file.bash (100%)
 rename packaging/docker/{ => scripts}/create_server_environment.bash (100%)
 rename packaging/docker/{ => scripts}/download_multiversion_libraries.bash (100%)
 rename packaging/docker/{ => scripts}/fdb.bash (100%)

diff --git a/cmake/InstallLayout.cmake b/cmake/InstallLayout.cmake
index f4297f0179..47fe89136b 100644
--- a/cmake/InstallLayout.cmake
+++ b/cmake/InstallLayout.cmake
@@ -227,6 +227,13 @@ set(LIB_DIR lib64)
 configure_file("${PROJECT_SOURCE_DIR}/packaging/multiversion/clients/postinst" "${script_dir}/clients/postinst-el7" @ONLY)
 configure_file("${PROJECT_SOURCE_DIR}/packaging/multiversion/clients/prerm" "${script_dir}/clients" @ONLY)
 
+
+################################################################################
+# Move Docker Setup
+################################################################################
+
+file(COPY "${PROJECT_SOURCE_DIR}/packaging/docker" DESTINATION "${PROJECT_BINARY_DIR}/packages/")
+
 ################################################################################
 # General CPack configuration
 ################################################################################
diff --git a/packaging/docker/README.md b/packaging/docker/README.md
index 39fc94844a..5e608efdb2 100644
--- a/packaging/docker/README.md
+++ b/packaging/docker/README.md
@@ -76,3 +76,15 @@ files you may want to copy are:
 *	`/var/fdb/scripts/create_cluster_file.bash`: A script for setting up the
 	cluster file based on an `FDB_COORDINATOR` environment variable.
 *	`/usr/bin/fdbcli`: The FoundationDB CLI.
+
+
+# Example Usages
+
+### Release Images
+
+cd src/foundationdb/packaging/docker
+
+docker build -f release/Dockerfile -t foundationDB:foundationDB:6.2.29 . --build-arg FDB_VERSION=6.2.29
+
+### Developer Images
+
diff --git a/packaging/docker/base/Dockerfile b/packaging/docker/base/Dockerfile
new file mode 100644
index 0000000000..937d48dcff
--- /dev/null
+++ b/packaging/docker/base/Dockerfile
@@ -0,0 +1,46 @@
+# Dockerfile
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM ubuntu:18.04
+
+# Install dependencies
+
+RUN apt-get update && \
+	apt-get install -y curl>=7.58.0-2ubuntu3.6 \
+		dnsutils>=1:9.11.3+dfsg-1ubuntu1.7 \
+		lsof>=4.89+dfsg-0.1 \
+		tcptraceroute>=1.5beta7+debian-4build1 \
+		telnet>=0.17-41 \
+		netcat>=1.10-41.1 \
+		strace>=4.21-1ubuntu1 \
+		tcpdump>=4.9.3-0ubuntu0.18.04.1 \
+		less>=487-0.1 \
+		vim>=2:8.0.1453-1ubuntu1.4 \
+		net-tools>=1.60+git20161116.90da8a0-1ubuntu1 \
+		jq>=1.5+dfsg-2 && \
+	rm -r /var/lib/apt/lists/*
+
+# Adding tini https://github.com/krallin/tini
+ARG TINI_VERSION=v0.19.0
+RUN curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
+    curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64.sha256sum && \
+	sha256sum -c tini-amd64.sha256sum && \
+	rm -f tini-amd64.sha256sum && \
+    chmod +x tini-amd64 && \
+	mv tini-amd64 /usr/bin/tini
diff --git a/packaging/docker/dev/Dockerfile b/packaging/docker/dev/Dockerfile
new file mode 100644
index 0000000000..c74da35467
--- /dev/null
+++ b/packaging/docker/dev/Dockerfile
@@ -0,0 +1,62 @@
+# Dockerfile
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+ARG REPOSITORY=foundationdb/build
+ARG VERSION=centos7-latest
+FROM $REPOSITORY:$VERSION
+
+# Install FoundationDB Binaries
+
+WORKDIR /var/fdb/tmp
+
+COPY docker/scripts scripts/
+
+RUN chmod u+x scripts/*.bash && \
+	mkdir -p logs
+
+COPY . /var/fdb/tmp/packages
+
+WORKDIR /var/fdb/tmp/packages/bin
+
+RUN chmod u+x fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent && \
+	mv fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent /usr/bin
+
+WORKDIR /var/fdb/tmp/packages/lib
+
+RUN mv libfdb_c.so /usr/lib/libfdb_c.so && \
+	mv libfdb_java.so /usr/lib/libfdb_java.so
+
+# Set Up Runtime Scripts and Directories
+
+VOLUME /var/fdb/data
+
+CMD /var/fdb/scripts/fdb.bash
+
+# Runtime Configuration Options
+
+ENV FDB_PORT 4500
+ENV FDB_CLUSTER_FILE /var/fdb/fdb.cluster
+ENV FDB_NETWORKING_MODE container
+ENV FDB_COORDINATOR ""
+ENV FDB_COORDINATOR_PORT 4500
+ENV FDB_CLUSTER_FILE_CONTENTS ""
+ENV FDB_PROCESS_CLASS unset
+
+# Adding tini as PID 1 https://github.com/krallin/tini
+ENTRYPOINT ["/usr/bin/tini", "-g", "--"]
diff --git a/packaging/docker/dev_ycsb/Dockerfile b/packaging/docker/dev_ycsb/Dockerfile
new file mode 100644
index 0000000000..4e1f435766
--- /dev/null
+++ b/packaging/docker/dev_ycsb/Dockerfile
@@ -0,0 +1,45 @@
+ARG REPOSITORY=foundationdb/build
+ARG VERSION=centos7-latest
+FROM $REPOSITORY:$VERSION
+
+#########################################################################################################################################
+# This install YCSB AND the FDB client
+# libraries necessary to run it. The
+# following are the different files downloaded:
+#
+#  1. YCSB
+#  2. libfdb_c_${FDB_VERSION}.so -- the C binding. Sent to /var/lib/fdb
+#  3. fdb-java-${FDB_VERSION}.jar -- the Java library. Sent to ${YCSB_HOME}/foundationdb-binding/lib
+#  4. jaxb-api-2.3.1.jar -- a library dependency necessary for making HDR histograms. Sent to ${YCSB_HOME}/foundationdb-binding/lib
+#
+# Note that these files are only complete for FDB 6.2.x. If you are wanting to run FDB 6.3.x versions, then you'll need to add
+# libfdb_java_${FDB_VERSION}.so to /var/lib/fdb as well
+#########################################################################################################################################
+
+ENV YCSB_VERSION=ycsb-foundationdb-binding-0.17.0 \
+    PATH=${PATH}:/usr/bin
+
+RUN cd /opt \
+    && eval curl "-Ls https://github.com/brianfrankcooper/YCSB/releases/download/0.17.0/ycsb-foundationdb-binding-0.17.0.tar.gz" \
+    | tar -xzvf - 
+
+RUN rm -Rf /opt/${YCSB_VERSION}/lib/fdb-java-5.2.5.jar
+
+WORKDIR /var/fdb/tmp
+
+COPY . /var/fdb/tmp/packages
+
+WORKDIR /var/fdb/tmp/packages/lib
+
+RUN mv libfdb_c.so /usr/lib/libfdb_c.so && \
+	mv libfdb_java.so /usr/lib/libfdb_java.so
+
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/
+
+WORKDIR /var/fdb/tmp/packages
+
+RUN mv fdb-java-7.0.0-PRERELEASE.jar /opt/${YCSB_VERSION}/lib/fdb-java-7.0.0-PRERELEASE.jar
+
+WORKDIR "/opt/${YCSB_VERSION}"
+
+CMD ["tail", "-f", "/dev/null"]
diff --git a/packaging/docker/Dockerfile b/packaging/docker/release/Dockerfile
similarity index 94%
rename from packaging/docker/Dockerfile
rename to packaging/docker/release/Dockerfile
index 9fd690290c..d44b94fdfd 100644
--- a/packaging/docker/Dockerfile
+++ b/packaging/docker/release/Dockerfile
@@ -54,23 +54,22 @@ RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz -o fd
 
 WORKDIR /var/fdb
 
+
+# Set Up Runtime Scripts and Directories
+
+COPY scripts /var/fdb/scripts
+
+RUN chmod u+x scripts/*.bash && \
+	mkdir -p logs
+
 # Install FoundationDB Client Libraries
 
 ARG FDB_ADDITIONAL_VERSIONS="5.1.7"
 
-COPY download_multiversion_libraries.bash scripts/
-
 RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/libfdb_c_$FDB_VERSION.so -o /usr/lib/libfdb_c.so && \
 	bash scripts/download_multiversion_libraries.bash $FDB_WEBSITE $FDB_ADDITIONAL_VERSIONS && \
 	rm -rf /mnt/website
 
-# Set Up Runtime Scripts and Directories
-
-COPY fdb.bash scripts/
-COPY create_server_environment.bash scripts/
-COPY create_cluster_file.bash scripts/
-RUN chmod u+x scripts/*.bash && \
-	mkdir -p logs
 VOLUME /var/fdb/data
 
 CMD /var/fdb/scripts/fdb.bash
diff --git a/packaging/docker/create_cluster_file.bash b/packaging/docker/scripts/create_cluster_file.bash
similarity index 100%
rename from packaging/docker/create_cluster_file.bash
rename to packaging/docker/scripts/create_cluster_file.bash
diff --git a/packaging/docker/create_server_environment.bash b/packaging/docker/scripts/create_server_environment.bash
similarity index 100%
rename from packaging/docker/create_server_environment.bash
rename to packaging/docker/scripts/create_server_environment.bash
diff --git a/packaging/docker/download_multiversion_libraries.bash b/packaging/docker/scripts/download_multiversion_libraries.bash
similarity index 100%
rename from packaging/docker/download_multiversion_libraries.bash
rename to packaging/docker/scripts/download_multiversion_libraries.bash
diff --git a/packaging/docker/fdb.bash b/packaging/docker/scripts/fdb.bash
similarity index 100%
rename from packaging/docker/fdb.bash
rename to packaging/docker/scripts/fdb.bash

From e45faa35342253910645b98e2c68fa5fc135c2d9 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 23 Apr 2021 16:38:01 -0700
Subject: [PATCH 249/461] Fix a bug where deleting a key invalidated its memory
 which was later read

---
 fdbclient/ActorLineageProfiler.h | 2 +-
 fdbclient/GlobalConfig.actor.cpp | 6 ++----
 fdbclient/GlobalConfig.actor.h   | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 82cd22cb1c..c612274133 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -78,7 +78,7 @@ class SampleCollection_t {
 
 	SampleCollector _collector;
 	mutable std::mutex mutex;
-	std::atomic<double> windowSize = 5.0;
+	std::atomic<double> windowSize = 0.0;
 	std::deque<std::shared_ptr<Sample>> data;
 
 public:
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 1d06d84880..947c383689 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -123,7 +123,7 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
 	}
 }
 
-void GlobalConfig::erase(KeyRef key) {
+void GlobalConfig::erase(Key key) {
 	erase(KeyRangeRef(key, keyAfter(key)));
 }
 
@@ -187,9 +187,7 @@ ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
 // Updates local copy of global configuration by reading the entire key-range
 // from storage.
 ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
-	for (const auto& [key, _] : self->data) {
-		self->erase(key);
-	}
+	self->erase(KeyRangeRef(""_sr, "\xff"_sr));
 
 	Transaction tr(self->cx);
 	Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 65028dcd92..a541145dd8 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -150,7 +150,7 @@ private:
 	void insert(KeyRef key, ValueRef value);
 	// Removes the given key (and associated value) from the local copy of the
 	// global configuration keyspace.
-	void erase(KeyRef key);
+	void erase(Key key);
 	// Removes the given key range (and associated values) from the local copy
 	// of the global configuration keyspace.
 	void erase(KeyRangeRef range);

From 168cba83efd2a590018fa150234482408e6d6c59 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Fri, 23 Apr 2021 23:51:49 +0000
Subject: [PATCH 250/461] Address review comments

---
 flow/FastRef.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/flow/FastRef.h b/flow/FastRef.h
index aaa50b9595..f8292c5322 100644
--- a/flow/FastRef.h
+++ b/flow/FastRef.h
@@ -36,20 +36,14 @@ public:
 		// The performance of this seems comparable to a version with less strict memory ordering (see e.g.
 		// https://www.boost.org/doc/libs/1_57_0/doc/html/atomic/usage_examples.html#boost_atomic.usage_examples.example_reference_counters),
 		// on both x86 and ARM, with gcc8.
-		if (referenceCount.fetch_sub(1) == 1) {
-			return true;
-		}
-		return false;
+		return referenceCount.fetch_sub(1) == 1;
 	}
 	void delref() const {
 		if (delref_no_destroy())
 			delete (Subclass*)this;
 	}
 	void setrefCountUnsafe(int32_t count) const { referenceCount.store(count); }
-	int32_t debugGetReferenceCount() const {
-		return referenceCount.load();
-	} // Never use in production code, only for tracing
-	bool isSoleOwnerUnsafe() const { return referenceCount.load() == 1; }
+	int32_t debugGetReferenceCount() const { return referenceCount.load(); }
 
 private:
 	ThreadSafeReferenceCounted(const ThreadSafeReferenceCounted&) /* = delete*/;

From 5e5ccebb4c2602965db0bf698ac8a495a8f421ba Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Fri, 23 Apr 2021 18:41:04 -0700
Subject: [PATCH 251/461] Draft scripts to build kubernetes image + sidecar. 
 Depends on okteto default paths (that is a feature, not a bug).

---
 packaging/docker/README.md               | 13 +++--
 packaging/docker/build-release-docker.sh | 66 ++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 7 deletions(-)
 create mode 100755 packaging/docker/build-release-docker.sh

diff --git a/packaging/docker/README.md b/packaging/docker/README.md
index 5e608efdb2..83639c7967 100644
--- a/packaging/docker/README.md
+++ b/packaging/docker/README.md
@@ -80,11 +80,10 @@ files you may want to copy are:
 
 # Example Usages
 
-### Release Images
-
-cd src/foundationdb/packaging/docker
-
-docker build -f release/Dockerfile -t foundationDB:foundationDB:6.2.29 . --build-arg FDB_VERSION=6.2.29
-
-### Developer Images
+```
+# optional; to build a release image (as in for public consumption, or deployment at apple) for 7.0.0, set TAG=7.0.0
+# defaults to <fdb version triple>-<okteto environment name>  e.g., 7.0.0-sears-dev
+#TAG=my-custom-tag
 
+. build-release-docker.sh
+```
diff --git a/packaging/docker/build-release-docker.sh b/packaging/docker/build-release-docker.sh
new file mode 100755
index 0000000000..a385dbf474
--- /dev/null
+++ b/packaging/docker/build-release-docker.sh
@@ -0,0 +1,66 @@
+# Run using . build-release-docker.sh
+
+## This is designed to be run inside an okteto environment.
+
+cmk
+
+cd ~/src/foundationdb/
+
+FDB_VERSION=$(grep '  VERSION ' CMakeLists.txt | tr -s ' ' ' ' | cut -d ' ' -f 3)
+
+# Feel free to customize the image tag:
+TAG=${TAG:-${FDB_VERSION}-${OKTETO_NAME}}
+
+export IMAGE=foundationdb/foundationdb:${TAG}
+
+echo Building with tag ${TAG}
+
+WEBSITE_BIN_DIR=website/downloads/$FDB_VERSION/linux/
+TARBALL=${WEBSITE_BIN_DIR}/fdb_$FDB_VERSION.tar.gz
+ECR=112664522426.dkr.ecr.us-west-2.amazonaws.com
+
+cd ~/src/foundationdb/packaging/docker
+
+mkdir -p ${WEBSITE_BIN_DIR}
+tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
+
+# XXX
+make -C ~/src/fdb-kubernetes-tests/tests/ ecr-login
+
+yes| cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
+docker pull ${ECR}/ubuntu:18.04
+docker tag ${ECR}/ubuntu:18.04 ubuntu:18.04
+
+docker build -t ${IMAGE} \
+   --build-arg FDB_WEBSITE=file:///mnt/website \
+   --build-arg FDB_VERSION=$FDB_VERSION \
+   --build-arg FDB_ADDITIONAL_VERSIONS=$FDB_VERSION \
+   -f release/Dockerfile .
+
+docker tag ${IMAGE} ${ECR}/${IMAGE}
+docker push ${ECR}/${IMAGE}
+
+cd ~/src/fdb-kubernetes-operator/foundationdb-kubernetes-sidecar
+echo
+pwd
+echo
+
+mkdir -p ${WEBSITE_BIN_DIR}
+tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
+yes| cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
+
+SIDECAR_IMAGE=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
+
+docker pull ${ECR}/python:3.9-slim
+docker tag ${ECR}/python:3.9-slim python:3.9-slim
+
+docker build -t ${SIDECAR_IMAGE} \
+   --build-arg FDB_WEBSITE=file:///mnt/website \
+   --build-arg FDB_VERSION=$FDB_VERSION \
+   --build-arg FDB_LIBRARY_VERSIONS=$FDB_VERSION \
+   -f Dockerfile .
+
+docker tag ${IMAGE} ${ECR}/${SIDECAR_IMAGE}
+docker push ${ECR}/${SIDECAR_IMAGE}
+
+#docker build -f release/Dockerfile -t foundationdb/foundationdb:6.2.29 . --build-arg FDB_VERSION=6.2.29

From 22d04266734265efac902975e61191dc94501ad4 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Sun, 25 Apr 2021 17:34:12 -0700
Subject: [PATCH 252/461] Log when RatekeeperGetSSListLongLatency.

---
 fdbserver/Ratekeeper.actor.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 0224954b9e..3daf824b10 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -737,8 +737,11 @@ ACTOR Future<Void> monitorServerListChange(
 
 	loop {
 		try {
+			if (now() - self->lastSSListFetchedTimestamp > 2 * SERVER_KNOBS->SERVER_LIST_DELAY) {
+				TraceEvent(SevWarnAlways, "RatekeeperGetSSListLongLatency", self->id)
+				    .detail("latency", now() - self->lastSSListFetchedTimestamp);
+			}
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-			TraceEvent("RatekeeperMonitorSSList", self->id).detail("CurrentTime", now());
 			vector<std::pair<StorageServerInterface, ProcessClass>> results = wait(getServerListAndProcessClasses(&tr));
 			self->lastSSListFetchedTimestamp = now();
 
@@ -1498,7 +1501,8 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
 					p.lastTagPushTime = now();
 
 					reply.throttledTags = self.throttledTags.getClientRates(self.autoThrottlingEnabled);
-					TEST(reply.throttledTags.present() && reply.throttledTags.get().size() > 0); // Returning tag throttles to a proxy
+					TEST(reply.throttledTags.present() &&
+					     reply.throttledTags.get().size() > 0); // Returning tag throttles to a proxy
 				}
 
 				reply.healthMetrics.update(self.healthMetrics, true, req.detailed);

From c06da4704caba263b81e1bba27e00eba49b394ea Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Sun, 25 Apr 2021 17:35:23 -0700
Subject: [PATCH 253/461] Revert unrelated clang format.

---
 fdbserver/Ratekeeper.actor.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 3daf824b10..f16c2f4fe6 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -1501,8 +1501,7 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
 					p.lastTagPushTime = now();
 
 					reply.throttledTags = self.throttledTags.getClientRates(self.autoThrottlingEnabled);
-					TEST(reply.throttledTags.present() &&
-					     reply.throttledTags.get().size() > 0); // Returning tag throttles to a proxy
+					TEST(reply.throttledTags.present() && reply.throttledTags.get().size() > 0); // Returning tag throttles to a proxy
 				}
 
 				reply.healthMetrics.update(self.healthMetrics, true, req.detailed);

From 384c0b48ea827dadb1e5fc2943b8932db6a54ea1 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Sun, 25 Apr 2021 22:53:52 -0700
Subject: [PATCH 254/461] Fix suppressFor order error.

---
 fdbserver/Ratekeeper.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index f16c2f4fe6..a7ef3f4626 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -739,7 +739,7 @@ ACTOR Future<Void> monitorServerListChange(
 		try {
 			if (now() - self->lastSSListFetchedTimestamp > 2 * SERVER_KNOBS->SERVER_LIST_DELAY) {
 				TraceEvent(SevWarnAlways, "RatekeeperGetSSListLongLatency", self->id)
-				    .detail("latency", now() - self->lastSSListFetchedTimestamp);
+				    .detail("Latency", now() - self->lastSSListFetchedTimestamp);
 			}
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 			vector<std::pair<StorageServerInterface, ProcessClass>> results = wait(getServerListAndProcessClasses(&tr));
@@ -768,7 +768,7 @@ ACTOR Future<Void> monitorServerListChange(
 			tr = Transaction(self->db);
 			wait(delay(SERVER_KNOBS->SERVER_LIST_DELAY));
 		} catch (Error& e) {
-			TraceEvent("RatekeeperGetSSListError", self->id).suppressFor(1.0).error(e);
+			TraceEvent("RatekeeperGetSSListError", self->id).error(e).suppressFor(1.0);
 			wait(tr.onError(e));
 		}
 	}

From f1559a22031149950f8ccf5ba9d4ee6fc02c9a03 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 09:49:26 -0700
Subject: [PATCH 255/461] use the stateless process class instead of master or
 resolution in simulation because it is the recommended process class, and the
 others are not deterministic when recruited in a constrained process
 situation

---
 .../source/mr-status-json-schemas.rst.inc      |  1 +
 fdbclient/Schemas.cpp                          |  1 +
 fdbserver/SimulatedCluster.actor.cpp           | 18 +++++++++++-------
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 81da2adf83..e43f7e5fac 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -27,6 +27,7 @@
                   "storage",
                   "transaction",
                   "resolution",
+                  "stateless",
                   "commit_proxy",
                   "grv_proxy",
                   "master",
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 866ea4441e..6f5a78a927 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -47,6 +47,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                   "storage",
                   "transaction",
                   "resolution",
+                  "stateless",
                   "commit_proxy",
                   "grv_proxy",
                   "master",
diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index f8d9610f32..f10ca774bb 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -730,9 +730,14 @@ ACTOR Future<Void> restartSimulatedSystem(vector<Future<Void>>* systemActors,
 				zoneId = StringRef(zoneIDini);
 			}
 
-			ProcessClass processClass =
-			    ProcessClass((ProcessClass::ClassType)atoi(ini.GetValue(machineIdString.c_str(), "mClass")),
-			                 ProcessClass::CommandLineSource);
+			ProcessClass::ClassType cType =
+			    (ProcessClass::ClassType)(atoi(ini.GetValue(machineIdString.c_str(), "mClass")));
+			// using specialized class types can lead to nondeterministic recruitment
+			if (cType == ProcessClass::MasterClass || cType == ProcessClass::ResolutionClass) {
+				cType = ProcessClass::StatelessClass;
+			}
+			ProcessClass processClass = ProcessClass(cType, ProcessClass::CommandLineSource);
+
 			if (processClass != ProcessClass::TesterClass) {
 				dcIds.push_back(dcUIDini);
 			}
@@ -1450,8 +1455,7 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
 	bool requiresExtraDBMachines = testConfig.extraDB && g_simulator.extraDB->toString() != conn.toString();
 	int assignedMachines = 0, nonVersatileMachines = 0;
 	std::vector<ProcessClass::ClassType> processClassesSubSet = { ProcessClass::UnsetClass,
-		                                                          ProcessClass::ResolutionClass,
-		                                                          ProcessClass::MasterClass };
+		                                                          ProcessClass::StatelessClass };
 	for (int dc = 0; dc < dataCenters; dc++) {
 		// FIXME: test unset dcID
 		Optional<Standalone<StringRef>> dcUID = StringRef(format("%d", dc));
@@ -1493,12 +1497,12 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
 				else if (assignedMachines == 4 && !simconfig.db.regions.size())
 					processClass = ProcessClass(
 					    processClassesSubSet[deterministicRandom()->randomInt(0, processClassesSubSet.size())],
-					    ProcessClass::CommandLineSource); // Unset or Resolution or Master
+					    ProcessClass::CommandLineSource); // Unset or Stateless
 				else
 					processClass = ProcessClass((ProcessClass::ClassType)deterministicRandom()->randomInt(0, 3),
 					                            ProcessClass::CommandLineSource); // Unset, Storage, or Transaction
 				if (processClass ==
-				    ProcessClass::ResolutionClass) // *can't* be assigned to other roles, even in an emergency
+				    ProcessClass::StatelessClass) // *can't* be assigned to other roles, even in an emergency
 					nonVersatileMachines++;
 			}
 

From 9ca2c3b6c4e7150b1112bbb4e8d8866dfda7bf18 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 09:50:19 -0700
Subject: [PATCH 256/461] instead of increasing the timeout for the lowLatency
 test, reduce the amount of time a commit takes because of long commit times

---
 fdbserver/workloads/LowLatency.actor.cpp | 2 +-
 flow/Knobs.cpp                           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbserver/workloads/LowLatency.actor.cpp b/fdbserver/workloads/LowLatency.actor.cpp
index 7e761b2262..90b03dd8e9 100644
--- a/fdbserver/workloads/LowLatency.actor.cpp
+++ b/fdbserver/workloads/LowLatency.actor.cpp
@@ -40,7 +40,7 @@ struct LowLatencyWorkload : TestWorkload {
 	  : TestWorkload(wcx), operations("Operations"), retries("Retries"), ok(true) {
 		testDuration = getOption(options, LiteralStringRef("testDuration"), 600.0);
 		maxGRVLatency = getOption(options, LiteralStringRef("maxGRVLatency"), 20.0);
-		maxCommitLatency = getOption(options, LiteralStringRef("maxCommitLatency"), 33.0);
+		maxCommitLatency = getOption(options, LiteralStringRef("maxCommitLatency"), 30.0);
 		checkDelay = getOption(options, LiteralStringRef("checkDelay"), 1.0);
 		testWrites = getOption(options, LiteralStringRef("testWrites"), true);
 		testKey = getOption(options, LiteralStringRef("testKey"), LiteralStringRef("testKey"));
diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp
index 4a3eb4e2d7..e4d5a4e6f9 100644
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@@ -135,7 +135,7 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) {
 	init( DISABLE_POSIX_KERNEL_AIO,                              0 );
 
 	//AsyncFileNonDurable
-	init( NON_DURABLE_MAX_WRITE_DELAY,                         5.0 );
+	init( NON_DURABLE_MAX_WRITE_DELAY,                         2.0 ); if( randomize && BUGGIFY ) NON_DURABLE_MAX_WRITE_DELAY = 5.0;
 	init( MAX_PRIOR_MODIFICATION_DELAY,                        1.0 ); if( randomize && BUGGIFY ) MAX_PRIOR_MODIFICATION_DELAY = 10.0;
 
 	//GenericActors

From ccfc77f6fbf8f4782bb979f4c270d85b019354f8 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 09:57:46 -0700
Subject: [PATCH 257/461] changed preferredSharing to be ordered, so that
 recruitment will always share with the same other role when everything else
 is equal

---
 fdbserver/ClusterController.actor.cpp | 168 ++++++++++++++------------
 1 file changed, 93 insertions(+), 75 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index db23dfd215..ec9314d7fd 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1174,15 +1174,15 @@ public:
 		return bestFitness;
 	}
 
-	WorkerFitnessInfo getWorkerForRoleInDatacenter(
-	    Optional<Standalone<StringRef>> const& dcId,
-	    ProcessClass::ClusterRole role,
-	    ProcessClass::Fitness unacceptableFitness,
-	    DatabaseConfiguration const& conf,
-	    std::map<Optional<Standalone<StringRef>>, int>& id_used,
-	    Optional<Standalone<StringRef>> preferredSharing = Optional<Standalone<StringRef>>(),
-	    bool checkStable = false) {
-		std::map<std::tuple<ProcessClass::Fitness, int, bool, bool>, vector<WorkerDetails>> fitness_workers;
+	WorkerFitnessInfo getWorkerForRoleInDatacenter(Optional<Standalone<StringRef>> const& dcId,
+	                                               ProcessClass::ClusterRole role,
+	                                               ProcessClass::Fitness unacceptableFitness,
+	                                               DatabaseConfiguration const& conf,
+	                                               std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	                                               std::map<Optional<Standalone<StringRef>>, int> preferredSharing =
+	                                                   std::map<Optional<Standalone<StringRef>>, int>(),
+	                                               bool checkStable = false) {
+		std::map<std::tuple<ProcessClass::Fitness, int, bool, int>, vector<WorkerDetails>> fitness_workers;
 
 		for (auto& it : id_worker) {
 			auto fitness = it.second.details.processClass.machineClassFitness(role);
@@ -1191,10 +1191,11 @@ public:
 			}
 			if (workerAvailable(it.second, checkStable) && fitness < unacceptableFitness &&
 			    it.second.details.interf.locality.dcId() == dcId) {
+				auto sharing = preferredSharing.find(it.first);
 				fitness_workers[std::make_tuple(fitness,
 				                                id_used[it.first],
 				                                isLongLivedStateless(it.first),
-				                                preferredSharing != it.first)]
+				                                sharing != preferredSharing.end() ? sharing->second : 1e6)]
 				    .push_back(it.second.details);
 			}
 		}
@@ -1216,10 +1217,11 @@ public:
 	    int amount,
 	    DatabaseConfiguration const& conf,
 	    std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	    std::map<Optional<Standalone<StringRef>>, int> preferredSharing =
+	        std::map<Optional<Standalone<StringRef>>, int>(),
 	    Optional<WorkerFitnessInfo> minWorker = Optional<WorkerFitnessInfo>(),
 	    bool checkStable = false) {
-		std::map<std::pair<ProcessClass::Fitness, int>, std::pair<vector<WorkerDetails>, vector<WorkerDetails>>>
-		    fitness_workers;
+		std::map<std::tuple<ProcessClass::Fitness, int, bool, int>, vector<WorkerDetails>> fitness_workers;
 		vector<WorkerDetails> results;
 		if (minWorker.present()) {
 			results.push_back(minWorker.get().worker);
@@ -1237,24 +1239,22 @@ public:
 			     (it.second.details.interf.id() != minWorker.get().worker.interf.id() &&
 			      (fitness < minWorker.get().fitness ||
 			       (fitness == minWorker.get().fitness && id_used[it.first] <= minWorker.get().used))))) {
-				if (isLongLivedStateless(it.first)) {
-					fitness_workers[std::make_pair(fitness, id_used[it.first])].second.push_back(it.second.details);
-				} else {
-					fitness_workers[std::make_pair(fitness, id_used[it.first])].first.push_back(it.second.details);
-				}
+				auto sharing = preferredSharing.find(it.first);
+				fitness_workers[std::make_tuple(fitness,
+				                                id_used[it.first],
+				                                isLongLivedStateless(it.first),
+				                                sharing != preferredSharing.end() ? sharing->second : 1e6)]
+				    .push_back(it.second.details);
 			}
 		}
 
 		for (auto& it : fitness_workers) {
-			for (int j = 0; j < 2; j++) {
-				auto& w = j == 0 ? it.second.first : it.second.second;
-				deterministicRandom()->randomShuffle(w);
-				for (int i = 0; i < w.size(); i++) {
-					results.push_back(w[i]);
-					id_used[w[i].interf.locality.processId()]++;
-					if (results.size() == amount)
-						return results;
-				}
+			deterministicRandom()->randomShuffle(it.second);
+			for (int i = 0; i < it.second.size(); i++) {
+				results.push_back(it.second[i]);
+				id_used[it.second[i].interf.locality.processId()]++;
+				if (results.size() == amount)
+					return results;
 			}
 		}
 
@@ -1475,20 +1475,16 @@ public:
 			}
 		}
 
+		std::map<Optional<Standalone<StringRef>>, int> preferredSharing;
 		auto first_commit_proxy = getWorkerForRoleInDatacenter(
-		    dcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, req.configuration, id_used);
-		auto first_grv_proxy = getWorkerForRoleInDatacenter(dcId,
-		                                                    ProcessClass::GrvProxy,
-		                                                    ProcessClass::ExcludeFit,
-		                                                    req.configuration,
-		                                                    id_used,
-		                                                    first_commit_proxy.worker.interf.locality.processId());
-		auto first_resolver = getWorkerForRoleInDatacenter(dcId,
-		                                                   ProcessClass::Resolver,
-		                                                   ProcessClass::ExcludeFit,
-		                                                   req.configuration,
-		                                                   id_used,
-		                                                   first_commit_proxy.worker.interf.locality.processId());
+		    dcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, req.configuration, id_used, preferredSharing);
+		preferredSharing[first_commit_proxy.worker.interf.locality.processId()] = 0;
+		auto first_grv_proxy = getWorkerForRoleInDatacenter(
+		    dcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, req.configuration, id_used, preferredSharing);
+		preferredSharing[first_grv_proxy.worker.interf.locality.processId()] = 1;
+		auto first_resolver = getWorkerForRoleInDatacenter(
+		    dcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, req.configuration, id_used, preferredSharing);
+		preferredSharing[first_resolver.worker.interf.locality.processId()] = 2;
 
 		// If one of the first process recruitments is forced to share a process, allow all of next recruitments
 		// to also share a process.
@@ -1502,18 +1498,21 @@ public:
 		                                                    req.configuration.getDesiredCommitProxies(),
 		                                                    req.configuration,
 		                                                    id_used,
+		                                                    preferredSharing,
 		                                                    first_commit_proxy);
 		auto grv_proxies = getWorkersForRoleInDatacenter(dcId,
 		                                                 ProcessClass::GrvProxy,
 		                                                 req.configuration.getDesiredGrvProxies(),
 		                                                 req.configuration,
 		                                                 id_used,
+		                                                 preferredSharing,
 		                                                 first_grv_proxy);
 		auto resolvers = getWorkersForRoleInDatacenter(dcId,
 		                                               ProcessClass::Resolver,
 		                                               req.configuration.getDesiredResolvers(),
 		                                               req.configuration,
 		                                               id_used,
+		                                               preferredSharing,
 		                                               first_resolver);
 		for (int i = 0; i < commit_proxies.size(); i++)
 			result.commitProxies.push_back(commit_proxies[i].interf);
@@ -1681,22 +1680,28 @@ public:
 					// SOMEDAY: recruitment in other DCs besides the clusterControllerDcID will not account for the
 					// processes used by the master and cluster controller properly.
 					auto used = id_used;
-					auto first_commit_proxy = getWorkerForRoleInDatacenter(
-					    dcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, req.configuration, used);
-					auto first_grv_proxy =
-					    getWorkerForRoleInDatacenter(dcId,
-					                                 ProcessClass::GrvProxy,
-					                                 ProcessClass::ExcludeFit,
-					                                 req.configuration,
-					                                 used,
-					                                 first_commit_proxy.worker.interf.locality.processId());
-					auto first_resolver =
-					    getWorkerForRoleInDatacenter(dcId,
-					                                 ProcessClass::Resolver,
-					                                 ProcessClass::ExcludeFit,
-					                                 req.configuration,
-					                                 used,
-					                                 first_commit_proxy.worker.interf.locality.processId());
+					std::map<Optional<Standalone<StringRef>>, int> preferredSharing;
+					auto first_commit_proxy = getWorkerForRoleInDatacenter(dcId,
+					                                                       ProcessClass::CommitProxy,
+					                                                       ProcessClass::ExcludeFit,
+					                                                       req.configuration,
+					                                                       used,
+					                                                       preferredSharing);
+					preferredSharing[first_commit_proxy.worker.interf.locality.processId()] = 0;
+					auto first_grv_proxy = getWorkerForRoleInDatacenter(dcId,
+					                                                    ProcessClass::GrvProxy,
+					                                                    ProcessClass::ExcludeFit,
+					                                                    req.configuration,
+					                                                    used,
+					                                                    preferredSharing);
+					preferredSharing[first_grv_proxy.worker.interf.locality.processId()] = 1;
+					auto first_resolver = getWorkerForRoleInDatacenter(dcId,
+					                                                   ProcessClass::Resolver,
+					                                                   ProcessClass::ExcludeFit,
+					                                                   req.configuration,
+					                                                   used,
+					                                                   preferredSharing);
+					preferredSharing[first_resolver.worker.interf.locality.processId()] = 2;
 
 					// If one of the first process recruitments is forced to share a process, allow all of next
 					// recruitments to also share a process.
@@ -1710,6 +1715,7 @@ public:
 					                                                    req.configuration.getDesiredCommitProxies(),
 					                                                    req.configuration,
 					                                                    used,
+					                                                    preferredSharing,
 					                                                    first_commit_proxy);
 
 					auto grv_proxies = getWorkersForRoleInDatacenter(dcId,
@@ -1717,6 +1723,7 @@ public:
 					                                                 req.configuration.getDesiredGrvProxies(),
 					                                                 req.configuration,
 					                                                 used,
+					                                                 preferredSharing,
 					                                                 first_grv_proxy);
 
 					auto resolvers = getWorkersForRoleInDatacenter(dcId,
@@ -1724,6 +1731,7 @@ public:
 					                                               req.configuration.getDesiredResolvers(),
 					                                               req.configuration,
 					                                               used,
+					                                               preferredSharing,
 					                                               first_resolver);
 
 					auto fitness = std::make_tuple(RoleFitness(commit_proxies, ProcessClass::CommitProxy, used),
@@ -1829,14 +1837,14 @@ public:
 			                             ProcessClass::ExcludeFit,
 			                             db.config,
 			                             id_used,
-			                             Optional<Standalone<StringRef>>(),
+			                             std::map<Optional<Standalone<StringRef>>, int>(),
 			                             true);
 			getWorkerForRoleInDatacenter(regions[0].dcId,
 			                             ProcessClass::Master,
 			                             ProcessClass::ExcludeFit,
 			                             db.config,
 			                             id_used,
-			                             Optional<Standalone<StringRef>>(),
+			                             std::map<Optional<Standalone<StringRef>>, int>(),
 			                             true);
 
 			std::set<Optional<Key>> primaryDC;
@@ -1858,21 +1866,21 @@ public:
 			                             ProcessClass::ExcludeFit,
 			                             db.config,
 			                             id_used,
-			                             Optional<Standalone<StringRef>>(),
+			                             std::map<Optional<Standalone<StringRef>>, int>(),
 			                             true);
 			getWorkerForRoleInDatacenter(regions[0].dcId,
 			                             ProcessClass::CommitProxy,
 			                             ProcessClass::ExcludeFit,
 			                             db.config,
 			                             id_used,
-			                             Optional<Standalone<StringRef>>(),
+			                             std::map<Optional<Standalone<StringRef>>, int>(),
 			                             true);
 			getWorkerForRoleInDatacenter(regions[0].dcId,
 			                             ProcessClass::GrvProxy,
 			                             ProcessClass::ExcludeFit,
 			                             db.config,
 			                             id_used,
-			                             Optional<Standalone<StringRef>>(),
+			                             std::map<Optional<Standalone<StringRef>>, int>(),
 			                             true);
 
 			vector<Optional<Key>> dcPriority;
@@ -2090,7 +2098,7 @@ public:
 		                                                         ProcessClass::NeverAssign,
 		                                                         db.config,
 		                                                         id_used,
-		                                                         Optional<Standalone<StringRef>>(),
+		                                                         std::map<Optional<Standalone<StringRef>>, int>(),
 		                                                         true);
 		auto newMasterFit = mworker.worker.processClass.machineClassFitness(ProcessClass::Master);
 		if (db.config.isExcludedServer(mworker.worker.interf.addresses())) {
@@ -2250,15 +2258,17 @@ public:
 		RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter, old_id_used);
 		RoleFitness newLogRoutersFit = oldLogRoutersFit;
 		if (db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED) {
-			newLogRoutersFit = RoleFitness(getWorkersForRoleInDatacenter(*remoteDC.begin(),
-			                                                             ProcessClass::LogRouter,
-			                                                             newRouterCount,
-			                                                             db.config,
-			                                                             id_used,
-			                                                             Optional<WorkerFitnessInfo>(),
-			                                                             true),
-			                               ProcessClass::LogRouter,
-			                               id_used);
+			newLogRoutersFit =
+			    RoleFitness(getWorkersForRoleInDatacenter(*remoteDC.begin(),
+			                                              ProcessClass::LogRouter,
+			                                              newRouterCount,
+			                                              db.config,
+			                                              id_used,
+			                                              std::map<Optional<Standalone<StringRef>>, int>(),
+			                                              Optional<WorkerFitnessInfo>(),
+			                                              true),
+			                ProcessClass::LogRouter,
+			                id_used);
 		}
 
 		if (oldLogRoutersFit.count < oldRouterCount) {
@@ -2276,27 +2286,31 @@ public:
 		RoleFitness oldGrvProxyFit(grvProxyClasses, ProcessClass::GrvProxy, old_id_used);
 		RoleFitness oldResolverFit(resolverClasses, ProcessClass::Resolver, old_id_used);
 
+		std::map<Optional<Standalone<StringRef>>, int> preferredSharing;
 		auto first_commit_proxy = getWorkerForRoleInDatacenter(clusterControllerDcId,
 		                                                       ProcessClass::CommitProxy,
 		                                                       ProcessClass::ExcludeFit,
 		                                                       db.config,
 		                                                       id_used,
-		                                                       Optional<Standalone<StringRef>>(),
+		                                                       preferredSharing,
 		                                                       true);
+		preferredSharing[first_commit_proxy.worker.interf.locality.processId()] = 0;
 		auto first_grv_proxy = getWorkerForRoleInDatacenter(clusterControllerDcId,
 		                                                    ProcessClass::GrvProxy,
 		                                                    ProcessClass::ExcludeFit,
 		                                                    db.config,
 		                                                    id_used,
-		                                                    first_commit_proxy.worker.interf.locality.processId(),
+		                                                    preferredSharing,
 		                                                    true);
+		preferredSharing[first_grv_proxy.worker.interf.locality.processId()] = 1;
 		auto first_resolver = getWorkerForRoleInDatacenter(clusterControllerDcId,
 		                                                   ProcessClass::Resolver,
 		                                                   ProcessClass::ExcludeFit,
 		                                                   db.config,
 		                                                   id_used,
-		                                                   first_commit_proxy.worker.interf.locality.processId(),
+		                                                   preferredSharing,
 		                                                   true);
+		preferredSharing[first_resolver.worker.interf.locality.processId()] = 2;
 		auto maxUsed = std::max({ first_commit_proxy.used, first_grv_proxy.used, first_resolver.used });
 		first_commit_proxy.used = maxUsed;
 		first_grv_proxy.used = maxUsed;
@@ -2306,6 +2320,7 @@ public:
 		                                                    db.config.getDesiredCommitProxies(),
 		                                                    db.config,
 		                                                    id_used,
+		                                                    preferredSharing,
 		                                                    first_commit_proxy,
 		                                                    true);
 		auto grv_proxies = getWorkersForRoleInDatacenter(clusterControllerDcId,
@@ -2313,6 +2328,7 @@ public:
 		                                                 db.config.getDesiredGrvProxies(),
 		                                                 db.config,
 		                                                 id_used,
+		                                                 preferredSharing,
 		                                                 first_grv_proxy,
 		                                                 true);
 		auto resolvers = getWorkersForRoleInDatacenter(clusterControllerDcId,
@@ -2320,6 +2336,7 @@ public:
 		                                               db.config.getDesiredResolvers(),
 		                                               db.config,
 		                                               id_used,
+		                                               preferredSharing,
 		                                               first_resolver,
 		                                               true);
 
@@ -2336,6 +2353,7 @@ public:
 		                                                              nBackup,
 		                                                              db.config,
 		                                                              id_used,
+		                                                              std::map<Optional<Standalone<StringRef>>, int>(),
 		                                                              Optional<WorkerFitnessInfo>(),
 		                                                              true),
 		                                ProcessClass::Backup,
@@ -2775,7 +2793,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 	                                                               ProcessClass::NeverAssign,
 	                                                               self->db.config,
 	                                                               id_used,
-	                                                               Optional<Standalone<StringRef>>(),
+	                                                               std::map<Optional<Standalone<StringRef>>, int>(),
 	                                                               true)
 	                                .worker;
 	if (self->onMasterIsBetter(newRKWorker, ProcessClass::Ratekeeper)) {
@@ -2791,7 +2809,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 	                                                               ProcessClass::NeverAssign,
 	                                                               self->db.config,
 	                                                               id_used,
-	                                                               Optional<Standalone<StringRef>>(),
+	                                                               std::map<Optional<Standalone<StringRef>>, int>(),
 	                                                               true)
 	                                .worker;
 	if (self->onMasterIsBetter(newDDWorker, ProcessClass::DataDistributor)) {

From 7503964ee9da0ab7ce1d31ea9a26b8a05c6bc8fe Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 10:01:54 -0700
Subject: [PATCH 258/461] recruitment tries to avoid degraded processes
 altogether, rather than just the worst one. Since this is a behavior change
 from the backup recruitment, we cannot compared degraded between the two
 recruitments

---
 fdbserver/ClusterController.actor.cpp | 38 ++++++++++++---------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index ec9314d7fd..42dd777e11 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -997,15 +997,15 @@ public:
 
 						std::map<Optional<Standalone<StringRef>>, int> field_count;
 						std::set<Optional<Standalone<StringRef>>> zones;
-						bool foundDegraded = false;
 						for (auto& worker : testWorkers) {
 							if (!zones.count(worker.interf.locality.zoneId())) {
 								field_count[worker.interf.locality.get(pa1->attributeKey())]++;
 								zones.insert(worker.interf.locality.zoneId());
 							}
-							foundDegraded = foundDegraded || worker.degraded;
 						}
-						testFitness.worstDegraded = foundDegraded;
+						// backup recruitment is not required to use degraded processes that have better fitness
+						// so we cannot compare degraded between the two methods
+						testFitness.degraded = fitness.degraded;
 
 						int minField = 100;
 
@@ -1047,6 +1047,9 @@ public:
 				    conf, required, desired, policy, testUsed, checkStable, dcIds, exclusionWorkerIds);
 				RoleFitness testFitness(testWorkers, ProcessClass::TLog, testUsed);
 				RoleFitness fitness(workers, ProcessClass::TLog, id_used);
+				// backup recruitment is not required to use degraded processes that have better fitness
+				// so we cannot compare degraded between the two methods
+				testFitness.degraded = fitness.degraded;
 
 				if (fitness > testFitness) {
 					for (auto& w : testWorkers) {
@@ -1270,7 +1273,7 @@ public:
 		ProcessClass::ClusterRole role;
 		int count;
 		int worstUsed = 1;
-		bool worstDegraded = false;
+		bool degraded = false;
 
 		RoleFitness(int bestFit, int worstFit, int count, ProcessClass::ClusterRole role)
 		  : bestFit((ProcessClass::Fitness)bestFit), worstFit((ProcessClass::Fitness)worstFit), count(count),
@@ -1291,7 +1294,7 @@ public:
 			// Every recruitment will attempt to recruit the preferred amount through GoodFit,
 			// So a recruitment which only has BestFit is not better than one that has a GoodFit process
 			worstFit = ProcessClass::GoodFit;
-			worstDegraded = false;
+			degraded = false;
 			bestFit = ProcessClass::NeverAssign;
 			worstUsed = 1;
 			for (auto& it : workers) {
@@ -1312,22 +1315,17 @@ public:
 				if (thisFit > worstFit) {
 					worstFit = thisFit;
 					worstUsed = thisUsed->second;
-					worstDegraded = it.degraded;
 				} else if (thisFit == worstFit) {
-					if (thisUsed->second > worstUsed) {
-						worstUsed = thisUsed->second;
-						worstDegraded = it.degraded;
-					} else if (thisUsed->second == worstUsed) {
-						worstDegraded = it.degraded || worstDegraded;
-					}
+					worstUsed = std::max(worstUsed, thisUsed->second);
 				}
+				degraded = degraded || it.degraded;
 			}
 
 			count = workers.size();
 
 			// degraded is only used for recruitment of tlogs
 			if (role != ProcessClass::TLog) {
-				worstDegraded = false;
+				degraded = false;
 			}
 		}
 
@@ -1338,8 +1336,8 @@ public:
 				return worstUsed < r.worstUsed;
 			if (count != r.count)
 				return count > r.count;
-			if (worstDegraded != r.worstDegraded)
-				return r.worstDegraded;
+			if (degraded != r.degraded)
+				return r.degraded;
 			// FIXME: TLog recruitment process does not guarantee the best fit is not worsened.
 			if (role != ProcessClass::TLog && role != ProcessClass::LogRouter && bestFit != r.bestFit)
 				return bestFit < r.bestFit;
@@ -1356,19 +1354,17 @@ public:
 				return worstFit < r.worstFit;
 			if (worstUsed != r.worstUsed)
 				return worstUsed < r.worstUsed;
-			if (worstDegraded != r.worstDegraded)
-				return r.worstDegraded;
+			if (degraded != r.degraded)
+				return r.degraded;
 			return false;
 		}
 
 		bool operator==(RoleFitness const& r) const {
 			return worstFit == r.worstFit && worstUsed == r.worstUsed && bestFit == r.bestFit && count == r.count &&
-			       worstDegraded == r.worstDegraded;
+			       degraded == r.degraded;
 		}
 
-		std::string toString() const {
-			return format("%d %d %d %d %d", worstFit, worstUsed, count, worstDegraded, bestFit);
-		}
+		std::string toString() const { return format("%d %d %d %d %d", worstFit, worstUsed, count, degraded, bestFit); }
 	};
 
 	std::set<Optional<Standalone<StringRef>>> getDatacenters(DatabaseConfiguration const& conf,

From 49ca48f82ebfae8d4f90b1ae47a02aa7a3e728d8 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 10:09:44 -0700
Subject: [PATCH 259/461] fix: tlog recruitment could select more than the
 desired about of tlogs fix: tlog recruitment did not attempt to avoid
 longLivedStateless processes

---
 fdbserver/ClusterController.actor.cpp | 87 +++++++++++++++------------
 1 file changed, 50 insertions(+), 37 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 42dd777e11..24a52b1a74 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -464,10 +464,9 @@ public:
 	                                                     int minPerField,
 	                                                     bool allowDegraded,
 	                                                     bool checkStable,
-	                                                     std::set<Optional<Key>> dcIds,
-	                                                     std::vector<UID> exclusionWorkerIds) {
-		std::map<std::pair<ProcessClass::Fitness, int>, vector<WorkerDetails>> fitness_workers;
-		desired = std::max(desired, minFields * minPerField);
+	                                                     const std::set<Optional<Key>>& dcIds,
+	                                                     const std::vector<UID>& exclusionWorkerIds) {
+		std::map<std::tuple<ProcessClass::Fitness, int, bool>, vector<WorkerDetails>> fitness_workers;
 
 		// Go through all the workers to list all the workers that can be recruited.
 		for (const auto& [worker_process_id, worker_info] : id_worker) {
@@ -483,11 +482,13 @@ public:
 				continue;
 			}
 
-			fitness_workers[std::make_pair(fitness, id_used[worker_process_id])].push_back(worker_details);
+			fitness_workers[std::make_tuple(
+			                    fitness, id_used[worker_process_id], isLongLivedStateless(worker_process_id))]
+			    .push_back(worker_details);
 		}
 
-		auto requiredFitness = ProcessClass::BestFit;
-		int requiredUsed = 0;
+		auto requiredFitness = ProcessClass::NeverAssign;
+		int requiredUsed = 1e6;
 
 		typedef Optional<Standalone<StringRef>> Field;
 		typedef Optional<Standalone<StringRef>> Zone;
@@ -499,6 +500,13 @@ public:
 		// Determine the best required workers by finding the workers with enough unique zoneIds per field
 		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
 			deterministicRandom()->randomShuffle(workerIter->second);
+			auto fitness = std::get<0>(workerIter->first);
+			auto used = std::get<1>(workerIter->first);
+
+			if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+				break;
+			}
+
 			for (auto& worker : workerIter->second) {
 				auto thisField = worker.interf.locality.get(field);
 				auto& zones = field_zones[thisField];
@@ -525,6 +533,7 @@ public:
 			throw no_more_servers();
 		}
 
+		std::set<Field> chosenFields;
 		// If we cannot use all of the fields, use the fields which allow the best workers to be chosen
 		if (fieldsWithMin.size() * minPerField > desired) {
 			std::vector<std::tuple<ProcessClass::Fitness, int, bool, int, Field>> orderedFields;
@@ -534,33 +543,29 @@ public:
 				    std::get<0>(fitness), std::get<1>(fitness), std::get<2>(fitness), field_count[it], it));
 			}
 			std::sort(orderedFields.begin(), orderedFields.end());
-			std::set<Field> newFieldsWithMin;
 			int totalFields = desired / minPerField;
 			int maxCount = 0;
-			for (int i = 0; i < orderedFields.size(); i++) {
-				if (newFieldsWithMin.size() == totalFields - 1 && maxCount + std::get<3>(orderedFields[i]) < desired) {
+			for (int i = 0; i < orderedFields.size() && chosenFields.size() < totalFields; i++) {
+				if (chosenFields.size() == totalFields - 1 && maxCount + std::get<3>(orderedFields[i]) < desired) {
 					for (int j = i + 1; j < orderedFields.size(); j++) {
 						if (maxCount + std::get<3>(orderedFields[j]) >= desired) {
-							newFieldsWithMin.insert(std::get<4>(orderedFields[j]));
+							chosenFields.insert(std::get<4>(orderedFields[j]));
 							break;
 						}
 					}
-					if (newFieldsWithMin.size() == totalFields) {
-						break;
-					}
 				}
-				maxCount += std::get<3>(orderedFields[i]);
-				newFieldsWithMin.insert(std::get<4>(orderedFields[i]));
-				if (newFieldsWithMin.size() == totalFields) {
-					break;
+				if (chosenFields.size() < totalFields) {
+					maxCount += std::get<3>(orderedFields[i]);
+					chosenFields.insert(std::get<4>(orderedFields[i]));
 				}
 			}
-			fieldsWithMin = newFieldsWithMin;
+		} else {
+			chosenFields = fieldsWithMin;
 		}
 
 		// Create a result set with fulfills the minField and minPerField requirements before adding more workers
 		std::set<WorkerDetails> resultSet;
-		for (auto& it : fieldsWithMin) {
+		for (auto& it : chosenFields) {
 			auto& w = field_zones[it].second;
 			for (int i = 0; i < minPerField; i++) {
 				resultSet.insert(w[i]);
@@ -568,23 +573,24 @@ public:
 		}
 
 		// Continue adding workers to the result set until we reach the desired number of workers
-		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
-			if (workerIter->first.first > requiredFitness ||
-			    (workerIter->first.first == requiredFitness && workerIter->first.second > requiredUsed)) {
+		for (auto workerIter = fitness_workers.begin();
+		     workerIter != fitness_workers.end() && resultSet.size() < desired;
+		     ++workerIter) {
+			auto fitness = std::get<0>(workerIter->first);
+			auto used = std::get<1>(workerIter->first);
+
+			if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
 				break;
 			}
 			if (workerIter->second.size() + resultSet.size() <= desired) {
 				for (auto& worker : workerIter->second) {
-					if (fieldsWithMin.count(worker.interf.locality.get(field))) {
+					if (chosenFields.count(worker.interf.locality.get(field))) {
 						resultSet.insert(worker);
 					}
 				}
 			} else {
 				addWorkersByLowestField(field, desired, workerIter->second, resultSet);
 			}
-			if (resultSet.size() >= desired) {
-				break;
-			}
 		}
 
 		for (auto& result : resultSet) {
@@ -602,8 +608,9 @@ public:
 	                                                     int minFields,
 	                                                     int minPerField,
 	                                                     bool checkStable,
-	                                                     std::set<Optional<Key>> dcIds,
-	                                                     std::vector<UID> exclusionWorkerIds) {
+	                                                     const std::set<Optional<Key>>& dcIds,
+	                                                     const std::vector<UID>& exclusionWorkerIds) {
+		desired = std::max(desired, minFields * minPerField);
 		std::map<Optional<Standalone<StringRef>>, int> withDegradedUsed = id_used;
 		auto withDegraded = getWorkersForTlogsComplex(conf,
 		                                              desired,
@@ -616,6 +623,7 @@ public:
 		                                              dcIds,
 		                                              exclusionWorkerIds);
 		RoleFitness withDegradedFitness(withDegraded, ProcessClass::TLog, withDegradedUsed);
+		ASSERT(withDegraded.size() <= desired);
 
 		bool usedDegraded = false;
 		for (auto& it : withDegraded) {
@@ -666,9 +674,9 @@ public:
 	                                                    int32_t desired,
 	                                                    std::map<Optional<Standalone<StringRef>>, int>& id_used,
 	                                                    bool checkStable,
-	                                                    std::set<Optional<Key>> dcIds,
-	                                                    std::vector<UID> exclusionWorkerIds) {
-		std::map<std::tuple<ProcessClass::Fitness, int, bool, bool>, vector<WorkerDetails>> fitness_workers;
+	                                                    const std::set<Optional<Key>>& dcIds,
+	                                                    const std::vector<UID>& exclusionWorkerIds) {
+		std::map<std::tuple<ProcessClass::Fitness, int, bool, bool, bool>, vector<WorkerDetails>> fitness_workers;
 
 		// Go through all the workers to list all the workers that can be recruited.
 		for (const auto& [worker_process_id, worker_info] : id_worker) {
@@ -689,7 +697,11 @@ public:
 				fitness = std::max(fitness, ProcessClass::GoodFit);
 			}
 
-			fitness_workers[std::make_tuple(fitness, id_used[worker_process_id], worker_details.degraded, inCCDC)]
+			fitness_workers[std::make_tuple(fitness,
+			                                id_used[worker_process_id],
+			                                worker_details.degraded,
+			                                isLongLivedStateless(worker_process_id),
+			                                inCCDC)]
 			    .push_back(worker_details);
 		}
 
@@ -721,7 +733,9 @@ public:
 		}
 
 		// Continue adding workers to the result set until we reach the desired number of workers
-		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+		for (auto workerIter = fitness_workers.begin();
+		     workerIter != fitness_workers.end() && resultSet.size() < desired;
+		     ++workerIter) {
 			auto fitness = std::get<0>(workerIter->first);
 			auto used = std::get<1>(workerIter->first);
 			if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
@@ -734,11 +748,10 @@ public:
 			} else {
 				addWorkersByLowestZone(desired, workerIter->second, resultSet);
 			}
-			if (resultSet.size() >= desired) {
-				break;
-			}
 		}
 
+		ASSERT(resultSet.size() <= desired);
+
 		for (auto& result : resultSet) {
 			id_used[result.interf.locality.processId()]++;
 		}

From 50bb9b51b483b69095e14606407072393a28a5b2 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 10:13:59 -0700
Subject: [PATCH 260/461] simulation does recruitment twice and compares the
 results to ensure recruitment is deterministic

---
 fdbserver/ClusterController.actor.cpp | 122 ++++++++++++++++++++++++--
 1 file changed, 116 insertions(+), 6 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 24a52b1a74..c118168c30 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1434,8 +1434,8 @@ public:
 		return result;
 	}
 
-	ErrorOr<RecruitFromConfigurationReply> findWorkersForConfiguration(RecruitFromConfigurationRequest const& req,
-	                                                                   Optional<Key> dcId) {
+	ErrorOr<RecruitFromConfigurationReply> findWorkersForConfigurationFromDC(RecruitFromConfigurationRequest const& req,
+	                                                                         Optional<Key> dcId) {
 		RecruitFromConfigurationReply result;
 		std::map<Optional<Standalone<StringRef>>, int> id_used;
 		updateKnownIds(&id_used);
@@ -1580,7 +1580,7 @@ public:
 		return result;
 	}
 
-	RecruitFromConfigurationReply findWorkersForConfiguration(RecruitFromConfigurationRequest const& req) {
+	RecruitFromConfigurationReply findWorkersForConfigurationDispatch(RecruitFromConfigurationRequest const& req) {
 		if (req.configuration.regions.size() > 1) {
 			std::vector<RegionInfo> regions = req.configuration.regions;
 			if (regions[0].priority == regions[1].priority && regions[1].dcId == clusterControllerDcId.get()) {
@@ -1600,7 +1600,7 @@ public:
 
 			bool setPrimaryDesired = false;
 			try {
-				auto reply = findWorkersForConfiguration(req, regions[0].dcId);
+				auto reply = findWorkersForConfigurationFromDC(req, regions[0].dcId);
 				setPrimaryDesired = true;
 				vector<Optional<Key>> dcPriority;
 				dcPriority.push_back(regions[0].dcId);
@@ -1621,7 +1621,7 @@ public:
 					throw;
 				}
 				TraceEvent(SevWarn, "AttemptingRecruitmentInRemoteDC", id).error(e);
-				auto reply = findWorkersForConfiguration(req, regions[1].dcId);
+				auto reply = findWorkersForConfigurationFromDC(req, regions[1].dcId);
 				if (!setPrimaryDesired) {
 					vector<Optional<Key>> dcPriority;
 					dcPriority.push_back(regions[1].dcId);
@@ -1639,7 +1639,7 @@ public:
 			vector<Optional<Key>> dcPriority;
 			dcPriority.push_back(req.configuration.regions[0].dcId);
 			desiredDcIds.set(dcPriority);
-			auto reply = findWorkersForConfiguration(req, req.configuration.regions[0].dcId);
+			auto reply = findWorkersForConfigurationFromDC(req, req.configuration.regions[0].dcId);
 			if (reply.isError()) {
 				throw reply.getError();
 			} else if (req.configuration.regions[0].dcId == clusterControllerDcId.get()) {
@@ -1831,6 +1831,116 @@ public:
 		}
 	}
 
+	void updateIdUsed(const std::vector<WorkerInterface>& workers,
+	                  std::map<Optional<Standalone<StringRef>>, int>& id_used) {
+		for (auto& it : workers) {
+			id_used[it.locality.processId()]++;
+		}
+	}
+
+	void compareWorkers(const DatabaseConfiguration& conf,
+	                    const std::vector<WorkerInterface>& first,
+	                    std::map<Optional<Standalone<StringRef>>, int>& firstUsed,
+	                    const std::vector<WorkerInterface>& second,
+	                    std::map<Optional<Standalone<StringRef>>, int>& secondUsed,
+	                    ProcessClass::ClusterRole role,
+	                    std::string description) {
+		std::vector<WorkerDetails> firstDetails;
+		for (auto& it : first) {
+			auto w = id_worker.find(it.locality.processId());
+			ASSERT(w != id_worker.end());
+			ASSERT(!conf.isExcludedServer(w->second.details.interf.addresses()));
+			firstDetails.push_back(w->second.details);
+			//TraceEvent("CompareAddressesFirst").detail(description.c_str(), w->second.details.interf.address());
+		}
+		RoleFitness firstFitness(firstDetails, role, firstUsed);
+
+		std::vector<WorkerDetails> secondDetails;
+		for (auto& it : second) {
+			auto w = id_worker.find(it.locality.processId());
+			ASSERT(w != id_worker.end());
+			ASSERT(!conf.isExcludedServer(w->second.details.interf.addresses()));
+			secondDetails.push_back(w->second.details);
+			//TraceEvent("CompareAddressesSecond").detail(description.c_str(), w->second.details.interf.address());
+		}
+		RoleFitness secondFitness(secondDetails, role, secondUsed);
+
+		if (!(firstFitness == secondFitness)) {
+			TraceEvent(SevError, "NonDeterministicRecruitment")
+			    .detail("FirstFitness", firstFitness.toString())
+			    .detail("SecondFitness", secondFitness.toString())
+			    .detail("ClusterRole", role);
+		}
+	}
+
+	RecruitFromConfigurationReply findWorkersForConfiguration(RecruitFromConfigurationRequest const& req) {
+		RecruitFromConfigurationReply rep = findWorkersForConfigurationDispatch(req);
+		if (g_network->isSimulated()) {
+			RecruitFromConfigurationReply compare = findWorkersForConfigurationDispatch(req);
+
+			std::map<Optional<Standalone<StringRef>>, int> firstUsed;
+			std::map<Optional<Standalone<StringRef>>, int> secondUsed;
+			updateKnownIds(&firstUsed);
+			updateKnownIds(&secondUsed);
+
+			auto mworker = id_worker.find(masterProcessId);
+			//TraceEvent("CompareAddressesMaster")
+			//    .detail("Master",
+			//            mworker != id_worker.end() ? mworker->second.details.interf.address() : NetworkAddress());
+
+			updateIdUsed(rep.tLogs, firstUsed);
+			updateIdUsed(compare.tLogs, secondUsed);
+			compareWorkers(
+			    req.configuration, rep.tLogs, firstUsed, compare.tLogs, secondUsed, ProcessClass::TLog, "TLog");
+			updateIdUsed(rep.satelliteTLogs, firstUsed);
+			updateIdUsed(compare.satelliteTLogs, secondUsed);
+			compareWorkers(req.configuration,
+			               rep.satelliteTLogs,
+			               firstUsed,
+			               compare.satelliteTLogs,
+			               secondUsed,
+			               ProcessClass::TLog,
+			               "Satellite");
+			updateIdUsed(rep.commitProxies, firstUsed);
+			updateIdUsed(compare.commitProxies, secondUsed);
+			updateIdUsed(rep.grvProxies, firstUsed);
+			updateIdUsed(compare.grvProxies, secondUsed);
+			updateIdUsed(rep.resolvers, firstUsed);
+			updateIdUsed(compare.resolvers, secondUsed);
+			compareWorkers(req.configuration,
+			               rep.commitProxies,
+			               firstUsed,
+			               compare.commitProxies,
+			               secondUsed,
+			               ProcessClass::CommitProxy,
+			               "CommitProxy");
+			compareWorkers(req.configuration,
+			               rep.grvProxies,
+			               firstUsed,
+			               compare.grvProxies,
+			               secondUsed,
+			               ProcessClass::GrvProxy,
+			               "GrvProxy");
+			compareWorkers(req.configuration,
+			               rep.resolvers,
+			               firstUsed,
+			               compare.resolvers,
+			               secondUsed,
+			               ProcessClass::Resolver,
+			               "Resolver");
+			updateIdUsed(rep.backupWorkers, firstUsed);
+			updateIdUsed(compare.backupWorkers, secondUsed);
+			compareWorkers(req.configuration,
+			               rep.backupWorkers,
+			               firstUsed,
+			               compare.backupWorkers,
+			               secondUsed,
+			               ProcessClass::Backup,
+			               "Backup");
+		}
+		return rep;
+	}
+
 	// Check if txn system is recruited successfully in each region
 	void checkRegions(const std::vector<RegionInfo>& regions) {
 		if (desiredDcIds.get().present() && desiredDcIds.get().get().size() == 2 &&

From 451609e6be07cdadf9b87b91910d8850fc95f465 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 10:16:18 -0700
Subject: [PATCH 261/461] code cleanup

---
 fdbrpc/ReplicationPolicy.h            |  4 +-
 fdbserver/ClusterController.actor.cpp | 86 ++++++++++++++-------------
 2 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/fdbrpc/ReplicationPolicy.h b/fdbrpc/ReplicationPolicy.h
index a9c6f33e09..c2401a4bc0 100644
--- a/fdbrpc/ReplicationPolicy.h
+++ b/fdbrpc/ReplicationPolicy.h
@@ -151,9 +151,9 @@ struct PolicyAcross final : IReplicationPolicy, public ReferenceCounted<PolicyAc
 		_policy->attributeKeys(set);
 	}
 
-	Reference<IReplicationPolicy> embeddedPolicy() { return _policy; }
+	Reference<IReplicationPolicy> embeddedPolicy() const { return _policy; }
 
-	std::string attributeKey() { return _attribKey; }
+	const std::string& attributeKey() const { return _attribKey; }
 
 protected:
 	int _count;
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index c118168c30..3cbaebe6bf 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -326,18 +326,18 @@ public:
 	// only add workers which have a field which is already in the result set
 	void addWorkersByLowestField(StringRef field,
 	                             int desired,
-	                             std::vector<WorkerDetails> workers,
+	                             const std::vector<WorkerDetails>& workers,
 	                             std::set<WorkerDetails>& resultSet) {
 		typedef Optional<Standalone<StringRef>> Field;
 		typedef Optional<Standalone<StringRef>> Zone;
-		typedef std::pair<int, Field> FieldCount;
+		typedef std::tuple<int, bool, Field> FieldCount;
 		typedef std::pair<int, Zone> ZoneCount;
 
 		std::priority_queue<FieldCount, std::vector<FieldCount>, std::greater<FieldCount>> fieldQueue;
 		std::map<Field, std::priority_queue<ZoneCount, std::vector<ZoneCount>, std::greater<ZoneCount>>>
 		    field_zoneQueue;
 
-		std::map<Field, std::pair<int, int>> field_count;
+		std::map<Field, std::pair<int, bool>> field_count;
 		std::map<Zone, std::pair<int, Field>> zone_count;
 		std::map<Zone, std::vector<WorkerDetails>> zone_workers;
 
@@ -347,29 +347,28 @@ public:
 			auto thisZone = worker.interf.locality.zoneId();
 			auto thisDc = worker.interf.locality.dcId();
 
-			auto& f = field_count[thisField];
-			f.first++;
-			if (thisDc == clusterControllerDcId) {
-				f.second = 1;
-			}
-			auto& z = zone_count[thisZone];
-			z.first++;
-			z.second = thisField;
+			auto& fitness = field_count[thisField];
+			fitness.first++;
+			fitness.second = thisDc == clusterControllerDcId;
+
+			auto& zc = zone_count[thisZone];
+			zc.first++;
+			zc.second = thisField;
 		}
 
 		for (auto& worker : workers) {
 			auto thisField = worker.interf.locality.get(field);
 			auto thisZone = worker.interf.locality.zoneId();
-			zone_workers[thisZone].push_back(worker);
 
 			if (field_count.count(thisField)) {
+				zone_workers[thisZone].push_back(worker);
 				zone_count[thisZone].second = thisField;
 			}
 		}
 
 		// try to avoid fields in the cluster controller datacenter if everything else is equal
 		for (auto& it : field_count) {
-			fieldQueue.push(std::make_pair(2 * it.second.first + it.second.second, it.first));
+			fieldQueue.push(std::make_tuple(it.second.first, it.second.second, it.first));
 		}
 
 		for (auto& it : zone_count) {
@@ -379,7 +378,7 @@ public:
 		// start with the least used field, and try to find a worker with that field
 		while (fieldQueue.size()) {
 			auto lowestField = fieldQueue.top();
-			auto& lowestZoneQueue = field_zoneQueue[lowestField.second];
+			auto& lowestZoneQueue = field_zoneQueue[std::get<2>(lowestField)];
 			bool added = false;
 			// start with the least used zoneId, and try and find a worker with that zone
 			while (lowestZoneQueue.size() && !added) {
@@ -389,7 +388,7 @@ public:
 				while (zoneWorkers.size() && !added) {
 					if (!resultSet.count(zoneWorkers.back())) {
 						resultSet.insert(zoneWorkers.back());
-						if (resultSet.size() >= desired) {
+						if (resultSet.size() == desired) {
 							return;
 						}
 						added = true;
@@ -398,18 +397,22 @@ public:
 				}
 				lowestZoneQueue.pop();
 				if (added && zoneWorkers.size()) {
-					lowestZoneQueue.push(std::make_pair(lowestZone.first + 1, lowestZone.second));
+					++lowestZone.first;
+					lowestZoneQueue.push(lowestZone);
 				}
 			}
 			fieldQueue.pop();
 			if (added) {
-				fieldQueue.push(std::make_pair(lowestField.first + 2, lowestField.second));
+				++std::get<0>(lowestField);
+				fieldQueue.push(lowestField);
 			}
 		}
 	}
 
 	// Adds workers to the result which minimize the reuse of zoneIds
-	void addWorkersByLowestZone(int desired, std::vector<WorkerDetails> workers, std::set<WorkerDetails>& resultSet) {
+	void addWorkersByLowestZone(int desired,
+	                            const std::vector<WorkerDetails>& workers,
+	                            std::set<WorkerDetails>& resultSet) {
 		typedef Optional<Standalone<StringRef>> Zone;
 		typedef std::pair<int, Zone> ZoneCount;
 
@@ -417,7 +420,7 @@ public:
 		std::map<Zone, std::vector<WorkerDetails>> zone_workers;
 		std::priority_queue<ZoneCount, std::vector<ZoneCount>, std::greater<ZoneCount>> zoneQueue;
 
-		for (auto& worker : workers) {
+		for (const auto& worker : workers) {
 			auto thisZone = worker.interf.locality.zoneId();
 			zone_count[thisZone] = 0;
 			zone_workers[thisZone].push_back(worker);
@@ -440,7 +443,7 @@ public:
 			while (zoneWorkers.size() && !added) {
 				if (!resultSet.count(zoneWorkers.back())) {
 					resultSet.insert(zoneWorkers.back());
-					if (resultSet.size() >= desired) {
+					if (resultSet.size() == desired) {
 						return;
 					}
 					added = true;
@@ -449,7 +452,8 @@ public:
 			}
 			zoneQueue.pop();
 			if (added && zoneWorkers.size()) {
-				zoneQueue.push(std::make_pair(lowestZone.first + 1, lowestZone.second));
+				++lowestZone.first;
+				zoneQueue.push(lowestZone);
 			}
 		}
 	}
@@ -517,15 +521,13 @@ public:
 					}
 				}
 				field_count[thisField]++;
-				field_fitness.insert({ thisField,
-				                       std::make_tuple(workerIter->first.first,
-				                                       workerIter->first.second,
-				                                       worker.interf.locality.dcId() == clusterControllerDcId) });
+				field_fitness.insert(
+				    { thisField,
+				      std::make_tuple(fitness, used, worker.interf.locality.dcId() == clusterControllerDcId) });
 			}
 			if (fieldsWithMin.size() >= minFields) {
-				requiredFitness = workerIter->first.first;
-				requiredUsed = workerIter->first.second;
-				break;
+				requiredFitness = fitness;
+				requiredUsed = used;
 			}
 		}
 
@@ -651,6 +653,7 @@ public:
 			                                                 dcIds,
 			                                                 exclusionWorkerIds);
 			RoleFitness withoutDegradedFitness(withoutDegraded, ProcessClass::TLog, withoutDegradedUsed);
+			ASSERT(withoutDegraded.size() <= desired);
 
 			if (withDegradedFitness < withoutDegradedFitness) {
 				id_used = withDegradedUsed;
@@ -720,12 +723,12 @@ public:
 				if (!zones.count(worker.interf.locality.zoneId())) {
 					zones.insert(worker.interf.locality.zoneId());
 					resultSet.insert(worker);
-					if (resultSet.size() >= required) {
+					if (resultSet.size() == required) {
 						break;
 					}
 				}
 			}
-			if (resultSet.size() >= required) {
+			if (resultSet.size() == required) {
 				requiredFitness = fitness;
 				requiredUsed = used;
 				break;
@@ -771,14 +774,15 @@ public:
 	//   dcIds:       the target data centers the workers are in. The selected workers must all be from these
 	//                data centers:
 	//   exclusionWorkerIds: the workers to be excluded from the selection.
-	std::vector<WorkerDetails> getWorkersForTlogsBackup(DatabaseConfiguration const& conf,
-	                                                    int32_t required,
-	                                                    int32_t desired,
-	                                                    Reference<IReplicationPolicy> const& policy,
-	                                                    std::map<Optional<Standalone<StringRef>>, int>& id_used,
-	                                                    bool checkStable = false,
-	                                                    std::set<Optional<Key>> dcIds = std::set<Optional<Key>>(),
-	                                                    std::vector<UID> exclusionWorkerIds = {}) {
+	std::vector<WorkerDetails> getWorkersForTlogsBackup(
+	    DatabaseConfiguration const& conf,
+	    int32_t required,
+	    int32_t desired,
+	    Reference<IReplicationPolicy> const& policy,
+	    std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	    bool checkStable = false,
+	    const std::set<Optional<Key>>& dcIds = std::set<Optional<Key>>(),
+	    const std::vector<UID>& exclusionWorkerIds = {}) {
 		std::map<std::tuple<ProcessClass::Fitness, int, bool, bool>, vector<WorkerDetails>> fitness_workers;
 		std::vector<WorkerDetails> results;
 		Reference<LocalitySet> logServerSet = Reference<LocalitySet>(new LocalityMap<WorkerDetails>());
@@ -980,8 +984,8 @@ public:
 	                                              Reference<IReplicationPolicy> const& policy,
 	                                              std::map<Optional<Standalone<StringRef>>, int>& id_used,
 	                                              bool checkStable = false,
-	                                              std::set<Optional<Key>> dcIds = std::set<Optional<Key>>(),
-	                                              std::vector<UID> exclusionWorkerIds = {}) {
+	                                              const std::set<Optional<Key>>& dcIds = std::set<Optional<Key>>(),
+	                                              const std::vector<UID>& exclusionWorkerIds = {}) {
 		desired = std::max(required, desired);
 		bool useSimple = false;
 		if (policy->name() == "Across") {
@@ -1079,6 +1083,7 @@ public:
 			}
 			return workers;
 		}
+		TraceEvent(g_network->isSimulated() ? SevError : SevWarnAlways, "PolicyEngineNotOptimized");
 		return getWorkersForTlogsBackup(
 		    conf, required, desired, policy, id_used, checkStable, dcIds, exclusionWorkerIds);
 	}
@@ -3352,6 +3357,7 @@ void registerWorker(RegisterWorkerRequest req, ClusterControllerData* self) {
 		self->goodRemoteRecruitmentTime = lowPriorityDelay(SERVER_KNOBS->WAIT_FOR_GOOD_REMOTE_RECRUITMENT_DELAY);
 	} else {
 		TraceEvent("ClusterControllerWorkerAlreadyRegistered", self->id)
+		    .suppressFor(1.0)
 		    .detail("WorkerId", w.id())
 		    .detail("ProcessId", w.locality.processId())
 		    .detail("ZoneId", w.locality.zoneId())

From 656c9a6c47b1b952aa7c096c4e28d7234924beeb Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Mon, 26 Apr 2021 17:46:35 +0000
Subject: [PATCH 262/461] Add benchmark and document entities touched

---
 fdbclient/MultiVersionTransaction.actor.cpp |  6 ++++++
 flow/FastRef.h                              |  7 +++++++
 flowbench/BenchRef.cpp                      | 10 ++++++++++
 3 files changed, 23 insertions(+)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index cff80f60b5..918eeb40fb 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -1975,6 +1975,12 @@ ACTOR Future<Void> checkUndestroyedFutures(std::vector<ThreadSingleAssignmentVar
 	return Void();
 }
 
+// Common code for tests of single assignment vars. Tests both correctness and thread safety.
+// T should be a class that has a static method with the following signature:
+//
+//     static FutureInfo createThreadFuture(FutureInfo f);
+//
+// See AbortableTest for an example T type
 template <class T>
 THREAD_FUNC runSingleAssignmentVarTest(void* arg) {
 	noUnseed = true;
diff --git a/flow/FastRef.h b/flow/FastRef.h
index f8292c5322..06221e4cc0 100644
--- a/flow/FastRef.h
+++ b/flow/FastRef.h
@@ -25,6 +25,13 @@
 #include <atomic>
 #include <cstdint>
 
+// The thread safety this class provides is that it's safe to call addref and
+// delref on the same object concurrently in different threads. Subclass does
+// not get deleted until after all calls to delref complete.
+//
+// Importantly, this class does _not_ make accessing Subclass automatically
+// thread safe. Clients will need to provide their own external synchronization
+// for that.
 template <class Subclass>
 class ThreadSafeReferenceCounted {
 public:
diff --git a/flowbench/BenchRef.cpp b/flowbench/BenchRef.cpp
index a60c15bb6f..facf09db68 100644
--- a/flowbench/BenchRef.cpp
+++ b/flowbench/BenchRef.cpp
@@ -26,12 +26,14 @@
 #include <memory>
 
 struct Empty : public ReferenceCounted<Empty>, public FastAllocated<Empty> {};
+struct EmptyTSRC : public ThreadSafeReferenceCounted<EmptyTSRC>, public FastAllocated<EmptyTSRC> {};
 
 enum class RefType {
 	RawPointer,
 	UniquePointer,
 	SharedPointer,
 	FlowReference,
+	FlowReferenceThreadSafe,
 };
 
 template <RefType refType>
@@ -61,6 +63,12 @@ struct Factory<RefType::FlowReference> {
 	static void cleanup(const Reference<Empty>&) {}
 };
 
+template <>
+struct Factory<RefType::FlowReferenceThreadSafe> {
+	static Reference<EmptyTSRC> create() { return makeReference<EmptyTSRC>(); }
+	static void cleanup(const Reference<EmptyTSRC>&) {}
+};
+
 template <RefType refType>
 static void bench_ref_create_and_destroy(benchmark::State& state) {
 	while (state.KeepRunning()) {
@@ -86,7 +94,9 @@ BENCHMARK_TEMPLATE(bench_ref_create_and_destroy, RefType::RawPointer)->ReportAgg
 BENCHMARK_TEMPLATE(bench_ref_create_and_destroy, RefType::UniquePointer)->ReportAggregatesOnly(true);
 BENCHMARK_TEMPLATE(bench_ref_create_and_destroy, RefType::SharedPointer)->ReportAggregatesOnly(true);
 BENCHMARK_TEMPLATE(bench_ref_create_and_destroy, RefType::FlowReference)->ReportAggregatesOnly(true);
+BENCHMARK_TEMPLATE(bench_ref_create_and_destroy, RefType::FlowReferenceThreadSafe)->ReportAggregatesOnly(true);
 
 BENCHMARK_TEMPLATE(bench_ref_copy, RefType::RawPointer)->ReportAggregatesOnly(true);
 BENCHMARK_TEMPLATE(bench_ref_copy, RefType::SharedPointer)->ReportAggregatesOnly(true);
 BENCHMARK_TEMPLATE(bench_ref_copy, RefType::FlowReference)->ReportAggregatesOnly(true);
+BENCHMARK_TEMPLATE(bench_ref_copy, RefType::FlowReferenceThreadSafe)->ReportAggregatesOnly(true);

From 6b81b7a04b528ddd9a33b33bf195694c15c9029e Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 26 Apr 2021 11:04:36 -0700
Subject: [PATCH 263/461] Remove current lineage validity check

---
 fdbclient/ActorLineageProfiler.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index f1a71bae60..8bb2910001 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -238,10 +238,7 @@ ActorLineageProfilerT::ActorLineageProfilerT() {
 	    std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet())));
 	collection->collector()->addGetter(WaitState::Running, []() {
 		auto res = currentLineageThreadSafe.get();
-		if (res.isValid()) {
-			return std::vector<Reference<ActorLineage>>({ res });
-		}
-		return std::vector<Reference<ActorLineage>>();
+		return std::vector<Reference<ActorLineage>>({ res });
 	});
 }
 

From 719f810676f9382a56abc9fa979e43d4f338c6ae Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 26 Apr 2021 12:30:46 -0700
Subject: [PATCH 264/461] Rename incrementalBackupOnly to onlyAppyMutationLogs
 in all restore configs and functions.

---
 fdbbackup/backup.actor.cpp          | 10 +++++----
 fdbclient/BackupAgent.actor.h       |  8 +++----
 fdbclient/FileBackupAgent.actor.cpp | 35 ++++++++++++++---------------
 3 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index 43e6f86b10..f976de06a6 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -2258,7 +2258,7 @@ ACTOR Future<Void> runRestore(Database db,
                               bool waitForDone,
                               std::string addPrefix,
                               std::string removePrefix,
-                              bool incrementalBackupOnly,
+                              bool onlyAppyMutationLogs,
                               bool inconsistentSnapshotOnly) {
 	if (ranges.empty()) {
 		ranges.push_back_deep(ranges.arena(), normalKeys);
@@ -2305,7 +2305,7 @@ ACTOR Future<Void> runRestore(Database db,
 
 			BackupDescription desc = wait(bc->describeBackup());
 
-			if (incrementalBackupOnly && desc.contiguousLogEnd.present()) {
+			if (onlyAppyMutationLogs && desc.contiguousLogEnd.present()) {
 				targetVersion = desc.contiguousLogEnd.get() - 1;
 			} else if (desc.maxRestorableVersion.present()) {
 				targetVersion = desc.maxRestorableVersion.get();
@@ -2330,7 +2330,7 @@ ACTOR Future<Void> runRestore(Database db,
 			                                                   KeyRef(addPrefix),
 			                                                   KeyRef(removePrefix),
 			                                                   true,
-			                                                   incrementalBackupOnly,
+			                                                   onlyAppyMutationLogs,
 			                                                   inconsistentSnapshotOnly,
 			                                                   beginVersion));
 
@@ -3247,6 +3247,7 @@ int main(int argc, char* argv[]) {
 		bool stopWhenDone = true;
 		bool usePartitionedLog = false; // Set to true to use new backup system
 		bool incrementalBackupOnly = false;
+		bool onlyAppyMutationLogs = false;
 		bool inconsistentSnapshotOnly = false;
 		bool forceAction = false;
 		bool trace = false;
@@ -3511,6 +3512,7 @@ int main(int argc, char* argv[]) {
 				break;
 			case OPT_INCREMENTALONLY:
 				incrementalBackupOnly = true;
+				onlyAppyMutationLogs = true;
 				break;
 			case OPT_RESTORECONTAINER:
 				restoreContainer = args->OptionArg();
@@ -4032,7 +4034,7 @@ int main(int argc, char* argv[]) {
 				                         waitForDone,
 				                         addPrefix,
 				                         removePrefix,
-				                         incrementalBackupOnly,
+				                         onlyAppyMutationLogs,
 				                         inconsistentSnapshotOnly));
 				break;
 			case RestoreType::WAIT:
diff --git a/fdbclient/BackupAgent.actor.h b/fdbclient/BackupAgent.actor.h
index 38888d4b59..1e3f41bb0e 100644
--- a/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/BackupAgent.actor.h
@@ -294,7 +294,7 @@ public:
 	                        Key addPrefix = Key(),
 	                        Key removePrefix = Key(),
 	                        bool lockDB = true,
-	                        bool incrementalBackupOnly = false,
+	                        bool onlyAppyMutationLogs = false,
 	                        bool inconsistentSnapshotOnly = false,
 	                        Version beginVersion = -1);
 	Future<Version> restore(Database cx,
@@ -308,7 +308,7 @@ public:
 	                        Key addPrefix = Key(),
 	                        Key removePrefix = Key(),
 	                        bool lockDB = true,
-	                        bool incrementalBackupOnly = false,
+	                        bool onlyAppyMutationLogs = false,
 	                        bool inconsistentSnapshotOnly = false,
 	                        Version beginVersion = -1) {
 		Standalone<VectorRef<KeyRangeRef>> rangeRef;
@@ -324,8 +324,8 @@ public:
 		               addPrefix,
 		               removePrefix,
 		               lockDB,
-		               incrementalBackupOnly,
-					   inconsistentSnapshotOnly,
+		               onlyAppyMutationLogs,
+		               inconsistentSnapshotOnly,
 		               beginVersion);
 	}
 	Future<Version> atomicRestore(Database cx,
diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 9d405af38f..ecc83dd955 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -141,7 +141,7 @@ public:
 	}
 	KeyBackedProperty<Key> addPrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	KeyBackedProperty<Key> removePrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
-	KeyBackedProperty<bool> incrementalBackupOnly() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
+	KeyBackedProperty<bool> onlyAppyMutationLogs() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	KeyBackedProperty<bool> inconsistentSnapshotOnly() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	// XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges
 	KeyBackedProperty<KeyRange> restoreRange() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
@@ -3574,9 +3574,9 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase {
 		state int64_t remainingInBatch = Params.remainingInBatch().get(task);
 		state bool addingToExistingBatch = remainingInBatch > 0;
 		state Version restoreVersion;
-		state Future<Optional<bool>> incrementalBackupOnly = restore.incrementalBackupOnly().get(tr);
+		state Future<Optional<bool>> onlyAppyMutationLogs = restore.onlyAppyMutationLogs().get(tr);
 
-		wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)) && success(incrementalBackupOnly) &&
+		wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)) && success(onlyAppyMutationLogs) &&
 		     checkTaskVersion(tr->getDatabase(), task, name, version));
 
 		// If not adding to an existing batch then update the apply mutations end version so the mutations from the
@@ -4101,8 +4101,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			}
 		}
 
-		Optional<bool> _incremental = wait(restore.incrementalBackupOnly().get(tr));
-		state bool incremental = _incremental.present() ? _incremental.get() : false;
+		state bool logsOnly = wait(restore.onlyAppyMutationLogs().getD(tr, false, false));
 		if (beginVersion == invalidVersion) {
 			beginVersion = 0;
 		}
@@ -4111,8 +4110,8 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			keyRangesFilter.push_back_deep(keyRangesFilter.arena(), KeyRangeRef(r));
 		}
 		state Optional<RestorableFileSet> restorable =
-		    wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, incremental, beginVersion));
-		if (!incremental) {
+		    wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, logsOnly, beginVersion));
+		if (!logsOnly) {
 			beginVersion = restorable.get().snapshot.beginVersion;
 		}
 
@@ -4226,7 +4225,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 		    tr, taskBucket, task, 0, "", 0, CLIENT_KNOBS->RESTORE_DISPATCH_BATCH_SIZE)));
 
 		wait(taskBucket->finish(tr, task));
-		state Future<Optional<bool>> logsOnly = restore.incrementalBackupOnly().get(tr);
+		state Future<Optional<bool>> logsOnly = restore.onlyAppyMutationLogs().get(tr);
 		wait(success(logsOnly));
 		if (logsOnly.get().present() && logsOnly.get().get()) {
 			// If this is an incremental restore, we need to set the applyMutationsMapPrefix
@@ -4626,7 +4625,7 @@ public:
 	                                        Key addPrefix,
 	                                        Key removePrefix,
 	                                        bool lockDB,
-	                                        bool incrementalBackupOnly,
+	                                        bool onlyAppyMutationLogs,
 	                                        bool inconsistentSnapshotOnly,
 	                                        Version beginVersion,
 	                                        UID uid) {
@@ -4679,7 +4678,7 @@ public:
 			                                .removePrefix(removePrefix)
 			                                .withPrefix(addPrefix);
 			Standalone<RangeResultRef> existingRows = wait(tr->getRange(restoreIntoRange, 1));
-			if (existingRows.size() > 0 && !incrementalBackupOnly) {
+			if (existingRows.size() > 0 && !onlyAppyMutationLogs) {
 				throw restore_destination_not_empty();
 			}
 		}
@@ -4696,7 +4695,7 @@ public:
 		restore.sourceContainer().set(tr, bc);
 		restore.stateEnum().set(tr, ERestoreState::QUEUED);
 		restore.restoreVersion().set(tr, restoreVersion);
-		restore.incrementalBackupOnly().set(tr, incrementalBackupOnly);
+		restore.onlyAppyMutationLogs().set(tr, onlyAppyMutationLogs);
 		restore.inconsistentSnapshotOnly().set(tr, inconsistentSnapshotOnly);
 		restore.beginVersion().set(tr, beginVersion);
 		if (BUGGIFY && restoreRanges.size() == 1) {
@@ -5271,7 +5270,7 @@ public:
 	//   removePrefix: for each key to be restored, remove this prefix first.
 	//   lockDB: if set lock the database with randomUid before performing restore;
 	//           otherwise, check database is locked with the randomUid
-	//   incrementalBackupOnly: only perform incremental restore, by only applying mutation logs
+	//   onlyAppyMutationLogs: only perform incremental restore, by only applying mutation logs
 	//   inconsistentSnapshotOnly: Ignore mutation log files during the restore to speedup the process.
 	//                             When set to true, gives an inconsistent snapshot, thus not recommended
 	//   beginVersion: restore's begin version
@@ -5288,7 +5287,7 @@ public:
 	                                     Key addPrefix,
 	                                     Key removePrefix,
 	                                     bool lockDB,
-	                                     bool incrementalBackupOnly,
+	                                     bool onlyAppyMutationLogs,
 	                                     bool inconsistentSnapshotOnly,
 	                                     Version beginVersion,
 	                                     UID randomUid) {
@@ -5308,12 +5307,12 @@ public:
 		if (targetVersion == invalidVersion && desc.maxRestorableVersion.present())
 			targetVersion = desc.maxRestorableVersion.get();
 
-		if (targetVersion == invalidVersion && incrementalBackupOnly && desc.contiguousLogEnd.present()) {
+		if (targetVersion == invalidVersion && onlyAppyMutationLogs && desc.contiguousLogEnd.present()) {
 			targetVersion = desc.contiguousLogEnd.get() - 1;
 		}
 
 		Optional<RestorableFileSet> restoreSet =
-		    wait(bc->getRestoreSet(targetVersion, ranges, incrementalBackupOnly, beginVersion));
+		    wait(bc->getRestoreSet(targetVersion, ranges, onlyAppyMutationLogs, beginVersion));
 
 		if (!restoreSet.present()) {
 			TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
@@ -5345,7 +5344,7 @@ public:
 				                   addPrefix,
 				                   removePrefix,
 				                   lockDB,
-				                   incrementalBackupOnly,
+				                   onlyAppyMutationLogs,
 				                   inconsistentSnapshotOnly,
 				                   beginVersion,
 				                   randomUid));
@@ -5563,7 +5562,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
                                          Key addPrefix,
                                          Key removePrefix,
                                          bool lockDB,
-                                         bool incrementalBackupOnly,
+                                         bool onlyAppyMutationLogs,
                                          bool inconsistentSnapshotOnly,
                                          Version beginVersion) {
 	return FileBackupAgentImpl::restore(this,
@@ -5578,7 +5577,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
 	                                    addPrefix,
 	                                    removePrefix,
 	                                    lockDB,
-	                                    incrementalBackupOnly,
+	                                    onlyAppyMutationLogs,
 	                                    inconsistentSnapshotOnly,
 	                                    beginVersion,
 	                                    deterministicRandom()->randomUniqueID());

From 823873a9aa150ab9c5edc2048d56a89f9e27ae7a Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Mon, 26 Apr 2021 14:39:27 -0700
Subject: [PATCH 265/461] Address review comments:

Use nullptr instead of NULL
Use const& for a parameter
Add some comments
---
 bindings/c/fdb_c.cpp                        | 1 +
 fdbclient/DatabaseContext.h                 | 1 +
 fdbclient/IClientApi.h                      | 1 +
 fdbclient/MultiVersionTransaction.actor.cpp | 4 +++-
 fdbclient/MultiVersionTransaction.h         | 7 ++++++-
 fdbclient/NativeAPI.actor.cpp               | 1 +
 fdbclient/ThreadSafeTransaction.cpp         | 1 +
 fdbclient/ThreadSafeTransaction.h           | 1 +
 8 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp
index 4b6b3a87ed..66bb974b71 100644
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@@ -367,6 +367,7 @@ extern "C" DLLEXPORT double fdb_database_get_main_thread_busyness(FDBDatabase* d
 
 // Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is non-zero, the future won't return until the protocol version is different than expected
+// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version) {
 	Optional<ProtocolVersion> expected;
 	if (expected_version > 0) {
diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h
index 487ce50bf2..ae1a5a741b 100644
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@@ -201,6 +201,7 @@ public:
 
 	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
+	// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 	Future<ProtocolVersion> getClusterProtocol(Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>());
 
 	// Update the watch counter for the database
diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h
index a3de56bf10..45249f1509 100644
--- a/fdbclient/IClientApi.h
+++ b/fdbclient/IClientApi.h
@@ -102,6 +102,7 @@ public:
 
 	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
+	// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 	virtual ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) = 0;
 
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 3aa14fd6aa..bca3549651 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -361,6 +361,7 @@ double DLDatabase::getMainThreadBusyness() {
 
 // Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
+// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 ThreadFuture<ProtocolVersion> DLDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
 	ASSERT(api->databaseGetServerProtocol != nullptr);
 
@@ -972,13 +973,14 @@ double MultiVersionDatabase::getMainThreadBusyness() {
 
 // Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
+// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 ThreadFuture<ProtocolVersion> MultiVersionDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
 	return dbState->versionMonitorDb->getServerProtocol(expectedVersion);
 }
 
 MultiVersionDatabase::DatabaseState::DatabaseState(std::string clusterFilePath, Reference<IDatabase> versionMonitorDb)
   : clusterFilePath(clusterFilePath), versionMonitorDb(versionMonitorDb),
-    dbVar(new ThreadSafeAsyncVar<Reference<IDatabase>>(Reference<IDatabase>(NULL))) {}
+    dbVar(new ThreadSafeAsyncVar<Reference<IDatabase>>(Reference<IDatabase>(nullptr))) {}
 
 // Adds a client (local or externally loaded) that can be used to connect to the cluster
 void MultiVersionDatabase::DatabaseState::addClient(Reference<ClientInfo> client) {
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index 86e5cc0a63..388db1bd3b 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -273,6 +273,7 @@ public:
 
 	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
+	// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 	ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
 
@@ -448,6 +449,7 @@ public:
 
 	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
+	// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 	ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
 
@@ -510,6 +512,9 @@ public:
 		ThreadFuture<Void> protocolVersionMonitor;
 		std::list<LegacyVersionMonitor> legacyVersionMonitors;
 		Optional<ProtocolVersion> dbProtocolVersion;
+
+		// This maps a normalized protocol version to the client associated with it. This prevents compatible
+		// differences in protocol version not matching each other.
 		std::map<ProtocolVersion, Reference<ClientInfo>> clients;
 
 		std::vector<std::pair<FDBDatabaseOptions::Option, Optional<Standalone<StringRef>>>> options;
@@ -520,7 +525,7 @@ public:
 	// A struct that enables monitoring whether the cluster is running an old version (<= 5.0) that doesn't support
 	// connect packet monitoring.
 	struct LegacyVersionMonitor {
-		LegacyVersionMonitor(Reference<ClientInfo> client) : client(client), monitorRunning(false) {}
+		LegacyVersionMonitor(Reference<ClientInfo> const& client) : client(client), monitorRunning(false) {}
 		~LegacyVersionMonitor() { TraceEvent("DestroyingVersionMonitor"); }
 
 		// Starts the connection monitor by creating a database object at an old version.
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index f264d16bfa..40373d69a6 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -4982,6 +4982,7 @@ ACTOR Future<ProtocolVersion> getClusterProtocolImpl(
 
 // Returns the protocol version reported by the coordinator this client is currently connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
+// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 Future<ProtocolVersion> DatabaseContext::getClusterProtocol(Optional<ProtocolVersion> expectedVersion) {
 	return getClusterProtocolImpl(coordinator, expectedVersion);
 }
diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp
index ce17338af7..b8f2bc6a0a 100644
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@@ -99,6 +99,7 @@ double ThreadSafeDatabase::getMainThreadBusyness() {
 
 // Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
+// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 ThreadFuture<ProtocolVersion> ThreadSafeDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
 	DatabaseContext* db = this->db;
 	return onMainThread(
diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h
index 407f9aefae..d8502f7613 100644
--- a/fdbclient/ThreadSafeTransaction.h
+++ b/fdbclient/ThreadSafeTransaction.h
@@ -41,6 +41,7 @@ public:
 
 	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
+	// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 	ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
 

From fabeedb52733eea0acdb241f0c3da1ec94b029f2 Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Mon, 26 Apr 2021 15:20:25 -0700
Subject: [PATCH 266/461] improve docker packaging script

---
 packaging/docker/build-release-docker.sh | 75 +++++++++++++-----------
 1 file changed, 40 insertions(+), 35 deletions(-)

diff --git a/packaging/docker/build-release-docker.sh b/packaging/docker/build-release-docker.sh
index a385dbf474..32d173aed5 100755
--- a/packaging/docker/build-release-docker.sh
+++ b/packaging/docker/build-release-docker.sh
@@ -1,35 +1,51 @@
-# Run using . build-release-docker.sh
+#!/bin/bash
+set -euxo pipefail
 
-## This is designed to be run inside an okteto environment.
+## This is designed to be run inside an environment with the following repos checked out under ~/src:
+#
+#     foundationdb
+#     fdb-kubernetes-operator
+#
+# The foundationdb build will write its output to ~/build_output
 
-cmk
+FDB_VERSION=$(grep '  VERSION ' ~/src/foundationdb/CMakeLists.txt | tr -s ' ' ' ' | cut -d ' ' -f 3)
 
-cd ~/src/foundationdb/
+# Options (passed via environment variables)
 
-FDB_VERSION=$(grep '  VERSION ' CMakeLists.txt | tr -s ' ' ' ' | cut -d ' ' -f 3)
-
-# Feel free to customize the image tag:
+# Feel free to customize the image tag.
+# TODO: add a mechanism to set TAG=FDB_VERSION when we're building public releases.
 TAG=${TAG:-${FDB_VERSION}-${OKTETO_NAME}}
-
-export IMAGE=foundationdb/foundationdb:${TAG}
+ECR=${ECR:-112664522426.dkr.ecr.us-west-2.amazonaws.com}
 
 echo Building with tag ${TAG}
 
-WEBSITE_BIN_DIR=website/downloads/$FDB_VERSION/linux/
-TARBALL=${WEBSITE_BIN_DIR}/fdb_$FDB_VERSION.tar.gz
-ECR=112664522426.dkr.ecr.us-west-2.amazonaws.com
+# TODO: This is a copy of the commonly-used 'cmk' function.
+cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output \
+   -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON \
+   -G Ninja \
+      && ninja -C ${HOME}/build_output -j 84
 
-cd ~/src/foundationdb/packaging/docker
+
+# derived variables
+IMAGE=foundationdb/foundationdb:${TAG}
+SIDECAR_IMAGE=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
+
+WEBSITE_BIN_DIR=website/downloads/${FDB_VERSION}/linux/
+TARBALL=${WEBSITE_BIN_DIR}/fdb_${FDB_VERSION}.tar.gz
+
+# copy packaging scripts from operator repo into fdb build_output directory
+cp -an ~/src/fdb-kubernetes-operator/foundationdb-kubernetes-sidecar/* ~/build_output/packages/docker/
+
+cd ~/build_output/packages/docker
 
 mkdir -p ${WEBSITE_BIN_DIR}
 tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
+cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
 
-# XXX
-make -C ~/src/fdb-kubernetes-tests/tests/ ecr-login
-
-yes| cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
 docker pull ${ECR}/ubuntu:18.04
 docker tag ${ECR}/ubuntu:18.04 ubuntu:18.04
+docker pull ${ECR}/python:3.9-slim
+docker tag ${ECR}/python:3.9-slim python:3.9-slim
 
 docker build -t ${IMAGE} \
    --build-arg FDB_WEBSITE=file:///mnt/website \
@@ -38,21 +54,6 @@ docker build -t ${IMAGE} \
    -f release/Dockerfile .
 
 docker tag ${IMAGE} ${ECR}/${IMAGE}
-docker push ${ECR}/${IMAGE}
-
-cd ~/src/fdb-kubernetes-operator/foundationdb-kubernetes-sidecar
-echo
-pwd
-echo
-
-mkdir -p ${WEBSITE_BIN_DIR}
-tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
-yes| cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
-
-SIDECAR_IMAGE=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
-
-docker pull ${ECR}/python:3.9-slim
-docker tag ${ECR}/python:3.9-slim python:3.9-slim
 
 docker build -t ${SIDECAR_IMAGE} \
    --build-arg FDB_WEBSITE=file:///mnt/website \
@@ -60,7 +61,11 @@ docker build -t ${SIDECAR_IMAGE} \
    --build-arg FDB_LIBRARY_VERSIONS=$FDB_VERSION \
    -f Dockerfile .
 
-docker tag ${IMAGE} ${ECR}/${SIDECAR_IMAGE}
-docker push ${ECR}/${SIDECAR_IMAGE}
+docker tag ${SIDECAR_IMAGE} ${ECR}/${SIDECAR_IMAGE}
 
-#docker build -f release/Dockerfile -t foundationdb/foundationdb:6.2.29 . --build-arg FDB_VERSION=6.2.29
+# Login to ECR
+# TODO: Move this to a common place instead of repeatedly copy-pasting it.
+aws ecr get-login-password | docker login --username AWS --password-stdin ${ECR}
+
+docker push ${ECR}/${IMAGE}
+docker push ${ECR}/${SIDECAR_IMAGE}

From c3ce091602927d731b60a78d916f1226715f8b78 Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Mon, 26 Apr 2021 15:21:07 -0700
Subject: [PATCH 267/461] remove unused Dockerfiles

---
 packaging/docker/base/Dockerfile     | 46 ---------------------
 packaging/docker/dev/Dockerfile      | 62 ----------------------------
 packaging/docker/dev_ycsb/Dockerfile | 45 --------------------
 3 files changed, 153 deletions(-)
 delete mode 100644 packaging/docker/base/Dockerfile
 delete mode 100644 packaging/docker/dev/Dockerfile
 delete mode 100644 packaging/docker/dev_ycsb/Dockerfile

diff --git a/packaging/docker/base/Dockerfile b/packaging/docker/base/Dockerfile
deleted file mode 100644
index 937d48dcff..0000000000
--- a/packaging/docker/base/Dockerfile
+++ /dev/null
@@ -1,46 +0,0 @@
-# Dockerfile
-#
-# This source file is part of the FoundationDB open source project
-#
-# Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-FROM ubuntu:18.04
-
-# Install dependencies
-
-RUN apt-get update && \
-	apt-get install -y curl>=7.58.0-2ubuntu3.6 \
-		dnsutils>=1:9.11.3+dfsg-1ubuntu1.7 \
-		lsof>=4.89+dfsg-0.1 \
-		tcptraceroute>=1.5beta7+debian-4build1 \
-		telnet>=0.17-41 \
-		netcat>=1.10-41.1 \
-		strace>=4.21-1ubuntu1 \
-		tcpdump>=4.9.3-0ubuntu0.18.04.1 \
-		less>=487-0.1 \
-		vim>=2:8.0.1453-1ubuntu1.4 \
-		net-tools>=1.60+git20161116.90da8a0-1ubuntu1 \
-		jq>=1.5+dfsg-2 && \
-	rm -r /var/lib/apt/lists/*
-
-# Adding tini https://github.com/krallin/tini
-ARG TINI_VERSION=v0.19.0
-RUN curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
-    curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64.sha256sum && \
-	sha256sum -c tini-amd64.sha256sum && \
-	rm -f tini-amd64.sha256sum && \
-    chmod +x tini-amd64 && \
-	mv tini-amd64 /usr/bin/tini
diff --git a/packaging/docker/dev/Dockerfile b/packaging/docker/dev/Dockerfile
deleted file mode 100644
index c74da35467..0000000000
--- a/packaging/docker/dev/Dockerfile
+++ /dev/null
@@ -1,62 +0,0 @@
-# Dockerfile
-#
-# This source file is part of the FoundationDB open source project
-#
-# Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-ARG REPOSITORY=foundationdb/build
-ARG VERSION=centos7-latest
-FROM $REPOSITORY:$VERSION
-
-# Install FoundationDB Binaries
-
-WORKDIR /var/fdb/tmp
-
-COPY docker/scripts scripts/
-
-RUN chmod u+x scripts/*.bash && \
-	mkdir -p logs
-
-COPY . /var/fdb/tmp/packages
-
-WORKDIR /var/fdb/tmp/packages/bin
-
-RUN chmod u+x fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent && \
-	mv fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent /usr/bin
-
-WORKDIR /var/fdb/tmp/packages/lib
-
-RUN mv libfdb_c.so /usr/lib/libfdb_c.so && \
-	mv libfdb_java.so /usr/lib/libfdb_java.so
-
-# Set Up Runtime Scripts and Directories
-
-VOLUME /var/fdb/data
-
-CMD /var/fdb/scripts/fdb.bash
-
-# Runtime Configuration Options
-
-ENV FDB_PORT 4500
-ENV FDB_CLUSTER_FILE /var/fdb/fdb.cluster
-ENV FDB_NETWORKING_MODE container
-ENV FDB_COORDINATOR ""
-ENV FDB_COORDINATOR_PORT 4500
-ENV FDB_CLUSTER_FILE_CONTENTS ""
-ENV FDB_PROCESS_CLASS unset
-
-# Adding tini as PID 1 https://github.com/krallin/tini
-ENTRYPOINT ["/usr/bin/tini", "-g", "--"]
diff --git a/packaging/docker/dev_ycsb/Dockerfile b/packaging/docker/dev_ycsb/Dockerfile
deleted file mode 100644
index 4e1f435766..0000000000
--- a/packaging/docker/dev_ycsb/Dockerfile
+++ /dev/null
@@ -1,45 +0,0 @@
-ARG REPOSITORY=foundationdb/build
-ARG VERSION=centos7-latest
-FROM $REPOSITORY:$VERSION
-
-#########################################################################################################################################
-# This install YCSB AND the FDB client
-# libraries necessary to run it. The
-# following are the different files downloaded:
-#
-#  1. YCSB
-#  2. libfdb_c_${FDB_VERSION}.so -- the C binding. Sent to /var/lib/fdb
-#  3. fdb-java-${FDB_VERSION}.jar -- the Java library. Sent to ${YCSB_HOME}/foundationdb-binding/lib
-#  4. jaxb-api-2.3.1.jar -- a library dependency necessary for making HDR histograms. Sent to ${YCSB_HOME}/foundationdb-binding/lib
-#
-# Note that these files are only complete for FDB 6.2.x. If you are wanting to run FDB 6.3.x versions, then you'll need to add
-# libfdb_java_${FDB_VERSION}.so to /var/lib/fdb as well
-#########################################################################################################################################
-
-ENV YCSB_VERSION=ycsb-foundationdb-binding-0.17.0 \
-    PATH=${PATH}:/usr/bin
-
-RUN cd /opt \
-    && eval curl "-Ls https://github.com/brianfrankcooper/YCSB/releases/download/0.17.0/ycsb-foundationdb-binding-0.17.0.tar.gz" \
-    | tar -xzvf - 
-
-RUN rm -Rf /opt/${YCSB_VERSION}/lib/fdb-java-5.2.5.jar
-
-WORKDIR /var/fdb/tmp
-
-COPY . /var/fdb/tmp/packages
-
-WORKDIR /var/fdb/tmp/packages/lib
-
-RUN mv libfdb_c.so /usr/lib/libfdb_c.so && \
-	mv libfdb_java.so /usr/lib/libfdb_java.so
-
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/
-
-WORKDIR /var/fdb/tmp/packages
-
-RUN mv fdb-java-7.0.0-PRERELEASE.jar /opt/${YCSB_VERSION}/lib/fdb-java-7.0.0-PRERELEASE.jar
-
-WORKDIR "/opt/${YCSB_VERSION}"
-
-CMD ["tail", "-f", "/dev/null"]

From f155374fd237b6b16073f0588220a361bf65aab0 Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Mon, 26 Apr 2021 22:41:55 +0000
Subject: [PATCH 268/461] Copy sidecar docker from fdb-kubernetes-operator
 github (the intent is to delete it from the other repo)

---
 packaging/docker/build-release-docker.sh  |  25 +-
 packaging/docker/sidecar/Dockerfile       |  66 +++
 packaging/docker/sidecar/entrypoint.bash  |  27 +
 packaging/docker/sidecar/requirements.txt |   1 +
 packaging/docker/sidecar/sidecar.py       | 633 ++++++++++++++++++++++
 5 files changed, 737 insertions(+), 15 deletions(-)
 create mode 100644 packaging/docker/sidecar/Dockerfile
 create mode 100755 packaging/docker/sidecar/entrypoint.bash
 create mode 100644 packaging/docker/sidecar/requirements.txt
 create mode 100644 packaging/docker/sidecar/sidecar.py

diff --git a/packaging/docker/build-release-docker.sh b/packaging/docker/build-release-docker.sh
index 32d173aed5..c165fb12ea 100755
--- a/packaging/docker/build-release-docker.sh
+++ b/packaging/docker/build-release-docker.sh
@@ -1,14 +1,12 @@
 #!/bin/bash
 set -euxo pipefail
 
-## This is designed to be run inside an environment with the following repos checked out under ~/src:
-#
-#     foundationdb
-#     fdb-kubernetes-operator
-#
+# This is designed to be run inside an environment with foundationdb checked out at ~/src/foundationdb.
 # The foundationdb build will write its output to ~/build_output
+FDB_SRC=${HOME}/src/foundationdb
+FDB_BUILD=${HOME}/build_output
 
-FDB_VERSION=$(grep '  VERSION ' ~/src/foundationdb/CMakeLists.txt | tr -s ' ' ' ' | cut -d ' ' -f 3)
+FDB_VERSION=$(grep '  VERSION ' ${FDB_SRC}/CMakeLists.txt | tr -s ' ' ' ' | cut -d ' ' -f 3)
 
 # Options (passed via environment variables)
 
@@ -20,24 +18,21 @@ ECR=${ECR:-112664522426.dkr.ecr.us-west-2.amazonaws.com}
 echo Building with tag ${TAG}
 
 # TODO: This is a copy of the commonly-used 'cmk' function.
-cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output \
+cmake -S ${FDB_SRC} -B ${FDB_BUILD} \
    -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON \
-   -G Ninja \
-      && ninja -C ${HOME}/build_output -j 84
+   -G Ninja
 
+ninja -C ${FDB_BUILD} -j 84
 
 # derived variables
 IMAGE=foundationdb/foundationdb:${TAG}
 SIDECAR_IMAGE=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
 
+cd ${FDB_BUILD}/packages/docker
+
 WEBSITE_BIN_DIR=website/downloads/${FDB_VERSION}/linux/
 TARBALL=${WEBSITE_BIN_DIR}/fdb_${FDB_VERSION}.tar.gz
 
-# copy packaging scripts from operator repo into fdb build_output directory
-cp -an ~/src/fdb-kubernetes-operator/foundationdb-kubernetes-sidecar/* ~/build_output/packages/docker/
-
-cd ~/build_output/packages/docker
-
 mkdir -p ${WEBSITE_BIN_DIR}
 tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
 cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
@@ -59,7 +54,7 @@ docker build -t ${SIDECAR_IMAGE} \
    --build-arg FDB_WEBSITE=file:///mnt/website \
    --build-arg FDB_VERSION=$FDB_VERSION \
    --build-arg FDB_LIBRARY_VERSIONS=$FDB_VERSION \
-   -f Dockerfile .
+   -f sidecar/Dockerfile .
 
 docker tag ${SIDECAR_IMAGE} ${ECR}/${SIDECAR_IMAGE}
 
diff --git a/packaging/docker/sidecar/Dockerfile b/packaging/docker/sidecar/Dockerfile
new file mode 100644
index 0000000000..c3245afcd0
--- /dev/null
+++ b/packaging/docker/sidecar/Dockerfile
@@ -0,0 +1,66 @@
+# Dockerfile
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2018-2019 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM python:3.9-slim
+
+WORKDIR /var/fdb/tmp
+ARG FDB_VERSION=6.2.30
+ARG FDB_LIBRARY_VERSIONS="6.2.30 6.1.13"
+ARG FDB_WEBSITE=https://www.foundationdb.org
+# Adding tini as PID 1 https://github.com/krallin/tini
+ARG TINI_VERSION=v0.19.0
+
+COPY website /mnt/website
+RUN apt-get update && \
+	apt-get install -y --no-install-recommends curl && \
+	curl --fail $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz -o fdb_$FDB_VERSION.tar.gz && \
+	tar -xzf fdb_$FDB_VERSION.tar.gz --strip-components=1 && \
+	rm fdb_$FDB_VERSION.tar.gz && \
+	chmod u+x fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent && \
+	mv fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent /usr/bin && \
+	echo ${FDB_VERSION} > /var/fdb/version && mkdir -p /var/fdb/lib && \
+	for version in $FDB_LIBRARY_VERSIONS; do curl --fail $FDB_WEBSITE/downloads/$version/linux/libfdb_c_$version.so -o /var/fdb/lib/libfdb_c_${version%.*}.so; done && \
+	curl -LO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
+    curl -LO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64.sha256sum && \
+	sha256sum -c tini-amd64.sha256sum && \
+	rm -f tini-amd64.sha256sum && \
+    chmod +x tini-amd64 && \
+	mv tini-amd64 /usr/bin/tini && \
+	rm -r /var/fdb/tmp && \
+	groupadd --gid 4059 fdb && \
+	useradd --gid 4059 --uid 4059 --no-create-home --shell /bin/bash fdb && \
+	apt-get remove -y curl && \
+	rm -rf /var/lib/apt/lists/*
+
+WORKDIR /
+
+COPY sidecar/entrypoint.bash /
+COPY sidecar/requirements.txt /
+COPY sidecar/sidecar.py /
+
+RUN pip install -r /requirements.txt && rm /requirements.txt && chmod a+x /entrypoint.bash
+
+VOLUME /var/input-files
+VOLUME /var/output-files
+
+USER fdb
+
+ENV LISTEN_PORT 8080
+
+ENTRYPOINT ["/usr/bin/tini", "-g", "--", "/entrypoint.bash"]
diff --git a/packaging/docker/sidecar/entrypoint.bash b/packaging/docker/sidecar/entrypoint.bash
new file mode 100755
index 0000000000..be173d4ea9
--- /dev/null
+++ b/packaging/docker/sidecar/entrypoint.bash
@@ -0,0 +1,27 @@
+#! /bin/bash
+
+# entrypoint.bash
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2018-2019 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+if [[ -n "$ADDITIONAL_ENV_FILE" ]]; then
+  source $ADDITIONAL_ENV_FILE
+fi
+
+python sidecar.py $*
\ No newline at end of file
diff --git a/packaging/docker/sidecar/requirements.txt b/packaging/docker/sidecar/requirements.txt
new file mode 100644
index 0000000000..c7fcc8bac8
--- /dev/null
+++ b/packaging/docker/sidecar/requirements.txt
@@ -0,0 +1 @@
+watchdog==0.9.0
\ No newline at end of file
diff --git a/packaging/docker/sidecar/sidecar.py b/packaging/docker/sidecar/sidecar.py
new file mode 100644
index 0000000000..af2a580439
--- /dev/null
+++ b/packaging/docker/sidecar/sidecar.py
@@ -0,0 +1,633 @@
+#! /usr/bin/python
+
+# entrypoint.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2018-2019 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import hashlib
+import http.server
+import logging
+import json
+import os
+import shutil
+import socket
+import ssl
+import stat
+import time
+import traceback
+import sys
+from pathlib import Path
+
+from watchdog.observers import Observer
+from watchdog.events import FileSystemEventHandler
+
+log = logging.getLogger(__name__)
+log.setLevel(logging.INFO)
+
+
+class Config(object):
+    def __init__(self):
+        parser = argparse.ArgumentParser(description="FoundationDB Kubernetes Sidecar")
+        parser.add_argument(
+            "--init-mode",
+            help=(
+                "Whether to run the sidecar in init mode "
+                "which causes it to copy the files once and "
+                "exit without starting a server."
+            ),
+            action="store_true",
+        )
+        parser.add_argument(
+            "--bind-address", help="IP and port to bind on", default="0.0.0.0:8080"
+        )
+        parser.add_argument(
+            "--tls",
+            help=("This flag enables TLS for incoming " "connections"),
+            action="store_true",
+        )
+        parser.add_argument(
+            "--tls-certificate-file",
+            help=(
+                "The path to the certificate file for TLS "
+                "connections. If this is not provided we "
+                "will take the path from the "
+                "FDB_TLS_CERTIFICATE_FILE environment "
+                "variable."
+            ),
+        )
+        parser.add_argument(
+            "--tls-ca-file",
+            help=(
+                "The path to the certificate authority file "
+                "for TLS connections  If this is not "
+                "provided we will take the path from the "
+                "FDB_TLS_CA_FILE environment variable."
+            ),
+        )
+        parser.add_argument(
+            "--tls-key-file",
+            help=(
+                "The path to the key file for TLS "
+                "connections. If this is not provided we "
+                "will take the path from the "
+                "FDB_TLS_KEY_FILE environment "
+                "variable."
+            ),
+        )
+        parser.add_argument(
+            "--tls-verify-peers",
+            help=(
+                "The peer verification rules for incoming "
+                "TLS  connections. If this is not provided "
+                "we will take the rules from the "
+                "FDB_TLS_VERIFY_PEERS environment variable. "
+                "The format of this is the same as the TLS "
+                "peer verification rules in FoundationDB."
+            ),
+        )
+        parser.add_argument(
+            "--input-dir",
+            help=("The directory containing the input files " "the config map."),
+            default="/var/input-files",
+        )
+        parser.add_argument(
+            "--output-dir",
+            help=(
+                "The directory into which the sidecar should "
+                "place the file it generates."
+            ),
+            default="/var/output-files",
+        )
+        parser.add_argument(
+            "--substitute-variable",
+            help=(
+                "A custom environment variable that should "
+                "available for substitution in the monitor "
+                "conf."
+            ),
+            action="append",
+        )
+        parser.add_argument(
+            "--copy-file",
+            help=("A file to copy from the config map to the " "output directory."),
+            action="append",
+        )
+        parser.add_argument(
+            "--copy-binary",
+            help=("A binary to copy from the to the output" "directory."),
+            action="append",
+        )
+        parser.add_argument(
+            "--copy-library",
+            help=(
+                "A version of the client library to copy " "to the output directory."
+            ),
+            action="append",
+        )
+        parser.add_argument(
+            "--input-monitor-conf",
+            help=("The name of a monitor conf template in the " "input files"),
+        )
+        parser.add_argument(
+            "--main-container-version",
+            help=("The version of the main foundationdb " "container in the pod"),
+        )
+        parser.add_argument(
+            "--main-container-conf-dir",
+            help=(
+                "The directory where the dynamic conf "
+                "written by the sidecar will be mounted in "
+                "the main container."
+            ),
+            default="/var/dynamic-conf",
+        )
+        parser.add_argument(
+            "--require-not-empty",
+            help=(
+                "A file that must be present and non-empty " "in the input directory"
+            ),
+            action="append",
+        )
+        args = parser.parse_args()
+
+        self.bind_address = args.bind_address
+        self.input_dir = args.input_dir
+        self.output_dir = args.output_dir
+
+        self.enable_tls = args.tls
+        self.copy_files = args.copy_file or []
+        self.copy_binaries = args.copy_binary or []
+        self.copy_libraries = args.copy_library or []
+        self.input_monitor_conf = args.input_monitor_conf
+        self.init_mode = args.init_mode
+        self.main_container_version = args.main_container_version
+        self.require_not_empty = args.require_not_empty
+
+        with open("/var/fdb/version") as version_file:
+            self.primary_version = version_file.read().strip()
+
+        version_split = self.primary_version.split(".")
+        self.minor_version = [int(version_split[0]), int(version_split[1])]
+
+        forbid_deprecated_environment_variables = self.is_at_least([6, 3])
+
+        if self.enable_tls:
+            self.certificate_file = args.tls_certificate_file or os.getenv(
+                "FDB_TLS_CERTIFICATE_FILE"
+            )
+            assert self.certificate_file, (
+                "You must provide a certificate file, either through the "
+                "tls_certificate_file argument or the FDB_TLS_CERTIFICATE_FILE "
+                "environment variable"
+            )
+            self.ca_file = args.tls_ca_file or os.getenv("FDB_TLS_CA_FILE")
+            assert self.ca_file, (
+                "You must provide a CA file, either through the tls_ca_file "
+                "argument or the FDB_TLS_CA_FILE environment variable"
+            )
+            self.key_file = args.tls_key_file or os.getenv("FDB_TLS_KEY_FILE")
+            assert self.key_file, (
+                "You must provide a key file, either through the tls_key_file "
+                "argument or the FDB_TLS_KEY_FILE environment variable"
+            )
+            self.peer_verification_rules = args.tls_verify_peers or os.getenv(
+                "FDB_TLS_VERIFY_PEERS"
+            )
+
+        self.substitutions = {}
+        for key in [
+            "FDB_PUBLIC_IP",
+            "FDB_MACHINE_ID",
+            "FDB_ZONE_ID",
+            "FDB_INSTANCE_ID",
+        ]:
+            self.substitutions[key] = os.getenv(key, "")
+
+        if self.substitutions["FDB_MACHINE_ID"] == "":
+            self.substitutions["FDB_MACHINE_ID"] = os.getenv("HOSTNAME", "")
+
+        if self.substitutions["FDB_ZONE_ID"] == "":
+            self.substitutions["FDB_ZONE_ID"] = self.substitutions["FDB_MACHINE_ID"]
+        if self.substitutions["FDB_PUBLIC_IP"] == "":
+            address_info = socket.getaddrinfo(
+                self.substitutions["FDB_MACHINE_ID"],
+                4500,
+                family=socket.AddressFamily.AF_INET,
+            )
+            if len(address_info) > 0:
+                self.substitutions["FDB_PUBLIC_IP"] = address_info[0][4][0]
+
+        if self.main_container_version == self.primary_version:
+            self.substitutions["BINARY_DIR"] = "/usr/bin"
+        else:
+            self.substitutions["BINARY_DIR"] = target_path = str(
+                Path("%s/bin/%s" % (args.main_container_conf_dir, self.primary_version))
+            )
+
+        for variable in args.substitute_variable or []:
+            self.substitutions[variable] = os.getenv(variable)
+
+        if forbid_deprecated_environment_variables:
+            for variable in [
+                "SIDECAR_CONF_DIR",
+                "INPUT_DIR",
+                "OUTPUT_DIR",
+                "COPY_ONCE",
+            ]:
+                if os.getenv(variable):
+                    print(
+                        f"""Environment variable {variable} is not supported in this version of FoundationDB.
+                        Please use the command-line arguments instead."""
+                    )
+                    sys.exit(1)
+
+        if os.getenv("SIDECAR_CONF_DIR"):
+            with open(
+                os.path.join(os.getenv("SIDECAR_CONF_DIR"), "config.json")
+            ) as conf_file:
+                config = json.load(conf_file)
+        else:
+            config = {}
+
+        if os.getenv("INPUT_DIR"):
+            self.input_dir = os.getenv("INPUT_DIR")
+
+        if os.getenv("OUTPUT_DIR"):
+            self.output_dir = os.getenv("OUTPUT_DIR")
+
+        if "ADDITIONAL_SUBSTITUTIONS" in config and config["ADDITIONAL_SUBSTITUTIONS"]:
+            for key in config["ADDITIONAL_SUBSTITUTIONS"]:
+                self.substitutions[key] = os.getenv(key, key)
+
+        if "COPY_FILES" in config and config["COPY_FILES"]:
+            self.copy_files.extend(config["COPY_FILES"])
+
+        if "COPY_BINARIES" in config and config["COPY_BINARIES"]:
+            self.copy_binaries.extend(config["COPY_BINARIES"])
+
+        if "COPY_LIBRARIES" in config and config["COPY_LIBRARIES"]:
+            self.copy_libraries.extend(config["COPY_LIBRARIES"])
+
+        if "INPUT_MONITOR_CONF" in config and config["INPUT_MONITOR_CONF"]:
+            self.input_monitor_conf = config["INPUT_MONITOR_CONF"]
+
+        if os.getenv("COPY_ONCE", "0") == "1":
+            self.init_mode = True
+
+    @classmethod
+    def shared(cls):
+        if cls.shared_config:
+            return cls.shared_config
+        cls.shared_config = Config()
+        return cls.shared_config
+
+    shared_config = None
+
+    def is_at_least(self, target_version):
+        return self.minor_version[0] > target_version[0] or (
+            self.minor_version[0] == target_version[0]
+            and self.minor_version[1] >= target_version[1]
+        )
+
+
+class Server(http.server.BaseHTTPRequestHandler):
+    ssl_context = None
+
+    @classmethod
+    def start(cls):
+        """
+        This method starts the server.
+        """
+        config = Config.shared()
+        (address, port) = config.bind_address.split(":")
+        log.info("Listening on %s:%s" % (address, port))
+        httpd = http.server.HTTPServer((address, int(port)), cls)
+
+        if config.enable_tls:
+            context = Server.load_ssl_context()
+            httpd.socket = context.wrap_socket(httpd.socket, server_side=True)
+            observer = Observer()
+            event_handler = CertificateEventHandler()
+            for path in set(
+                [
+                    Path(config.certificate_file).parent.as_posix(),
+                    Path(config.key_file).parent.as_posix(),
+                ]
+            ):
+                observer.schedule(event_handler, path)
+            observer.start()
+
+        httpd.serve_forever()
+
+    @classmethod
+    def load_ssl_context(cls):
+        config = Config.shared()
+        if not cls.ssl_context:
+            cls.ssl_context = ssl.create_default_context(cafile=config.ca_file)
+            cls.ssl_context.check_hostname = False
+            cls.ssl_context.verify_mode = ssl.CERT_REQUIRED
+        cls.ssl_context.load_cert_chain(config.certificate_file, config.key_file)
+        return cls.ssl_context
+
+    def send_text(self, text, code=200, content_type="text/plain", add_newline=True):
+        """
+        This method sends a text response.
+        """
+        if add_newline:
+            text += "\n"
+
+        self.send_response(code)
+        response = bytes(text, encoding="utf-8")
+        self.send_header("Content-Length", str(len(response)))
+        self.send_header("Content-Type", content_type)
+        self.end_headers()
+        self.wfile.write(response)
+
+    def check_request_cert(self):
+        config = Config.shared()
+        approved = not config.enable_tls or self.check_cert(
+            self.connection.getpeercert(), config.peer_verification_rules
+        )
+        if not approved:
+            self.send_error(401, "Client certificate was not approved")
+        return approved
+
+    def check_cert(self, cert, rules):
+        """
+        This method checks that the client's certificate is valid.
+
+        If there is any problem with the certificate, this will return a string
+        describing the error.
+        """
+        if not rules:
+            return True
+
+        for option in rules.split(";"):
+            option_valid = True
+            for rule in option.split(","):
+                if not self.check_cert_rule(cert, rule):
+                    option_valid = False
+                    break
+
+            if option_valid:
+                return True
+
+        return False
+
+    def check_cert_rule(self, cert, rule):
+        (key, expected_value) = rule.split("=", 1)
+        if "." in key:
+            (scope_key, field_key) = key.split(".", 1)
+        else:
+            scope_key = "S"
+            field_key = key
+
+        if scope_key == "S" or scope_key == "Subject":
+            scope_name = "subject"
+        elif scope_key == "I" or scope_key == "Issuer":
+            scope_name = "issuer"
+        elif scope_key == "R" or scope_key == "Root":
+            scope_name = "root"
+        else:
+            assert False, "Unknown certificate scope %s" % scope_key
+
+        if scope_name not in cert:
+            return False
+
+        rdns = None
+        operator = ""
+        if field_key == "CN":
+            field_name = "commonName"
+        elif field_key == "C":
+            field_name = "country"
+        elif field_key == "L":
+            field_name = "localityName"
+        elif field_key == "ST":
+            field_name = "stateOrProvinceName"
+        elif field_key == "O":
+            field_name = "organizationName"
+        elif field_key == "OU":
+            field_name = "organizationalUnitName"
+        elif field_key == "UID":
+            field_name = "userId"
+        elif field_key == "DC":
+            field_name = "domainComponent"
+        elif field_key.startswith("subjectAltName") and scope_name == "subject":
+            operator = field_key[14:]
+            field_key = field_key[0:14]
+            (field_name, expected_value) = expected_value.split(":", 1)
+            if field_key not in cert:
+                return False
+            rdns = [cert["subjectAltName"]]
+        else:
+            assert False, "Unknown certificate field %s" % field_key
+
+        if not rdns:
+            rdns = list(cert[scope_name])
+
+        for rdn in rdns:
+            for entry in list(rdn):
+                if entry[0] == field_name:
+                    if operator == "" and entry[1] == expected_value:
+                        return True
+                    elif operator == "<" and entry[1].endswith(expected_value):
+                        return True
+                    elif operator == ">" and entry[1].startswith(expected_value):
+                        return True
+
+    def do_GET(self):
+        """
+        This method executes a GET request.
+        """
+        try:
+            if not self.check_request_cert():
+                return
+            if self.path.startswith("/check_hash/"):
+                try:
+                    self.send_text(check_hash(self.path[12:]), add_newline=False)
+                except FileNotFoundError:
+                    self.send_error(404, "Path not found")
+                    self.end_headers()
+            elif self.path == "/ready":
+                self.send_text(ready())
+            elif self.path == "/substitutions":
+                self.send_text(get_substitutions())
+            else:
+                self.send_error(404, "Path not found")
+                self.end_headers()
+        except RequestException as e:
+            self.send_error(400, e.message)
+        except Exception as ex:
+            log.error(f"Error processing request {ex}", exc_info=True)
+            self.send_error(500)
+            self.end_headers()
+
+    def do_POST(self):
+        """
+        This method executes a POST request.
+        """
+        try:
+            if not self.check_request_cert():
+                return
+            if self.path == "/copy_files":
+                self.send_text(copy_files())
+            elif self.path == "/copy_binaries":
+                self.send_text(copy_binaries())
+            elif self.path == "/copy_libraries":
+                self.send_text(copy_libraries())
+            elif self.path == "/copy_monitor_conf":
+                self.send_text(copy_monitor_conf())
+            elif self.path == "/refresh_certs":
+                self.send_text(refresh_certs())
+            elif self.path == "/restart":
+                self.send_text("OK")
+                exit(1)
+            else:
+                self.send_error(404, "Path not found")
+                self.end_headers()
+        except SystemExit as e:
+            raise e
+        except RequestException as e:
+            self.send_error(400, e.message)
+        except e:
+            log.error("Error processing request", exc_info=True)
+            self.send_error(500)
+            self.end_headers()
+
+    def log_message(self, format, *args):
+        log.info(format % args)
+
+
+class CertificateEventHandler(FileSystemEventHandler):
+    def on_any_event(self, event):
+        log.info("Detected change to certificates")
+        time.sleep(10)
+        log.info("Reloading certificates")
+        Server.load_ssl_context()
+
+
+def check_hash(filename):
+    with open(os.path.join(Config.shared().output_dir, filename), "rb") as contents:
+        m = hashlib.sha256()
+        m.update(contents.read())
+        return m.hexdigest()
+
+
+def copy_files():
+    config = Config.shared()
+    if config.require_not_empty:
+        for filename in config.require_not_empty:
+            path = os.path.join(config.input_dir, filename)
+            if not os.path.isfile(path) or os.path.getsize(path) == 0:
+                raise Exception("No contents for file %s" % path)
+    for filename in config.copy_files:
+        tmp_file = os.path.join(config.output_dir, f"{filename}.tmp")
+        shutil.copy(os.path.join(config.input_dir, filename), tmp_file)
+        os.replace(tmp_file, os.path.join(config.output_dir, filename))
+
+    return "OK"
+
+
+def copy_binaries():
+    config = Config.shared()
+    if config.main_container_version != config.primary_version:
+        for binary in config.copy_binaries:
+            path = Path(f"/usr/bin/{binary}")
+            target_path = Path(
+                f"{config.output_dir}/bin/{config.primary_version}/{binary}"
+            )
+            if not target_path.exists():
+                target_path.parent.mkdir(parents=True, exist_ok=True)
+                tmp_file = f"{target_path}.tmp"
+                shutil.copy(path, tmp_file)
+                os.replace(tmp_file, target_path)
+                target_path.chmod(0o744)
+    return "OK"
+
+
+def copy_libraries():
+    config = Config.shared()
+    for version in config.copy_libraries:
+        path = Path(f"/var/fdb/lib/libfdb_c_{version}.so")
+        if version == config.copy_libraries[0]:
+            target_path = Path(f"{config.output_dir}/lib/libfdb_c.so")
+        else:
+            target_path = Path(
+                f"{config.output_dir}/lib/multiversion/libfdb_c_{version}.so"
+            )
+        if not target_path.exists():
+            target_path.parent.mkdir(parents=True, exist_ok=True)
+            tmp_file = f"{target_path}.tmp"
+            shutil.copy(path, tmp_file)
+            os.replace(tmp_file, target_path)
+    return "OK"
+
+
+def copy_monitor_conf():
+    config = Config.shared()
+    if config.input_monitor_conf:
+        with open(
+            os.path.join(config.input_dir, config.input_monitor_conf)
+        ) as monitor_conf_file:
+            monitor_conf = monitor_conf_file.read()
+        for variable in config.substitutions:
+            monitor_conf = monitor_conf.replace(
+                "$" + variable, config.substitutions[variable]
+            )
+
+        tmp_file = os.path.join(config.output_dir, "fdbmonitor.conf.tmp")
+        target_file = os.path.join(config.output_dir, "fdbmonitor.conf")
+
+        with open(tmp_file, "w") as output_conf_file:
+            output_conf_file.write(monitor_conf)
+
+        os.replace(tmp_file, target_file)
+    return "OK"
+
+
+def get_substitutions():
+    return json.dumps(Config.shared().substitutions)
+
+
+def ready():
+    return "OK"
+
+
+def refresh_certs():
+    if not Config.shared().enable_tls:
+        raise RequestException("Server is not using TLS")
+    Server.load_ssl_context()
+    return "OK"
+
+
+class RequestException(Exception):
+    def __init__(self, message):
+        super().__init__(message)
+        self.message = message
+
+
+if __name__ == "__main__":
+    logging.basicConfig(format="%(asctime)-15s %(levelname)s %(message)s")
+    copy_files()
+    copy_binaries()
+    copy_libraries()
+    copy_monitor_conf()
+
+    if not Config.shared().init_mode:
+        Server.start()

From 8dc487e56267db6d2c4cab718eb382ef5c6761fc Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Mon, 26 Apr 2021 16:23:54 -0700
Subject: [PATCH 269/461] docker cleanup

---
 packaging/docker/build-release-docker.sh               | 10 +++++-----
 packaging/docker/release/Dockerfile                    |  5 ++---
 .../{scripts => release}/create_cluster_file.bash      |  0
 .../create_server_environment.bash                     |  0
 .../download_multiversion_libraries.bash               |  0
 packaging/docker/{scripts => release}/fdb.bash         |  0
 packaging/docker/sidecar/Dockerfile                    |  4 ++--
 7 files changed, 9 insertions(+), 10 deletions(-)
 rename packaging/docker/{scripts => release}/create_cluster_file.bash (100%)
 mode change 100644 => 100755
 rename packaging/docker/{scripts => release}/create_server_environment.bash (100%)
 mode change 100644 => 100755
 rename packaging/docker/{scripts => release}/download_multiversion_libraries.bash (100%)
 mode change 100644 => 100755
 rename packaging/docker/{scripts => release}/fdb.bash (100%)
 mode change 100644 => 100755

diff --git a/packaging/docker/build-release-docker.sh b/packaging/docker/build-release-docker.sh
index c165fb12ea..eaaa941bce 100755
--- a/packaging/docker/build-release-docker.sh
+++ b/packaging/docker/build-release-docker.sh
@@ -37,6 +37,10 @@ mkdir -p ${WEBSITE_BIN_DIR}
 tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
 cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
 
+# Login to ECR
+# TODO: Move this to a common place instead of repeatedly copy-pasting it.
+aws ecr get-login-password | docker login --username AWS --password-stdin ${ECR}
+
 docker pull ${ECR}/ubuntu:18.04
 docker tag ${ECR}/ubuntu:18.04 ubuntu:18.04
 docker pull ${ECR}/python:3.9-slim
@@ -53,14 +57,10 @@ docker tag ${IMAGE} ${ECR}/${IMAGE}
 docker build -t ${SIDECAR_IMAGE} \
    --build-arg FDB_WEBSITE=file:///mnt/website \
    --build-arg FDB_VERSION=$FDB_VERSION \
-   --build-arg FDB_LIBRARY_VERSIONS=$FDB_VERSION \
+   --build-arg FDB_ADDITIONAL_VERSIONS=$FDB_VERSION \
    -f sidecar/Dockerfile .
 
 docker tag ${SIDECAR_IMAGE} ${ECR}/${SIDECAR_IMAGE}
 
-# Login to ECR
-# TODO: Move this to a common place instead of repeatedly copy-pasting it.
-aws ecr get-login-password | docker login --username AWS --password-stdin ${ECR}
-
 docker push ${ECR}/${IMAGE}
 docker push ${ECR}/${SIDECAR_IMAGE}
diff --git a/packaging/docker/release/Dockerfile b/packaging/docker/release/Dockerfile
index d44b94fdfd..7e265cbf91 100644
--- a/packaging/docker/release/Dockerfile
+++ b/packaging/docker/release/Dockerfile
@@ -57,10 +57,9 @@ WORKDIR /var/fdb
 
 # Set Up Runtime Scripts and Directories
 
-COPY scripts /var/fdb/scripts
+COPY release/*.bash /var/fdb/scripts/
 
-RUN chmod u+x scripts/*.bash && \
-	mkdir -p logs
+RUN	mkdir -p logs
 
 # Install FoundationDB Client Libraries
 
diff --git a/packaging/docker/scripts/create_cluster_file.bash b/packaging/docker/release/create_cluster_file.bash
old mode 100644
new mode 100755
similarity index 100%
rename from packaging/docker/scripts/create_cluster_file.bash
rename to packaging/docker/release/create_cluster_file.bash
diff --git a/packaging/docker/scripts/create_server_environment.bash b/packaging/docker/release/create_server_environment.bash
old mode 100644
new mode 100755
similarity index 100%
rename from packaging/docker/scripts/create_server_environment.bash
rename to packaging/docker/release/create_server_environment.bash
diff --git a/packaging/docker/scripts/download_multiversion_libraries.bash b/packaging/docker/release/download_multiversion_libraries.bash
old mode 100644
new mode 100755
similarity index 100%
rename from packaging/docker/scripts/download_multiversion_libraries.bash
rename to packaging/docker/release/download_multiversion_libraries.bash
diff --git a/packaging/docker/scripts/fdb.bash b/packaging/docker/release/fdb.bash
old mode 100644
new mode 100755
similarity index 100%
rename from packaging/docker/scripts/fdb.bash
rename to packaging/docker/release/fdb.bash
diff --git a/packaging/docker/sidecar/Dockerfile b/packaging/docker/sidecar/Dockerfile
index c3245afcd0..cb6c0d8397 100644
--- a/packaging/docker/sidecar/Dockerfile
+++ b/packaging/docker/sidecar/Dockerfile
@@ -21,7 +21,7 @@ FROM python:3.9-slim
 
 WORKDIR /var/fdb/tmp
 ARG FDB_VERSION=6.2.30
-ARG FDB_LIBRARY_VERSIONS="6.2.30 6.1.13"
+ARG FDB_ADDITIONAL_VERSIONS="6.2.30 6.1.13"
 ARG FDB_WEBSITE=https://www.foundationdb.org
 # Adding tini as PID 1 https://github.com/krallin/tini
 ARG TINI_VERSION=v0.19.0
@@ -35,7 +35,7 @@ RUN apt-get update && \
 	chmod u+x fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent && \
 	mv fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent /usr/bin && \
 	echo ${FDB_VERSION} > /var/fdb/version && mkdir -p /var/fdb/lib && \
-	for version in $FDB_LIBRARY_VERSIONS; do curl --fail $FDB_WEBSITE/downloads/$version/linux/libfdb_c_$version.so -o /var/fdb/lib/libfdb_c_${version%.*}.so; done && \
+	for version in $FDB_ADDITIONAL_VERSIONS; do curl --fail $FDB_WEBSITE/downloads/$version/linux/libfdb_c_$version.so -o /var/fdb/lib/libfdb_c_${version%.*}.so; done && \
 	curl -LO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
     curl -LO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64.sha256sum && \
 	sha256sum -c tini-amd64.sha256sum && \

From 76acb0fcb98cbe9bc2a147a9aadc2d69028cbd9d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 26 Apr 2021 17:42:15 -0700
Subject: [PATCH 270/461] Update date format to ISO 8601

---
 fdbclient/ProcessInterface.h        |  2 +-
 fdbclient/SpecialKeySpace.actor.cpp | 20 ++++++++++++++++++--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
index c89f6028bb..0c57107106 100644
--- a/fdbclient/ProcessInterface.h
+++ b/fdbclient/ProcessInterface.h
@@ -86,7 +86,7 @@ struct ActorLineageReply {
 struct ActorLineageRequest {
 	constexpr static FileIdentifier file_identifier = 11654765;
 	WaitState waitStateStart, waitStateEnd;
-	double timeStart, timeEnd;
+	time_t timeStart, timeEnd;
 	int seqStart, seqEnd;
 	ReplyPromise<ActorLineageReply> reply;
 
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index f251feddfa..d2f0e57d55 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -21,6 +21,7 @@
 #include "boost/lexical_cast.hpp"
 #include "boost/algorithm/string.hpp"
 
+#include <time.h>
 #include <msgpack.hpp>
 
 #include <exception>
@@ -1958,6 +1959,21 @@ void parse(StringRef& val, WaitState& w) {
 	}
 }
 
+void parse(StringRef& val, time_t& t) {
+	struct tm tm = { 0 };
+	if (strptime(val.toString().c_str(), "%FT%T%z", &tm) == nullptr) {
+			TraceEvent("LUKAS_FailedToParse");
+		throw std::invalid_argument("failed to parse ISO 8601 datetime");
+	}
+
+	long timezone = tm.tm_gmtoff;
+	t = timegm(&tm);
+	if (t == -1) {
+		throw std::runtime_error("failed to convert ISO 8601 datetime");
+	}
+	t -= timezone;
+}
+
 void parse(StringRef& val, NetworkAddress& a) {
 	auto address = NetworkAddress::parse(val.toString());
 	if (!address.isValid()) {
@@ -2016,8 +2032,8 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 	state NetworkAddress host;
 	state WaitState waitStateStart = WaitState{ 0 };
 	state WaitState waitStateEnd = WaitState{ 2 };
-	state double timeStart = 0;
-	state double timeEnd = std::numeric_limits<double>::max();
+	state time_t timeStart = 0;
+	state time_t timeEnd = std::numeric_limits<time_t>::max();
 	state int seqStart = 0;
 	state int seqEnd = std::numeric_limits<int>::max();
 

From 5279512097a31e546ebf57b69cd3f88e9500c2a2 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 18:06:25 -0700
Subject: [PATCH 271/461] the enum values changed so the master class in the
 previous version is the same as GrvProxyClass now

---
 fdbserver/SimulatedCluster.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index f10ca774bb..68ae6966eb 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -733,7 +733,7 @@ ACTOR Future<Void> restartSimulatedSystem(vector<Future<Void>>* systemActors,
 			ProcessClass::ClassType cType =
 			    (ProcessClass::ClassType)(atoi(ini.GetValue(machineIdString.c_str(), "mClass")));
 			// using specialized class types can lead to nondeterministic recruitment
-			if (cType == ProcessClass::MasterClass || cType == ProcessClass::ResolutionClass) {
+			if (cType == ProcessClass::GrvProxyClass || cType == ProcessClass::ResolutionClass) {
 				cType = ProcessClass::StatelessClass;
 			}
 			ProcessClass processClass = ProcessClass(cType, ProcessClass::CommandLineSource);

From a02da36e8541dbd284458e6d70ebcec4ad599b86 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 18:45:44 -0700
Subject: [PATCH 272/461] fixed the problem with the GrvProxyClass the proper
 way my keeping the enum the same between versions

---
 fdbrpc/Locality.h                    | 2 +-
 fdbserver/SimulatedCluster.actor.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbrpc/Locality.h b/fdbrpc/Locality.h
index a3a3ebd4a9..0a7467e0bf 100644
--- a/fdbrpc/Locality.h
+++ b/fdbrpc/Locality.h
@@ -34,7 +34,6 @@ struct ProcessClass {
 		ResolutionClass,
 		TesterClass,
 		CommitProxyClass,
-		GrvProxyClass,
 		MasterClass,
 		StatelessClass,
 		LogClass,
@@ -46,6 +45,7 @@ struct ProcessClass {
 		RatekeeperClass,
 		StorageCacheClass,
 		BackupClass,
+		GrvProxyClass,
 		InvalidClass = -1
 	};
 
diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index 68ae6966eb..f10ca774bb 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -733,7 +733,7 @@ ACTOR Future<Void> restartSimulatedSystem(vector<Future<Void>>* systemActors,
 			ProcessClass::ClassType cType =
 			    (ProcessClass::ClassType)(atoi(ini.GetValue(machineIdString.c_str(), "mClass")));
 			// using specialized class types can lead to nondeterministic recruitment
-			if (cType == ProcessClass::GrvProxyClass || cType == ProcessClass::ResolutionClass) {
+			if (cType == ProcessClass::MasterClass || cType == ProcessClass::ResolutionClass) {
 				cType = ProcessClass::StatelessClass;
 			}
 			ProcessClass processClass = ProcessClass(cType, ProcessClass::CommandLineSource);

From 1f98dec1df2d1bb5bad5c3c5296160f85d829bd3 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 19:26:25 -0700
Subject: [PATCH 273/461] cleaned up default constructed maps

---
 fdbserver/ClusterController.actor.cpp | 80 +++++++++------------------
 1 file changed, 26 insertions(+), 54 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 3cbaebe6bf..0ba69cbf84 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1200,8 +1200,7 @@ public:
 	                                               ProcessClass::Fitness unacceptableFitness,
 	                                               DatabaseConfiguration const& conf,
 	                                               std::map<Optional<Standalone<StringRef>>, int>& id_used,
-	                                               std::map<Optional<Standalone<StringRef>>, int> preferredSharing =
-	                                                   std::map<Optional<Standalone<StringRef>>, int>(),
+	                                               std::map<Optional<Standalone<StringRef>>, int> preferredSharing = {},
 	                                               bool checkStable = false) {
 		std::map<std::tuple<ProcessClass::Fitness, int, bool, int>, vector<WorkerDetails>> fitness_workers;
 
@@ -1238,8 +1237,7 @@ public:
 	    int amount,
 	    DatabaseConfiguration const& conf,
 	    std::map<Optional<Standalone<StringRef>>, int>& id_used,
-	    std::map<Optional<Standalone<StringRef>>, int> preferredSharing =
-	        std::map<Optional<Standalone<StringRef>>, int>(),
+	    std::map<Optional<Standalone<StringRef>>, int> preferredSharing = {},
 	    Optional<WorkerFitnessInfo> minWorker = Optional<WorkerFitnessInfo>(),
 	    bool checkStable = false) {
 		std::map<std::tuple<ProcessClass::Fitness, int, bool, int>, vector<WorkerDetails>> fitness_workers;
@@ -1961,15 +1959,10 @@ public:
 			                             ProcessClass::ExcludeFit,
 			                             db.config,
 			                             id_used,
-			                             std::map<Optional<Standalone<StringRef>>, int>(),
-			                             true);
-			getWorkerForRoleInDatacenter(regions[0].dcId,
-			                             ProcessClass::Master,
-			                             ProcessClass::ExcludeFit,
-			                             db.config,
-			                             id_used,
-			                             std::map<Optional<Standalone<StringRef>>, int>(),
+			                             {},
 			                             true);
+			getWorkerForRoleInDatacenter(
+			    regions[0].dcId, ProcessClass::Master, ProcessClass::ExcludeFit, db.config, id_used, {}, true);
 
 			std::set<Optional<Key>> primaryDC;
 			primaryDC.insert(regions[0].dcId);
@@ -1985,27 +1978,12 @@ public:
 				getWorkersForSatelliteLogs(db.config, regions[0], regions[1], id_used, satelliteFallback, true);
 			}
 
-			getWorkerForRoleInDatacenter(regions[0].dcId,
-			                             ProcessClass::Resolver,
-			                             ProcessClass::ExcludeFit,
-			                             db.config,
-			                             id_used,
-			                             std::map<Optional<Standalone<StringRef>>, int>(),
-			                             true);
-			getWorkerForRoleInDatacenter(regions[0].dcId,
-			                             ProcessClass::CommitProxy,
-			                             ProcessClass::ExcludeFit,
-			                             db.config,
-			                             id_used,
-			                             std::map<Optional<Standalone<StringRef>>, int>(),
-			                             true);
-			getWorkerForRoleInDatacenter(regions[0].dcId,
-			                             ProcessClass::GrvProxy,
-			                             ProcessClass::ExcludeFit,
-			                             db.config,
-			                             id_used,
-			                             std::map<Optional<Standalone<StringRef>>, int>(),
-			                             true);
+			getWorkerForRoleInDatacenter(
+			    regions[0].dcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, db.config, id_used, {}, true);
+			getWorkerForRoleInDatacenter(
+			    regions[0].dcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, db.config, id_used, {}, true);
+			getWorkerForRoleInDatacenter(
+			    regions[0].dcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, db.config, id_used, {}, true);
 
 			vector<Optional<Key>> dcPriority;
 			dcPriority.push_back(regions[0].dcId);
@@ -2217,13 +2195,8 @@ public:
 		std::map<Optional<Standalone<StringRef>>, int> old_id_used;
 		id_used[clusterControllerProcessId]++;
 		old_id_used[clusterControllerProcessId]++;
-		WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(clusterControllerDcId,
-		                                                         ProcessClass::Master,
-		                                                         ProcessClass::NeverAssign,
-		                                                         db.config,
-		                                                         id_used,
-		                                                         std::map<Optional<Standalone<StringRef>>, int>(),
-		                                                         true);
+		WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(
+		    clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db.config, id_used, {}, true);
 		auto newMasterFit = mworker.worker.processClass.machineClassFitness(ProcessClass::Master);
 		if (db.config.isExcludedServer(mworker.worker.interf.addresses())) {
 			newMasterFit = std::max(newMasterFit, ProcessClass::ExcludeFit);
@@ -2382,17 +2355,16 @@ public:
 		RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter, old_id_used);
 		RoleFitness newLogRoutersFit = oldLogRoutersFit;
 		if (db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED) {
-			newLogRoutersFit =
-			    RoleFitness(getWorkersForRoleInDatacenter(*remoteDC.begin(),
-			                                              ProcessClass::LogRouter,
-			                                              newRouterCount,
-			                                              db.config,
-			                                              id_used,
-			                                              std::map<Optional<Standalone<StringRef>>, int>(),
-			                                              Optional<WorkerFitnessInfo>(),
-			                                              true),
-			                ProcessClass::LogRouter,
-			                id_used);
+			newLogRoutersFit = RoleFitness(getWorkersForRoleInDatacenter(*remoteDC.begin(),
+			                                                             ProcessClass::LogRouter,
+			                                                             newRouterCount,
+			                                                             db.config,
+			                                                             id_used,
+			                                                             {},
+			                                                             Optional<WorkerFitnessInfo>(),
+			                                                             true),
+			                               ProcessClass::LogRouter,
+			                               id_used);
 		}
 
 		if (oldLogRoutersFit.count < oldRouterCount) {
@@ -2477,7 +2449,7 @@ public:
 		                                                              nBackup,
 		                                                              db.config,
 		                                                              id_used,
-		                                                              std::map<Optional<Standalone<StringRef>>, int>(),
+		                                                              {},
 		                                                              Optional<WorkerFitnessInfo>(),
 		                                                              true),
 		                                ProcessClass::Backup,
@@ -2917,7 +2889,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 	                                                               ProcessClass::NeverAssign,
 	                                                               self->db.config,
 	                                                               id_used,
-	                                                               std::map<Optional<Standalone<StringRef>>, int>(),
+	                                                               {},
 	                                                               true)
 	                                .worker;
 	if (self->onMasterIsBetter(newRKWorker, ProcessClass::Ratekeeper)) {
@@ -2933,7 +2905,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 	                                                               ProcessClass::NeverAssign,
 	                                                               self->db.config,
 	                                                               id_used,
-	                                                               std::map<Optional<Standalone<StringRef>>, int>(),
+	                                                               {},
 	                                                               true)
 	                                .worker;
 	if (self->onMasterIsBetter(newDDWorker, ProcessClass::DataDistributor)) {

From 7f9ee224a4e849a7f5f1320357110110cadb1df7 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 26 Apr 2021 22:50:44 -0700
Subject: [PATCH 274/461] Refactor samples to include wait state

---
 fdbclient/ActorLineageProfiler.cpp  | 21 ++++++++---------
 fdbclient/ActorLineageProfiler.h    |  9 +++++---
 fdbclient/ProcessInterface.h        |  5 ++--
 fdbclient/SpecialKeySpace.actor.cpp | 36 +++++++++++++++++++++++------
 fdbserver/worker.actor.cpp          | 17 +++++++++-----
 5 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 8bb2910001..e0a2e1bdf5 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -150,12 +150,9 @@ public:
 		}
 	}
 
-	std::shared_ptr<Sample> done(double time) {
-		auto res = std::make_shared<Sample>();
-		res->time = time;
-		res->size = sbuffer.size();
-		res->data = sbuffer.release();
-		return res;
+	std::pair<char*, unsigned> getbuf() {
+		unsigned size = sbuffer.size();
+		return std::make_pair(sbuffer.release(), size);
 	}
 };
 
@@ -175,11 +172,11 @@ std::map<std::string_view, std::any> SampleCollectorT::collect(ActorLineage* lin
 }
 
 std::shared_ptr<Sample> SampleCollectorT::collect() {
-	Packer packer;
-	std::map<std::string_view, std::any> res;
+	auto sample = std::make_shared<Sample>();
 	double time = g_network->now();
-	res["time"sv] = time;
+	sample->time = time;
 	for (auto& p : getSamples) {
+		Packer packer;
 		std::vector<std::map<std::string_view, std::any>> samples;
 		auto sampleVec = p.second();
 		for (auto& val : sampleVec) {
@@ -189,11 +186,11 @@ std::shared_ptr<Sample> SampleCollectorT::collect() {
 			}
 		}
 		if (!samples.empty()) {
-			res[to_string(p.first)] = samples;
+			packer.pack(samples);
+			sample->data[p.first] = packer.getbuf();
 		}
 	}
-	packer.pack(res);
-	return packer.done(time);
+	return sample;
 }
 
 void SampleCollection_t::refresh() {
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index c612274133..67c6c83ff3 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -47,9 +47,12 @@ struct IALPCollector : IALPCollectorBase {
 
 struct Sample : std::enable_shared_from_this<Sample> {
 	double time = 0.0;
-	unsigned size = 0u;
-	char* data = nullptr;
-	~Sample() { ::free(data); }
+	std::unordered_map<WaitState, std::pair<char*, unsigned>> data;
+	~Sample() {
+		std::for_each(data.begin(), data.end(), [](std::pair<WaitState, std::pair<char*, unsigned>> entry) {
+			::free(entry.second.first);
+		});
+	}
 };
 
 class SampleCollectorT {
diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
index 0c57107106..04ecf76181 100644
--- a/fdbclient/ProcessInterface.h
+++ b/fdbclient/ProcessInterface.h
@@ -62,14 +62,13 @@ struct EchoRequest {
 struct SerializedSample {
 	constexpr static FileIdentifier file_identifier = 15785634;
 
-	WaitState waitState;
 	double time;
 	int seq;
-	std::string data;
+	std::unordered_map<WaitState, std::string> data;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, waitState, time, seq, data);
+		serializer(ar, time, seq, data);
 	}
 };
 
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index d2f0e57d55..ac200e20eb 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -2095,15 +2095,37 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 	actorLineageRequest.seqEnd = seqEnd;
 	ActorLineageReply reply = wait(process.actorLineage.getReply(actorLineageRequest));
 
+	time_t dt = 0;
+	int seq = -1;
 	for (const auto& sample : reply.samples) {
-		msgpack::object_handle oh = msgpack::unpack(sample.data.data(), sample.data.size());
-		msgpack::object deserialized = oh.get();
+		for (const auto& [waitState, data] : sample.data) {
+			time_t datetime = (time_t)sample.time;
+			seq = dt == datetime ? seq + 1 : 0;
+			dt = datetime;
 
-		std::ostringstream stream;
-		stream << deserialized;
-		// TODO: Fix return value for ranges
-		Key returnKey = prefix.withSuffix(host.toString() + "/" + std::to_string(sample.seq));
-		result.push_back_deep(result.arena(), KeyValueRef(returnKey, stream.str()));
+			if (seq < seqStart) {
+				continue;
+			} else if (seq >= seqEnd) {
+				break;
+			}
+
+			char buf[200];
+			struct tm* tm;
+			tm = localtime(&datetime);
+			size_t size = strftime(buf, 200, "%FT%T%z", tm);
+			std::string date(buf, size);
+
+			msgpack::object_handle oh = msgpack::unpack(data.data(), data.size());
+			msgpack::object deserialized = oh.get();
+
+			std::ostringstream stream;
+			stream << deserialized;
+
+			// TODO: Fix return value for time range
+			Key returnKey = prefix.withSuffix(host.toString() + "/" + std::string(to_string(waitState)) + "/" + date +
+			                                  "/" + std::to_string(seq));
+			result.push_back_deep(result.arena(), KeyValueRef(returnKey, stream.str()));
+		}
 	}
 
 	return result;
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 4c5dfecb16..c897b80354 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -2060,13 +2060,18 @@ ACTOR Future<Void> serveProcess() {
 				int maxSeq = std::min(req.seqEnd, static_cast<int>(samples.size()));
 
 				std::vector<SerializedSample> serializedSamples;
-				for (int i = req.seqStart; i < maxSeq; ++i) {
-					auto samplePtr = samples.at(i);
-					auto serialized = SerializedSample{ .waitState = WaitState::Network, // TODO: Currently unused
-						                                .time = samplePtr->time,
-						                                .seq = i,
-						                                .data = std::string(samplePtr->data, samplePtr->size) };
+				for (const auto& samplePtr : samples) {
+					int seq = 0;
+					auto serialized = SerializedSample{ .time = samplePtr->time, .seq = seq };
+					for (const auto& [waitState, pair] : samplePtr->data) {
+						serialized.data[waitState] = std::string(pair.first, pair.second);
+					}
 					serializedSamples.push_back(std::move(serialized));
+
+					// TODO: Don't need to transmit seq over the network anymore
+					if (++seq >= maxSeq) {
+						continue;
+					};
 				}
 				ActorLineageReply reply{ serializedSamples };
 				req.reply.send(reply);

From 2d6fafde64ce017b992d57bbd8481bf49c0d5e31 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Tue, 27 Apr 2021 10:26:42 -0600
Subject: [PATCH 275/461] Implemented configuration

---
 fdbclient/ActorLineageProfiler.cpp  |  8 ++++
 fdbclient/ActorLineageProfiler.h    |  4 ++
 fdbclient/FluentDSampleIngestor.cpp |  6 +++
 fdbclient/NativeAPI.actor.cpp       |  4 ++
 fdbclient/SpecialKeySpace.actor.cpp | 67 ++++++++++++++++++++++++++++-
 fdbclient/SpecialKeySpace.actor.h   | 14 ++++++
 6 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 3b300f1653..3c656010b6 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -360,3 +360,11 @@ void ProfilerConfigT::reset(std::map<std::string, std::string> const& config) {
 		    useTCP ? FluentDIngestor::Protocol::TCP : FluentDIngestor::Protocol::TCP, address));
 	}
 }
+
+std::map<std::string, std::string> ProfilerConfigT::getConfig() const {
+	std::map<std::string, std::string> res;
+	if (ingestor) {
+		ingestor->getConfig(res);
+	}
+	return res;
+}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index d09aba7d2c..9f9fdc3300 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -55,11 +55,13 @@ class SampleIngestor : std::enable_shared_from_this<SampleIngestor> {
 public:
 	virtual ~SampleIngestor();
 	virtual void ingest(std::shared_ptr<Sample> const& sample) = 0;
+	virtual void getConfig(std::map<std::string, std::string>&) const = 0;
 };
 
 class NoneIngestor : public SampleIngestor {
 public:
 	void ingest(std::shared_ptr<Sample> const& sample) override {}
+	void getConfig(std::map<std::string, std::string>& res) const override { res["ingestor"] = "none"; }
 };
 
 // The FluentD ingestor uses the pimp idiom. This is to make compilation less heavy weight as this implementation has
@@ -76,6 +78,7 @@ private: // members
 public: // interface
 	void ingest(std::shared_ptr<Sample> const& sample) override;
 	FluentDIngestor(Protocol protocol, NetworkAddress& endpoint);
+	void getConfig(std::map<std::string, std::string>& res) const override;
 	~FluentDIngestor();
 };
 
@@ -99,6 +102,7 @@ private: // construction
 
 public:
 	void reset(std::map<std::string, std::string> const& config);
+	std::map<std::string, std::string> getConfig() const;
 };
 
 using ProfilerConfig = crossbow::singleton<ProfilerConfigT>;
diff --git a/fdbclient/FluentDSampleIngestor.cpp b/fdbclient/FluentDSampleIngestor.cpp
index f1609ae5b3..e912643dbf 100644
--- a/fdbclient/FluentDSampleIngestor.cpp
+++ b/fdbclient/FluentDSampleIngestor.cpp
@@ -170,3 +170,9 @@ void FluentDIngestor::ingest(const std::shared_ptr<Sample>& sample) {
 		impl->socket->send(sample);
 	}
 }
+
+void FluentDIngestor::getConfig(std::map<std::string, std::string>& res) const {
+	res["ingestor"] = "fluentd";
+	res["collector_endpoint"] = impl->endpoint.toString();
+	res["collector_protocol"] = impl->protocol == Protocol::TCP ? "tcp" : "udp";
+}
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index cd7638221b..443571c097 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -1060,6 +1060,10 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 		    SpecialKeySpace::MODULE::ACTORLINEAGE,
 		    SpecialKeySpace::IMPLTYPE::READONLY,
 		    std::make_unique<ActorLineageImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE)));
+		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF,
+		                              SpecialKeySpace::IMPLTYPE::READWRITE,
+		                              std::make_unique<ActorProfilerConf>(
+		                                  SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE)));
 	}
 	if (apiVersionAtLeast(630)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION,
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 603887fcf6..b692a5dea3 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -21,6 +21,7 @@
 #include "boost/lexical_cast.hpp"
 #include "boost/algorithm/string.hpp"
 
+#include "fdbclient/ActorLineageProfiler.h"
 #include "fdbclient/Knobs.h"
 #include "fdbclient/ProcessInterface.h"
 #include "fdbclient/GlobalConfig.actor.h"
@@ -71,7 +72,10 @@ std::unordered_map<SpecialKeySpace::MODULE, KeyRange> SpecialKeySpace::moduleToB
 	{ SpecialKeySpace::MODULE::TRACING,
 	  KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) },
 	{ SpecialKeySpace::MODULE::ACTORLINEAGE,
-	  KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) }
+	  KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) },
+	{ SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF,
+	  KeyRangeRef(LiteralStringRef("\xff\xff/actor_profiler_conf/"),
+	              LiteralStringRef("\xff\xff/actor_profiler_conf0")) }
 };
 
 std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandToRange = {
@@ -1953,3 +1957,64 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 Future<Standalone<RangeResultRef>> ActorLineageImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return actorLineageGetRangeActor(ryw, getKeyRange().begin, kr);
 }
+
+namespace {
+std::string_view to_string_view(StringRef sr) {
+	return std::string_view(reinterpret_cast<const char*>(sr.begin()), sr.size());
+}
+} // namespace
+
+ActorProfilerConf::ActorProfilerConf(KeyRangeRef kr)
+  : SpecialKeyRangeRWImpl(kr), config(ProfilerConfig::instance().getConfig()) {}
+
+Future<Standalone<RangeResultRef>> ActorProfilerConf::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+	Standalone<RangeResultRef> res;
+	std::string_view begin(to_string_view(kr.begin.removePrefix(range.begin))),
+	    end(to_string_view(kr.end.removePrefix(range.begin)));
+	for (auto& p : config) {
+		if (p.first > end) {
+			break;
+		} else if (p.first > begin) {
+			KeyValueRef kv;
+			kv.key = StringRef(res.arena(), p.first);
+			kv.value = StringRef(res.arena(), p.second);
+			res.push_back(res.arena(), kv);
+		}
+	}
+	return res;
+}
+
+void ActorProfilerConf::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
+	config[key.removePrefix(range.begin).toString()] = value.toString();
+}
+
+void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& kr) {
+	std::string begin(kr.begin.removePrefix(range.begin).toString()), end(kr.end.removePrefix(range.begin).toString());
+	auto first = config.lower_bound(begin);
+	if (first == config.end()) {
+		// nothing to clear
+		return;
+	}
+	auto last = config.upper_bound(end);
+	config.erase(first, last);
+}
+
+void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
+	std::string k = key.removePrefix(range.begin).toString();
+	auto iter = config.find(k);
+	if (iter != config.end()) {
+		config.erase(iter);
+	}
+}
+
+Future<Optional<std::string>> ActorProfilerConf::commit(ReadYourWritesTransaction* ryw) {
+	Optional<std::string> res{};
+	try {
+		if (didWrite) {
+			ProfilerConfig::instance().reset(config);
+		}
+		return res;
+	} catch (ConfigError& err) {
+		return Optional<std::string>{ err.description };
+	}
+}
diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h
index 08a3c6cfc5..f17a4b38ca 100644
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@@ -143,6 +143,7 @@ class SpecialKeySpace {
 public:
 	enum class MODULE {
 		ACTORLINEAGE, // Sampling data
+		ACTOR_PROFILER_CONF, // profiler configuration
 		CLUSTERFILEPATH,
 		CONFIGURATION, // Configuration of the cluster
 		CONNECTIONSTRING,
@@ -395,5 +396,18 @@ public:
 	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 };
 
+class ActorProfilerConf : public SpecialKeyRangeRWImpl {
+	bool didWrite = false;
+	std::map<std::string, std::string> config;
+
+public:
+	explicit ActorProfilerConf(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) override;
+	void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
+	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
+	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
+};
+
 #include "flow/unactorcompiler.h"
 #endif

From 10d5007e1a9f4c808e648320f7b57c4f07b4ea45 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 27 Apr 2021 09:59:10 -0700
Subject: [PATCH 276/461] Cleanup

---
 fdbclient/SpecialKeySpace.actor.cpp | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index ac200e20eb..c3426e27df 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -2103,18 +2103,18 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 			seq = dt == datetime ? seq + 1 : 0;
 			dt = datetime;
 
-			if (seq < seqStart) {
-				continue;
-			} else if (seq >= seqEnd) {
-				break;
-			}
+			if (seq < seqStart) { continue; }
+			else if (seq >= seqEnd) { break; }
 
-			char buf[200];
+			char buf[50];
 			struct tm* tm;
 			tm = localtime(&datetime);
-			size_t size = strftime(buf, 200, "%FT%T%z", tm);
+			size_t size = strftime(buf, 50, "%FT%T%z", tm);
 			std::string date(buf, size);
 
+			std::ostringstream streamKey;
+			streamKey << prefix.toString() << host.toString() << "/" << to_string(waitState) << "/" << date << "/" << seq;
+
 			msgpack::object_handle oh = msgpack::unpack(data.data(), data.size());
 			msgpack::object deserialized = oh.get();
 
@@ -2122,9 +2122,7 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 			stream << deserialized;
 
 			// TODO: Fix return value for time range
-			Key returnKey = prefix.withSuffix(host.toString() + "/" + std::string(to_string(waitState)) + "/" + date +
-			                                  "/" + std::to_string(seq));
-			result.push_back_deep(result.arena(), KeyValueRef(returnKey, stream.str()));
+			result.push_back_deep(result.arena(), KeyValueRef(streamKey.str(), stream.str()));
 		}
 	}
 

From e16343230351152d12f2eb1b9a727c9658731627 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 27 Apr 2021 10:20:25 -0700
Subject: [PATCH 277/461] Add filtering by wait state

---
 fdbclient/ProcessInterface.h        |  6 ++----
 fdbclient/SpecialKeySpace.actor.cpp |  2 --
 fdbserver/worker.actor.cpp          | 17 ++++-------------
 3 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
index 04ecf76181..80fb1f9aff 100644
--- a/fdbclient/ProcessInterface.h
+++ b/fdbclient/ProcessInterface.h
@@ -63,12 +63,11 @@ struct SerializedSample {
 	constexpr static FileIdentifier file_identifier = 15785634;
 
 	double time;
-	int seq;
 	std::unordered_map<WaitState, std::string> data;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, time, seq, data);
+		serializer(ar, time, data);
 	}
 };
 
@@ -86,11 +85,10 @@ struct ActorLineageRequest {
 	constexpr static FileIdentifier file_identifier = 11654765;
 	WaitState waitStateStart, waitStateEnd;
 	time_t timeStart, timeEnd;
-	int seqStart, seqEnd;
 	ReplyPromise<ActorLineageReply> reply;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, waitStateStart, waitStateEnd, timeStart, timeEnd, seqStart, seqEnd, reply);
+		serializer(ar, waitStateStart, waitStateEnd, timeStart, timeEnd, reply);
 	}
 };
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index c3426e27df..e0adca87c2 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -2091,8 +2091,6 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 	actorLineageRequest.waitStateEnd = waitStateEnd;
 	actorLineageRequest.timeStart = timeStart;
 	actorLineageRequest.timeEnd = timeEnd;
-	actorLineageRequest.seqStart = seqStart;
-	actorLineageRequest.seqEnd = seqEnd;
 	ActorLineageReply reply = wait(process.actorLineage.getReply(actorLineageRequest));
 
 	time_t dt = 0;
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index c897b80354..447f2f85a7 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -2052,26 +2052,17 @@ ACTOR Future<Void> serveProcess() {
 			}
 			when(ActorLineageRequest req = waitNext(process.actorLineage.getFuture())) {
 				state SampleCollection sampleCollector;
-				// TODO: Add filtering by wait state
 				auto samples = sampleCollector->get(req.timeStart, req.timeEnd);
-				// The size of samples should never approach 2 billion, so
-				// casting from 64 to 32 bits here should be okay.
-				ASSERT(samples.size() < std::numeric_limits<int>::max());
-				int maxSeq = std::min(req.seqEnd, static_cast<int>(samples.size()));
 
 				std::vector<SerializedSample> serializedSamples;
 				for (const auto& samplePtr : samples) {
-					int seq = 0;
-					auto serialized = SerializedSample{ .time = samplePtr->time, .seq = seq };
+					auto serialized = SerializedSample{ .time = samplePtr->time };
 					for (const auto& [waitState, pair] : samplePtr->data) {
-						serialized.data[waitState] = std::string(pair.first, pair.second);
+						if (waitState >= req.waitStateStart && waitState <= req.waitStateEnd) {
+							serialized.data[waitState] = std::string(pair.first, pair.second);
+						}
 					}
 					serializedSamples.push_back(std::move(serialized));
-
-					// TODO: Don't need to transmit seq over the network anymore
-					if (++seq >= maxSeq) {
-						continue;
-					};
 				}
 				ActorLineageReply reply{ serializedSamples };
 				req.reply.send(reply);

From 0ba5a8e9d1bf880aaf48da17737a9ece8fa4f6a9 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 27 Apr 2021 10:39:26 -0700
Subject: [PATCH 278/461] Fix return key when sorting by time

---
 fdbclient/ProcessInterface.h        |  1 -
 fdbclient/SpecialKeySpace.actor.cpp | 10 ++++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
index 80fb1f9aff..4224ae9d03 100644
--- a/fdbclient/ProcessInterface.h
+++ b/fdbclient/ProcessInterface.h
@@ -58,7 +58,6 @@ struct EchoRequest {
 };
 
 // This type is used to send serialized sample data over the network.
-// TODO: Possible to combine with `Sample`?
 struct SerializedSample {
 	constexpr static FileIdentifier file_identifier = 15785634;
 
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index e0adca87c2..c86411933a 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -2111,7 +2111,14 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 			std::string date(buf, size);
 
 			std::ostringstream streamKey;
-			streamKey << prefix.toString() << host.toString() << "/" << to_string(waitState) << "/" << date << "/" << seq;
+			if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
+				streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("state").toString() << host.toString() << "/" << to_string(waitState) << "/" << date;
+			} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
+				streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("time").toString() << host.toString() << "/" << date << "/" << to_string(waitState);;
+			} else {
+				ASSERT(false);
+			}
+			streamKey <<  "/" << seq;
 
 			msgpack::object_handle oh = msgpack::unpack(data.data(), data.size());
 			msgpack::object deserialized = oh.get();
@@ -2119,7 +2126,6 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 			std::ostringstream stream;
 			stream << deserialized;
 
-			// TODO: Fix return value for time range
 			result.push_back_deep(result.arena(), KeyValueRef(streamKey.str(), stream.str()));
 		}
 	}

From d964b5ded08e0d864e9cfa87f4abde3a8bbadc96 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 27 Apr 2021 10:41:48 -0700
Subject: [PATCH 279/461] clang-format

---
 fdbclient/SpecialKeySpace.actor.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index c86411933a..1776b6dffd 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -2112,13 +2112,16 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 
 			std::ostringstream streamKey;
 			if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
-				streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("state").toString() << host.toString() << "/" << to_string(waitState) << "/" << date;
+				streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("state").toString() << host.toString()
+				          << "/" << to_string(waitState) << "/" << date;
 			} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
-				streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("time").toString() << host.toString() << "/" << date << "/" << to_string(waitState);;
+				streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("time").toString() << host.toString()
+				          << "/" << date << "/" << to_string(waitState);
+				;
 			} else {
 				ASSERT(false);
 			}
-			streamKey <<  "/" << seq;
+			streamKey << "/" << seq;
 
 			msgpack::object_handle oh = msgpack::unpack(data.data(), data.size());
 			msgpack::object deserialized = oh.get();

From 9009780aa8e8a4098e083c32ca69d399f2a2a095 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Tue, 27 Apr 2021 11:15:16 -0700
Subject: [PATCH 280/461] Fix bug that could cause the server to crash when an
 old client connected

---
 fdbrpc/FlowTransport.actor.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index c8dd207d3a..8cc9d0d8e6 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -1215,9 +1215,12 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
 							}
 							compatible = false;
 							if (!protocolVersion.hasInexpensiveMultiVersionClient()) {
+								if(peer) {
+									peer->protocolVersion->set(protocolVersion);
+								}
+
 								// Older versions expected us to hang up. It may work even if we don't hang up here, but
 								// it's safer to keep the old behavior.
-								peer->protocolVersion->set(protocolVersion);
 								throw incompatible_protocol_version();
 							}
 						} else {

From 2f3d70c084e6d63004c1276535baa71f9310714f Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Tue, 27 Apr 2021 11:27:57 -0700
Subject: [PATCH 281/461] Fix the logic of getting firstConsistentVersion.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

First consistent version should be:

- In a logs-only restore, it is the begin version the user said to start applying logs for;
- In an inconsistent-snapshot-only restore, if all range files have the same version, then it is that version, otherwise unknown (use -1);
- If using both range files and logs, then it is the highest version of any range file in the RestoreSet’s ranges vector.
---
 fdbclient/FileBackupAgent.actor.cpp | 57 ++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index ecc83dd955..60d21a6a95 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -4102,6 +4102,8 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 		}
 
 		state bool logsOnly = wait(restore.onlyAppyMutationLogs().getD(tr, false, false));
+		state bool inconsistentSnapshotOnly = wait(restore.inconsistentSnapshotOnly().getD(tr, false, false));
+		state Version firstConsistentVersion = invalidVersion;
 		if (beginVersion == invalidVersion) {
 			beginVersion = 0;
 		}
@@ -4111,25 +4113,46 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 		}
 		state Optional<RestorableFileSet> restorable =
 		    wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, logsOnly, beginVersion));
-		if (!logsOnly) {
-			beginVersion = restorable.get().snapshot.beginVersion;
-		}
-
 		if (!restorable.present())
 			throw restore_missing_data();
 
-		// First version for which log data should be applied
-		Params.firstVersion().set(task, beginVersion);
-
 		// Convert the two lists in restorable (logs and ranges) to a single list of RestoreFiles.
 		// Order does not matter, they will be put in order when written to the restoreFileMap below.
 		state std::vector<RestoreConfig::RestoreFile> files;
-
-		state Version firstConsistentVersion = beginVersion;
-		for (const RangeFile& f : restorable.get().ranges) {
-			files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
-			firstConsistentVersion = std::max(firstConsistentVersion, f.version);
+		if (!logsOnly) {
+			beginVersion = restorable.get().snapshot.beginVersion;
+			if (!inconsistentSnapshotOnly) {
+				for (const RangeFile& f : restorable.get().ranges) {
+					files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
+					// In a restore with both snapshots and logs, the firstConsistentVersion is the highest version of
+					// any range file.
+					firstConsistentVersion = std::max(firstConsistentVersion, f.version);
+				}
+			} else {
+				for (int i = 0; i < restorable.get().ranges.size(); ++i) {
+					const RangeFile& f = restorable.get().ranges[i];
+					files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
+					// In inconsistentSnapshotOnly mode, if all range files have the same version, then it is the
+					// firstConsistentVersion, otherwise unknown (use -1).
+					if (i != 0 && f.version != firstConsistentVersion) {
+						firstConsistentVersion = invalidVersion;
+					} else {
+						firstConsistentVersion = f.version;
+					}
+				}
+			}
+		} else {
+			// In logs-only (incremental) mode, the firstConsistentVersion should just be restore.beginVersion().
+			firstConsistentVersion = beginVersion;
 		}
+		if (!inconsistentSnapshotOnly) {
+			for (const LogFile& f : restorable.get().logs) {
+				files.push_back({ f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion });
+			}
+		}
+		// First version for which log data should be applied
+		Params.firstVersion().set(task, beginVersion);
+
 		tr->reset();
 		loop {
 			try {
@@ -4143,16 +4166,6 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			}
 		}
 
-		tr->reset();
-		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-		bool inconsistentSnapshotOnly = wait(restore.inconsistentSnapshotOnly().getD(tr, false, false));
-		if (!inconsistentSnapshotOnly) {
-			for (const LogFile& f : restorable.get().logs) {
-				files.push_back({ f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion });
-			}
-		}
-
 		state std::vector<RestoreConfig::RestoreFile>::iterator start = files.begin();
 		state std::vector<RestoreConfig::RestoreFile>::iterator end = files.end();
 

From 5d0eaac3ea5c0f2aa2937e1537c14503e0cb48e1 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 27 Apr 2021 11:40:02 -0700
Subject: [PATCH 282/461] Remove old code

---
 fdbclient/ProcessInterface.h        | 12 ------------
 fdbclient/SpecialKeySpace.actor.cpp |  1 -
 2 files changed, 13 deletions(-)

diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
index 4224ae9d03..11bafc2987 100644
--- a/fdbclient/ProcessInterface.h
+++ b/fdbclient/ProcessInterface.h
@@ -45,18 +45,6 @@ struct GetProcessInterfaceRequest {
 	}
 };
 
-// TODO: Used for demonstration purposes, remove in later PR
-struct EchoRequest {
-	constexpr static FileIdentifier file_identifier = 10624019;
-	std::string message;
-	ReplyPromise<std::string> reply;
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		serializer(ar, message, reply);
-	}
-};
-
 // This type is used to send serialized sample data over the network.
 struct SerializedSample {
 	constexpr static FileIdentifier file_identifier = 15785634;
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 1776b6dffd..b8721f52e1 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1962,7 +1962,6 @@ void parse(StringRef& val, WaitState& w) {
 void parse(StringRef& val, time_t& t) {
 	struct tm tm = { 0 };
 	if (strptime(val.toString().c_str(), "%FT%T%z", &tm) == nullptr) {
-			TraceEvent("LUKAS_FailedToParse");
 		throw std::invalid_argument("failed to parse ISO 8601 datetime");
 	}
 

From 1b5119b7fb6addfc479edf418de04bf96eff54e6 Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Tue, 27 Apr 2021 12:24:54 -0700
Subject: [PATCH 283/461] Add new docker images for EKS environments

---
 packaging/docker/Dockerfile.eks            | 85 ++++++++++++++++++++++
 packaging/docker/build-eks-docker.sh       | 50 +++++++++++++
 packaging/docker/misc/tini-amd64.sha256sum |  1 +
 3 files changed, 136 insertions(+)
 create mode 100644 packaging/docker/Dockerfile.eks
 create mode 100755 packaging/docker/build-eks-docker.sh
 create mode 100644 packaging/docker/misc/tini-amd64.sha256sum

diff --git a/packaging/docker/Dockerfile.eks b/packaging/docker/Dockerfile.eks
new file mode 100644
index 0000000000..d7fbdf04cb
--- /dev/null
+++ b/packaging/docker/Dockerfile.eks
@@ -0,0 +1,85 @@
+FROM amazonlinux:2.0.20210326.0 as base
+
+RUN yum install -y \
+  bind-utils \
+  curl \
+  jq \
+  less \
+  lsof \
+  nc \
+  net-tools \
+  perf \
+  python3-pip \
+  strace \
+  tar \
+  traceroute \
+  telnet \
+  tcpdump \
+  vim
+
+#todo: nload, iperf, numademo
+
+COPY misc/tini-amd64.sha256sum /tmp/
+# Adding tini as PID 1 https://github.com/krallin/tini
+ARG TINI_VERSION=v0.19.0
+RUN curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
+	sha256sum -c /tmp/tini-amd64.sha256sum && \
+  chmod +x tini-amd64 && \
+	mv tini-amd64 /usr/bin/tini
+ENTRYPOINT ["/usr/bin/tini", "-g", "--"]
+
+COPY sidecar/requirements.txt /tmp
+RUN pip3 install -r /tmp/requirements.txt
+
+# TODO: Only used by sidecar
+RUN groupadd --gid 4059 fdb && \
+	useradd --gid 4059 --uid 4059 --no-create-home --shell /bin/bash fdb
+
+ARG FDB_VERSION
+
+# These are the output of the current build (not stripped)
+COPY --chown=root bin /usr/bin/
+COPY --chown=root lib/libfdb_c.so /var/fdb/lib/
+RUN mv /var/fdb/lib/libfdb_c.so /var/fdb/lib/libfdb_c_${FDB_VERSION%.*}.so
+RUN ln -s /var/fdb/lib/libfdb_c_${FDB_VERSION%.*}.so /var/fdb/lib/libfdb_c.so
+# -------------------------------------------------
+
+FROM base as foundationdb
+
+COPY release/*.bash /var/fdb/scripts/
+RUN mkdir -p /var/fdb/logs
+
+# TODO: FDB_ADDITIONAL_VERSIONS
+RUN mkdir -p /usr/lib/fdb/multiversion
+
+VOLUME /var/fdb/data
+
+# Runtime Configuration Options
+ENV FDB_PORT 4500
+ENV FDB_CLUSTER_FILE /var/fdb/fdb.cluster
+ENV FDB_NETWORKING_MODE container
+ENV FDB_COORDINATOR ""
+ENV FDB_COORDINATOR_PORT 4500
+ENV FDB_CLUSTER_FILE_CONTENTS ""
+ENV FDB_PROCESS_CLASS unset
+
+CMD /var/fdb/scripts/fdb.bash
+
+# -------------------------------------------------
+
+FROM base AS sidecar
+
+COPY sidecar/entrypoint.bash /
+COPY sidecar/sidecar.py /
+
+VOLUME /var/input-files
+VOLUME /var/output-files
+
+RUN echo ${FDB_VERSION} > /var/fdb/version
+RUN mkdir -p /var/fdb/lib
+
+ENV LISTEN_PORT 8080
+
+USER fdb
+
+CMD "/entrypoint.bash"
\ No newline at end of file
diff --git a/packaging/docker/build-eks-docker.sh b/packaging/docker/build-eks-docker.sh
new file mode 100755
index 0000000000..b467d0be5b
--- /dev/null
+++ b/packaging/docker/build-eks-docker.sh
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+set -euxo pipefail
+
+DOCKER_ROOT=$(realpath $(dirname ${BASH_SOURCE[0]}))
+BUILD_OUTPUT=$(realpath ${DOCKER_ROOT}/../..)
+
+echo Docker root:  $DOCKER_ROOT
+echo Build output: $BUILD_OUTPUT
+
+cd ${DOCKER_ROOT}
+
+## eg: CMAKE_PROJECT_VERSION:STATIC=7.0.0
+FDB_VERSION=$(grep CMAKE_PROJECT_VERSION\: ${BUILD_OUTPUT}/CMakeCache.txt | cut -d '=' -f 2)
+
+# Options (passed via environment variables)
+
+# Feel free to customize the image tag.
+# TODO: add a mechanism to set TAG=FDB_VERSION when we're building public releases.
+TAG=${TAG:-${FDB_VERSION}-${OKTETO_NAME}}
+ECR=${ECR:-112664522426.dkr.ecr.us-west-2.amazonaws.com}
+
+echo Building with tag ${TAG}
+
+# Login to ECR
+# TODO: Move this to a common place instead of repeatedly copy-pasting it.
+aws ecr get-login-password | docker login --username AWS --password-stdin ${ECR}
+
+docker pull ${ECR}/amazonlinux:2.0.20210326.0
+docker tag ${ECR}/amazonlinux:2.0.20210326.0 amazonlinux:2.0.20210326.0
+
+IMAGE=foundationdb/foundationdb:${TAG}
+SIDECAR=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
+STRIPPED=${STRIPPED:-false}
+if $STRIPPED; then
+  rsync -av --delete --exclude=*.xml ${BUILD_OUTPUT}/packages/bin .
+  rsync -av --delete --exclude=*.a --exclude=*.xml ${BUILD_OUTPUT}/packages/lib .
+else
+  rsync -av --delete --exclude=*.xml ${BUILD_OUTPUT}/bin .
+  rsync -av --delete --exclude=*.a --exclude=*.xml ${BUILD_OUTPUT}/lib .
+fi
+
+docker build --build-arg FDB_VERSION=$FDB_VERSION -t ${IMAGE}   --target foundationdb -f Dockerfile.eks .
+docker build --build-arg FDB_VERSION=$FDB_VERSION -t ${SIDECAR} --target sidecar      -f Dockerfile.eks .
+
+docker tag ${IMAGE} ${ECR}/${IMAGE}
+docker tag ${SIDECAR} ${ECR}/${SIDECAR}
+
+docker push ${ECR}/${IMAGE}
+docker push ${ECR}/${SIDECAR}
diff --git a/packaging/docker/misc/tini-amd64.sha256sum b/packaging/docker/misc/tini-amd64.sha256sum
new file mode 100644
index 0000000000..3cb1f9f635
--- /dev/null
+++ b/packaging/docker/misc/tini-amd64.sha256sum
@@ -0,0 +1 @@
+93dcc18adc78c65a028a84799ecf8ad40c936fdfc5f2a57b1acda5a8117fa82c  tini-amd64

From 16dfb2b2f2b24b767700a7f32b8c5877e7d1a8f7 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Tue, 27 Apr 2021 15:00:56 -0700
Subject: [PATCH 284/461] Keep connections older than 6.2 open indefinitely to
 avoid weird bugs around quickly closing the database.

---
 fdbclient/MultiVersionTransaction.actor.cpp | 24 ++++++++++++++++-----
 fdbclient/MultiVersionTransaction.h         |  9 ++++++++
 flow/ProtocolVersion.h                      |  1 +
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index bca3549651..552db4c3ac 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -892,15 +892,26 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
 			dbState->addClient(api->getLocalClient());
 		}
 
+		api->runOnExternalClients(threadIdx, [this](Reference<ClientInfo> client) { dbState->addClient(client); });
+
 		if (!externalClientsInitialized.test_and_set()) {
 			api->runOnExternalClientsAllThreads([&clusterFilePath](Reference<ClientInfo> client) {
-				// This creates a database to initialize some client state on the external library,
-				// but it gets deleted immediately so that we don't keep open connections
-				Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
+				// This creates a database to initialize some client state on the external library
+				// We only do this on 6.2+ clients to avoid some bugs associated with older versions
+				// This deletes the new database immediately to discard its connections
+				if (client->protocolVersion.hasCloseUnusedConnection()) {
+					Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
+				}
 			});
 		}
 
-		api->runOnExternalClients(threadIdx, [this](Reference<ClientInfo> client) { dbState->addClient(client); });
+		// For clients older than 6.2 we create and maintain our database connection
+		api->runOnExternalClients(threadIdx, [this, &clusterFilePath](Reference<ClientInfo> client) {
+			if (!client->protocolVersion.hasCloseUnusedConnection()) {
+				dbState->legacyDatabaseConnections[client->protocolVersion] =
+				    client->api->createDatabase(clusterFilePath.c_str());
+			}
+		});
 
 		onMainThreadVoid([this]() { dbState->protocolVersionMonitor = dbState->monitorProtocolVersion(); }, nullptr);
 	}
@@ -1158,7 +1169,10 @@ void MultiVersionDatabase::LegacyVersionMonitor::startConnectionMonitor(
 	if (!monitorRunning) {
 		monitorRunning = true;
 
-		db = client->api->createDatabase(dbState->clusterFilePath.c_str());
+		auto itr = dbState->legacyDatabaseConnections.find(client->protocolVersion);
+		ASSERT(itr != dbState->legacyDatabaseConnections.end());
+
+		db = itr->second;
 		tr = Reference<ITransaction>();
 
 		TraceEvent("StartingLegacyVersionMonitor").detail("ProtocolVersion", client->protocolVersion);
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index 388db1bd3b..2244ec2c6e 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -510,7 +510,16 @@ public:
 
 		ThreadFuture<Void> dbReady;
 		ThreadFuture<Void> protocolVersionMonitor;
+
+		// Versions older than 6.1 do not benefit from having their database connections closed. Additionally,
+		// there are various issues that result in negative behavior in some cases if the connections are closed.
+		// Therefore, we leave them open.
+		std::map<ProtocolVersion, Reference<IDatabase>> legacyDatabaseConnections;
+
+		// Versions 5.0 and older do not support connection packet monitoring and require alternate techniques to
+		// determine the cluster version.
 		std::list<LegacyVersionMonitor> legacyVersionMonitors;
+
 		Optional<ProtocolVersion> dbProtocolVersion;
 
 		// This maps a normalized protocol version to the client associated with it. This prevents compatible
diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h
index 07a2675f1b..d3c601a9b5 100644
--- a/flow/ProtocolVersion.h
+++ b/flow/ProtocolVersion.h
@@ -118,6 +118,7 @@ public: // introduced features
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, BackupMutations);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, ClusterControllerPriorityInfo);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, ProcessIDFile);
+	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, CloseUnusedConnection);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, DBCoreState);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, TagThrottleValue);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, ServerListValue);

From 0145eea68467ee25bc82a3951d1d8b684fb5879c Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Tue, 27 Apr 2021 15:17:20 -0700
Subject: [PATCH 285/461] Make `MonitorLeaderForwarding` and `LeaderForwarding`
 trackLatest events.

---
 fdbclient/MonitorLeader.actor.cpp  | 2 +-
 fdbserver/Knobs.cpp                | 2 +-
 fdbserver/LeaderElection.actor.cpp | 4 +++-
 fdbserver/worker.actor.cpp         | 6 ------
 4 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp
index af563c68b0..d18fd12adc 100644
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@@ -495,7 +495,7 @@ ACTOR Future<MonitorLeaderInfo> monitorLeaderOneGeneration(Reference<ClusterConn
 			if (leader.get().first.forward) {
 				TraceEvent("MonitorLeaderForwarding")
 				    .detail("NewConnStr", leader.get().first.serializedInfo.toString())
-				    .detail("OldConnStr", info.intermediateConnFile->getConnectionString().toString());
+				    .detail("OldConnStr", info.intermediateConnFile->getConnectionString().toString()).trackLatest("MonitorLeaderForwarding");
 				info.intermediateConnFile = makeReference<ClusterConnectionFile>(
 				    connFile->getFilename(), ClusterConnectionString(leader.get().first.serializedInfo.toString()));
 				return info;
diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index ef2334d3cf..d3e42f760d 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -616,7 +616,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	//Worker
 	init( WORKER_LOGGING_INTERVAL,                               5.0 );
 	init( HEAP_PROFILER_INTERVAL,                               30.0 );
-	init( REGISTER_WORKER_REQUEST_TIMEOUT,                       300.0 );
+	init( REGISTER_WORKER_REQUEST_TIMEOUT,                     300.0 );
 	init( DEGRADED_RESET_INTERVAL,                          24*60*60 ); if ( randomize && BUGGIFY ) DEGRADED_RESET_INTERVAL = 10;
 	init( DEGRADED_WARNING_LIMIT,                                  1 );
 	init( DEGRADED_WARNING_RESET_DELAY,                   7*24*60*60 );
diff --git a/fdbserver/LeaderElection.actor.cpp b/fdbserver/LeaderElection.actor.cpp
index 319074630d..d6ce27126a 100644
--- a/fdbserver/LeaderElection.actor.cpp
+++ b/fdbserver/LeaderElection.actor.cpp
@@ -143,7 +143,9 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
 				}
 				coordinators.ccf->setConnectionString(
 				    ClusterConnectionString(leader.get().first.serializedInfo.toString()));
-				TraceEvent("LeaderForwarding").detail("ConnStr", coordinators.ccf->getConnectionString().toString());
+				TraceEvent("LeaderForwarding")
+				    .detail("ConnStr", coordinators.ccf->getConnectionString().toString())
+				    .trackLatest("LeaderForwarding");
 				throw coordinators_changed();
 			}
 
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index b7cbfe16d7..5437d4dead 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -542,12 +542,6 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			}
 		} else {
 			incorrectTime = Optional<double>();
-			if (connFile && connFile->canGetFilename()) {
-				TraceEvent("ClusterFileContents")
-				    .detail("Filename", connFile->getFilename())
-				    .detail("ConnectionStringFromFile", fileConnectionString.toString())
-				    .detail("CurrentConnectionString", connectionString);
-			}
 		}
 
 		auto peers = FlowTransport::transport().getIncompatiblePeers();

From 2bf7cf707e6fbc56634a69dcd54af3c1f0ea0b0f Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Tue, 27 Apr 2021 15:19:18 -0700
Subject: [PATCH 286/461] Revert a change in fdbserver/worker.actor.cpp.

---
 fdbserver/worker.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 5437d4dead..5a568fc96d 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -526,9 +526,9 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			request.issues.push_back_deep(request.issues.arena(), i);
 		}
 		ClusterConnectionString fileConnectionString;
-		std::string connectionString = connFile->getConnectionString().toString();
 		if (connFile && !connFile->fileContentsUpToDate(fileConnectionString)) {
 			request.issues.push_back_deep(request.issues.arena(), LiteralStringRef("incorrect_cluster_file_contents"));
+			std::string connectionString = connFile->getConnectionString().toString();
 			if (!incorrectTime.present()) {
 				incorrectTime = now();
 			}

From ec0f5db98e6991daa63f19301a1780321a4a0f88 Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Tue, 27 Apr 2021 13:46:59 -0700
Subject: [PATCH 287/461] port sidecar.py to distributions that default to
 python 2 instead of 3

fix various entrypoint bugs in sidecar
---
 packaging/docker/Dockerfile.eks          | 10 +++++++---
 packaging/docker/sidecar/entrypoint.bash |  2 +-
 packaging/docker/sidecar/sidecar.py      |  2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)
 mode change 100644 => 100755 packaging/docker/sidecar/sidecar.py

diff --git a/packaging/docker/Dockerfile.eks b/packaging/docker/Dockerfile.eks
index d7fbdf04cb..e9a1185dc9 100644
--- a/packaging/docker/Dockerfile.eks
+++ b/packaging/docker/Dockerfile.eks
@@ -9,6 +9,7 @@ RUN yum install -y \
   nc \
   net-tools \
   perf \
+  python38 \
   python3-pip \
   strace \
   tar \
@@ -26,7 +27,6 @@ RUN curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/
 	sha256sum -c /tmp/tini-amd64.sha256sum && \
   chmod +x tini-amd64 && \
 	mv tini-amd64 /usr/bin/tini
-ENTRYPOINT ["/usr/bin/tini", "-g", "--"]
 
 COPY sidecar/requirements.txt /tmp
 RUN pip3 install -r /tmp/requirements.txt
@@ -63,6 +63,7 @@ ENV FDB_COORDINATOR_PORT 4500
 ENV FDB_CLUSTER_FILE_CONTENTS ""
 ENV FDB_PROCESS_CLASS unset
 
+ENTRYPOINT ["/usr/bin/tini", "-g", "--"]
 CMD /var/fdb/scripts/fdb.bash
 
 # -------------------------------------------------
@@ -71,15 +72,18 @@ FROM base AS sidecar
 
 COPY sidecar/entrypoint.bash /
 COPY sidecar/sidecar.py /
+RUN chmod a+x /sidecar.py /entrypoint.bash
 
 VOLUME /var/input-files
 VOLUME /var/output-files
 
-RUN echo ${FDB_VERSION} > /var/fdb/version
+ARG FDB_VERSION
+
+RUN echo ${FDB_VERSION} ; echo ${FDB_VERSION}> /var/fdb/version
 RUN mkdir -p /var/fdb/lib
 
 ENV LISTEN_PORT 8080
 
 USER fdb
 
-CMD "/entrypoint.bash"
\ No newline at end of file
+ENTRYPOINT ["/usr/bin/tini", "-g", "--", "/entrypoint.bash"]
\ No newline at end of file
diff --git a/packaging/docker/sidecar/entrypoint.bash b/packaging/docker/sidecar/entrypoint.bash
index be173d4ea9..dbc885581c 100755
--- a/packaging/docker/sidecar/entrypoint.bash
+++ b/packaging/docker/sidecar/entrypoint.bash
@@ -24,4 +24,4 @@ if [[ -n "$ADDITIONAL_ENV_FILE" ]]; then
   source $ADDITIONAL_ENV_FILE
 fi
 
-python sidecar.py $*
\ No newline at end of file
+/sidecar.py $*
diff --git a/packaging/docker/sidecar/sidecar.py b/packaging/docker/sidecar/sidecar.py
old mode 100644
new mode 100755
index af2a580439..3e47c1c932
--- a/packaging/docker/sidecar/sidecar.py
+++ b/packaging/docker/sidecar/sidecar.py
@@ -1,4 +1,4 @@
-#! /usr/bin/python
+#! /usr/bin/python3
 
 # entrypoint.py
 #

From 2eef4e28beabfe852b007b45e8517020b73e360d Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Tue, 27 Apr 2021 17:15:44 -0700
Subject: [PATCH 288/461] Documentation update

---
 packaging/docker/README.md | 61 +++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 17 deletions(-)

diff --git a/packaging/docker/README.md b/packaging/docker/README.md
index 83639c7967..f8bf9cb5ed 100644
--- a/packaging/docker/README.md
+++ b/packaging/docker/README.md
@@ -1,21 +1,29 @@
 # Overview
 
-This directory provides a Docker image for running FoundationDB.
+This directory provides various Docker images for running FoundationDB.
 
-The image in this directory is based on Ubuntu 18.04, but the commands and
-scripts used to build it should be suitable for most other distros with small
-tweaks to the installation of dependencies.
-
-The image relies on the following dependencies:
-
-*	bash
-*	wget
-*	dig
-*	glibc
+This directory includes two sets of images.  The "release" images are based
+on Ubuntu 18.04.  The EKS images use Amazon Linux, which allows us to profile
+FoundationDB when it is running inside of Amazon EKS.
 
 # Build Configuration
 
-This image supports several build arguments for build-time configuration.
+The build scripts are configured using the following environment variables:
+
+`TAG` is the base docker tag for this build.  The sidecar tag will be this
+string, with a "-1" appended to it.  If you do not specify a tag, then the
+scripts attempt to provide a reasonable default.
+
+`ECR` is the name of the Docker registry the images should be published to.
+It defaults to a private registry, so it is likely you will need to override this.
+
+`STRIPPED` if true, the Docker images will contain stripped binaries without
+debugging symbols.  Debugging symbols add approximately 2GiB to the image size.
+
+# Release Dockerfile arguments.
+
+These arguments are set automatically by the build scripts, but are documented here
+in case you need to invoke the release Dockerfiles directly.
 
 ### FDB_VERSION
 
@@ -26,6 +34,10 @@ The version of FoundationDB to install in the container. This is required.
 The base URL for the FoundationDB website. The default is
 `https://www.foundationdb.org`.
 
+You can build the docker without talking to a webserver by using the URL
+`file:///mnt/website` and mirroring the directory tree of the webserver
+inside the `website` subdirectory.
+
 ### FDB_ADDITIONAL_VERSIONS
 
 A list of additional client library versions to include in this image. These
@@ -77,13 +89,28 @@ files you may want to copy are:
 	cluster file based on an `FDB_COORDINATOR` environment variable.
 *	`/usr/bin/fdbcli`: The FoundationDB CLI.
 
+If you are running FDB inside of a Kubernetes cluster, you should probably use
+the sidecar image instead.  It makes it easier to automatically copy a compatible
+`libfdb_c.so` and cluster file into application containers.
+
+TODO: Document the sidecar.
 
 # Example Usages
 
-```
-# optional; to build a release image (as in for public consumption, or deployment at apple) for 7.0.0, set TAG=7.0.0
-# defaults to <fdb version triple>-<okteto environment name>  e.g., 7.0.0-sears-dev
-#TAG=my-custom-tag
+### Build an Ubuntu-based image
+
+TAG is optional and defaults to <fdb version triple>-<okteto environment name>
+e.g., 7.0.0-username-dev
 
-. build-release-docker.sh
 ```
+TAG=my-custom-tag ./build-release-docker.sh
+```
+### Build an Amazon Linux-based image
+From inside the developer Docker container:
+```
+# compile FDB, then:
+cd ~/build_output/packages/docker/
+STRIPPED=true TAG=my-custom-tag ./build-eks-docker.sh
+```
+
+TODO: Unify the above, so they're invoked in the same way.
\ No newline at end of file

From 6b0ba1419917ea5f80f7ba22786a6f819d47d411 Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Tue, 27 Apr 2021 20:30:27 -0700
Subject: [PATCH 289/461] make release docker build script match EKS one as
 much as possible

---
 packaging/docker/README.md               | 23 +++-----
 packaging/docker/build-eks-docker.sh     | 14 ++++-
 packaging/docker/build-release-docker.sh | 74 +++++++++++-------------
 packaging/docker/release/Dockerfile      |  3 +-
 packaging/docker/sidecar/Dockerfile      |  2 +-
 packaging/docker/sidecar/sidecar.py      |  2 +-
 6 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/packaging/docker/README.md b/packaging/docker/README.md
index f8bf9cb5ed..3f48d5cbc5 100644
--- a/packaging/docker/README.md
+++ b/packaging/docker/README.md
@@ -93,24 +93,19 @@ If you are running FDB inside of a Kubernetes cluster, you should probably use
 the sidecar image instead.  It makes it easier to automatically copy a compatible
 `libfdb_c.so` and cluster file into application containers.
 
-TODO: Document the sidecar.
+TODO: Document sidecar.py
 
 # Example Usages
 
-### Build an Ubuntu-based image
-
-TAG is optional and defaults to <fdb version triple>-<okteto environment name>
-e.g., 7.0.0-username-dev
-
-```
-TAG=my-custom-tag ./build-release-docker.sh
-```
-### Build an Amazon Linux-based image
-From inside the developer Docker container:
+### Build an Ubuntu-based image with a custom tag and unstripped binaries
 ```
 # compile FDB, then:
 cd ~/build_output/packages/docker/
-STRIPPED=true TAG=my-custom-tag ./build-eks-docker.sh
+TAG=my-custom-tag ./build-release-docker.sh
 ```
-
-TODO: Unify the above, so they're invoked in the same way.
\ No newline at end of file
+### Build an Amazon Linux-based image with a default tag and stripped binaries
+```
+# compile FDB, then:
+cd ~/build_output/packages/docker/
+STRIPPED=true ./build-eks-docker.sh
+```
\ No newline at end of file
diff --git a/packaging/docker/build-eks-docker.sh b/packaging/docker/build-eks-docker.sh
index b467d0be5b..2bdb2e8e54 100755
--- a/packaging/docker/build-eks-docker.sh
+++ b/packaging/docker/build-eks-docker.sh
@@ -29,9 +29,17 @@ aws ecr get-login-password | docker login --username AWS --password-stdin ${ECR}
 docker pull ${ECR}/amazonlinux:2.0.20210326.0
 docker tag ${ECR}/amazonlinux:2.0.20210326.0 amazonlinux:2.0.20210326.0
 
+
+
+#derived variables
 IMAGE=foundationdb/foundationdb:${TAG}
 SIDECAR=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
 STRIPPED=${STRIPPED:-false}
+
+
+
+
+
 if $STRIPPED; then
   rsync -av --delete --exclude=*.xml ${BUILD_OUTPUT}/packages/bin .
   rsync -av --delete --exclude=*.a --exclude=*.xml ${BUILD_OUTPUT}/packages/lib .
@@ -40,8 +48,10 @@ else
   rsync -av --delete --exclude=*.a --exclude=*.xml ${BUILD_OUTPUT}/lib .
 fi
 
-docker build --build-arg FDB_VERSION=$FDB_VERSION -t ${IMAGE}   --target foundationdb -f Dockerfile.eks .
-docker build --build-arg FDB_VERSION=$FDB_VERSION -t ${SIDECAR} --target sidecar      -f Dockerfile.eks .
+BUILD_ARGS="--build-arg FDB_VERSION=$FDB_VERSION"
+
+docker build ${BUILD_ARGS} -t ${IMAGE}   --target foundationdb -f Dockerfile.eks .
+docker build ${BUILD_ARGS} -t ${SIDECAR} --target sidecar      -f Dockerfile.eks .
 
 docker tag ${IMAGE} ${ECR}/${IMAGE}
 docker tag ${SIDECAR} ${ECR}/${SIDECAR}
diff --git a/packaging/docker/build-release-docker.sh b/packaging/docker/build-release-docker.sh
index eaaa941bce..3e52edf4d3 100755
--- a/packaging/docker/build-release-docker.sh
+++ b/packaging/docker/build-release-docker.sh
@@ -1,12 +1,17 @@
 #!/bin/bash
+
 set -euxo pipefail
 
-# This is designed to be run inside an environment with foundationdb checked out at ~/src/foundationdb.
-# The foundationdb build will write its output to ~/build_output
-FDB_SRC=${HOME}/src/foundationdb
-FDB_BUILD=${HOME}/build_output
+DOCKER_ROOT=$(realpath $(dirname ${BASH_SOURCE[0]}))
+BUILD_OUTPUT=$(realpath ${DOCKER_ROOT}/../..)
 
-FDB_VERSION=$(grep '  VERSION ' ${FDB_SRC}/CMakeLists.txt | tr -s ' ' ' ' | cut -d ' ' -f 3)
+echo Docker root:  $DOCKER_ROOT
+echo Build output: $BUILD_OUTPUT
+
+cd ${DOCKER_ROOT}
+
+## eg: CMAKE_PROJECT_VERSION:STATIC=7.0.0
+FDB_VERSION=$(grep CMAKE_PROJECT_VERSION\: ${BUILD_OUTPUT}/CMakeCache.txt | cut -d '=' -f 2)
 
 # Options (passed via environment variables)
 
@@ -17,26 +22,6 @@ ECR=${ECR:-112664522426.dkr.ecr.us-west-2.amazonaws.com}
 
 echo Building with tag ${TAG}
 
-# TODO: This is a copy of the commonly-used 'cmk' function.
-cmake -S ${FDB_SRC} -B ${FDB_BUILD} \
-   -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON \
-   -G Ninja
-
-ninja -C ${FDB_BUILD} -j 84
-
-# derived variables
-IMAGE=foundationdb/foundationdb:${TAG}
-SIDECAR_IMAGE=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
-
-cd ${FDB_BUILD}/packages/docker
-
-WEBSITE_BIN_DIR=website/downloads/${FDB_VERSION}/linux/
-TARBALL=${WEBSITE_BIN_DIR}/fdb_${FDB_VERSION}.tar.gz
-
-mkdir -p ${WEBSITE_BIN_DIR}
-tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
-cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
-
 # Login to ECR
 # TODO: Move this to a common place instead of repeatedly copy-pasting it.
 aws ecr get-login-password | docker login --username AWS --password-stdin ${ECR}
@@ -46,21 +31,32 @@ docker tag ${ECR}/ubuntu:18.04 ubuntu:18.04
 docker pull ${ECR}/python:3.9-slim
 docker tag ${ECR}/python:3.9-slim python:3.9-slim
 
-docker build -t ${IMAGE} \
-   --build-arg FDB_WEBSITE=file:///mnt/website \
-   --build-arg FDB_VERSION=$FDB_VERSION \
-   --build-arg FDB_ADDITIONAL_VERSIONS=$FDB_VERSION \
-   -f release/Dockerfile .
+# derived variables
+IMAGE=foundationdb/foundationdb:${TAG}
+SIDECAR=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
+STRIPPED=${STRIPPED:-false}
+
+WEBSITE_BIN_DIR=website/downloads/${FDB_VERSION}/linux/
+TARBALL=${WEBSITE_BIN_DIR}/fdb_${FDB_VERSION}.tar.gz
+mkdir -p ${WEBSITE_BIN_DIR}
+
+if $STRIPPED; then
+  tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
+  cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
+else
+  tar -C ~/build_output/ -zcvf ${TARBALL} bin lib
+  cp ~/build_output/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
+fi
+
+BUILD_ARGS="--build-arg FDB_WEBSITE=file:///mnt/website "
+BUILD_ARGS+="--build-arg FDB_VERSION=$FDB_VERSION "
+BUILD_ARGS+="--build-arg FDB_ADDITIONAL_VERSIONS=$FDB_VERSION"
+
+docker build -t ${IMAGE} ${BUILD_ARGS} -f release/Dockerfile .
+docker build -t ${SIDECAR} ${BUILD_ARGS} -f sidecar/Dockerfile .
 
 docker tag ${IMAGE} ${ECR}/${IMAGE}
-
-docker build -t ${SIDECAR_IMAGE} \
-   --build-arg FDB_WEBSITE=file:///mnt/website \
-   --build-arg FDB_VERSION=$FDB_VERSION \
-   --build-arg FDB_ADDITIONAL_VERSIONS=$FDB_VERSION \
-   -f sidecar/Dockerfile .
-
-docker tag ${SIDECAR_IMAGE} ${ECR}/${SIDECAR_IMAGE}
+docker tag ${SIDECAR} ${ECR}/${SIDECAR}
 
 docker push ${ECR}/${IMAGE}
-docker push ${ECR}/${SIDECAR_IMAGE}
+docker push ${ECR}/${SIDECAR}
diff --git a/packaging/docker/release/Dockerfile b/packaging/docker/release/Dockerfile
index 7e265cbf91..45fcf71531 100644
--- a/packaging/docker/release/Dockerfile
+++ b/packaging/docker/release/Dockerfile
@@ -54,11 +54,10 @@ RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz -o fd
 
 WORKDIR /var/fdb
 
-
 # Set Up Runtime Scripts and Directories
 
 COPY release/*.bash /var/fdb/scripts/
-
+RUN chmod a+x /var/fdb/scripts/*.bash
 RUN	mkdir -p logs
 
 # Install FoundationDB Client Libraries
diff --git a/packaging/docker/sidecar/Dockerfile b/packaging/docker/sidecar/Dockerfile
index cb6c0d8397..f4e5ea7b44 100644
--- a/packaging/docker/sidecar/Dockerfile
+++ b/packaging/docker/sidecar/Dockerfile
@@ -54,7 +54,7 @@ COPY sidecar/entrypoint.bash /
 COPY sidecar/requirements.txt /
 COPY sidecar/sidecar.py /
 
-RUN pip install -r /requirements.txt && rm /requirements.txt && chmod a+x /entrypoint.bash
+RUN pip install -r /requirements.txt && rm /requirements.txt && chmod a+x /entrypoint.bash /sidecar.py
 
 VOLUME /var/input-files
 VOLUME /var/output-files
diff --git a/packaging/docker/sidecar/sidecar.py b/packaging/docker/sidecar/sidecar.py
index 3e47c1c932..12262db02e 100755
--- a/packaging/docker/sidecar/sidecar.py
+++ b/packaging/docker/sidecar/sidecar.py
@@ -1,4 +1,4 @@
-#! /usr/bin/python3
+#! /usr/bin/env python3
 
 # entrypoint.py
 #

From 55ce798ff431695a7863021dcefb1b6fb9f594d2 Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Tue, 27 Apr 2021 22:14:14 -0700
Subject: [PATCH 290/461] Reconcile differences in release and sidecar
 Dockerfile

---
 packaging/docker/release/Dockerfile | 60 +++++++++++++-------------
 packaging/docker/sidecar/Dockerfile | 65 ++++++++++++++++-------------
 2 files changed, 66 insertions(+), 59 deletions(-)

diff --git a/packaging/docker/release/Dockerfile b/packaging/docker/release/Dockerfile
index 45fcf71531..ad506e620d 100644
--- a/packaging/docker/release/Dockerfile
+++ b/packaging/docker/release/Dockerfile
@@ -19,8 +19,6 @@
 
 FROM ubuntu:18.04
 
-# Install dependencies
-
 RUN apt-get update && \
 	apt-get install -y curl>=7.58.0-2ubuntu3.6 \
 		dnsutils>=1:9.11.3+dfsg-1ubuntu1.7 \
@@ -34,44 +32,51 @@ RUN apt-get update && \
 		vim>=2:8.0.1453-1ubuntu1.4 \
 		net-tools>=1.60+git20161116.90da8a0-1ubuntu1 \
 		jq>=1.5+dfsg-2 && \
-	rm -r /var/lib/apt/lists/*
+	rm -rf /var/lib/apt/lists/*
 
-
-# Install FoundationDB Binaries
+COPY misc/tini-amd64.sha256sum /tmp/
+# Adding tini as PID 1 https://github.com/krallin/tini
+ARG TINI_VERSION=v0.19.0
+RUN curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
+  sha256sum -c /tmp/tini-amd64.sha256sum && \
+  chmod +x tini-amd64 && \
+  mv tini-amd64 /usr/bin/tini
 
 ARG FDB_VERSION
+ARG FDB_ADDITIONAL_VERSIONS="5.1.7"
 ARG FDB_WEBSITE=https://www.foundationdb.org
 
 WORKDIR /var/fdb/tmp
-ADD website /mnt/website
-RUN ls -l /mnt/website
-RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz -o fdb_$FDB_VERSION.tar.gz && \
-	tar -xzf fdb_$FDB_VERSION.tar.gz --strip-components=1 && \
-	rm fdb_$FDB_VERSION.tar.gz && \
+COPY website /mnt/website/
+
+# Install FoundationDB Binaries
+RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz | tar zxf - --strip-components=1 && \
 	chmod u+x fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent && \
 	mv fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent /usr/bin && \
 	rm -r /var/fdb/tmp
 
-WORKDIR /var/fdb
+WORKDIR /
+
+## TODO: Can unify everything above this line
+## TODO: we can almost unify the additional client library download,
+##       but sidecar.py expects them in a different location,
+##        with a different naming convention.
+
+RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/libfdb_c_$FDB_VERSION.so -o /usr/lib/libfdb_c.so
 
 # Set Up Runtime Scripts and Directories
-
-COPY release/*.bash /var/fdb/scripts/
+ADD release/*.bash /var/fdb/scripts/
 RUN chmod a+x /var/fdb/scripts/*.bash
+
+# Install additional FoundationDB Client Libraries
+RUN	/var/fdb/scripts/download_multiversion_libraries.bash $FDB_WEBSITE $FDB_ADDITIONAL_VERSIONS
+
+RUN	rm -rf /mnt/website
+
 RUN	mkdir -p logs
 
-# Install FoundationDB Client Libraries
-
-ARG FDB_ADDITIONAL_VERSIONS="5.1.7"
-
-RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/libfdb_c_$FDB_VERSION.so -o /usr/lib/libfdb_c.so && \
-	bash scripts/download_multiversion_libraries.bash $FDB_WEBSITE $FDB_ADDITIONAL_VERSIONS && \
-	rm -rf /mnt/website
-
 VOLUME /var/fdb/data
 
-CMD /var/fdb/scripts/fdb.bash
-
 # Runtime Configuration Options
 
 ENV FDB_PORT 4500
@@ -82,12 +87,5 @@ ENV FDB_COORDINATOR_PORT 4500
 ENV FDB_CLUSTER_FILE_CONTENTS ""
 ENV FDB_PROCESS_CLASS unset
 
-# Adding tini as PID 1 https://github.com/krallin/tini
-ARG TINI_VERSION=v0.19.0
-RUN curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
-    curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64.sha256sum && \
-	sha256sum -c tini-amd64.sha256sum && \
-	rm -f tini-amd64.sha256sum && \
-    chmod +x tini-amd64 && \
-	mv tini-amd64 /usr/bin/tini
 ENTRYPOINT ["/usr/bin/tini", "-g", "--"]
+CMD /var/fdb/scripts/fdb.bash
diff --git a/packaging/docker/sidecar/Dockerfile b/packaging/docker/sidecar/Dockerfile
index f4e5ea7b44..b2d76693ec 100644
--- a/packaging/docker/sidecar/Dockerfile
+++ b/packaging/docker/sidecar/Dockerfile
@@ -19,42 +19,51 @@
 
 FROM python:3.9-slim
 
-WORKDIR /var/fdb/tmp
-ARG FDB_VERSION=6.2.30
-ARG FDB_ADDITIONAL_VERSIONS="6.2.30 6.1.13"
-ARG FDB_WEBSITE=https://www.foundationdb.org
-# Adding tini as PID 1 https://github.com/krallin/tini
-ARG TINI_VERSION=v0.19.0
-
-COPY website /mnt/website
 RUN apt-get update && \
 	apt-get install -y --no-install-recommends curl && \
-	curl --fail $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz -o fdb_$FDB_VERSION.tar.gz && \
-	tar -xzf fdb_$FDB_VERSION.tar.gz --strip-components=1 && \
-	rm fdb_$FDB_VERSION.tar.gz && \
+	rm -rf /var/lub/apt/lists/*
+
+COPY misc/tini-amd64.sha256sum /tmp/
+# Adding tini as PID 1 https://github.com/krallin/tini
+ARG TINI_VERSION=v0.19.0
+RUN curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
+  sha256sum -c /tmp/tini-amd64.sha256sum && \
+  chmod +x tini-amd64 && \
+  mv tini-amd64 /usr/bin/tini
+
+COPY sidecar/requirements.txt /tmp
+RUN pip install -r tmp/requirements.txt
+
+ARG FDB_VERSION=
+ARG FDB_ADDITIONAL_VERSIONS="6.2.30 6.1.13"
+ARG FDB_WEBSITE=https://www.foundationdb.org
+
+WORKDIR /var/fdb/tmp
+COPY website /mnt/website/
+
+# Install FoundationDB Binaries
+RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz | tar zxf - --strip-components=1 && \
 	chmod u+x fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent && \
 	mv fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent /usr/bin && \
-	echo ${FDB_VERSION} > /var/fdb/version && mkdir -p /var/fdb/lib && \
-	for version in $FDB_ADDITIONAL_VERSIONS; do curl --fail $FDB_WEBSITE/downloads/$version/linux/libfdb_c_$version.so -o /var/fdb/lib/libfdb_c_${version%.*}.so; done && \
-	curl -LO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
-    curl -LO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64.sha256sum && \
-	sha256sum -c tini-amd64.sha256sum && \
-	rm -f tini-amd64.sha256sum && \
-    chmod +x tini-amd64 && \
-	mv tini-amd64 /usr/bin/tini && \
-	rm -r /var/fdb/tmp && \
-	groupadd --gid 4059 fdb && \
-	useradd --gid 4059 --uid 4059 --no-create-home --shell /bin/bash fdb && \
-	apt-get remove -y curl && \
-	rm -rf /var/lib/apt/lists/*
+	rm -r /var/fdb/tmp
 
 WORKDIR /
 
-COPY sidecar/entrypoint.bash /
-COPY sidecar/requirements.txt /
-COPY sidecar/sidecar.py /
+# Set Up Runtime Scripts and Directories
+ADD sidecar/entrypoint.bash sidecar/sidecar.py /
+RUN chmod a+x /entrypoint.bash /sidecar.py
+
+# Install additional FoundationDB Client Libraries
+RUN mkdir -p /var/fdb/lib && \
+    for version in $FDB_ADDITIONAL_VERSIONS; do curl $FDB_WEBSITE/downloads/$version/linux/libfdb_c_$version.so -o /var/fdb/lib/libfdb_c_${version%.*}.so; done
+
+RUN	rm -rf /mnt/website
+
+RUN	echo ${FDB_VERSION} > /var/fdb/version && \
+	mkdir -p /var/fdb/lib && \
+	groupadd --gid 4059 fdb && \
+	useradd --gid 4059 --uid 4059 --no-create-home --shell /bin/bash fdb
 
-RUN pip install -r /requirements.txt && rm /requirements.txt && chmod a+x /entrypoint.bash /sidecar.py
 
 VOLUME /var/input-files
 VOLUME /var/output-files

From 1e665044fe3586117b72666dd6883f0bcdde4dc0 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 09:08:17 -0600
Subject: [PATCH 291/461] bugfix

---
 fdbclient/SpecialKeySpace.actor.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index b692a5dea3..3ba31cca95 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1986,6 +1986,7 @@ Future<Standalone<RangeResultRef>> ActorProfilerConf::getRange(ReadYourWritesTra
 
 void ActorProfilerConf::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
 	config[key.removePrefix(range.begin).toString()] = value.toString();
+	didWrite = true;
 }
 
 void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& kr) {
@@ -1995,6 +1996,7 @@ void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef&
 		// nothing to clear
 		return;
 	}
+	didWrite = true;
 	auto last = config.upper_bound(end);
 	config.erase(first, last);
 }
@@ -2005,6 +2007,7 @@ void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRef& key)
 	if (iter != config.end()) {
 		config.erase(iter);
 	}
+	didWrite = true;
 }
 
 Future<Optional<std::string>> ActorProfilerConf::commit(ReadYourWritesTransaction* ryw) {

From 838d4e021a5b4b6bd2bca5562bebb0229c2b52ee Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 09:24:08 -0600
Subject: [PATCH 292/461] remove accidently added files

---
 .stignore  |  2 --
 okteto.yml | 12 ------------
 2 files changed, 14 deletions(-)
 delete mode 100644 .stignore
 delete mode 100644 okteto.yml

diff --git a/.stignore b/.stignore
deleted file mode 100644
index 7500a08f9f..0000000000
--- a/.stignore
+++ /dev/null
@@ -1,2 +0,0 @@
-.git
-.clangd
diff --git a/okteto.yml b/okteto.yml
deleted file mode 100644
index efa744a7d8..0000000000
--- a/okteto.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-name: foundationdb
-autocreate: true
-image: foundationdb/devel:centos7-latest
-command: bash
-volumes:
-- /root/.m2
-- /root/build
-sync:
-- .:/usr/src/fdb
-forward:
-- 5005:5005
-- 8080:8080

From e2b8a2734b0971bb5295026746669361c681841c Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 09:24:58 -0600
Subject: [PATCH 293/461] Fix typo

Co-authored-by: Lukas Joswiak <lukas.joswiak@snowflake.com>
---
 fdbserver/fdbserver.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 75247d85cf..3b0fc8b180 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -623,7 +623,7 @@ static void printUsage(const char* name, bool devhelp) {
 	                 " resolution, grv_proxy, commit_proxy, master, test, unset, stateless, log, router,"
 	                 " and cluster_controller).");
 	printOptionUsage("--profiler_",
-	                 "Set a actor profiler option. Supported options are:\n"
+	                 "Set an actor profiler option. Supported options are:\n"
 	                 "  collector -- None or FluentD (FluentD requires collector_endpoint to be set)\n"
 	                 "  collector_endpoint -- IP:PORT of the fluentd server\n"
 	                 "  collector_protocol -- UDP or TCP (default is UDP)");

From 0ee0b8a76f41de62ceb97fc2df214a7d36633b52 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 09:25:28 -0600
Subject: [PATCH 294/461] Fixed typo

Co-authored-by: Lukas Joswiak <lukas.joswiak@snowflake.com>
---
 fdbclient/ActorLineageProfiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 9f9fdc3300..30e478bc48 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -64,7 +64,7 @@ public:
 	void getConfig(std::map<std::string, std::string>& res) const override { res["ingestor"] = "none"; }
 };
 
-// The FluentD ingestor uses the pimp idiom. This is to make compilation less heavy weight as this implementation has
+// The FluentD ingestor uses the pimpl idiom. This is to make compilation less heavy weight as this implementation has
 // dependencies to boost::asio
 struct FluentDIngestorImpl;
 

From 6f71a811b6dab8d81db81587a889397ed03b9ab7 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 09:27:11 -0600
Subject: [PATCH 295/461] fix memory leak

---
 fdbclient/FluentDSampleIngestor.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fdbclient/FluentDSampleIngestor.cpp b/fdbclient/FluentDSampleIngestor.cpp
index e912643dbf..ac34567584 100644
--- a/fdbclient/FluentDSampleIngestor.cpp
+++ b/fdbclient/FluentDSampleIngestor.cpp
@@ -154,7 +154,9 @@ struct FluentDIngestorImpl {
 	}
 };
 
-FluentDIngestor::~FluentDIngestor() {}
+FluentDIngestor::~FluentDIngestor() {
+	delete impl;
+}
 
 FluentDIngestor::FluentDIngestor(Protocol protocol, NetworkAddress& endpoint)
   : impl(new FluentDIngestorImpl(protocol, endpoint)) {}

From 05dba91dd4e0f361468148dbc7d76d22f7c1f36d Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 10:36:41 -0600
Subject: [PATCH 296/461] fix OPEN_FOR_IDE

---
 fdbclient/SpecialKeySpace.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 3c7824366d..c866354759 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -2055,8 +2055,8 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 		throw special_keys_api_failure();
 	}
 
+	state NetworkAddress endRangeHost;
 	try {
-		state NetworkAddress endRangeHost;
 		if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
 			// For the range \xff\xff/actor_lineage/state/ip:port/wait-state/time/seq
 			parse(beginValues.begin() + 1, beginValues.end(), host, waitStateStart, timeStart, seqStart);

From 135cc9c69adb4dafd7cbb85154fea3a777f1a90b Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 28 Apr 2021 10:30:30 -0700
Subject: [PATCH 297/461] Make parameter const&

---
 fdbrpc/LoadBalance.actor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h
index f3fe58e441..393c3c0ee2 100644
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@@ -122,7 +122,7 @@ struct RequestData : NonCopyable {
 	// A return value of true means that the request completed successfully
 	// A return value of false means that the request failed but should be retried
 	// A return value with an error means that the error should be thrown back to original caller
-	static ErrorOr<bool> checkAndProcessResultImpl(Reply result,
+	static ErrorOr<bool> checkAndProcessResultImpl(Reply const& result,
 	                                               Reference<ModelHolder> modelHolder,
 	                                               bool atMostOnce,
 	                                               bool triedAllOptions) {

From 65fcf4014edbbfd72798e4bb01af9dffc6e14c1d Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 28 Apr 2021 12:41:48 -0700
Subject: [PATCH 298/461] Fix: simulation could still stall writes for 10
 seconds even when speedUpSimulation was on Fix: disable connection failures
 in simulation when there are too many generations outstanding

---
 fdbclient/Knobs.cpp                | 1 +
 fdbclient/Knobs.h                  | 1 +
 fdbrpc/AsyncFileNonDurable.actor.h | 2 +-
 fdbserver/masterserver.actor.cpp   | 6 ++++++
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index 3f5523e218..cd196737f8 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -50,6 +50,7 @@ void ClientKnobs::initialize(bool randomize) {
 	init( RECOVERY_DELAY_SECONDS_PER_GENERATION,  60.0 );
 	init( MAX_GENERATIONS,                         100 );
 	init( MAX_GENERATIONS_OVERRIDE,                  0 );
+	init( MAX_GENERATIONS_SIM,                      50 ); //Disable network connections after this many generations in simulation, should be less than RECOVERY_DELAY_START_GENERATION
 
 	init( COORDINATOR_RECONNECTION_DELAY,          1.0 );
 	init( CLIENT_EXAMPLE_AMOUNT,                    20 );
diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h
index 8cfcd9e6bd..3ee7d1ee83 100644
--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@@ -42,6 +42,7 @@ public:
 	double RECOVERY_DELAY_SECONDS_PER_GENERATION;
 	double MAX_GENERATIONS;
 	double MAX_GENERATIONS_OVERRIDE;
+	double MAX_GENERATIONS_SIM;
 
 	double COORDINATOR_RECONNECTION_DELAY;
 	int CLIENT_EXAMPLE_AMOUNT;
diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 848d755fb1..00c0f7441d 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -449,7 +449,7 @@ private:
 			    self->getModificationsAndInsert(offset, length, true, writeEnded);
 			self->minSizeAfterPendingModifications = std::max(self->minSizeAfterPendingModifications, offset + length);
 
-			if (BUGGIFY_WITH_PROB(0.001))
+			if (BUGGIFY_WITH_PROB(0.001) && !g_simulator.speedUpSimulation)
 				priorModifications.push_back(
 				    delay(deterministicRandom()->random01() * FLOW_KNOBS->MAX_PRIOR_MODIFICATION_DELAY) ||
 				    self->killed.getFuture());
diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp
index a5f6ed7b75..040c03595f 100644
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@@ -26,6 +26,7 @@
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/PerfMetric.h"
 #include "fdbrpc/sim_validation.h"
+#include "fdbrpc/simulator.h"
 #include "fdbserver/ApplyMetadataMutation.h"
 #include "fdbserver/BackupProgress.actor.h"
 #include "fdbserver/ConflictSet.h"
@@ -1680,6 +1681,11 @@ ACTOR Future<Void> masterCore(Reference<MasterData> self) {
 			wait(delay(CLIENT_KNOBS->RECOVERY_DELAY_SECONDS_PER_GENERATION *
 			           (self->cstate.myDBState.oldTLogData.size() - CLIENT_KNOBS->RECOVERY_DELAY_START_GENERATION)));
 		}
+		if (g_network->isSimulated() && self->cstate.myDBState.oldTLogData.size() > CLIENT_KNOBS->MAX_GENERATIONS_SIM) {
+			g_simulator.connectionFailuresDisableDuration = 1e6;
+			g_simulator.speedUpSimulation = true;
+			TraceEvent(SevWarnAlways, "DisableConnectionFailures_TooManyGenerations");
+		}
 	}
 
 	state Reference<AsyncVar<Reference<ILogSystem>>> oldLogSystems(new AsyncVar<Reference<ILogSystem>>);

From d946e90d75589d472dd4b3976fa0ea781b571285 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 14:10:45 -0600
Subject: [PATCH 299/461] Use new Sample interface

---
 fdbclient/FluentDSampleIngestor.cpp | 105 ++++++++++++++++++++++++----
 1 file changed, 91 insertions(+), 14 deletions(-)

diff --git a/fdbclient/FluentDSampleIngestor.cpp b/fdbclient/FluentDSampleIngestor.cpp
index ac34567584..08d5bfe55f 100644
--- a/fdbclient/FluentDSampleIngestor.cpp
+++ b/fdbclient/FluentDSampleIngestor.cpp
@@ -21,6 +21,7 @@
 #include "fdbclient/ActorLineageProfiler.h"
 #include <boost/asio.hpp>
 #include <boost/asio/co_spawn.hpp>
+#include <msgpack.hpp>
 
 namespace {
 
@@ -44,6 +45,90 @@ struct FluentDSocket {
 	virtual const boost::system::error_code& failed() const = 0;
 };
 
+template <class Protocol, class Callback>
+class SampleSender : public std::enable_shared_from_this<SampleSender<Protocol, Callback>> {
+	using Socket = typename Protocol::socket;
+	using Iter = typename decltype(Sample::data)::iterator;
+	Socket& socket;
+	Callback callback;
+	Iter iter, end;
+
+	struct Buf {
+		const char* data;
+		const unsigned size;
+		Buf(const char* data, unsigned size) : data(data), size(size) {}
+		Buf(Buf const&) = delete;
+		Buf& operator=(Buf const&) = delete;
+		~Buf() { delete[] data; }
+	};
+
+	void sendCompletionHandler(boost::system::error_code const& ec) {
+		if (ec) {
+			callback(ec);
+		} else {
+			++iter;
+			sendNext();
+		}
+	}
+
+	void send(boost::asio::ip::tcp::socket& socket, std::shared_ptr<Buf> const& buf) {
+		boost::asio::async_write(
+		    socket,
+		    boost::asio::const_buffer(buf->data, buf->size),
+		    [buf, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
+	}
+	void send(boost::asio::ip::udp::socket& socket, std::shared_ptr<Buf> const& buf) {
+		socket.async_send(
+		    boost::asio::const_buffer(buf->data, buf->size),
+		    [buf, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
+	}
+
+	void sendNext() {
+		if (iter == end) {
+			callback(boost::system::error_code());
+		}
+		// 1. calculate size of buffer
+		unsigned size = 1; // 1 for fixmap identifier byte
+		auto waitState = to_string(iter->first);
+		if (waitState.size() < 32) {
+			size = waitState.size() + 1;
+		} else {
+			size = waitState.size() + 2;
+		}
+		size += iter->second.second;
+		// 2. allocate the buffer
+		std::unique_ptr<char[]> buf(new char[size]);
+		unsigned off = 0;
+		// 3. serialize fixmap
+		buf[off++] = 0x81; // map of size 1
+		// 3.1 serialize key
+		if (waitState.size() < 32) {
+			buf[off++] = 0xa0 + waitState.size(); // fixstr
+		} else {
+			buf[off++] = 0xd9;
+			buf[off++] = char(waitState.size());
+		}
+		memcpy(buf.get() + off, waitState.data(), waitState.size());
+		off += waitState.size();
+		// 3.2 append serialized value
+		memcpy(buf.get() + off, iter->second.first, iter->second.second);
+		// 4. send the result to fluentd
+		send(socket, std::make_shared<Buf>(buf.release(), size));
+	}
+
+public:
+	SampleSender(Socket& socket, Callback const& callback, std::shared_ptr<Sample> const& sample)
+	  : socket(socket), callback(callback), iter(sample->data.begin()), end(sample->data.end()) {
+			sendNext();
+		}
+};
+
+// Sample function to make instanciation of SampleSender easier
+template <class Protocol, class Callback>
+std::shared_ptr<SampleSender<Protocol, Callback>> makeSampleSender(typename Protocol::socket& socket, Callback const& callback, std::shared_ptr<Sample> const& sample) {
+	return std::make_shared<SampleSender<Protocol, Callback>>(socket, callback, sample);
+}
+
 template <class Protocol>
 struct FluentDSocketImpl : FluentDSocket, std::enable_shared_from_this<FluentDSocketImpl<Protocol>> {
 	static constexpr unsigned MAX_QUEUE_SIZE = 100;
@@ -67,23 +152,15 @@ struct FluentDSocketImpl : FluentDSocket, std::enable_shared_from_this<FluentDSo
 		} else {
 			auto sample = queue.front();
 			queue.pop_front();
-			sendImpl<Protocol>(sample);
+			sendImpl(sample);
 		}
 	}
 
-	template <class P>
-	std::enable_if_t<std::is_same_v<boost::asio::ip::tcp, P>> sendImpl(std::shared_ptr<Sample> const& sample) {
-		boost::asio::async_write(
-		    socket,
-		    boost::asio::const_buffer(sample->data, sample->size),
-		    [sample, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
-	}
 
-	template <class P>
-	std::enable_if_t<std::is_same_v<boost::asio::ip::udp, P>> sendImpl(std::shared_ptr<Sample> const& sample) {
-		socket.async_send(
-		    boost::asio::const_buffer(sample->data, sample->size),
-		    [sample, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
+	void sendImpl(std::shared_ptr<Sample> const& sample) {
+		makeSampleSender<Protocol>(socket, [self = this->shared_from_this()](boost::system::error_code const& ec){
+			self->sendCompletionHandler(ec);
+		}, sample);
 	}
 
 	void send(std::shared_ptr<Sample> const& sample) override {
@@ -92,7 +169,7 @@ struct FluentDSocketImpl : FluentDSocket, std::enable_shared_from_this<FluentDSo
 		}
 		if (ready) {
 			ready = false;
-			sendImpl<Protocol>(sample);
+			sendImpl(sample);
 		} else {
 			if (queue.size() < MAX_QUEUE_SIZE) {
 				queue.push_back(sample);

From 32ee206675ca8aed0edf237776806dcb29eb3c20 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 14:11:09 -0600
Subject: [PATCH 300/461] delete copy constructor of Sample

---
 fdbclient/ActorLineageProfiler.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 1d9af52ace..8fc29ef0b2 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -47,6 +47,8 @@ struct IALPCollector : IALPCollectorBase {
 
 struct Sample : std::enable_shared_from_this<Sample> {
 	double time = 0.0;
+	Sample(Sample const&) = delete;
+	Sample& operator=(Sample const&) = delete;
 	std::unordered_map<WaitState, std::pair<char*, unsigned>> data;
 	~Sample() {
 		std::for_each(data.begin(), data.end(), [](std::pair<WaitState, std::pair<char*, unsigned>> entry) {

From b256c6822d221da0864f29d9e30b99516ae2a8dd Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 15:44:06 -0600
Subject: [PATCH 301/461] add default constructor to Sample

---
 fdbclient/ActorLineageProfiler.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 8fc29ef0b2..30a3eec3e7 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -47,6 +47,7 @@ struct IALPCollector : IALPCollectorBase {
 
 struct Sample : std::enable_shared_from_this<Sample> {
 	double time = 0.0;
+	Sample() {}
 	Sample(Sample const&) = delete;
 	Sample& operator=(Sample const&) = delete;
 	std::unordered_map<WaitState, std::pair<char*, unsigned>> data;

From f513543305a638c9bd8f03cf422d6ce3909d78ac Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 16:35:09 -0600
Subject: [PATCH 302/461] fix special keyspace register

---
 fdbclient/NativeAPI.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index e3ac5dbab0..418a1acb6e 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -1076,8 +1076,8 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 		    std::make_unique<ActorLineageImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE)));
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF,
 		                              SpecialKeySpace::IMPLTYPE::READWRITE,
-		                              std::make_unique<ActorProfilerConf>(
-		                                  SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE)));
+		                              std::make_unique<ActorProfilerConf>(SpecialKeySpace::getModuleRange(
+		                                  SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF)));
 	}
 	if (apiVersionAtLeast(630)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION,

From 045d20ab7af6cb45b53ad72fc0da3f27f3d6cffc Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 28 Apr 2021 15:40:13 -0700
Subject: [PATCH 303/461] Check validity of ActorLineage

---
 fdbclient/ActorLineageProfiler.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index d596d30616..4095725ffc 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -275,7 +275,10 @@ ActorLineageProfilerT::ActorLineageProfilerT() : impl(new ProfilerImpl()) {
 	    std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet())));
 	collection->collector()->addGetter(WaitState::Running, []() {
 		auto res = currentLineageThreadSafe.get();
-		return std::vector<Reference<ActorLineage>>({ res });
+		if (res.isValid()) {
+			return std::vector<Reference<ActorLineage>>({ res });
+		}
+		return std::vector<Reference<ActorLineage>>();
 	});
 }
 

From 709df795c0233c79f7041d5c0073d7e3d9d4629d Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 28 Apr 2021 16:22:34 -0700
Subject: [PATCH 304/461] Added a new option to bypass unreadable protection in
 read your writes for calls to get

---
 fdbclient/RYWIterator.cpp                |  4 ++--
 fdbclient/RYWIterator.h                  |  5 ++++-
 fdbclient/ReadYourWrites.actor.cpp       |  8 ++++++++
 fdbclient/ReadYourWrites.h               |  1 +
 fdbclient/SnapshotCache.h                |  1 +
 fdbclient/vexillographer/fdb.options     |  5 +++++
 fdbserver/workloads/Unreadable.actor.cpp | 15 ++++++++++++++-
 7 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/fdbclient/RYWIterator.cpp b/fdbclient/RYWIterator.cpp
index fd3eec35c7..6756b1f482 100644
--- a/fdbclient/RYWIterator.cpp
+++ b/fdbclient/RYWIterator.cpp
@@ -42,7 +42,7 @@ const RYWIterator::SEGMENT_TYPE RYWIterator::typeMap[12] = {
 };
 
 RYWIterator::SEGMENT_TYPE RYWIterator::type() const {
-	if (is_unreadable())
+	if (is_unreadable() && !bypassUnreadable)
 		throw accessed_unreadable();
 
 	return typeMap[writes.type() * 3 + cache.type()];
@@ -72,7 +72,7 @@ ExtStringRef RYWIterator::endKey() {
 }
 
 const KeyValueRef* RYWIterator::kv(Arena& arena) {
-	if (is_unreadable())
+	if (is_unreadable() && !bypassUnreadable)
 		throw accessed_unreadable();
 
 	if (writes.is_unmodified_range()) {
diff --git a/fdbclient/RYWIterator.h b/fdbclient/RYWIterator.h
index e28b11c033..ba49bad9e3 100644
--- a/fdbclient/RYWIterator.h
+++ b/fdbclient/RYWIterator.h
@@ -28,7 +28,7 @@
 class RYWIterator {
 public:
 	RYWIterator(SnapshotCache* snapshotCache, WriteMap* writeMap)
-	  : cache(snapshotCache), writes(writeMap), begin_key_cmp(0), end_key_cmp(0) {}
+	  : cache(snapshotCache), writes(writeMap), begin_key_cmp(0), end_key_cmp(0), bypassUnreadable(false) {}
 
 	enum SEGMENT_TYPE { UNKNOWN_RANGE, EMPTY_RANGE, KV };
 	static const SEGMENT_TYPE typeMap[12];
@@ -59,6 +59,8 @@ public:
 
 	void skipContiguousBack(KeyRef key);
 
+	void bypassUnreadableProtection() { bypassUnreadable = true; }
+
 	WriteMap::iterator& extractWriteMapIterator();
 	// Really this should return an iterator by value, but for performance it's convenient to actually grab the internal
 	// one.  Consider copying the return value if performance isn't critical. If you modify the returned iterator, it
@@ -72,6 +74,7 @@ private:
 	SnapshotCache::iterator cache;
 	WriteMap::iterator writes;
 	KeyValueRef temp;
+	bool bypassUnreadable;
 
 	void updateCmp();
 };
diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp
index a6e047b979..717499112c 100644
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@@ -84,6 +84,9 @@ public:
 	static Future<Optional<Value>> read(ReadYourWritesTransaction* ryw, GetValueReq read, Iter* it) {
 		// This overload is required to provide postcondition: it->extractWriteMapIterator().segmentContains(read.key)
 
+		if (ryw->options.bypassUnreadable) {
+			it->bypassUnreadableProtection();
+		}
 		it->skip(read.key);
 		state bool dependent = it->is_dependent();
 		if (it->is_kv()) {
@@ -2237,6 +2240,11 @@ void ReadYourWritesTransaction::setOptionImpl(FDBTransactionOptions::Option opti
 		validateOptionValue(value, false);
 		options.specialKeySpaceChangeConfiguration = true;
 		break;
+	case FDBTransactionOptions::BYPASS_UNREADABLE:
+		validateOptionValue(value, false);
+		TraceEvent("ReadVersionStampValueOptionSet");
+		options.bypassUnreadable = true;
+		break;
 	default:
 		break;
 	}
diff --git a/fdbclient/ReadYourWrites.h b/fdbclient/ReadYourWrites.h
index f8e8e390bd..d2f043987e 100644
--- a/fdbclient/ReadYourWrites.h
+++ b/fdbclient/ReadYourWrites.h
@@ -42,6 +42,7 @@ struct ReadYourWritesTransactionOptions {
 	double timeoutInSeconds;
 	int maxRetries;
 	int snapshotRywEnabled;
+	bool bypassUnreadable : 1;
 
 	ReadYourWritesTransactionOptions() {}
 	explicit ReadYourWritesTransactionOptions(Transaction const& tr);
diff --git a/fdbclient/SnapshotCache.h b/fdbclient/SnapshotCache.h
index df389eb4ab..eabd289aee 100644
--- a/fdbclient/SnapshotCache.h
+++ b/fdbclient/SnapshotCache.h
@@ -203,6 +203,7 @@ public:
 		bool is_empty_range() const { return type() == EMPTY_RANGE; }
 		bool is_dependent() const { return false; }
 		bool is_unreadable() const { return false; }
+		void bypassUnreadableProtection() {}
 
 		ExtStringRef beginKey() const {
 			if (offset == 0) {
diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options
index c6a4a9749c..9e09062234 100644
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@@ -192,6 +192,9 @@ description is not currently required but encouraged.
             description="Enable tracing for all transactions. This is the default." />
     <Option name="distributed_transaction_trace_disable" code="601"
             description="Disable tracing for all transactions." />
+    <Option name="transaction_bypass_unreadable" code="700"
+            description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations."
+            defaultFor="1100"/>
   </Scope>
   
   <Scope name="TransactionOption">
@@ -284,6 +287,8 @@ description is not currently required but encouraged.
             description="Adds a parent to the Span of this transaction. Used for transaction tracing. A span can be identified with any 16 bytes"/>
     <Option name="expensive_clear_cost_estimation_enable" code="1000"
                 description="Asks storage servers for how many bytes a clear key range contains. Otherwise uses the location cache to roughly estimate this." />
+    <Option name="bypass_unreadable" code="1100"
+                description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations." />            
   </Scope>
 
   <!-- The enumeration values matter - do not change them without
diff --git a/fdbserver/workloads/Unreadable.actor.cpp b/fdbserver/workloads/Unreadable.actor.cpp
index e804048880..d0317595e4 100644
--- a/fdbserver/workloads/Unreadable.actor.cpp
+++ b/fdbserver/workloads/Unreadable.actor.cpp
@@ -313,6 +313,10 @@ struct UnreadableWorkload : TestWorkload {
 			state bool snapshot;
 			state KeySelectorRef begin;
 			state KeySelectorRef end;
+			state bool bypassUnreadable = deterministicRandom()->coinflip();
+			if (readVerisonStampValues) {
+				tr.setOption(FDBTransactionOptions::BYPASS_UNREADABLE);
+			}
 
 			setMap[normalKeys.begin] = ValueRef();
 			setMap[normalKeys.end] = ValueRef();
@@ -377,6 +381,9 @@ struct UnreadableWorkload : TestWorkload {
 						setMap = std::map<KeyRef, ValueRef>();
 						unreadableMap = KeyRangeMap<bool>();
 						tr = ReadYourWritesTransaction(cx);
+						if (bypassUnreadable) {
+							tr.setOption(FDBTransactionOptions::BYPASS_UNREADABLE);
+						}
 						arena = Arena();
 
 						setMap[normalKeys.begin] = ValueRef();
@@ -422,6 +429,9 @@ struct UnreadableWorkload : TestWorkload {
 						setMap = std::map<KeyRef, ValueRef>();
 						unreadableMap = KeyRangeMap<bool>();
 						tr = ReadYourWritesTransaction(cx);
+						if (bypassUnreadable) {
+							tr.setOption(FDBTransactionOptions::BYPASS_UNREADABLE);
+						}
 						arena = Arena();
 
 						setMap[normalKeys.begin] = ValueRef();
@@ -441,7 +451,7 @@ struct UnreadableWorkload : TestWorkload {
 
 					if (!value.isError() || value.getError().code() == error_code_accessed_unreadable) {
 						//TraceEvent("RYWT_Get").detail("Key", printable(key)).detail("IsUnreadable", value.isError());
-						if (snapshot) {
+						if (snapshot || bypassUnreadable) {
 							ASSERT(!value.isError());
 						} else {
 							ASSERT(unreadableMap[key] == value.isError());
@@ -451,6 +461,9 @@ struct UnreadableWorkload : TestWorkload {
 						setMap = std::map<KeyRef, ValueRef>();
 						unreadableMap = KeyRangeMap<bool>();
 						tr = ReadYourWritesTransaction(cx);
+						if (bypassUnreadable) {
+							tr.setOption(FDBTransactionOptions::BYPASS_UNREADABLE);
+						}
 						arena = Arena();
 
 						setMap[normalKeys.begin] = ValueRef();

From a1436406b7a22db31074ea29c87f7c8e5908e4d3 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 28 Apr 2021 16:27:48 -0700
Subject: [PATCH 305/461] fix compile error

---
 fdbserver/workloads/Unreadable.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/workloads/Unreadable.actor.cpp b/fdbserver/workloads/Unreadable.actor.cpp
index d0317595e4..1a9d9ee364 100644
--- a/fdbserver/workloads/Unreadable.actor.cpp
+++ b/fdbserver/workloads/Unreadable.actor.cpp
@@ -314,7 +314,7 @@ struct UnreadableWorkload : TestWorkload {
 			state KeySelectorRef begin;
 			state KeySelectorRef end;
 			state bool bypassUnreadable = deterministicRandom()->coinflip();
-			if (readVerisonStampValues) {
+			if (bypassUnreadable) {
 				tr.setOption(FDBTransactionOptions::BYPASS_UNREADABLE);
 			}
 

From 7ab6fedb5b186d234126a06ee000f764fd641741 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 29 Apr 2021 11:34:42 -0600
Subject: [PATCH 306/461] fix refcounting in WriteOnlySet

---
 flow/WriteOnlySet.actor.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index c79f8f4db7..8e2e0ecfd8 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -70,16 +70,16 @@ bool WriteOnlySet<T, IndexType, CAPACITY>::erase(Index idx) {
 template <class T, class IndexType, IndexType CAPACITY>
 bool WriteOnlySet<T, IndexType, CAPACITY>::replace(Index idx, const Reference<T>& lineage) {
 	auto lineagePtr = reinterpret_cast<uintptr_t>(lineage.getPtr());
+	if (lineage.isValid()) {
+		lineage->addref();
+	}
 	ASSERT((lineagePtr % 2) == 0); // this needs to be at least 2-byte aligned
 
 	while (true) {
-		if (lineage.isValid()) {
-			lineage->addref();
-		}
-
 		auto ptr = _set[idx].load();
 		if (ptr & LOCK) {
 			_set[idx].store(lineagePtr);
+			ASSERT(freeList.push(reinterpret_cast<T*>(ptr ^ LOCK)));
 			return false;
 		} else {
 			if (_set[idx].compare_exchange_strong(ptr, lineagePtr)) {

From 44197644be41ee08bd1b6c9c7843438eb3d4bb84 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 29 Apr 2021 12:19:58 -0700
Subject: [PATCH 307/461] Reset transaction after a commit and before start a
 read.

---
 fdbclient/FileBackupAgent.actor.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 60d21a6a95..905592f8bc 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -4101,6 +4101,9 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			}
 		}
 
+		tr->reset();
+		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 		state bool logsOnly = wait(restore.onlyAppyMutationLogs().getD(tr, false, false));
 		state bool inconsistentSnapshotOnly = wait(restore.inconsistentSnapshotOnly().getD(tr, false, false));
 		state Version firstConsistentVersion = invalidVersion;

From e616545949a0308e84a00307f03be32bcb9745db Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 29 Apr 2021 15:23:33 -0600
Subject: [PATCH 308/461] use std::atomic for threadsaferefcounted

---
 flow/FastRef.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flow/FastRef.h b/flow/FastRef.h
index eca6ab72d5..d20cdfd9a1 100644
--- a/flow/FastRef.h
+++ b/flow/FastRef.h
@@ -35,10 +35,10 @@ class ThreadSafeReferenceCounted {
 public:
 	ThreadSafeReferenceCounted() : referenceCount(1) {}
 	// NO virtual destructor!  Subclass should have a virtual destructor if it is not sealed.
-	void addref() const { interlockedIncrement(&referenceCount); }
+	void addref() const { ++referenceCount; }
 	// If return value is true, caller is responsible for destruction of object
 	bool delref_no_destroy() const {
-		if (interlockedDecrement(&referenceCount) != 0) {
+		if (--referenceCount != 0) {
 #ifdef VALGRIND
 			ANNOTATE_HAPPENS_BEFORE(&referenceCount);
 #endif
@@ -60,7 +60,7 @@ public:
 private:
 	ThreadSafeReferenceCounted(const ThreadSafeReferenceCounted&) /* = delete*/;
 	void operator=(const ThreadSafeReferenceCounted&) /* = delete*/;
-	mutable volatile int32_t referenceCount;
+	mutable std::atomic<int32_t> referenceCount;
 };
 
 template <class Subclass>

From 5c1279ceb7debd1c229ddeca69962a8c306fd3e6 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 29 Apr 2021 15:23:46 -0600
Subject: [PATCH 309/461] make actorlineage inherit from threadsaferefcounted

---
 flow/flow.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/flow.h b/flow/flow.h
index 2fab7b11a4..8388113253 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -448,7 +448,7 @@ struct LineageProperties : LineagePropertiesBase {
 	}
 };
 
-struct ActorLineage : ReferenceCounted<ActorLineage> {
+struct ActorLineage : ThreadSafeReferenceCounted<ActorLineage> {
 	friend class LocalLineage;
 
 private:

From 41e0eac450725bd2e6bf4d301d4db502d6268ffa Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Thu, 29 Apr 2021 21:25:55 +0000
Subject: [PATCH 310/461] Fix the data dir for DEB and EL7 to be
 /usr/lib/foundationdb/data

Also remove the unused legacy mapping for install dirs
---
 cmake/InstallLayout.cmake | 46 +--------------------------------------
 1 file changed, 1 insertion(+), 45 deletions(-)

diff --git a/cmake/InstallLayout.cmake b/cmake/InstallLayout.cmake
index 47fe89136b..a037b65df2 100644
--- a/cmake/InstallLayout.cmake
+++ b/cmake/InstallLayout.cmake
@@ -103,54 +103,10 @@ function(symlink_files)
   endif()
 endfunction()
 
-# 'map' from (destination, package) to path
-# format vars like install_destination_for_${destination}_${package}
-set(install_destination_for_bin_tgz "bin")
-set(install_destination_for_bin_deb "usr/bin")
-set(install_destination_for_bin_el6 "usr/bin")
-set(install_destination_for_bin_el7 "usr/bin")
-set(install_destination_for_bin_pm "usr/local/bin")
-set(install_destination_for_sbin_tgz "sbin")
-set(install_destination_for_sbin_deb "usr/sbin")
-set(install_destination_for_sbin_el6 "usr/sbin")
-set(install_destination_for_sbin_el7 "usr/sbin")
-set(install_destination_for_sbin_pm "usr/local/libexec")
-set(install_destination_for_lib_tgz "lib")
-set(install_destination_for_lib_deb "usr/lib")
-set(install_destination_for_lib_el6 "usr/lib64")
-set(install_destination_for_lib_el7 "usr/lib64")
-set(install_destination_for_lib_pm "usr/local/lib")
-set(install_destination_for_fdbmonitor_tgz "sbin")
-set(install_destination_for_fdbmonitor_deb "usr/lib/foundationdb")
-set(install_destination_for_fdbmonitor_el6 "usr/lib/foundationdb")
-set(install_destination_for_fdbmonitor_el7 "usr/lib/foundationdb")
-set(install_destination_for_fdbmonitor_pm "usr/local/libexec")
-set(install_destination_for_include_tgz "include")
-set(install_destination_for_include_deb "usr/include")
-set(install_destination_for_include_el6 "usr/include")
-set(install_destination_for_include_el7 "usr/include")
-set(install_destination_for_include_pm "usr/local/include")
-set(install_destination_for_etc_tgz "etc/foundationdb")
-set(install_destination_for_etc_deb "etc/foundationdb")
-set(install_destination_for_etc_el6 "etc/foundationdb")
-set(install_destination_for_etc_el7 "etc/foundationdb")
-set(install_destination_for_etc_pm "usr/local/etc/foundationdb")
-set(install_destination_for_log_tgz "log/foundationdb")
-set(install_destination_for_log_deb "var/log/foundationdb")
-set(install_destination_for_log_el6 "var/log/foundationdb")
-set(install_destination_for_log_el7 "var/log/foundationdb")
-set(install_destination_for_log_pm "usr/local/foundationdb/logs")
-set(install_destination_for_data_tgz "lib/foundationdb")
-set(install_destination_for_data_deb "var/lib/foundationdb/data")
-set(install_destination_for_data_el6 "var/lib/foundationdb/data")
-set(install_destination_for_data_el7 "var/lib/foundationdb/data")
-set(install_destination_for_data_pm "usr/local/foundationdb/data")
 fdb_install_packages(TGZ DEB EL7 PM VERSIONED)
 fdb_install_dirs(BIN SBIN LIB FDBMONITOR INCLUDE ETC LOG DATA)
 message(STATUS "FDB_INSTALL_DIRS -> ${FDB_INSTALL_DIRS}")
 
-# 'map' from (destination, package) to path
-# format vars like install_destination_for_${destination}_${package}
 install_destinations(TGZ
   BIN bin
   SBIN sbin
@@ -169,7 +125,7 @@ install_destinations(DEB
   INCLUDE usr/include
   ETC etc/foundationdb
   LOG var/log/foundationdb
-  DATA var/lib/foundationdb)
+  DATA var/lib/foundationdb/data)
 copy_install_destinations(DEB EL7)
 install_destinations(EL7 LIB usr/lib64)
 install_destinations(PM

From f151df3203311c104d87820e079ddca073864aa2 Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Thu, 29 Apr 2021 22:11:09 +0000
Subject: [PATCH 311/461] Expose CommitBatchingWindowSize metric to fdbcli
 status

Changes:

Schemas.cpp:
- Extend JSON schema to include aggregated information about
CommitBatchingWindowSize samples.

Status.actor.cpp:
- Extend getStorageServersAndMetrics() to gather metrics about
CommitBatchingWindowSize.
- Extend CommitProxy AddRole() to populate the status-JSON object
with the metrics about CommitBatchingWindowSize.
---
 fdbclient/Schemas.cpp      | 12 ++++++++++++
 fdbserver/Status.actor.cpp |  7 ++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 866ea4441e..1439a4cc3a 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -181,6 +181,18 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                      "p99":0.0,
                      "p99.9":0.0
                   },
+                  "commit_batching_window_size":{
+                     "count":0,
+                     "min":0.0,
+                     "max":0.0,
+                     "median":0.0,
+                     "mean":0.0,
+                     "p25":0.0,
+                     "p90":0.0,
+                     "p95":0.0,
+                     "p99":0.0,
+                     "p99.9":0.0
+                  },
                   "grv_latency_bands":{
                      "$map": 1
                   },
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 284cedf97b..b45856ff53 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -615,6 +615,11 @@ struct RolesInfo {
 			TraceEventFields const& commitLatencyBands = metrics.at("CommitLatencyBands");
 			if (commitLatencyBands.size()) {
 				obj["commit_latency_bands"] = addLatencyBandInfo(commitLatencyBands);
+			} 
+
+			TraceEventFields const& commitBatchingWindowSize = metrics.at("CommitBatchingWindowSize");
+			if (commitBatchingWindowSize.size()) {
+				obj["commit_batching_window_size"] = addLatencyStatistics(commitBatchingWindowSize);
 			}
 		} catch (Error& e) {
 			if (e.code() != error_code_attribute_not_found) {
@@ -1839,7 +1844,7 @@ ACTOR static Future<vector<std::pair<CommitProxyInterface, EventMap>>> getCommit
 	vector<std::pair<CommitProxyInterface, EventMap>> results =
 	    wait(getServerMetrics(db->get().client.commitProxies,
 	                          address_workers,
-	                          std::vector<std::string>{ "CommitLatencyMetrics", "CommitLatencyBands" }));
+	                          std::vector<std::string>{ "CommitLatencyMetrics", "CommitLatencyBands", "CommitBatchingWindowSize"}));
 
 	return results;
 }

From 5e045bd21cfed47f06da9ae09fc3218b8dfca418 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 29 Apr 2021 17:00:34 -0700
Subject: [PATCH 312/461] Move the read of logsOnly and
 inconsistentSnapshotOnly to a loop where other RestoreConfig vars are
 fetched.

---
 fdbclient/FileBackupAgent.actor.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 905592f8bc..a09ed25789 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -4043,6 +4043,8 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 		state Version beginVersion;
 		state Reference<IBackupContainer> bc;
 		state std::vector<KeyRange> ranges;
+		state bool logsOnly;
+		state bool inconsistentSnapshotOnly;
 
 		loop {
 			try {
@@ -4050,11 +4052,12 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 
 				wait(checkTaskVersion(tr->getDatabase(), task, name, version));
-				Optional<Version> _beginVersion = wait(restore.beginVersion().get(tr));
-				beginVersion = _beginVersion.present() ? _beginVersion.get() : invalidVersion;
+				wait(store(beginVersion, restore.beginVersion().getD(tr, false, invalidVersion)));
 
 				wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)));
 				wait(store(ranges, restore.getRestoreRangesOrDefault(tr)));
+				wait(store(logsOnly, restore.onlyAppyMutationLogs().getD(tr, false, false)));
+				wait(store(inconsistentSnapshotOnly, restore.inconsistentSnapshotOnly().getD(tr, false, false)));
 
 				wait(taskBucket->keepRunning(tr, task));
 
@@ -4101,11 +4104,6 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			}
 		}
 
-		tr->reset();
-		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-		state bool logsOnly = wait(restore.onlyAppyMutationLogs().getD(tr, false, false));
-		state bool inconsistentSnapshotOnly = wait(restore.inconsistentSnapshotOnly().getD(tr, false, false));
 		state Version firstConsistentVersion = invalidVersion;
 		if (beginVersion == invalidVersion) {
 			beginVersion = 0;

From f61f13a0babf00cc1758fa83aac969b2bdf11342 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Thu, 29 Apr 2021 22:02:38 -0700
Subject: [PATCH 313/461] Explicitly cancel thread futures for the protocol
 version monitors in MVC

---
 fdbclient/MultiVersionTransaction.actor.cpp | 31 +++++++++++----------
 fdbclient/MultiVersionTransaction.h         |  6 +++-
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index bb4049d859..354102f883 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -1035,6 +1035,10 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
 
 	return mapThreadFuture<ProtocolVersion, Void>(f, [this, expected](ErrorOr<ProtocolVersion> cv) {
 		if (cv.isError()) {
+			if (cv.getError().code() == error_code_operation_cancelled) {
+				return ErrorOr<Void>(cv.getError());
+			}
+
 			TraceEvent("ErrorGettingClusterProtocolVersion")
 			    .detail("ExpectedProtocolVersion", expected)
 			    .error(cv.getError());
@@ -1042,7 +1046,7 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
 
 		ProtocolVersion clusterVersion = !cv.isError() ? cv.get() : dbProtocolVersion.orDefault(currentProtocolVersion);
 		onMainThreadVoid([this, clusterVersion]() { protocolVersionChanged(clusterVersion); }, nullptr);
-		return Void();
+		return ErrorOr<Void>(Void());
 	});
 }
 
@@ -1053,6 +1057,8 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
 	if (dbProtocolVersion.present() &&
 	    protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) {
 		dbProtocolVersion = protocolVersion;
+
+		protocolVersionMonitor.cancel();
 		protocolVersionMonitor = monitorProtocolVersion();
 	}
 
@@ -1139,6 +1145,8 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
 	}
 
 	dbVar->set(db);
+
+	protocolVersionMonitor.cancel();
 	protocolVersionMonitor = monitorProtocolVersion();
 }
 
@@ -1181,12 +1189,14 @@ void MultiVersionDatabase::LegacyVersionMonitor::startConnectionMonitor(
 			    onMainThreadVoid(
 			        [this, ready, dbState]() {
 				        if (ready.isError()) {
-					        TraceEvent(SevError, "FailedToOpenDatabaseOnClient")
-					            .error(ready.getError())
-					            .detail("LibPath", client->libPath);
+					        if (ready.getError().code() != error_code_operation_cancelled) {
+						        TraceEvent(SevError, "FailedToOpenDatabaseOnClient")
+						            .error(ready.getError())
+						            .detail("LibPath", client->libPath);
 
-					        client->failed = true;
-					        MultiVersionApi::api->updateSupportedVersions();
+						        client->failed = true;
+						        MultiVersionApi::api->updateSupportedVersions();
+					        }
 				        } else {
 					        runGrvProbe(dbState);
 				        }
@@ -1209,14 +1219,7 @@ void MultiVersionDatabase::LegacyVersionMonitor::runGrvProbe(Reference<MultiVers
 
 			    // If the version attempt returns an error, we regard that as a connection (except
 			    // operation_cancelled)
-			    if (v.isError() && v.getError().code() == error_code_operation_cancelled) {
-				    TraceEvent(SevError, "FailedToOpenDatabaseOnClient")
-				        .error(v.getError())
-				        .detail("LibPath", client->libPath);
-
-				    client->failed = true;
-				    MultiVersionApi::api->updateSupportedVersions();
-			    } else {
+			    if (!v.isError() || v.getError().code() != error_code_operation_cancelled) {
 				    dbState->protocolVersionChanged(client->protocolVersion);
 			    }
 		    },
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index 2244ec2c6e..5401076cf3 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -535,7 +535,11 @@ public:
 	// connect packet monitoring.
 	struct LegacyVersionMonitor {
 		LegacyVersionMonitor(Reference<ClientInfo> const& client) : client(client), monitorRunning(false) {}
-		~LegacyVersionMonitor() { TraceEvent("DestroyingVersionMonitor"); }
+		~LegacyVersionMonitor() {
+			if (versionMonitor.isValid()) {
+				versionMonitor.cancel();
+			}
+		}
 
 		// Starts the connection monitor by creating a database object at an old version.
 		// Must be called from the main thread

From cb3d2bfec7cf83d12e04a8308fa2d6f8355899d5 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 30 Apr 2021 10:35:44 -0700
Subject: [PATCH 314/461] Add cancellation of the protocol monitor when the
 database is destroyed. Avoid using any state when cancelled. Fix race between
 setting up the protocol version monitor and destroying the database.

---
 fdbclient/MultiVersionTransaction.actor.cpp | 16 +++++++++++-----
 fdbclient/MultiVersionTransaction.h         |  1 -
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 354102f883..c67f752d30 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -913,7 +913,8 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
 			}
 		});
 
-		onMainThreadVoid([this]() { dbState->protocolVersionMonitor = dbState->monitorProtocolVersion(); }, nullptr);
+		Reference<DatabaseState> dbStateRef = dbState;
+		onMainThreadVoid([dbStateRef]() { dbStateRef->protocolVersionMonitor = dbStateRef->monitorProtocolVersion(); }, nullptr);
 	}
 }
 
@@ -1165,9 +1166,15 @@ void MultiVersionDatabase::DatabaseState::startLegacyVersionMonitors() {
 }
 
 // Cleans up state for the legacy version monitors to break reference cycles
-// Must be called from the main thread
 void MultiVersionDatabase::DatabaseState::close() {
-	legacyVersionMonitors.clear();
+	addref();
+	onMainThreadVoid(
+	    [this]() {
+		    protocolVersionMonitor.cancel();
+		    legacyVersionMonitors.clear();
+		    delref();
+	    },
+	    nullptr);
 }
 
 // Starts the connection monitor by creating a database object at an old version.
@@ -1215,11 +1222,10 @@ void MultiVersionDatabase::LegacyVersionMonitor::runGrvProbe(Reference<MultiVers
 	versionMonitor = mapThreadFuture<Version, Void>(tr->getReadVersion(), [this, dbState](ErrorOr<Version> v) {
 		onMainThreadVoid(
 		    [this, v, dbState]() {
-			    monitorRunning = false;
-
 			    // If the version attempt returns an error, we regard that as a connection (except
 			    // operation_cancelled)
 			    if (!v.isError() || v.getError().code() != error_code_operation_cancelled) {
+				    monitorRunning = false;
 				    dbState->protocolVersionChanged(client->protocolVersion);
 			    }
 		    },
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index 5401076cf3..c68f9259d6 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -492,7 +492,6 @@ public:
 		void startLegacyVersionMonitors();
 
 		// Cleans up state for the legacy version monitors to break reference cycles
-		// Must be called from the main thread
 		void close();
 
 		Reference<IDatabase> db;

From eaf1e0f64eeaefc4e5a10417609eec6d9782a421 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 30 Apr 2021 11:11:02 -0700
Subject: [PATCH 315/461] Be more defensive with cancellation by not capturing
 the this pointer in lambdas, instead capturing a full Reference.

---
 fdbclient/MultiVersionTransaction.actor.cpp | 67 +++++++++++++--------
 fdbclient/MultiVersionTransaction.h         | 12 ++--
 2 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index c67f752d30..25c770d7a7 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -1022,7 +1022,7 @@ void MultiVersionDatabase::DatabaseState::addClient(Reference<ClientInfo> client
 		    .detail("LibPath", client->libPath)
 		    .detail("ProtocolVersion", client->protocolVersion);
 
-		legacyVersionMonitors.emplace_back(client);
+		legacyVersionMonitors.emplace_back(new LegacyVersionMonitor(client));
 	}
 }
 
@@ -1034,7 +1034,8 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
 	Optional<ProtocolVersion> expected = dbProtocolVersion;
 	ThreadFuture<ProtocolVersion> f = versionMonitorDb->getServerProtocol(dbProtocolVersion);
 
-	return mapThreadFuture<ProtocolVersion, Void>(f, [this, expected](ErrorOr<ProtocolVersion> cv) {
+	Reference<DatabaseState> self = Reference<DatabaseState>::addRef(this);
+	return mapThreadFuture<ProtocolVersion, Void>(f, [self, expected](ErrorOr<ProtocolVersion> cv) {
 		if (cv.isError()) {
 			if (cv.getError().code() == error_code_operation_cancelled) {
 				return ErrorOr<Void>(cv.getError());
@@ -1045,8 +1046,9 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
 			    .error(cv.getError());
 		}
 
-		ProtocolVersion clusterVersion = !cv.isError() ? cv.get() : dbProtocolVersion.orDefault(currentProtocolVersion);
-		onMainThreadVoid([this, clusterVersion]() { protocolVersionChanged(clusterVersion); }, nullptr);
+		ProtocolVersion clusterVersion =
+		    !cv.isError() ? cv.get() : self->dbProtocolVersion.orDefault(currentProtocolVersion);
+		onMainThreadVoid([self, clusterVersion]() { self->protocolVersionChanged(clusterVersion); }, nullptr);
 		return ErrorOr<Void>(Void());
 	});
 }
@@ -1083,12 +1085,13 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
 
 			if (client->external && !MultiVersionApi::apiVersionAtLeast(610)) {
 				// Old API versions return a future when creating the database, so we need to wait for it
+				Reference<DatabaseState> self = Reference<DatabaseState>::addRef(this);
 				dbReady = mapThreadFuture<Void, Void>(
-				    newDb.castTo<DLDatabase>()->onReady(), [this, newDb, client](ErrorOr<Void> ready) {
+				    newDb.castTo<DLDatabase>()->onReady(), [self, newDb, client](ErrorOr<Void> ready) {
 					    if (!ready.isError()) {
-						    onMainThreadVoid([this, newDb, client]() { updateDatabase(newDb, client); }, nullptr);
+						    onMainThreadVoid([self, newDb, client]() { self->updateDatabase(newDb, client); }, nullptr);
 					    } else {
-						    onMainThreadVoid([this, client]() { updateDatabase(Reference<IDatabase>(), client); },
+						    onMainThreadVoid([self, client]() { self->updateDatabase(Reference<IDatabase>(), client); },
 						                     nullptr);
 					    }
 
@@ -1155,12 +1158,13 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
 // Must be called from the main thread
 void MultiVersionDatabase::DatabaseState::startLegacyVersionMonitors() {
 	for (auto itr = legacyVersionMonitors.begin(); itr != legacyVersionMonitors.end(); ++itr) {
-		while (itr != legacyVersionMonitors.end() && itr->client->failed) {
+		while (itr != legacyVersionMonitors.end() && (*itr)->client->failed) {
+			(*itr)->close();
 			itr = legacyVersionMonitors.erase(itr);
 		}
 		if (itr != legacyVersionMonitors.end() &&
-		    (!dbProtocolVersion.present() || itr->client->protocolVersion != dbProtocolVersion.get())) {
-			itr->startConnectionMonitor(Reference<DatabaseState>::addRef(this));
+		    (!dbProtocolVersion.present() || (*itr)->client->protocolVersion != dbProtocolVersion.get())) {
+			(*itr)->startConnectionMonitor(Reference<DatabaseState>::addRef(this));
 		}
 	}
 }
@@ -1171,6 +1175,10 @@ void MultiVersionDatabase::DatabaseState::close() {
 	onMainThreadVoid(
 	    [this]() {
 		    protocolVersionMonitor.cancel();
+		    for (auto monitor : legacyVersionMonitors) {
+			    monitor->close();
+		    }
+
 		    legacyVersionMonitors.clear();
 		    delref();
 	    },
@@ -1191,21 +1199,22 @@ void MultiVersionDatabase::LegacyVersionMonitor::startConnectionMonitor(
 		tr = Reference<ITransaction>();
 
 		TraceEvent("StartingLegacyVersionMonitor").detail("ProtocolVersion", client->protocolVersion);
+		Reference<LegacyVersionMonitor> self = Reference<LegacyVersionMonitor>::addRef(this);
 		versionMonitor =
-		    mapThreadFuture<Void, Void>(db.castTo<DLDatabase>()->onReady(), [this, dbState](ErrorOr<Void> ready) {
+		    mapThreadFuture<Void, Void>(db.castTo<DLDatabase>()->onReady(), [self, dbState](ErrorOr<Void> ready) {
 			    onMainThreadVoid(
-			        [this, ready, dbState]() {
+			        [self, ready, dbState]() {
 				        if (ready.isError()) {
 					        if (ready.getError().code() != error_code_operation_cancelled) {
 						        TraceEvent(SevError, "FailedToOpenDatabaseOnClient")
 						            .error(ready.getError())
-						            .detail("LibPath", client->libPath);
+						            .detail("LibPath", self->client->libPath);
 
-						        client->failed = true;
+						        self->client->failed = true;
 						        MultiVersionApi::api->updateSupportedVersions();
 					        }
 				        } else {
-					        runGrvProbe(dbState);
+					        self->runGrvProbe(dbState);
 				        }
 			        },
 			        nullptr);
@@ -1219,22 +1228,28 @@ void MultiVersionDatabase::LegacyVersionMonitor::startConnectionMonitor(
 // Must be called from main thread
 void MultiVersionDatabase::LegacyVersionMonitor::runGrvProbe(Reference<MultiVersionDatabase::DatabaseState> dbState) {
 	tr = db->createTransaction();
-	versionMonitor = mapThreadFuture<Version, Void>(tr->getReadVersion(), [this, dbState](ErrorOr<Version> v) {
-		onMainThreadVoid(
-		    [this, v, dbState]() {
-			    // If the version attempt returns an error, we regard that as a connection (except
-			    // operation_cancelled)
-			    if (!v.isError() || v.getError().code() != error_code_operation_cancelled) {
-				    monitorRunning = false;
-				    dbState->protocolVersionChanged(client->protocolVersion);
-			    }
-		    },
-		    nullptr);
+	Reference<LegacyVersionMonitor> self = Reference<LegacyVersionMonitor>::addRef(this);
+	versionMonitor = mapThreadFuture<Version, Void>(tr->getReadVersion(), [self, dbState](ErrorOr<Version> v) {
+		// If the version attempt returns an error, we regard that as a connection (except operation_cancelled)
+		if (!v.isError() || v.getError().code() != error_code_operation_cancelled) {
+			onMainThreadVoid(
+			    [self, dbState]() {
+				    self->monitorRunning = false;
+				    dbState->protocolVersionChanged(self->client->protocolVersion);
+			    },
+			    nullptr);
+		}
 
 		return v.map<Void>([](Version v) { return Void(); });
 	});
 }
 
+void MultiVersionDatabase::LegacyVersionMonitor::close() {
+	if (versionMonitor.isValid()) {
+		versionMonitor.cancel();
+	}
+}
+
 std::atomic_flag MultiVersionDatabase::externalClientsInitialized = ATOMIC_FLAG_INIT;
 
 // MultiVersionApi
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index c68f9259d6..70de66c064 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -517,7 +517,7 @@ public:
 
 		// Versions 5.0 and older do not support connection packet monitoring and require alternate techniques to
 		// determine the cluster version.
-		std::list<LegacyVersionMonitor> legacyVersionMonitors;
+		std::list<Reference<LegacyVersionMonitor>> legacyVersionMonitors;
 
 		Optional<ProtocolVersion> dbProtocolVersion;
 
@@ -532,13 +532,11 @@ public:
 
 	// A struct that enables monitoring whether the cluster is running an old version (<= 5.0) that doesn't support
 	// connect packet monitoring.
-	struct LegacyVersionMonitor {
+	struct LegacyVersionMonitor : ThreadSafeReferenceCounted<LegacyVersionMonitor> {
 		LegacyVersionMonitor(Reference<ClientInfo> const& client) : client(client), monitorRunning(false) {}
-		~LegacyVersionMonitor() {
-			if (versionMonitor.isValid()) {
-				versionMonitor.cancel();
-			}
-		}
+
+		// Terminates the version monitor to break reference cycles
+		void close();
 
 		// Starts the connection monitor by creating a database object at an old version.
 		// Must be called from the main thread

From ab3f96f16dd9c3836bcf25abe2a3409d8038235b Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 30 Apr 2021 11:56:35 -0700
Subject: [PATCH 316/461] Fix: simulation doesn't have a protocol version
 monitor and can't cancel it.

---
 fdbclient/MultiVersionTransaction.actor.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 25c770d7a7..177ce68673 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -914,7 +914,8 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
 		});
 
 		Reference<DatabaseState> dbStateRef = dbState;
-		onMainThreadVoid([dbStateRef]() { dbStateRef->protocolVersionMonitor = dbStateRef->monitorProtocolVersion(); }, nullptr);
+		onMainThreadVoid([dbStateRef]() { dbStateRef->protocolVersionMonitor = dbStateRef->monitorProtocolVersion(); },
+		                 nullptr);
 	}
 }
 
@@ -1061,6 +1062,7 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
 	    protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) {
 		dbProtocolVersion = protocolVersion;
 
+		ASSERT(protocolVersionMonitor.isValid());
 		protocolVersionMonitor.cancel();
 		protocolVersionMonitor = monitorProtocolVersion();
 	}
@@ -1150,6 +1152,7 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
 
 	dbVar->set(db);
 
+	ASSERT(protocolVersionMonitor.isValid());
 	protocolVersionMonitor.cancel();
 	protocolVersionMonitor = monitorProtocolVersion();
 }
@@ -1171,16 +1174,17 @@ void MultiVersionDatabase::DatabaseState::startLegacyVersionMonitors() {
 
 // Cleans up state for the legacy version monitors to break reference cycles
 void MultiVersionDatabase::DatabaseState::close() {
-	addref();
+	Reference<DatabaseState> self = Reference<DatabaseState>::addRef(this);
 	onMainThreadVoid(
-	    [this]() {
-		    protocolVersionMonitor.cancel();
-		    for (auto monitor : legacyVersionMonitors) {
+	    [self]() {
+		    if (self->protocolVersionMonitor.isValid()) {
+			    self->protocolVersionMonitor.cancel();
+		    }
+		    for (auto monitor : self->legacyVersionMonitors) {
 			    monitor->close();
 		    }
 
-		    legacyVersionMonitors.clear();
-		    delref();
+		    self->legacyVersionMonitors.clear();
 	    },
 	    nullptr);
 }

From cf4218dfd15ed6d851e6c26160af0982c866d348 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Sat, 1 May 2021 15:20:49 -0700
Subject: [PATCH 317/461] Fixes simulation failures

Fixes the following issues:

1. Use the right index when initializing the WriteOnlySet's vector of
   atomics. Also switch to std::atomic_init to initialize each atomic in
   the vector (cannot default construct the atomics in the vector
   because std::atomic does not have a copy constructor).
2. Add failure check for when items cannot be inserted into the
   WriteOnlySet due to capacity constraints. This situation occurs when
   `copy` is not called on the WriteOnlySet, such as when sampling is
   disabled. The `copy` function is what clears the WriteOnlySet.
3. Remove a global config feature I added to update the ClientDBInfo
   object used by the global config listener function. This needs more
   investigation, but the effect of this change could be that global
   config changes are not correctly recognized on fdbserver processes.
4. Add various ASSERTs to verify data in WriteOnlySet.
---
 fdbclient/AnnotateActor.h             | 3 +++
 fdbclient/GlobalConfig.actor.cpp      | 5 +----
 fdbclient/GlobalConfig.actor.h        | 8 --------
 fdbrpc/AsyncFileKAIO.actor.h          | 1 +
 fdbserver/ClusterController.actor.cpp | 4 +---
 flow/WriteOnlySet.actor.cpp           | 4 ++--
 flow/flow.cpp                         | 2 +-
 7 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h
index 660b777d69..23f7f67659 100644
--- a/fdbclient/AnnotateActor.h
+++ b/fdbclient/AnnotateActor.h
@@ -35,6 +35,9 @@ struct AnnotateActor {
 
 	AnnotateActor(Reference<ActorLineage> lineage) : set(true) {
 		index = g_network->getActorLineageSet().insert(lineage);
+		if (index == ActorLineageSet::npos) {
+			set = false;
+		}
 	}
 
 	AnnotateActor(const AnnotateActor& other) = delete;
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 947c383689..071f39c661 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -48,10 +48,6 @@ void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>>
 	}
 }
 
-void GlobalConfig::updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo) {
-	_updater = updater(&GlobalConfig::globalConfig(), dbInfo);
-}
-
 GlobalConfig& GlobalConfig::globalConfig() {
 	void* res = g_network->global(INetwork::enGlobalConfig);
 	ASSERT(res);
@@ -201,6 +197,7 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 // Applies updates to the local copy of the global configuration when this
 // process receives an updated history.
 ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+	// wait(self->cx->onConnected());
 	wait(self->migrate(self));
 
 	wait(self->refresh(self));
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index a541145dd8..7babe0a5ef 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -83,14 +83,6 @@ public:
 	// For example, given "config/a", returns "\xff\xff/global_config/config/a".
 	static Key prefixedKey(KeyRef key);
 
-	// Update the ClientDBInfo object used internally to check for updates to
-	// global configuration. The ClientDBInfo reference must be the same one
-	// used in the cluster controller, but fdbserver requires initial creation
-	// of the GlobalConfig class before the cluster controller is initialized.
-	// This function allows the ClientDBInfo object to be updated after create
-	// was called.
-	void updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo);
-
 	// Get a value from the framework. Values are returned as a ConfigValue
 	// reference which also contains the arena holding the object. As long as
 	// the caller keeps the ConfigValue reference, the value is guaranteed to
diff --git a/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/AsyncFileKAIO.actor.h
index dbdb040d00..cdc7fe9954 100644
--- a/fdbrpc/AsyncFileKAIO.actor.h
+++ b/fdbrpc/AsyncFileKAIO.actor.h
@@ -244,6 +244,7 @@ public:
 
 		auto& actorLineageSet = IAsyncFileSystem::filesystem()->getActorLineageSet();
 		auto index = actorLineageSet.insert(currentLineage);
+		ASSERT(index != ActorLineageSet::npos);
 		Future<Void> res = success(result);
 		actorLineageSet.erase(index);
 		return res;
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index d0735301df..0ba69cbf84 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -135,9 +135,7 @@ public:
 		                                                                         true,
 		                                                                         TaskPriority::DefaultEndpoint,
 		                                                                         true)) // SOMEDAY: Locality!
-		{
-			GlobalConfig::globalConfig().updateDBInfo(clientInfo);
-		}
+		{}
 
 		void setDistributor(const DataDistributorInterface& interf) {
 			auto newInfo = serverInfo->get();
diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index 8e2e0ecfd8..4a2e60d542 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -62,6 +62,7 @@ bool WriteOnlySet<T, IndexType, CAPACITY>::eraseImpl(Index idx) {
 
 template <class T, class IndexType, IndexType CAPACITY>
 bool WriteOnlySet<T, IndexType, CAPACITY>::erase(Index idx) {
+	ASSERT(idx >= 0 && idx < CAPACITY);
 	auto res = eraseImpl(idx);
 	ASSERT(freeQueue.push(idx));
 	return res;
@@ -86,7 +87,6 @@ bool WriteOnlySet<T, IndexType, CAPACITY>::replace(Index idx, const Reference<T>
 				if (ptr) {
 					reinterpret_cast<T*>(ptr)->delref();
 				}
-				_set[idx].store(lineagePtr);
 				return ptr != 0;
 			}
 		}
@@ -98,7 +98,7 @@ WriteOnlySet<T, IndexType, CAPACITY>::WriteOnlySet() : _set(CAPACITY) {
 	// insert the free indexes in reverse order
 	for (unsigned i = CAPACITY; i > 0; --i) {
 		freeQueue.push(i - 1);
-		_set[i] = uintptr_t(0);
+		std::atomic_init(&_set[i - 1], uintptr_t(0));
 	}
 }
 
diff --git a/flow/flow.cpp b/flow/flow.cpp
index 1332207e38..ec65640fe2 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -31,7 +31,7 @@ WriteOnlyVariable<ActorLineage, unsigned> currentLineageThreadSafe;
 
 LineagePropertiesBase::~LineagePropertiesBase() {}
 
-ActorLineage::ActorLineage() : parent(currentLineage) {}
+ActorLineage::ActorLineage() : properties(), parent(currentLineage) {}
 
 ActorLineage::~ActorLineage() {
 	for (auto ptr : properties) {

From 637699be324879f57a662a2befb78b38ec6c6332 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Sat, 1 May 2021 21:41:10 -0700
Subject: [PATCH 318/461] Fix issue with fdbserver not receiving global config
 change notifications

---
 fdbclient/GlobalConfig.actor.cpp      | 17 +++++++++++------
 fdbclient/GlobalConfig.actor.h        | 10 +++++++++-
 fdbserver/ClusterController.actor.cpp |  4 +++-
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 071f39c661..8d10c37dcc 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -43,8 +43,9 @@ void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>>
 	if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
 		auto config = new GlobalConfig{};
 		config->cx = Database(cx);
+		config->dbInfo = dbInfo;
 		g_network->setGlobal(INetwork::enGlobalConfig, config);
-		config->_updater = updater(config, dbInfo);
+		config->_updater = updater(config);
 	}
 }
 
@@ -54,6 +55,10 @@ GlobalConfig& GlobalConfig::globalConfig() {
 	return *reinterpret_cast<GlobalConfig*>(res);
 }
 
+void GlobalConfig::updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+	this->dbInfo = dbInfo;
+}
+
 Key GlobalConfig::prefixedKey(KeyRef key) {
 	return key.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin);
 }
@@ -196,7 +201,7 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 
 // Applies updates to the local copy of the global configuration when this
 // process receives an updated history.
-ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self) {
 	// wait(self->cx->onConnected());
 	wait(self->migrate(self));
 
@@ -205,9 +210,9 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 
 	loop {
 		try {
-			wait(dbInfo->onChange());
+			wait(self->dbInfo->onChange());
 
-			auto& history = dbInfo->get().history;
+			auto& history = self->dbInfo->get().history;
 			if (history.size() == 0) {
 				continue;
 			}
@@ -217,8 +222,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
 				wait(self->refresh(self));
-				if (dbInfo->get().history.size() > 0) {
-					self->lastUpdate = dbInfo->get().history.back().version;
+				if (self->dbInfo->get().history.size() > 0) {
+					self->lastUpdate = self->dbInfo->get().history.back().version;
 				}
 			} else {
 				// Apply history in order, from lowest version to highest
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 7babe0a5ef..967ec77f8d 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -77,6 +77,13 @@ public:
 	// configuration.
 	static GlobalConfig& globalConfig();
 
+	// Updates the ClientDBInfo object used by global configuration to read new
+	// data. For server processes, this value needs to be set by the cluster
+	// controller, but global config is initialized before the cluster
+	// controller is, so this function provides a mechanism to update the
+	// object after initialization.
+	void updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo);
+
 	// Use this function to turn a global configuration key defined above into
 	// the full path needed to set the value in the database.
 	//
@@ -149,9 +156,10 @@ private:
 
 	ACTOR static Future<Void> migrate(GlobalConfig* self);
 	ACTOR static Future<Void> refresh(GlobalConfig* self);
-	ACTOR static Future<Void> updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo);
+	ACTOR static Future<Void> updater(GlobalConfig* self);
 
 	Database cx;
+	Reference<AsyncVar<ClientDBInfo>> dbInfo;
 	Future<Void> _updater;
 	Promise<Void> initialized;
 	AsyncTrigger configChanged;
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 0ba69cbf84..d0735301df 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -135,7 +135,9 @@ public:
 		                                                                         true,
 		                                                                         TaskPriority::DefaultEndpoint,
 		                                                                         true)) // SOMEDAY: Locality!
-		{}
+		{
+			GlobalConfig::globalConfig().updateDBInfo(clientInfo);
+		}
 
 		void setDistributor(const DataDistributorInterface& interf) {
 			auto newInfo = serverInfo->get();

From c016e154a7c0092be7885f6b744efb75ec290dae Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Sun, 2 May 2021 11:03:07 -0700
Subject: [PATCH 319/461] Remove global config fdbserver fix

This is causing problems with the 5.2.0 restarting test. Removing this
line disables fdbserver processes from receiving global config updates,
instead requiring a restart to see them.
---
 fdbclient/GlobalConfig.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 8d10c37dcc..e0991f4b44 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -56,7 +56,7 @@ GlobalConfig& GlobalConfig::globalConfig() {
 }
 
 void GlobalConfig::updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo) {
-	this->dbInfo = dbInfo;
+	// this->dbInfo = dbInfo;
 }
 
 Key GlobalConfig::prefixedKey(KeyRef key) {

From f275fd3c32c9fd98e8a45eb24ccf9886420d8778 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Sun, 2 May 2021 17:27:18 -0700
Subject: [PATCH 320/461] Fix gcc compilation

---
 fdbclient/ActorLineageProfiler.cpp | 4 ++--
 fdbclient/AnnotateActor.h          | 2 ++
 fdbserver/WorkerInterface.actor.h  | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 4095725ffc..9be7a60704 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -49,8 +49,8 @@ class Packer : public msgpack::packer<msgpack::sbuffer> {
 				populate_visitor_map<Tail...>::populate(map);
 			}
 		};
-		template <>
-		struct populate_visitor_map<> {
+		template <class Head>
+		struct populate_visitor_map<Head> {
 			static void populate(VisitorMap&) {}
 		};
 
diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h
index 23f7f67659..dfc944fd02 100644
--- a/fdbclient/AnnotateActor.h
+++ b/fdbclient/AnnotateActor.h
@@ -77,6 +77,8 @@ constexpr std::string_view to_string(WaitState st) {
 		return "Network"sv;
 	case WaitState::Running:
 		return "Running"sv;
+	default:
+		return ""sv;
 	}
 }
 
diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h
index a09408dca0..fb9a190feb 100644
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@@ -818,6 +818,7 @@ struct Role {
 		case ProcessClass::Worker:
 			return WORKER;
 		case ProcessClass::NoRole:
+		default:
 			ASSERT(false);
 			throw internal_error();
 		}

From 13a6c4cf06e81f2cacfb52b7e84e4317a2a54c3b Mon Sep 17 00:00:00 2001
From: sfc-gh-tclinkenbeard <trevor.clinkenbeard@snowflake.com>
Date: Mon, 3 May 2021 08:09:52 -0700
Subject: [PATCH 321/461] Add ErrorKind field to SevError trace events

---
 flow/Error.cpp |  7 +++++++
 flow/Error.h   |  1 +
 flow/Trace.cpp | 30 +++++++++++++++++++++++++++---
 flow/Trace.h   | 13 +++++++++++++
 4 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/flow/Error.cpp b/flow/Error.cpp
index 3063759df5..42cbeec462 100644
--- a/flow/Error.cpp
+++ b/flow/Error.cpp
@@ -40,6 +40,10 @@ Error Error::fromUnvalidatedCode(int code) {
 		return Error::fromCode(code);
 }
 
+bool Error::isDiskError() const {
+	return (error_code == error_code_io_error || error_code == error_code_io_timeout);
+}
+
 Error internal_error_impl(const char* file, int line) {
 	fprintf(stderr, "Internal Error @ %s %d:\n  %s\n", file, line, platform::get_backtrace().c_str());
 
@@ -47,6 +51,7 @@ Error internal_error_impl(const char* file, int line) {
 	    .error(Error::fromCode(error_code_internal_error))
 	    .detail("File", file)
 	    .detail("Line", line)
+	    .setErrorKind(ErrorKind::BugDetected)
 	    .backtrace();
 	flushTraceFileVoid();
 	return Error(error_code_internal_error);
@@ -60,6 +65,7 @@ Error internal_error_impl(const char* msg, const char* file, int line) {
 	    .detail("FailedAssertion", msg)
 	    .detail("File", file)
 	    .detail("Line", line)
+	    .setErrorKind(ErrorKind::BugDetected)
 	    .backtrace();
 	flushTraceFileVoid();
 	return Error(error_code_internal_error);
@@ -86,6 +92,7 @@ Error internal_error_impl(const char* a_nm,
 	    .detail("RightValue", b)
 	    .detail("File", file)
 	    .detail("Line", line)
+	    .setErrorKind(ErrorKind::BugDetected)
 	    .backtrace();
 	flushTraceFileVoid();
 	return Error(error_code_internal_error);
diff --git a/flow/Error.h b/flow/Error.h
index ad02e2bd0b..d3612f72dd 100644
--- a/flow/Error.h
+++ b/flow/Error.h
@@ -50,6 +50,7 @@ public:
 		return flags & FLAG_INJECTED_FAULT;
 	} // Use as little as possible, so injected faults effectively test real faults!
 	bool isValid() const { return error_code != invalid_error_code; }
+	bool isDiskError() const;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
diff --git a/flow/Trace.cpp b/flow/Trace.cpp
index b713617d9e..53d607ea1c 100644
--- a/flow/Trace.cpp
+++ b/flow/Trace.cpp
@@ -355,7 +355,6 @@ public:
 		if (localAddress.present()) {
 			fields.addField("Machine", formatIpPort(localAddress.get().ip, localAddress.get().port));
 		}
-
 		fields.addField("LogGroup", logGroup);
 
 		RoleInfo const& r = mutateRoleInfo();
@@ -653,6 +652,21 @@ bool traceClockSource(std::string& source) {
 		return false;
 	}
 }
+
+std::string toString(ErrorKind errorKind) {
+	switch (errorKind) {
+		case ErrorKind::Unset:
+			return "Unset";
+		case ErrorKind::DiskIssue:
+			return "DiskIssue";
+		case ErrorKind::BugDetected:
+			return "BugDetected";
+		default:
+			UNSTOPPABLE_ASSERT(false);
+			return "";
+	}
+}
+
 } // namespace
 
 bool selectTraceFormatter(std::string format) {
@@ -900,6 +914,10 @@ bool TraceEvent::init() {
 		}
 
 		detail("Severity", int(severity));
+		if (severity >= SevError) {
+			detail("ErrorKind", errorKind);
+			errorKindIndex = fields.size()-1;
+		}
 		detail("Time", "0.000000");
 		timeIndex = fields.size() - 1;
 		if (FLOW_KNOBS && FLOW_KNOBS->TRACE_DATETIME_ENABLED) {
@@ -942,6 +960,9 @@ TraceEvent& TraceEvent::errorImpl(class Error const& error, bool includeCancelle
 			detail("ErrorDescription", error.what());
 			detail("ErrorCode", error.code());
 		}
+		if (err.isDiskError()) {
+			setErrorKind(ErrorKind::DiskIssue);
+		}
 	} else {
 		if (initialized) {
 			TraceEvent(g_network && g_network->isSimulated() ? SevError : SevWarnAlways,
@@ -1152,6 +1173,9 @@ void TraceEvent::log() {
 					severity = SevInfo;
 					backtrace();
 					severity = SevError;
+					if (errorKindIndex != -1) {
+						fields.mutate(errorKindIndex).second = toString(errorKind);
+					}
 				}
 
 				if (isNetworkThread()) {
@@ -1365,12 +1389,12 @@ TraceEventFields::TraceEventFields() : bytes(0), annotated(false) {}
 
 void TraceEventFields::addField(const std::string& key, const std::string& value) {
 	bytes += key.size() + value.size();
-	fields.push_back(std::make_pair(key, value));
+	fields.emplace_back(key, value);
 }
 
 void TraceEventFields::addField(std::string&& key, std::string&& value) {
 	bytes += key.size() + value.size();
-	fields.push_back(std::make_pair(std::move(key), std::move(value)));
+	fields.emplace_back(std::move(key), std::move(value));
 }
 
 size_t TraceEventFields::size() const {
diff --git a/flow/Trace.h b/flow/Trace.h
index 1193171a83..aed19d3cf8 100644
--- a/flow/Trace.h
+++ b/flow/Trace.h
@@ -64,6 +64,12 @@ enum Severity {
 	SevMax = 1000000
 };
 
+enum class ErrorKind : uint8_t {
+	Unset,
+	DiskIssue,
+	BugDetected,
+};
+
 const int NUM_MAJOR_LEVELS_OF_EVENTS = SevMaxUsed / 10 + 1;
 
 class TraceEventFields {
@@ -457,6 +463,11 @@ public:
 
 	bool isEnabled() const { return enabled; }
 
+	TraceEvent &setErrorKind(ErrorKind errorKind) {
+		this->errorKind = errorKind;
+		return *this;
+	}
+
 	explicit operator bool() const { return enabled; }
 
 	void log();
@@ -475,6 +486,7 @@ private:
 	std::string trackingKey;
 	TraceEventFields fields;
 	Severity severity;
+	ErrorKind errorKind;
 	const char* type;
 	UID id;
 	Error err;
@@ -482,6 +494,7 @@ private:
 	int maxFieldLength;
 	int maxEventLength;
 	int timeIndex;
+	int errorKindIndex { -1 };
 
 	void setSizeLimits();
 

From f9ede75b422d1fdcc128b76b419c61d505546a43 Mon Sep 17 00:00:00 2001
From: sfc-gh-tclinkenbeard <trevor.clinkenbeard@snowflake.com>
Date: Mon, 3 May 2021 10:56:42 -0700
Subject: [PATCH 322/461] Remove unused variable in ClusterController.actor.cpp

---
 fdbserver/ClusterController.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index d0735301df..9dbb9bca2f 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1888,7 +1888,7 @@ public:
 			updateKnownIds(&firstUsed);
 			updateKnownIds(&secondUsed);
 
-			auto mworker = id_worker.find(masterProcessId);
+			// auto mworker = id_worker.find(masterProcessId);
 			//TraceEvent("CompareAddressesMaster")
 			//    .detail("Master",
 			//            mworker != id_worker.end() ? mworker->second.details.interf.address() : NetworkAddress());

From 2bcfbd681679584606c9fdf5135a2c499e0fbdf7 Mon Sep 17 00:00:00 2001
From: sfc-gh-tclinkenbeard <trevor.clinkenbeard@snowflake.com>
Date: Mon, 3 May 2021 10:59:44 -0700
Subject: [PATCH 323/461] Move TraceEvent::setErrorKind implementation to cpp
 file

---
 flow/Trace.cpp | 5 +++++
 flow/Trace.h   | 5 +----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/flow/Trace.cpp b/flow/Trace.cpp
index 53d607ea1c..06c7f33ec1 100644
--- a/flow/Trace.cpp
+++ b/flow/Trace.cpp
@@ -1105,6 +1105,11 @@ TraceEvent& TraceEvent::suppressFor(double duration, bool logSuppressedEventCoun
 	return *this;
 }
 
+TraceEvent &TraceEvent::setErrorKind(ErrorKind errorKind) {
+	this->errorKind = errorKind;
+	return *this;
+}
+
 TraceEvent& TraceEvent::setMaxFieldLength(int maxFieldLength) {
 	ASSERT(!logged);
 	if (maxFieldLength == 0) {
diff --git a/flow/Trace.h b/flow/Trace.h
index aed19d3cf8..ef86671cc1 100644
--- a/flow/Trace.h
+++ b/flow/Trace.h
@@ -463,10 +463,7 @@ public:
 
 	bool isEnabled() const { return enabled; }
 
-	TraceEvent &setErrorKind(ErrorKind errorKind) {
-		this->errorKind = errorKind;
-		return *this;
-	}
+	TraceEvent &setErrorKind(ErrorKind errorKind);
 
 	explicit operator bool() const { return enabled; }
 

From 5c2d7b6080b9222d0fa1431a2f4e2f9445eee738 Mon Sep 17 00:00:00 2001
From: sfc-gh-tclinkenbeard <trevor.clinkenbeard@snowflake.com>
Date: Mon, 3 May 2021 13:14:16 -0700
Subject: [PATCH 324/461] Create RangeResult type alias

---
 design/special-key-space.md                   |   8 +-
 documentation/tutorial/tutorial.actor.cpp     |   2 +-
 fdbbackup/backup.actor.cpp                    |  10 +-
 fdbcli/fdbcli.actor.cpp                       |  14 +-
 fdbclient/BackupAgent.actor.h                 |   4 +-
 fdbclient/BackupAgentBase.actor.cpp           |  12 +-
 fdbclient/DatabaseBackupAgent.actor.cpp       |  41 ++-
 fdbclient/FDBTypes.h                          |   1 +
 fdbclient/FileBackupAgent.actor.cpp           |  14 +-
 fdbclient/GlobalConfig.actor.cpp              |   2 +-
 fdbclient/IClientApi.h                        |  36 +--
 fdbclient/KeyBackedTypes.h                    |   4 +-
 fdbclient/KeyRangeMap.actor.cpp               |  34 +--
 fdbclient/KeyRangeMap.h                       |  22 +-
 fdbclient/ManagementAPI.actor.cpp             |  34 ++-
 fdbclient/MultiVersionTransaction.actor.cpp   |  86 +++---
 fdbclient/MultiVersionTransaction.h           |  72 ++---
 fdbclient/NativeAPI.actor.cpp                 | 185 +++++++------
 fdbclient/NativeAPI.actor.h                   |  36 +--
 fdbclient/RYWIterator.cpp                     |  10 +-
 fdbclient/ReadYourWrites.actor.cpp            |  95 ++++---
 fdbclient/ReadYourWrites.h                    |  37 ++-
 fdbclient/SpecialKeySpace.actor.cpp           | 249 ++++++++----------
 fdbclient/SpecialKeySpace.actor.h             | 102 ++++---
 fdbclient/SystemData.cpp                      |   6 +-
 fdbclient/SystemData.h                        |   4 +-
 fdbclient/TagThrottle.actor.cpp               |   6 +-
 fdbclient/TaskBucket.actor.cpp                |  25 +-
 fdbclient/ThreadSafeTransaction.cpp           |  24 +-
 fdbclient/ThreadSafeTransaction.h             |  36 +--
 fdbserver/ApplyMetadataMutation.cpp           |   2 +-
 fdbserver/BackupProgress.actor.cpp            |   2 +-
 fdbserver/ClusterController.actor.cpp         |   8 +-
 fdbserver/CommitProxyServer.actor.cpp         |  15 +-
 fdbserver/Coordination.actor.cpp              |   2 +-
 fdbserver/DataDistribution.actor.cpp          |  35 ++-
 fdbserver/DataDistributionQueue.actor.cpp     |  11 +-
 fdbserver/IKeyValueStore.h                    |   4 +-
 .../KeyValueStoreCompressTestData.actor.cpp   |  13 +-
 fdbserver/KeyValueStoreMemory.actor.cpp       |  14 +-
 fdbserver/KeyValueStoreRocksDB.actor.cpp      |   6 +-
 fdbserver/KeyValueStoreSQLite.actor.cpp       |  12 +-
 fdbserver/MetricLogger.actor.cpp              |   2 +-
 fdbserver/MoveKeys.actor.cpp                  |  67 ++---
 fdbserver/OldTLogServer_4_6.actor.cpp         |  12 +-
 fdbserver/OldTLogServer_6_0.actor.cpp         |  20 +-
 fdbserver/OldTLogServer_6_2.actor.cpp         |  26 +-
 fdbserver/QuietDatabase.actor.cpp             |   2 +-
 fdbserver/Ratekeeper.actor.cpp                |   5 +-
 fdbserver/RestoreController.actor.cpp         |   3 +-
 fdbserver/RestoreWorker.actor.cpp             |   2 +-
 fdbserver/Status.actor.cpp                    |  18 +-
 fdbserver/StorageCache.actor.cpp              |  20 +-
 fdbserver/TLogServer.actor.cpp                |  28 +-
 fdbserver/VersionedBTree.actor.cpp            |  16 +-
 fdbserver/fdbserver.actor.cpp                 |   2 +-
 fdbserver/masterserver.actor.cpp              |  14 +-
 fdbserver/pubsub.actor.cpp                    |  34 ++-
 fdbserver/storageserver.actor.cpp             |  38 +--
 fdbserver/workloads/ApiCorrectness.actor.cpp  |  14 +-
 fdbserver/workloads/ApiWorkload.actor.cpp     |   5 +-
 fdbserver/workloads/ApiWorkload.h             |  21 +-
 fdbserver/workloads/AtomicOps.actor.cpp       |  27 +-
 .../workloads/AtomicSwitchover.actor.cpp      |   5 +-
 .../workloads/BackgroundSelectors.actor.cpp   |  11 +-
 ...kupAndParallelRestoreCorrectness.actor.cpp |   8 +-
 .../workloads/BackupCorrectness.actor.cpp     |  10 +-
 .../workloads/BackupToDBCorrectness.actor.cpp |  25 +-
 .../workloads/BackupToDBUpgrade.actor.cpp     |  11 +-
 ...entTransactionProfileCorrectness.actor.cpp |   6 +-
 fdbserver/workloads/ConflictRange.actor.cpp   |  39 ++-
 .../workloads/ConsistencyCheck.actor.cpp      |   6 +-
 fdbserver/workloads/Cycle.actor.cpp           |   7 +-
 .../DataDistributionMetrics.actor.cpp         |   7 +-
 .../workloads/DiskDurabilityTest.actor.cpp    |   2 +-
 fdbserver/workloads/FileSystem.actor.cpp      |   5 +-
 .../workloads/FuzzApiCorrectness.actor.cpp    |  16 +-
 fdbserver/workloads/Increment.actor.cpp       |   7 +-
 fdbserver/workloads/IndexScan.actor.cpp       |   2 +-
 fdbserver/workloads/Inventory.actor.cpp       |   2 +-
 fdbserver/workloads/KVStoreTest.actor.cpp     |   3 +-
 fdbserver/workloads/LocalRatekeeper.actor.cpp |   2 +-
 fdbserver/workloads/LockDatabase.actor.cpp    |  13 +-
 fdbserver/workloads/MemoryKeyValueStore.cpp   |   4 +-
 fdbserver/workloads/MemoryKeyValueStore.h     |   2 +-
 fdbserver/workloads/MemoryLifetime.actor.cpp  |   4 +-
 fdbserver/workloads/RYWPerformance.actor.cpp  |  10 +-
 fdbserver/workloads/RandomMoveKeys.actor.cpp  |   2 +-
 fdbserver/workloads/RandomSelector.actor.cpp  |  10 +-
 fdbserver/workloads/ReadWrite.actor.cpp       |  12 +-
 .../workloads/ReportConflictingKeys.actor.cpp |   5 +-
 fdbserver/workloads/RyowCorrectness.actor.cpp |  30 +--
 .../workloads/SelectorCorrectness.actor.cpp   |   4 +-
 fdbserver/workloads/Serializability.actor.cpp |  14 +-
 fdbserver/workloads/SnapTest.actor.cpp        |   2 +-
 .../SpecialKeySpaceCorrectness.actor.cpp      |  35 ++-
 fdbserver/workloads/Storefront.actor.cpp      |   6 +-
 fdbserver/workloads/StreamingRead.actor.cpp   |   2 +-
 .../workloads/SuspendProcesses.actor.cpp      |   7 +-
 fdbserver/workloads/TPCC.actor.cpp            |   9 +-
 .../workloads/TaskBucketCorrectness.actor.cpp |   2 +-
 fdbserver/workloads/Throttling.actor.cpp      |   2 +-
 fdbserver/workloads/TriggerRecovery.actor.cpp |   7 +-
 fdbserver/workloads/UDPWorkload.actor.cpp     |   3 +-
 fdbserver/workloads/Unreadable.actor.cpp      |   7 +-
 fdbserver/workloads/VersionStamp.actor.cpp    |   6 +-
 fdbserver/workloads/WriteDuringRead.actor.cpp |   4 +-
 107 files changed, 972 insertions(+), 1115 deletions(-)

diff --git a/design/special-key-space.md b/design/special-key-space.md
index 5d22e9d7f3..7cdcfe460d 100644
--- a/design/special-key-space.md
+++ b/design/special-key-space.md
@@ -20,7 +20,7 @@ Consequently, the special-key-space framework wants to integrate all client func
 If your feature is exposing information to clients and the results are easily formatted as key-value pairs, then you can use special-key-space to implement your client function.
 
 ## How
-If you choose to use, you need to implement a function class that inherits from `SpecialKeyRangeReadImpl`, which has an abstract method `Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr)`.
+If you choose to use, you need to implement a function class that inherits from `SpecialKeyRangeReadImpl`, which has an abstract method `Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr)`.
 This method can be treated as a callback, whose implementation details are determined by the developer.
 Once you fill out the method, register the function class to the corresponding key range.
 Below is a detailed example.
@@ -38,10 +38,10 @@ public:
         CountryToCapitalCity[LiteralStringRef("China")] = LiteralStringRef("Beijing");
     }
     // Implement the getRange interface
-    Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw,
+    Future<RangeResult> getRange(ReadYourWritesTransaction* ryw,
                                             KeyRangeRef kr) const override {
         
-        Standalone<RangeResultRef> result;
+        RangeResult result;
         for (auto const& country : CountryToCapitalCity) {
             // the registered range here: [\xff\xff/example/, \xff\xff/example/\xff]
             Key keyWithPrefix = country.first.withPrefix(range.begin);
@@ -71,7 +71,7 @@ ASSERT(res1.present() && res.getValue() == LiteralStringRef("Tokyo"));
 // getRange
 // Note: for getRange(key1, key2), both key1 and key2 should prefixed with \xff\xff
 // something like getRange("normal_key", "\xff\xff/...") is not supported yet
-Standalone<RangeResultRef> res2 = wait(tr.getRange(LiteralStringRef("\xff\xff/example/U"), LiteralStringRef("\xff\xff/example/U\xff")));
+RangeResult res2 = wait(tr.getRange(LiteralStringRef("\xff\xff/example/U"), LiteralStringRef("\xff\xff/example/U\xff")));
 // res2 should contain USA and UK
 ASSERT(
     res2.size() == 2 &&
diff --git a/documentation/tutorial/tutorial.actor.cpp b/documentation/tutorial/tutorial.actor.cpp
index 5ec749b1cb..4213ddd5c2 100644
--- a/documentation/tutorial/tutorial.actor.cpp
+++ b/documentation/tutorial/tutorial.actor.cpp
@@ -366,7 +366,7 @@ ACTOR Future<Void> fdbClient() {
 			// 3. write 10 values in [k, k+100]
 			beginIdx = deterministicRandom()->randomInt(0, 1e8 - 100);
 			startKey = keyPrefix + std::to_string(beginIdx);
-			Standalone<RangeResultRef> range = wait(tx.getRange(KeyRangeRef(startKey, endKey), 100));
+			RangeResult range = wait(tx.getRange(KeyRangeRef(startKey, endKey), 100));
 			for (int i = 0; i < 10; ++i) {
 				Key k = Key(keyPrefix + std::to_string(beginIdx + deterministicRandom()->randomInt(0, 100)));
 				tx.set(k, LiteralStringRef("foo"));
diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index f976de06a6..77e4b03f0d 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -1576,7 +1576,7 @@ ACTOR Future<std::string> getLayerStatus(Reference<ReadYourWritesTransaction> tr
 		state Reference<ReadYourWritesTransaction> tr2(new ReadYourWritesTransaction(dest));
 		tr2->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr2->setOption(FDBTransactionOptions::LOCK_AWARE);
-		state Standalone<RangeResultRef> tagNames = wait(tr2->getRange(dba.tagNames.range(), 10000, snapshot));
+		state RangeResult tagNames = wait(tr2->getRange(dba.tagNames.range(), 10000, snapshot));
 		state std::vector<Future<Optional<Key>>> backupVersion;
 		state std::vector<Future<EBackupState>> backupStatus;
 		state std::vector<Future<int64_t>> tagRangeBytesDR;
@@ -1638,7 +1638,7 @@ ACTOR Future<Void> cleanupStatus(Reference<ReadYourWritesTransaction> tr,
                                  std::string name,
                                  std::string id,
                                  int limit = 1) {
-	state Standalone<RangeResultRef> docs = wait(tr->getRange(KeyRangeRef(rootKey, strinc(rootKey)), limit, true));
+	state RangeResult docs = wait(tr->getRange(KeyRangeRef(rootKey, strinc(rootKey)), limit, true));
 	state bool readMore = false;
 	state int i;
 	for (i = 0; i < docs.size(); ++i) {
@@ -1667,7 +1667,7 @@ ACTOR Future<Void> cleanupStatus(Reference<ReadYourWritesTransaction> tr,
 		}
 		if (readMore) {
 			limit = 10000;
-			Standalone<RangeResultRef> docs2 = wait(tr->getRange(KeyRangeRef(rootKey, strinc(rootKey)), limit, true));
+			RangeResult docs2 = wait(tr->getRange(KeyRangeRef(rootKey, strinc(rootKey)), limit, true));
 			docs = std::move(docs2);
 			readMore = false;
 		}
@@ -1684,7 +1684,7 @@ ACTOR Future<json_spirit::mObject> getLayerStatus(Database src, std::string root
 		try {
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-			state Standalone<RangeResultRef> kvPairs =
+			state RangeResult kvPairs =
 			    wait(tr.getRange(KeyRangeRef(rootKey, strinc(rootKey)), GetRangeLimits::ROW_LIMIT_UNLIMITED));
 			json_spirit::mObject statusDoc;
 			JSONDoc modifier(statusDoc);
@@ -4246,4 +4246,4 @@ int main(int argc, char* argv[]) {
 	}
 
 	flushAndExit(status);
-}
\ No newline at end of file
+}
diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index d655601e22..a3578a95d4 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -3633,7 +3633,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 				if (tokencmp(tokens[0], "kill")) {
 					getTransaction(db, tr, options, intrans);
 					if (tokens.size() == 1) {
-						Standalone<RangeResultRef> kvs = wait(
+						RangeResult kvs = wait(
 						    makeInterruptable(tr->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"),
 						                                               LiteralStringRef("\xff\xff/worker_interfaces0")),
 						                                   CLIENT_KNOBS->TOO_MANY)));
@@ -3700,7 +3700,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 				if (tokencmp(tokens[0], "suspend")) {
 					getTransaction(db, tr, options, intrans);
 					if (tokens.size() == 1) {
-						Standalone<RangeResultRef> kvs = wait(
+						RangeResult kvs = wait(
 						    makeInterruptable(tr->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"),
 						                                               LiteralStringRef("\xff\xff/worker_interfaces0")),
 						                                   CLIENT_KNOBS->TOO_MANY)));
@@ -3911,7 +3911,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 							continue;
 						}
 						getTransaction(db, tr, options, intrans);
-						Standalone<RangeResultRef> kvs = wait(
+						RangeResult kvs = wait(
 						    makeInterruptable(tr->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"),
 						                                               LiteralStringRef("\xff\xff/worker_interfaces0")),
 						                                   CLIENT_KNOBS->TOO_MANY)));
@@ -3940,7 +3940,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 								continue;
 							}
 							getTransaction(db, tr, options, intrans);
-							Standalone<RangeResultRef> kvs = wait(makeInterruptable(
+							RangeResult kvs = wait(makeInterruptable(
 							    tr->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"),
 							                             LiteralStringRef("\xff\xff/worker_interfaces0")),
 							                 CLIENT_KNOBS->TOO_MANY)));
@@ -4019,7 +4019,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 							continue;
 						}
 						getTransaction(db, tr, options, intrans);
-						Standalone<RangeResultRef> kvs = wait(
+						RangeResult kvs = wait(
 						    makeInterruptable(tr->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"),
 						                                               LiteralStringRef("\xff\xff/worker_interfaces0")),
 						                                   CLIENT_KNOBS->TOO_MANY)));
@@ -4061,7 +4061,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 				if (tokencmp(tokens[0], "expensive_data_check")) {
 					getTransaction(db, tr, options, intrans);
 					if (tokens.size() == 1) {
-						Standalone<RangeResultRef> kvs = wait(
+						RangeResult kvs = wait(
 						    makeInterruptable(tr->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"),
 						                                               LiteralStringRef("\xff\xff/worker_interfaces0")),
 						                                   CLIENT_KNOBS->TOO_MANY)));
@@ -4177,7 +4177,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 							endKey = strinc(tokens[1]);
 						}
 
-						Standalone<RangeResultRef> kvs = wait(makeInterruptable(
+						RangeResult kvs = wait(makeInterruptable(
 						    getTransaction(db, tr, options, intrans)->getRange(KeyRangeRef(tokens[1], endKey), limit)));
 
 						printf("\nRange limited to %d keys\n", limit);
diff --git a/fdbclient/BackupAgent.actor.h b/fdbclient/BackupAgent.actor.h
index 1e3f41bb0e..c8903b9fe4 100644
--- a/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/BackupAgent.actor.h
@@ -593,10 +593,10 @@ public:
 	Reference<FutureBucket> futureBucket;
 };
 
-typedef std::pair<Standalone<RangeResultRef>, Version> RangeResultWithVersion;
+using RangeResultWithVersion = std::pair<RangeResult, Version>;
 
 struct RCGroup {
-	Standalone<RangeResultRef> items;
+	RangeResult items;
 	Version version;
 	uint64_t groupKey;
 
diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp
index 72fce5a509..fba2e69954 100644
--- a/fdbclient/BackupAgentBase.actor.cpp
+++ b/fdbclient/BackupAgentBase.actor.cpp
@@ -401,7 +401,7 @@ ACTOR Future<Void> readCommitted(Database cx,
 			releaser = FlowLock::Releaser(
 			    *lock, limits.bytes + CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT);
 
-			state Standalone<RangeResultRef> values = wait(tr.getRange(begin, end, limits));
+			state RangeResult values = wait(tr.getRange(begin, end, limits));
 
 			// When this buggify line is enabled, if there are more than 1 result then use half of the results
 			if (values.size() > 1 && BUGGIFY) {
@@ -467,7 +467,7 @@ ACTOR Future<Void> readCommitted(Database cx,
 			if (lockAware)
 				tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 
-			state Standalone<RangeResultRef> rangevalue = wait(tr.getRange(nextKey, end, limits));
+			state RangeResult rangevalue = wait(tr.getRange(nextKey, end, limits));
 
 			// When this buggify line is enabled, if there are more than 1 result then use half of the results
 			if (rangevalue.size() > 1 && BUGGIFY) {
@@ -778,7 +778,7 @@ ACTOR static Future<Void> _eraseLogData(Reference<ReadYourWritesTransaction> tr,
 			return Void();
 	}
 
-	state Standalone<RangeResultRef> backupVersions = wait(
+	state RangeResult backupVersions = wait(
 	    tr->getRange(KeyRangeRef(backupLatestVersionsPath, strinc(backupLatestVersionsPath)), CLIENT_KNOBS->TOO_MANY));
 
 	// Make sure version history key does exist and lower the beginVersion if needed
@@ -870,7 +870,7 @@ ACTOR static Future<Void> _eraseLogData(Reference<ReadYourWritesTransaction> tr,
 	}
 
 	if (!endVersion.present() && backupVersions.size() == 1) {
-		Standalone<RangeResultRef> existingDestUidValues =
+		RangeResult existingDestUidValues =
 		    wait(tr->getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY));
 		for (auto it : existingDestUidValues) {
 			if (it.value == destUidValue) {
@@ -903,7 +903,7 @@ ACTOR Future<Void> cleanupLogMutations(Database cx, Value destUidValue, bool del
 			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 
-			state Standalone<RangeResultRef> backupVersions = wait(tr->getRange(
+			state RangeResult backupVersions = wait(tr->getRange(
 			    KeyRangeRef(backupLatestVersionsPath, strinc(backupLatestVersionsPath)), CLIENT_KNOBS->TOO_MANY));
 			state Version readVer = tr->getReadVersion().get();
 
@@ -990,7 +990,7 @@ ACTOR Future<Void> cleanupBackup(Database cx, bool deleteData) {
 			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 
-			state Standalone<RangeResultRef> destUids = wait(
+			state RangeResult destUids = wait(
 			    tr->getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY));
 
 			for (auto destUid : destUids) {
diff --git a/fdbclient/DatabaseBackupAgent.actor.cpp b/fdbclient/DatabaseBackupAgent.actor.cpp
index 9b8e02a102..20f9c6bcf2 100644
--- a/fdbclient/DatabaseBackupAgent.actor.cpp
+++ b/fdbclient/DatabaseBackupAgent.actor.cpp
@@ -157,7 +157,7 @@ struct BackupRangeTaskFunc : TaskFuncBase {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 		state Standalone<VectorRef<KeyRef>> results;
-		Standalone<RangeResultRef> values = wait(tr->getRange(
+		RangeResult values = wait(tr->getRange(
 		    KeyRangeRef(keyAfter(beginKey.withPrefix(keyServersPrefix)), endKey.withPrefix(keyServersPrefix)), limit));
 
 		for (auto& s : values) {
@@ -314,19 +314,18 @@ struct BackupRangeTaskFunc : TaskFuncBase {
 					    applyMutationsKeyVersionMapRange.begin);
 					state Key rangeCountKey = task->params[BackupAgentBase::keyConfigLogUid].withPrefix(
 					    applyMutationsKeyVersionCountRange.begin);
-					state Future<Standalone<RangeResultRef>> backupVersions =
+					state Future<RangeResult> backupVersions =
 					    krmGetRanges(tr, prefix, KeyRangeRef(rangeBegin, rangeEnd), BUGGIFY ? 2 : 2000, 1e5);
 					state Future<Optional<Value>> logVersionValue = tr->get(
 					    task->params[BackupAgentBase::keyConfigLogUid].withPrefix(applyMutationsEndRange.begin), true);
 					state Future<Optional<Value>> rangeCountValue = tr->get(rangeCountKey, true);
-					state Future<Standalone<RangeResultRef>> prevRange = tr->getRange(
+					state Future<RangeResult> prevRange = tr->getRange(
 					    firstGreaterOrEqual(prefix), lastLessOrEqual(rangeBegin.withPrefix(prefix)), 1, true, true);
-					state Future<Standalone<RangeResultRef>> nextRange =
-					    tr->getRange(firstGreaterOrEqual(rangeEnd.withPrefix(prefix)),
-					                 firstGreaterOrEqual(strinc(prefix)),
-					                 1,
-					                 true,
-					                 false);
+					state Future<RangeResult> nextRange = tr->getRange(firstGreaterOrEqual(rangeEnd.withPrefix(prefix)),
+					                                                   firstGreaterOrEqual(strinc(prefix)),
+					                                                   1,
+					                                                   true,
+					                                                   false);
 					state Future<Void> verified = taskBucket->keepRunning(tr, task);
 
 					wait(checkDatabaseLock(tr,
@@ -725,7 +724,7 @@ struct CopyLogRangeTaskFunc : TaskFuncBase {
 		state Subspace conf = Subspace(databaseBackupPrefixRange.begin)
 		                          .get(BackupAgentBase::keyConfig)
 		                          .get(task->params[BackupAgentBase::keyConfigLogUid]);
-		state std::vector<Standalone<RangeResultRef>> nextMutations;
+		state std::vector<RangeResult> nextMutations;
 		state bool isTimeoutOccured = false;
 		state Optional<KeyRef> lastKey;
 		state Version lastVersion;
@@ -736,9 +735,9 @@ struct CopyLogRangeTaskFunc : TaskFuncBase {
 					return Optional<Version>();
 				}
 
-				state std::vector<Standalone<RangeResultRef>> mutations = std::move(nextMutations);
+				state std::vector<RangeResult> mutations = std::move(nextMutations);
 				state int64_t mutationSize = nextMutationSize;
-				nextMutations = std::vector<Standalone<RangeResultRef>>();
+				nextMutations = std::vector<RangeResult>();
 				nextMutationSize = 0;
 
 				if (!endOfStream) {
@@ -1470,7 +1469,7 @@ struct OldCopyLogRangeTaskFunc : TaskFuncBase {
 		                          .get(BackupAgentBase::keyConfig)
 		                          .get(task->params[BackupAgentBase::keyConfigLogUid]);
 
-		state std::vector<Standalone<RangeResultRef>> nextMutations;
+		state std::vector<RangeResult> nextMutations;
 		state int64_t nextMutationSize = 0;
 		loop {
 			try {
@@ -1478,9 +1477,9 @@ struct OldCopyLogRangeTaskFunc : TaskFuncBase {
 					return Void();
 				}
 
-				state std::vector<Standalone<RangeResultRef>> mutations = std::move(nextMutations);
+				state std::vector<RangeResult> mutations = std::move(nextMutations);
 				state int64_t mutationSize = nextMutationSize;
-				nextMutations = std::vector<Standalone<RangeResultRef>>();
+				nextMutations = std::vector<RangeResult>();
 				nextMutationSize = 0;
 
 				if (!endOfStream) {
@@ -1819,7 +1818,7 @@ struct CopyDiffLogsUpgradeTaskFunc : TaskFuncBase {
 				}
 
 				if (backupRanges.size() == 1) {
-					Standalone<RangeResultRef> existingDestUidValues = wait(srcTr->getRange(
+					RangeResult existingDestUidValues = wait(srcTr->getRange(
 					    KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY));
 					bool found = false;
 					for (auto it : existingDestUidValues) {
@@ -2063,7 +2062,7 @@ struct StartFullBackupTaskFunc : TaskFuncBase {
 
 				// Initialize destUid
 				if (backupRanges.size() == 1) {
-					Standalone<RangeResultRef> existingDestUidValues = wait(srcTr->getRange(
+					RangeResult existingDestUidValues = wait(srcTr->getRange(
 					    KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY));
 					bool found = false;
 					for (auto it : existingDestUidValues) {
@@ -2561,7 +2560,7 @@ public:
 
 		if (backupAction == DatabaseBackupAgent::PreBackupAction::VERIFY) {
 			// Make sure all of the ranges are empty before we backup into them.
-			state std::vector<Future<Standalone<RangeResultRef>>> backupIntoResults;
+			state std::vector<Future<RangeResult>> backupIntoResults;
 			for (auto& backupRange : backupRanges) {
 				backupIntoResults.push_back(
 				    tr->getRange(backupRange.removePrefix(removePrefix).withPrefix(addPrefix), 1));
@@ -3060,13 +3059,13 @@ public:
 				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 
 				state Future<Optional<Value>> fPaused = tr->get(backupAgent->taskBucket->getPauseKey());
-				state Future<Standalone<RangeResultRef>> fErrorValues =
+				state Future<RangeResult> fErrorValues =
 				    errorLimit > 0
 				        ? tr->getRange(backupAgent->errors.get(BinaryWriter::toValue(logUid, Unversioned())).range(),
 				                       errorLimit,
 				                       false,
 				                       true)
-				        : Future<Standalone<RangeResultRef>>();
+				        : Future<RangeResult>();
 				state Future<Optional<Value>> fBackupUid =
 				    tr->get(backupAgent->states.get(BinaryWriter::toValue(logUid, Unversioned()))
 				                .pack(DatabaseBackupAgent::keyFolderId));
@@ -3141,7 +3140,7 @@ public:
 
 				// Append the errors, if requested
 				if (errorLimit > 0) {
-					Standalone<RangeResultRef> values = wait(fErrorValues);
+					RangeResult values = wait(fErrorValues);
 
 					// Display the errors, if any
 					if (values.size() > 0) {
diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h
index b2cd469ab8..59d7412d79 100644
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@@ -706,6 +706,7 @@ struct RangeResultRef : VectorRef<KeyValueRef> {
 		       " readToBegin:" + std::to_string(readToBegin) + " readThroughEnd:" + std::to_string(readThroughEnd);
 	}
 };
+using RangeResult = Standalone<RangeResultRef>;
 
 template <>
 struct Traceable<RangeResultRef> : std::true_type {
diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index a09ed25789..dfd33b5b67 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -1025,7 +1025,7 @@ ACTOR static Future<Standalone<VectorRef<KeyRef>>> getBlockOfShards(Reference<Re
 	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 	tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 	state Standalone<VectorRef<KeyRef>> results;
-	Standalone<RangeResultRef> values = wait(tr->getRange(
+	RangeResult values = wait(tr->getRange(
 	    KeyRangeRef(keyAfter(beginKey.withPrefix(keyServersPrefix)), endKey.withPrefix(keyServersPrefix)), limit));
 
 	for (auto& s : values) {
@@ -4584,7 +4584,7 @@ public:
 
 		state Key destUidValue(BinaryWriter::toValue(uid, Unversioned()));
 		if (normalizedRanges.size() == 1) {
-			Standalone<RangeResultRef> existingDestUidValues = wait(
+			RangeResult existingDestUidValues = wait(
 			    tr->getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY));
 			bool found = false;
 			for (auto it : existingDestUidValues) {
@@ -4691,7 +4691,7 @@ public:
 			KeyRange restoreIntoRange = KeyRangeRef(restoreRanges[index].begin, restoreRanges[index].end)
 			                                .removePrefix(removePrefix)
 			                                .withPrefix(addPrefix);
-			Standalone<RangeResultRef> existingRows = wait(tr->getRange(restoreIntoRange, 1));
+			RangeResult existingRows = wait(tr->getRange(restoreIntoRange, 1));
 			if (existingRows.size() > 0 && !onlyAppyMutationLogs) {
 				throw restore_destination_not_empty();
 			}
@@ -5741,7 +5741,7 @@ ACTOR static Future<Void> writeKVs(Database cx, Standalone<VectorRef<KeyValueRef
 			    .detail("Range", KeyRangeRef(k1, k2))
 			    .detail("Begin", begin)
 			    .detail("End", end);
-			Standalone<RangeResultRef> readKVs = wait(tr.getRange(KeyRangeRef(k1, k2), CLIENT_KNOBS->TOO_MANY));
+			RangeResult readKVs = wait(tr.getRange(KeyRangeRef(k1, k2), CLIENT_KNOBS->TOO_MANY));
 			ASSERT(readKVs.size() > 0 || begin == end);
 			break;
 		} catch (Error& e) {
@@ -5773,7 +5773,7 @@ ACTOR static Future<Void> transformDatabaseContents(Database cx,
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			for (i = 0; i < restoreRanges.size(); ++i) {
-				Standalone<RangeResultRef> kvs = wait(tr.getRange(restoreRanges[i], CLIENT_KNOBS->TOO_MANY));
+				RangeResult kvs = wait(tr.getRange(restoreRanges[i], CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!kvs.more);
 				for (auto kv : kvs) {
 					oldData.push_back_deep(oldData.arena(), KeyValueRef(kv.key, kv.value));
@@ -5840,7 +5840,7 @@ ACTOR static Future<Void> transformDatabaseContents(Database cx,
 		try {
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-			Standalone<RangeResultRef> emptyData = wait(tr.getRange(normalKeys, CLIENT_KNOBS->TOO_MANY));
+			RangeResult emptyData = wait(tr.getRange(normalKeys, CLIENT_KNOBS->TOO_MANY));
 			for (int i = 0; i < emptyData.size(); ++i) {
 				TraceEvent(SevError, "ExpectEmptyData")
 				    .detail("Index", i)
@@ -5878,7 +5878,7 @@ ACTOR static Future<Void> transformDatabaseContents(Database cx,
 		try {
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-			Standalone<RangeResultRef> allData = wait(tr.getRange(normalKeys, CLIENT_KNOBS->TOO_MANY));
+			RangeResult allData = wait(tr.getRange(normalKeys, CLIENT_KNOBS->TOO_MANY));
 			TraceEvent(SevFRTestInfo, "SanityCheckData").detail("Size", allData.size());
 			for (int i = 0; i < allData.size(); ++i) {
 				std::pair<bool, bool> backupRestoreValid = insideValidRange(allData[i], restoreRanges, backupRanges);
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index e0991f4b44..e4f0791431 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -191,7 +191,7 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 	self->erase(KeyRangeRef(""_sr, "\xff"_sr));
 
 	Transaction tr(self->cx);
-	Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
+	RangeResult result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
 	for (const auto& kv : result) {
 		KeyRef systemKey = kv.key.removePrefix(globalConfigKeysPrefix);
 		self->insert(systemKey, kv.value);
diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h
index 45249f1509..05f64c1f64 100644
--- a/fdbclient/IClientApi.h
+++ b/fdbclient/IClientApi.h
@@ -42,24 +42,24 @@ public:
 	// until the ThreadFuture's ThreadSingleAssignmentVar has its memory released or it is destroyed.
 	virtual ThreadFuture<Optional<Value>> get(const KeyRef& key, bool snapshot = false) = 0;
 	virtual ThreadFuture<Key> getKey(const KeySelectorRef& key, bool snapshot = false) = 0;
-	virtual ThreadFuture<Standalone<RangeResultRef>> getRange(const KeySelectorRef& begin,
-	                                                          const KeySelectorRef& end,
-	                                                          int limit,
-	                                                          bool snapshot = false,
-	                                                          bool reverse = false) = 0;
-	virtual ThreadFuture<Standalone<RangeResultRef>> getRange(const KeySelectorRef& begin,
-	                                                          const KeySelectorRef& end,
-	                                                          GetRangeLimits limits,
-	                                                          bool snapshot = false,
-	                                                          bool reverse = false) = 0;
-	virtual ThreadFuture<Standalone<RangeResultRef>> getRange(const KeyRangeRef& keys,
-	                                                          int limit,
-	                                                          bool snapshot = false,
-	                                                          bool reverse = false) = 0;
-	virtual ThreadFuture<Standalone<RangeResultRef>> getRange(const KeyRangeRef& keys,
-	                                                          GetRangeLimits limits,
-	                                                          bool snapshot = false,
-	                                                          bool reverse = false) = 0;
+	virtual ThreadFuture<RangeResult> getRange(const KeySelectorRef& begin,
+	                                           const KeySelectorRef& end,
+	                                           int limit,
+	                                           bool snapshot = false,
+	                                           bool reverse = false) = 0;
+	virtual ThreadFuture<RangeResult> getRange(const KeySelectorRef& begin,
+	                                           const KeySelectorRef& end,
+	                                           GetRangeLimits limits,
+	                                           bool snapshot = false,
+	                                           bool reverse = false) = 0;
+	virtual ThreadFuture<RangeResult> getRange(const KeyRangeRef& keys,
+	                                           int limit,
+	                                           bool snapshot = false,
+	                                           bool reverse = false) = 0;
+	virtual ThreadFuture<RangeResult> getRange(const KeyRangeRef& keys,
+	                                           GetRangeLimits limits,
+	                                           bool snapshot = false,
+	                                           bool reverse = false) = 0;
 	virtual ThreadFuture<Standalone<VectorRef<const char*>>> getAddressesForKey(const KeyRef& key) = 0;
 	virtual ThreadFuture<Standalone<StringRef>> getVersionstamp() = 0;
 
diff --git a/fdbclient/KeyBackedTypes.h b/fdbclient/KeyBackedTypes.h
index dd6623e4ef..f92324e4ab 100644
--- a/fdbclient/KeyBackedTypes.h
+++ b/fdbclient/KeyBackedTypes.h
@@ -280,7 +280,7 @@ public:
 		return map(
 		    tr->getRange(
 		        KeyRangeRef(s.pack(Codec<KeyType>::pack(begin)), endKey), GetRangeLimits(limit), snapshot, reverse),
-		    [s](Standalone<RangeResultRef> const& kvs) -> PairsType {
+		    [s](RangeResult const& kvs) -> PairsType {
 			    PairsType results;
 			    for (int i = 0; i < kvs.size(); ++i) {
 				    KeyType key = Codec<KeyType>::unpack(s.unpack(kvs[i].key));
@@ -344,7 +344,7 @@ public:
 		Key endKey = end.present() ? s.pack(Codec<ValueType>::pack(end.get())) : space.range().end;
 		return map(
 		    tr->getRange(KeyRangeRef(s.pack(Codec<ValueType>::pack(begin)), endKey), GetRangeLimits(limit), snapshot),
-		    [s](Standalone<RangeResultRef> const& kvs) -> Values {
+		    [s](RangeResult const& kvs) -> Values {
 			    Values results;
 			    for (int i = 0; i < kvs.size(); ++i) {
 				    results.push_back(Codec<ValueType>::unpack(s.unpack(kvs[i].key)));
diff --git a/fdbclient/KeyRangeMap.actor.cpp b/fdbclient/KeyRangeMap.actor.cpp
index 67992e3e95..7b7dcdf1e3 100644
--- a/fdbclient/KeyRangeMap.actor.cpp
+++ b/fdbclient/KeyRangeMap.actor.cpp
@@ -35,7 +35,7 @@ void KeyRangeActorMap::getRangesAffectedByInsertion(const KeyRangeRef& keys, vec
 		affectedRanges.push_back(KeyRangeRef(keys.end, e.end()));
 }
 
-Standalone<RangeResultRef> krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, Standalone<RangeResultRef> kv) {
+RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv) {
 	ASSERT(!kv.more || kv.size() > 1);
 	KeyRange withPrefix =
 	    KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString());
@@ -46,7 +46,7 @@ Standalone<RangeResultRef> krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, Stan
 	if (kv.size() && kv.end()[-1].key.startsWith(mapPrefix))
 		endValue = kv.end()[-1].value;
 
-	Standalone<RangeResultRef> result;
+	RangeResult result;
 	result.arena().dependsOn(kv.arena());
 	result.arena().dependsOn(keys.arena());
 
@@ -67,34 +67,28 @@ Standalone<RangeResultRef> krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, Stan
 }
 
 // Returns keys.begin, all transitional points in keys, and keys.end, and their values
-ACTOR Future<Standalone<RangeResultRef>> krmGetRanges(Transaction* tr,
-                                                      Key mapPrefix,
-                                                      KeyRange keys,
-                                                      int limit,
-                                                      int limitBytes) {
+ACTOR Future<RangeResult> krmGetRanges(Transaction* tr, Key mapPrefix, KeyRange keys, int limit, int limitBytes) {
 	KeyRange withPrefix =
 	    KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString());
 
 	state GetRangeLimits limits(limit, limitBytes);
 	limits.minRows = 2;
-	Standalone<RangeResultRef> kv =
-	    wait(tr->getRange(lastLessOrEqual(withPrefix.begin), firstGreaterThan(withPrefix.end), limits));
+	RangeResult kv = wait(tr->getRange(lastLessOrEqual(withPrefix.begin), firstGreaterThan(withPrefix.end), limits));
 
 	return krmDecodeRanges(mapPrefix, keys, kv);
 }
 
-ACTOR Future<Standalone<RangeResultRef>> krmGetRanges(Reference<ReadYourWritesTransaction> tr,
-                                                      Key mapPrefix,
-                                                      KeyRange keys,
-                                                      int limit,
-                                                      int limitBytes) {
+ACTOR Future<RangeResult> krmGetRanges(Reference<ReadYourWritesTransaction> tr,
+                                       Key mapPrefix,
+                                       KeyRange keys,
+                                       int limit,
+                                       int limitBytes) {
 	KeyRange withPrefix =
 	    KeyRangeRef(mapPrefix.toString() + keys.begin.toString(), mapPrefix.toString() + keys.end.toString());
 
 	state GetRangeLimits limits(limit, limitBytes);
 	limits.minRows = 2;
-	Standalone<RangeResultRef> kv =
-	    wait(tr->getRange(lastLessOrEqual(withPrefix.begin), firstGreaterThan(withPrefix.end), limits));
+	RangeResult kv = wait(tr->getRange(lastLessOrEqual(withPrefix.begin), firstGreaterThan(withPrefix.end), limits));
 
 	return krmDecodeRanges(mapPrefix, keys, kv);
 }
@@ -125,8 +119,7 @@ void krmSetPreviouslyEmptyRange(CommitTransactionRef& tr,
 ACTOR Future<Void> krmSetRange(Transaction* tr, Key mapPrefix, KeyRange range, Value value) {
 	state KeyRange withPrefix =
 	    KeyRangeRef(mapPrefix.toString() + range.begin.toString(), mapPrefix.toString() + range.end.toString());
-	Standalone<RangeResultRef> old =
-	    wait(tr->getRange(lastLessOrEqual(withPrefix.end), firstGreaterThan(withPrefix.end), 1, true));
+	RangeResult old = wait(tr->getRange(lastLessOrEqual(withPrefix.end), firstGreaterThan(withPrefix.end), 1, true));
 
 	Value oldValue;
 	bool hasResult = old.size() > 0 && old[0].key.startsWith(mapPrefix);
@@ -147,8 +140,7 @@ ACTOR Future<Void> krmSetRange(Transaction* tr, Key mapPrefix, KeyRange range, V
 ACTOR Future<Void> krmSetRange(Reference<ReadYourWritesTransaction> tr, Key mapPrefix, KeyRange range, Value value) {
 	state KeyRange withPrefix =
 	    KeyRangeRef(mapPrefix.toString() + range.begin.toString(), mapPrefix.toString() + range.end.toString());
-	Standalone<RangeResultRef> old =
-	    wait(tr->getRange(lastLessOrEqual(withPrefix.end), firstGreaterThan(withPrefix.end), 1, true));
+	RangeResult old = wait(tr->getRange(lastLessOrEqual(withPrefix.end), firstGreaterThan(withPrefix.end), 1, true));
 
 	Value oldValue;
 	bool hasResult = old.size() > 0 && old[0].key.startsWith(mapPrefix);
@@ -182,7 +174,7 @@ static Future<Void> krmSetRangeCoalescing_(Transaction* tr,
 	state KeyRange maxWithPrefix =
 	    KeyRangeRef(mapPrefix.toString() + maxRange.begin.toString(), mapPrefix.toString() + maxRange.end.toString());
 
-	state vector<Future<Standalone<RangeResultRef>>> keys;
+	state vector<Future<RangeResult>> keys;
 	keys.push_back(tr->getRange(lastLessThan(withPrefix.begin), firstGreaterOrEqual(withPrefix.begin), 1, true));
 	keys.push_back(tr->getRange(lastLessOrEqual(withPrefix.end), firstGreaterThan(withPrefix.end) + 1, 2, true));
 	wait(waitForAll(keys));
diff --git a/fdbclient/KeyRangeMap.h b/fdbclient/KeyRangeMap.h
index 38a340c77b..7016dcfc4d 100644
--- a/fdbclient/KeyRangeMap.h
+++ b/fdbclient/KeyRangeMap.h
@@ -126,16 +126,16 @@ private:
 // krm*(): KeyRangeMap-like abstraction stored in the database, accessed through Transactions
 class Transaction;
 class ReadYourWritesTransaction;
-Future<Standalone<RangeResultRef>> krmGetRanges(Transaction* const& tr,
-                                                Key const& mapPrefix,
-                                                KeyRange const& keys,
-                                                int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT,
-                                                int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES);
-Future<Standalone<RangeResultRef>> krmGetRanges(Reference<ReadYourWritesTransaction> const& tr,
-                                                Key const& mapPrefix,
-                                                KeyRange const& keys,
-                                                int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT,
-                                                int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES);
+Future<RangeResult> krmGetRanges(Transaction* const& tr,
+                                 Key const& mapPrefix,
+                                 KeyRange const& keys,
+                                 int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT,
+                                 int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES);
+Future<RangeResult> krmGetRanges(Reference<ReadYourWritesTransaction> const& tr,
+                                 Key const& mapPrefix,
+                                 KeyRange const& keys,
+                                 int const& limit = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT,
+                                 int const& limitBytes = CLIENT_KNOBS->KRM_GET_RANGE_LIMIT_BYTES);
 void krmSetPreviouslyEmptyRange(Transaction* tr,
                                 const KeyRef& mapPrefix,
                                 const KeyRangeRef& keys,
@@ -162,7 +162,7 @@ Future<Void> krmSetRangeCoalescing(Reference<ReadYourWritesTransaction> const& t
                                    KeyRange const& range,
                                    KeyRange const& maxRange,
                                    Value const& value);
-Standalone<RangeResultRef> krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, Standalone<RangeResultRef> kv);
+RangeResult krmDecodeRanges(KeyRef mapPrefix, KeyRange keys, RangeResult kv);
 
 template <class Val, class Metric, class MetricFunc>
 std::vector<KeyRangeWith<Val>> KeyRangeMap<Val, Metric, MetricFunc>::getAffectedRangesAfterInsertion(
diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp
index 05e1ec95e2..90d670e801 100644
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@@ -357,7 +357,7 @@ ACTOR Future<DatabaseConfiguration> getDatabaseConfiguration(Database cx) {
 	loop {
 		try {
 			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-			Standalone<RangeResultRef> res = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY));
+			RangeResult res = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY));
 			ASSERT(res.size() < CLIENT_KNOBS->TOO_MANY);
 			DatabaseConfiguration config;
 			config.fromKeyValues((VectorRef<KeyValueRef>)res);
@@ -407,7 +407,7 @@ ACTOR Future<ConfigurationResult> changeConfig(Database cx, std::map<std::string
 			tr.setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
 
 			if (!creating && !force) {
-				state Future<Standalone<RangeResultRef>> fConfig = tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY);
+				state Future<RangeResult> fConfig = tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY);
 				state Future<vector<ProcessData>> fWorkers = getWorkers(&tr);
 				wait(success(fConfig) || tooLong);
 
@@ -458,19 +458,19 @@ ACTOR Future<ConfigurationResult> changeConfig(Database cx, std::map<std::string
 						}
 					}
 
-					state Future<Standalone<RangeResultRef>> fServerList =
-					    (newConfig.regions.size()) ? tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY)
-					                               : Future<Standalone<RangeResultRef>>();
+					state Future<RangeResult> fServerList = (newConfig.regions.size())
+					                                            ? tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY)
+					                                            : Future<RangeResult>();
 
 					if (newConfig.usableRegions == 2) {
 						if (oldReplicationUsesDcId) {
-							state Future<Standalone<RangeResultRef>> fLocalityList =
+							state Future<RangeResult> fLocalityList =
 							    tr.getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
 							wait(success(fLocalityList) || tooLong);
 							if (!fLocalityList.isReady()) {
 								return ConfigurationResult::DATABASE_UNAVAILABLE;
 							}
-							Standalone<RangeResultRef> localityList = fLocalityList.get();
+							RangeResult localityList = fLocalityList.get();
 							ASSERT(!localityList.more && localityList.size() < CLIENT_KNOBS->TOO_MANY);
 
 							std::set<Key> localityDcIds;
@@ -513,7 +513,7 @@ ACTOR Future<ConfigurationResult> changeConfig(Database cx, std::map<std::string
 						if (!fServerList.isReady()) {
 							return ConfigurationResult::DATABASE_UNAVAILABLE;
 						}
-						Standalone<RangeResultRef> serverList = fServerList.get();
+						RangeResult serverList = fServerList.get();
 						ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY);
 
 						std::set<Key> newDcIds;
@@ -988,8 +988,8 @@ Future<ConfigurationResult> changeConfig(Database const& cx, std::string const&
 }
 
 ACTOR Future<vector<ProcessData>> getWorkers(Transaction* tr) {
-	state Future<Standalone<RangeResultRef>> processClasses = tr->getRange(processClassKeys, CLIENT_KNOBS->TOO_MANY);
-	state Future<Standalone<RangeResultRef>> processData = tr->getRange(workerListKeys, CLIENT_KNOBS->TOO_MANY);
+	state Future<RangeResult> processClasses = tr->getRange(processClassKeys, CLIENT_KNOBS->TOO_MANY);
+	state Future<RangeResult> processData = tr->getRange(workerListKeys, CLIENT_KNOBS->TOO_MANY);
 
 	wait(success(processClasses) && success(processData));
 	ASSERT(!processClasses.get().more && processClasses.get().size() < CLIENT_KNOBS->TOO_MANY);
@@ -1679,9 +1679,9 @@ ACTOR Future<Void> setClass(Database cx, AddressExclusion server, ProcessClass p
 }
 
 ACTOR Future<vector<AddressExclusion>> getExcludedServers(Transaction* tr) {
-	state Standalone<RangeResultRef> r = wait(tr->getRange(excludedServersKeys, CLIENT_KNOBS->TOO_MANY));
+	state RangeResult r = wait(tr->getRange(excludedServersKeys, CLIENT_KNOBS->TOO_MANY));
 	ASSERT(!r.more && r.size() < CLIENT_KNOBS->TOO_MANY);
-	state Standalone<RangeResultRef> r2 = wait(tr->getRange(failedServersKeys, CLIENT_KNOBS->TOO_MANY));
+	state RangeResult r2 = wait(tr->getRange(failedServersKeys, CLIENT_KNOBS->TOO_MANY));
 	ASSERT(!r2.more && r2.size() < CLIENT_KNOBS->TOO_MANY);
 
 	vector<AddressExclusion> exclusions;
@@ -1867,7 +1867,7 @@ ACTOR Future<bool> checkForExcludingServersTxActor(ReadYourWritesTransaction* tr
 	// recovery
 
 	// Check that there aren't any storage servers with addresses violating the exclusions
-	Standalone<RangeResultRef> serverList = wait(tr->getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
+	RangeResult serverList = wait(tr->getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
 	ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY);
 
 	state bool ok = true;
@@ -1948,7 +1948,7 @@ ACTOR Future<Void> waitForFullReplication(Database cx) {
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 
-			Standalone<RangeResultRef> confResults = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY));
+			RangeResult confResults = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY));
 			ASSERT(!confResults.more && confResults.size() < CLIENT_KNOBS->TOO_MANY);
 			state DatabaseConfiguration config;
 			config.fromKeyValues((VectorRef<KeyValueRef>)confResults);
@@ -2203,8 +2203,7 @@ ACTOR Future<Void> changeCachedRange(Database cx, KeyRangeRef range, bool add) {
 			tr.clear(sysRangeClear);
 			tr.clear(privateRange);
 			tr.addReadConflictRange(privateRange);
-			Standalone<RangeResultRef> previous =
-			    wait(tr.getRange(KeyRangeRef(storageCachePrefix, sysRange.begin), 1, true));
+			RangeResult previous = wait(tr.getRange(KeyRangeRef(storageCachePrefix, sysRange.begin), 1, true));
 			bool prevIsCached = false;
 			if (!previous.empty()) {
 				std::vector<uint16_t> prevVal;
@@ -2220,8 +2219,7 @@ ACTOR Future<Void> changeCachedRange(Database cx, KeyRangeRef range, bool add) {
 				tr.set(sysRange.begin, trueValue);
 				tr.set(privateRange.begin, serverKeysTrue);
 			}
-			Standalone<RangeResultRef> after =
-			    wait(tr.getRange(KeyRangeRef(sysRange.end, storageCacheKeys.end), 1, false));
+			RangeResult after = wait(tr.getRange(KeyRangeRef(sysRange.end, storageCacheKeys.end), 1, false));
 			bool afterIsCached = false;
 			if (!after.empty()) {
 				std::vector<uint16_t> afterVal;
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 177ce68673..18f7bc71e8 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -89,19 +89,19 @@ ThreadFuture<Key> DLTransaction::getKey(const KeySelectorRef& key, bool snapshot
 	});
 }
 
-ThreadFuture<Standalone<RangeResultRef>> DLTransaction::getRange(const KeySelectorRef& begin,
-                                                                 const KeySelectorRef& end,
-                                                                 int limit,
-                                                                 bool snapshot,
-                                                                 bool reverse) {
+ThreadFuture<RangeResult> DLTransaction::getRange(const KeySelectorRef& begin,
+                                                  const KeySelectorRef& end,
+                                                  int limit,
+                                                  bool snapshot,
+                                                  bool reverse) {
 	return getRange(begin, end, GetRangeLimits(limit), snapshot, reverse);
 }
 
-ThreadFuture<Standalone<RangeResultRef>> DLTransaction::getRange(const KeySelectorRef& begin,
-                                                                 const KeySelectorRef& end,
-                                                                 GetRangeLimits limits,
-                                                                 bool snapshot,
-                                                                 bool reverse) {
+ThreadFuture<RangeResult> DLTransaction::getRange(const KeySelectorRef& begin,
+                                                  const KeySelectorRef& end,
+                                                  GetRangeLimits limits,
+                                                  bool snapshot,
+                                                  bool reverse) {
 	FdbCApi::FDBFuture* f = api->transactionGetRange(tr,
 	                                                 begin.getKey().begin(),
 	                                                 begin.getKey().size(),
@@ -117,7 +117,7 @@ ThreadFuture<Standalone<RangeResultRef>> DLTransaction::getRange(const KeySelect
 	                                                 0,
 	                                                 snapshot,
 	                                                 reverse);
-	return toThreadFuture<Standalone<RangeResultRef>>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
+	return toThreadFuture<RangeResult>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
 		const FdbCApi::FDBKeyValue* kvs;
 		int count;
 		FdbCApi::fdb_bool_t more;
@@ -125,23 +125,19 @@ ThreadFuture<Standalone<RangeResultRef>> DLTransaction::getRange(const KeySelect
 		ASSERT(!error);
 
 		// The memory for this is stored in the FDBFuture and is released when the future gets destroyed
-		return Standalone<RangeResultRef>(RangeResultRef(VectorRef<KeyValueRef>((KeyValueRef*)kvs, count), more),
-		                                  Arena());
+		return RangeResult(RangeResultRef(VectorRef<KeyValueRef>((KeyValueRef*)kvs, count), more), Arena());
 	});
 }
 
-ThreadFuture<Standalone<RangeResultRef>> DLTransaction::getRange(const KeyRangeRef& keys,
-                                                                 int limit,
-                                                                 bool snapshot,
-                                                                 bool reverse) {
+ThreadFuture<RangeResult> DLTransaction::getRange(const KeyRangeRef& keys, int limit, bool snapshot, bool reverse) {
 	return getRange(
 	    firstGreaterOrEqual(keys.begin), firstGreaterOrEqual(keys.end), GetRangeLimits(limit), snapshot, reverse);
 }
 
-ThreadFuture<Standalone<RangeResultRef>> DLTransaction::getRange(const KeyRangeRef& keys,
-                                                                 GetRangeLimits limits,
-                                                                 bool snapshot,
-                                                                 bool reverse) {
+ThreadFuture<RangeResult> DLTransaction::getRange(const KeyRangeRef& keys,
+                                                  GetRangeLimits limits,
+                                                  bool snapshot,
+                                                  bool reverse) {
 	return getRange(firstGreaterOrEqual(keys.begin), firstGreaterOrEqual(keys.end), limits, snapshot, reverse);
 }
 
@@ -685,45 +681,45 @@ ThreadFuture<Key> MultiVersionTransaction::getKey(const KeySelectorRef& key, boo
 	return abortableFuture(f, tr.onChange);
 }
 
-ThreadFuture<Standalone<RangeResultRef>> MultiVersionTransaction::getRange(const KeySelectorRef& begin,
-                                                                           const KeySelectorRef& end,
-                                                                           int limit,
-                                                                           bool snapshot,
-                                                                           bool reverse) {
+ThreadFuture<RangeResult> MultiVersionTransaction::getRange(const KeySelectorRef& begin,
+                                                            const KeySelectorRef& end,
+                                                            int limit,
+                                                            bool snapshot,
+                                                            bool reverse) {
 	auto tr = getTransaction();
 	auto f = tr.transaction ? tr.transaction->getRange(begin, end, limit, snapshot, reverse)
-	                        : ThreadFuture<Standalone<RangeResultRef>>(Never());
+	                        : ThreadFuture<RangeResult>(Never());
 	return abortableFuture(f, tr.onChange);
 }
 
-ThreadFuture<Standalone<RangeResultRef>> MultiVersionTransaction::getRange(const KeySelectorRef& begin,
-                                                                           const KeySelectorRef& end,
-                                                                           GetRangeLimits limits,
-                                                                           bool snapshot,
-                                                                           bool reverse) {
+ThreadFuture<RangeResult> MultiVersionTransaction::getRange(const KeySelectorRef& begin,
+                                                            const KeySelectorRef& end,
+                                                            GetRangeLimits limits,
+                                                            bool snapshot,
+                                                            bool reverse) {
 	auto tr = getTransaction();
 	auto f = tr.transaction ? tr.transaction->getRange(begin, end, limits, snapshot, reverse)
-	                        : ThreadFuture<Standalone<RangeResultRef>>(Never());
+	                        : ThreadFuture<RangeResult>(Never());
 	return abortableFuture(f, tr.onChange);
 }
 
-ThreadFuture<Standalone<RangeResultRef>> MultiVersionTransaction::getRange(const KeyRangeRef& keys,
-                                                                           int limit,
-                                                                           bool snapshot,
-                                                                           bool reverse) {
+ThreadFuture<RangeResult> MultiVersionTransaction::getRange(const KeyRangeRef& keys,
+                                                            int limit,
+                                                            bool snapshot,
+                                                            bool reverse) {
 	auto tr = getTransaction();
-	auto f = tr.transaction ? tr.transaction->getRange(keys, limit, snapshot, reverse)
-	                        : ThreadFuture<Standalone<RangeResultRef>>(Never());
+	auto f =
+	    tr.transaction ? tr.transaction->getRange(keys, limit, snapshot, reverse) : ThreadFuture<RangeResult>(Never());
 	return abortableFuture(f, tr.onChange);
 }
 
-ThreadFuture<Standalone<RangeResultRef>> MultiVersionTransaction::getRange(const KeyRangeRef& keys,
-                                                                           GetRangeLimits limits,
-                                                                           bool snapshot,
-                                                                           bool reverse) {
+ThreadFuture<RangeResult> MultiVersionTransaction::getRange(const KeyRangeRef& keys,
+                                                            GetRangeLimits limits,
+                                                            bool snapshot,
+                                                            bool reverse) {
 	auto tr = getTransaction();
-	auto f = tr.transaction ? tr.transaction->getRange(keys, limits, snapshot, reverse)
-	                        : ThreadFuture<Standalone<RangeResultRef>>(Never());
+	auto f =
+	    tr.transaction ? tr.transaction->getRange(keys, limits, snapshot, reverse) : ThreadFuture<RangeResult>(Never());
 	return abortableFuture(f, tr.onChange);
 }
 
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index 70de66c064..a98e16b440 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -200,24 +200,24 @@ public:
 
 	ThreadFuture<Optional<Value>> get(const KeyRef& key, bool snapshot = false) override;
 	ThreadFuture<Key> getKey(const KeySelectorRef& key, bool snapshot = false) override;
-	ThreadFuture<Standalone<RangeResultRef>> getRange(const KeySelectorRef& begin,
-	                                                  const KeySelectorRef& end,
-	                                                  int limit,
-	                                                  bool snapshot = false,
-	                                                  bool reverse = false) override;
-	ThreadFuture<Standalone<RangeResultRef>> getRange(const KeySelectorRef& begin,
-	                                                  const KeySelectorRef& end,
-	                                                  GetRangeLimits limits,
-	                                                  bool snapshot = false,
-	                                                  bool reverse = false) override;
-	ThreadFuture<Standalone<RangeResultRef>> getRange(const KeyRangeRef& keys,
-	                                                  int limit,
-	                                                  bool snapshot = false,
-	                                                  bool reverse = false) override;
-	ThreadFuture<Standalone<RangeResultRef>> getRange(const KeyRangeRef& keys,
-	                                                  GetRangeLimits limits,
-	                                                  bool snapshot = false,
-	                                                  bool reverse = false) override;
+	ThreadFuture<RangeResult> getRange(const KeySelectorRef& begin,
+	                                   const KeySelectorRef& end,
+	                                   int limit,
+	                                   bool snapshot = false,
+	                                   bool reverse = false) override;
+	ThreadFuture<RangeResult> getRange(const KeySelectorRef& begin,
+	                                   const KeySelectorRef& end,
+	                                   GetRangeLimits limits,
+	                                   bool snapshot = false,
+	                                   bool reverse = false) override;
+	ThreadFuture<RangeResult> getRange(const KeyRangeRef& keys,
+	                                   int limit,
+	                                   bool snapshot = false,
+	                                   bool reverse = false) override;
+	ThreadFuture<RangeResult> getRange(const KeyRangeRef& keys,
+	                                   GetRangeLimits limits,
+	                                   bool snapshot = false,
+	                                   bool reverse = false) override;
 	ThreadFuture<Standalone<VectorRef<const char*>>> getAddressesForKey(const KeyRef& key) override;
 	ThreadFuture<Standalone<StringRef>> getVersionstamp() override;
 	ThreadFuture<int64_t> getEstimatedRangeSizeBytes(const KeyRangeRef& keys) override;
@@ -339,24 +339,24 @@ public:
 
 	ThreadFuture<Optional<Value>> get(const KeyRef& key, bool snapshot = false) override;
 	ThreadFuture<Key> getKey(const KeySelectorRef& key, bool snapshot = false) override;
-	ThreadFuture<Standalone<RangeResultRef>> getRange(const KeySelectorRef& begin,
-	                                                  const KeySelectorRef& end,
-	                                                  int limit,
-	                                                  bool snapshot = false,
-	                                                  bool reverse = false) override;
-	ThreadFuture<Standalone<RangeResultRef>> getRange(const KeySelectorRef& begin,
-	                                                  const KeySelectorRef& end,
-	                                                  GetRangeLimits limits,
-	                                                  bool snapshot = false,
-	                                                  bool reverse = false) override;
-	ThreadFuture<Standalone<RangeResultRef>> getRange(const KeyRangeRef& keys,
-	                                                  int limit,
-	                                                  bool snapshot = false,
-	                                                  bool reverse = false) override;
-	ThreadFuture<Standalone<RangeResultRef>> getRange(const KeyRangeRef& keys,
-	                                                  GetRangeLimits limits,
-	                                                  bool snapshot = false,
-	                                                  bool reverse = false) override;
+	ThreadFuture<RangeResult> getRange(const KeySelectorRef& begin,
+	                                   const KeySelectorRef& end,
+	                                   int limit,
+	                                   bool snapshot = false,
+	                                   bool reverse = false) override;
+	ThreadFuture<RangeResult> getRange(const KeySelectorRef& begin,
+	                                   const KeySelectorRef& end,
+	                                   GetRangeLimits limits,
+	                                   bool snapshot = false,
+	                                   bool reverse = false) override;
+	ThreadFuture<RangeResult> getRange(const KeyRangeRef& keys,
+	                                   int limit,
+	                                   bool snapshot = false,
+	                                   bool reverse = false) override;
+	ThreadFuture<RangeResult> getRange(const KeyRangeRef& keys,
+	                                   GetRangeLimits limits,
+	                                   bool snapshot = false,
+	                                   bool reverse = false) override;
 	ThreadFuture<Standalone<VectorRef<const char*>>> getAddressesForKey(const KeyRef& key) override;
 	ThreadFuture<Standalone<StringRef>> getVersionstamp() override;
 
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index b9cf4abae5..a9bc969a13 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -391,7 +391,7 @@ ACTOR static Future<Void> delExcessClntTxnEntriesActor(Transaction* tr, int64_t
 			                            ? (txInfoSize - clientTxInfoSizeLimit)
 			                            : CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT;
 			GetRangeLimits limit(GetRangeLimits::ROW_LIMIT_UNLIMITED, getRangeByteLimit);
-			Standalone<RangeResultRef> txEntries =
+			RangeResult txEntries =
 			    wait(tr->getRange(KeyRangeRef(clientLatencyName, strinc(clientLatencyName)), limit));
 			state int64_t numBytesToDel = 0;
 			KeyRef endKey;
@@ -601,7 +601,7 @@ ACTOR Future<Void> updateCachedRanges(DatabaseContext* self, std::map<UID, Stora
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
 			try {
-				Standalone<RangeResultRef> range = wait(tr.getRange(storageCacheKeys, CLIENT_KNOBS->TOO_MANY));
+				RangeResult range = wait(tr.getRange(storageCacheKeys, CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!range.more);
 				std::vector<Reference<ReferencedInterface<StorageServerInterface>>> cacheInterfaces;
 				cacheInterfaces.reserve(cacheServers->size());
@@ -678,8 +678,7 @@ ACTOR Future<Void> monitorCacheList(DatabaseContext* self) {
 			// the cyclic reference to self.
 			wait(refreshTransaction(self, &tr));
 			try {
-				Standalone<RangeResultRef> cacheList =
-				    wait(tr.getRange(storageCacheServerKeys, CLIENT_KNOBS->TOO_MANY));
+				RangeResult cacheList = wait(tr.getRange(storageCacheServerKeys, CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!cacheList.more);
 				bool hasChanges = false;
 				std::map<UID, StorageServerInterface> allCacheServers;
@@ -762,16 +761,16 @@ void DatabaseContext::registerSpecialKeySpaceModule(SpecialKeySpace::MODULE modu
 	specialKeySpaceModules.push_back(std::move(impl));
 }
 
-ACTOR Future<Standalone<RangeResultRef>> getWorkerInterfaces(Reference<ClusterConnectionFile> clusterFile);
+ACTOR Future<RangeResult> getWorkerInterfaces(Reference<ClusterConnectionFile> clusterFile);
 ACTOR Future<Optional<Value>> getJSON(Database db);
 
 struct WorkerInterfacesSpecialKeyImpl : SpecialKeyRangeReadImpl {
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override {
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override {
 		if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionFile()) {
 			Key prefix = Key(getKeyRange().begin);
 			return map(getWorkerInterfaces(ryw->getDatabase()->getConnectionFile()),
-			           [prefix = prefix, kr = KeyRange(kr)](const Standalone<RangeResultRef>& in) {
-				           Standalone<RangeResultRef> result;
+			           [prefix = prefix, kr = KeyRange(kr)](const RangeResult& in) {
+				           RangeResult result;
 				           for (const auto& [k_, v] : in) {
 					           auto k = k_.withPrefix(prefix);
 					           if (kr.contains(k))
@@ -782,7 +781,7 @@ struct WorkerInterfacesSpecialKeyImpl : SpecialKeyRangeReadImpl {
 				           return result;
 			           });
 		} else {
-			return Standalone<RangeResultRef>();
+			return RangeResult();
 		}
 	}
 
@@ -790,10 +789,10 @@ struct WorkerInterfacesSpecialKeyImpl : SpecialKeyRangeReadImpl {
 };
 
 struct SingleSpecialKeyImpl : SpecialKeyRangeReadImpl {
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override {
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override {
 		ASSERT(kr.contains(k));
 		return map(f(ryw), [k = k](Optional<Value> v) {
-			Standalone<RangeResultRef> result;
+			RangeResult result;
 			if (v.present()) {
 				result.push_back_deep(result.arena(), KeyValueRef(k, v.get()));
 			}
@@ -812,11 +811,11 @@ private:
 class HealthMetricsRangeImpl : public SpecialKeyRangeAsyncImpl {
 public:
 	explicit HealthMetricsRangeImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 };
 
-static Standalone<RangeResultRef> healthMetricsToKVPairs(const HealthMetrics& metrics, KeyRangeRef kr) {
-	Standalone<RangeResultRef> result;
+static RangeResult healthMetricsToKVPairs(const HealthMetrics& metrics, KeyRangeRef kr) {
+	RangeResult result;
 	if (CLIENT_BUGGIFY)
 		return result;
 	if (kr.contains(LiteralStringRef("\xff\xff/metrics/health/aggregate")) && metrics.worstStorageDurabilityLag != 0) {
@@ -886,8 +885,7 @@ static Standalone<RangeResultRef> healthMetricsToKVPairs(const HealthMetrics& me
 	return result;
 }
 
-ACTOR static Future<Standalone<RangeResultRef>> healthMetricsGetRangeActor(ReadYourWritesTransaction* ryw,
-                                                                           KeyRangeRef kr) {
+ACTOR static Future<RangeResult> healthMetricsGetRangeActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
 	HealthMetrics metrics = wait(ryw->getDatabase()->getHealthMetrics(
 	    /*detailed ("per process")*/ kr.intersects(KeyRangeRef(LiteralStringRef("\xff\xff/metrics/health/storage/"),
 	                                                           LiteralStringRef("\xff\xff/metrics/health/storage0"))) ||
@@ -898,8 +896,7 @@ ACTOR static Future<Standalone<RangeResultRef>> healthMetricsGetRangeActor(ReadY
 
 HealthMetricsRangeImpl::HealthMetricsRangeImpl(KeyRangeRef kr) : SpecialKeyRangeAsyncImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> HealthMetricsRangeImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                                    KeyRangeRef kr) const {
+Future<RangeResult> HealthMetricsRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return healthMetricsGetRangeActor(ryw, kr);
 }
 
@@ -1980,14 +1977,14 @@ AddressExclusion AddressExclusion::parse(StringRef const& key) {
 	}
 }
 
-Future<Standalone<RangeResultRef>> getRange(Database const& cx,
-                                            Future<Version> const& fVersion,
-                                            KeySelector const& begin,
-                                            KeySelector const& end,
-                                            GetRangeLimits const& limits,
-                                            bool const& reverse,
-                                            TransactionInfo const& info,
-                                            TagSet const& tags);
+Future<RangeResult> getRange(Database const& cx,
+                             Future<Version> const& fVersion,
+                             KeySelector const& begin,
+                             KeySelector const& end,
+                             GetRangeLimits const& limits,
+                             bool const& reverse,
+                             TransactionInfo const& info,
+                             TagSet const& tags);
 
 ACTOR Future<Optional<Value>> getValue(Future<Version> version,
                                        Key key,
@@ -2713,14 +2710,14 @@ void transformRangeLimits(GetRangeLimits limits, bool reverse, GetKeyValuesReque
 	}
 }
 
-ACTOR Future<Standalone<RangeResultRef>> getExactRange(Database cx,
-                                                       Version version,
-                                                       KeyRange keys,
-                                                       GetRangeLimits limits,
-                                                       bool reverse,
-                                                       TransactionInfo info,
-                                                       TagSet tags) {
-	state Standalone<RangeResultRef> output;
+ACTOR Future<RangeResult> getExactRange(Database cx,
+                                        Version version,
+                                        KeyRange keys,
+                                        GetRangeLimits limits,
+                                        bool reverse,
+                                        TransactionInfo info,
+                                        TagSet tags) {
+	state RangeResult output;
 	state Span span("NAPI:getExactRange"_loc, info.spanID);
 
 	// printf("getExactRange( '%s', '%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str());
@@ -2892,14 +2889,14 @@ Future<Key> resolveKey(Database const& cx,
 	return getKey(cx, key, version, info, tags);
 }
 
-ACTOR Future<Standalone<RangeResultRef>> getRangeFallback(Database cx,
-                                                          Version version,
-                                                          KeySelector begin,
-                                                          KeySelector end,
-                                                          GetRangeLimits limits,
-                                                          bool reverse,
-                                                          TransactionInfo info,
-                                                          TagSet tags) {
+ACTOR Future<RangeResult> getRangeFallback(Database cx,
+                                           Version version,
+                                           KeySelector begin,
+                                           KeySelector end,
+                                           GetRangeLimits limits,
+                                           bool reverse,
+                                           TransactionInfo info,
+                                           TagSet tags) {
 	if (version == latestVersion) {
 		state Transaction transaction(cx);
 		transaction.setOption(FDBTransactionOptions::CAUSAL_READ_RISKY);
@@ -2915,15 +2912,15 @@ ACTOR Future<Standalone<RangeResultRef>> getRangeFallback(Database cx,
 	state Key b = wait(fb);
 	state Key e = wait(fe);
 	if (b >= e) {
-		return Standalone<RangeResultRef>();
+		return RangeResult();
 	}
 
 	// if e is allKeys.end, we have read through the end of the database
 	// if b is allKeys.begin, we have either read through the beginning of the database,
 	// or allKeys.begin exists in the database and will be part of the conflict range anyways
 
-	Standalone<RangeResultRef> _r = wait(getExactRange(cx, version, KeyRangeRef(b, e), limits, reverse, info, tags));
-	Standalone<RangeResultRef> r = _r;
+	RangeResult _r = wait(getExactRange(cx, version, KeyRangeRef(b, e), limits, reverse, info, tags));
+	RangeResult r = _r;
 
 	if (b == allKeys.begin && ((reverse && !r.more) || !reverse))
 		r.readToBegin = true;
@@ -2955,7 +2952,7 @@ void getRangeFinished(Database cx,
                       bool snapshot,
                       Promise<std::pair<Key, Key>> conflictRange,
                       bool reverse,
-                      Standalone<RangeResultRef> result) {
+                      RangeResult result) {
 	int64_t bytes = 0;
 	for (const KeyValueRef& kv : result) {
 		bytes += kv.key.size() + kv.value.size();
@@ -3001,21 +2998,21 @@ void getRangeFinished(Database cx,
 	}
 }
 
-ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
-                                                  Reference<TransactionLogInfo> trLogInfo,
-                                                  Future<Version> fVersion,
-                                                  KeySelector begin,
-                                                  KeySelector end,
-                                                  GetRangeLimits limits,
-                                                  Promise<std::pair<Key, Key>> conflictRange,
-                                                  bool snapshot,
-                                                  bool reverse,
-                                                  TransactionInfo info,
-                                                  TagSet tags) {
+ACTOR Future<RangeResult> getRange(Database cx,
+                                   Reference<TransactionLogInfo> trLogInfo,
+                                   Future<Version> fVersion,
+                                   KeySelector begin,
+                                   KeySelector end,
+                                   GetRangeLimits limits,
+                                   Promise<std::pair<Key, Key>> conflictRange,
+                                   bool snapshot,
+                                   bool reverse,
+                                   TransactionInfo info,
+                                   TagSet tags) {
 	state GetRangeLimits originalLimits(limits);
 	state KeySelector originalBegin = begin;
 	state KeySelector originalEnd = end;
-	state Standalone<RangeResultRef> output;
+	state RangeResult output;
 	state Span span("NAPI:getRange"_loc, info.spanID);
 
 	try {
@@ -3148,8 +3145,8 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					bool readToBegin = output.readToBegin;
 					bool readThroughEnd = output.readThroughEnd;
 
-					output = Standalone<RangeResultRef>(
-					    RangeResultRef(rep.data, modifiedSelectors || limits.isReached() || rep.more), rep.arena);
+					output = RangeResult(RangeResultRef(rep.data, modifiedSelectors || limits.isReached() || rep.more),
+					                     rep.arena);
 					output.readToBegin = readToBegin;
 					output.readThroughEnd = readThroughEnd;
 
@@ -3202,7 +3199,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					TEST(true); // !GetKeyValuesReply.more and modifiedSelectors in getRange
 
 					if (!rep.data.size()) {
-						Standalone<RangeResultRef> result = wait(getRangeFallback(
+						RangeResult result = wait(getRangeFallback(
 						    cx, version, originalBegin, originalEnd, originalLimits, reverse, info, tags));
 						getRangeFinished(cx,
 						                 trLogInfo,
@@ -3239,7 +3236,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					                    reverse ? (end - 1).isBackward() : begin.isBackward());
 
 					if (e.code() == error_code_wrong_shard_server) {
-						Standalone<RangeResultRef> result = wait(getRangeFallback(
+						RangeResult result = wait(getRangeFallback(
 						    cx, version, originalBegin, originalEnd, originalLimits, reverse, info, tags));
 						getRangeFinished(cx,
 						                 trLogInfo,
@@ -3275,14 +3272,14 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 	}
 }
 
-Future<Standalone<RangeResultRef>> getRange(Database const& cx,
-                                            Future<Version> const& fVersion,
-                                            KeySelector const& begin,
-                                            KeySelector const& end,
-                                            GetRangeLimits const& limits,
-                                            bool const& reverse,
-                                            TransactionInfo const& info,
-                                            TagSet const& tags) {
+Future<RangeResult> getRange(Database const& cx,
+                             Future<Version> const& fVersion,
+                             KeySelector const& begin,
+                             KeySelector const& end,
+                             GetRangeLimits const& limits,
+                             bool const& reverse,
+                             TransactionInfo const& info,
+                             TagSet const& tags) {
 	return getRange(cx,
 	                Reference<TransactionLogInfo>(),
 	                fVersion,
@@ -3504,18 +3501,18 @@ ACTOR Future<Standalone<VectorRef<const char*>>> getAddressesForKeyActor(Key key
 	// serverInterfaces vector being empty, which will cause us to return an empty addresses list.
 
 	state Key ksKey = keyServersKey(key);
-	state Standalone<RangeResultRef> serverTagResult = wait(getRange(cx,
-	                                                                 ver,
-	                                                                 lastLessOrEqual(serverTagKeys.begin),
-	                                                                 firstGreaterThan(serverTagKeys.end),
-	                                                                 GetRangeLimits(CLIENT_KNOBS->TOO_MANY),
-	                                                                 false,
-	                                                                 info,
-	                                                                 options.readTags));
+	state RangeResult serverTagResult = wait(getRange(cx,
+	                                                  ver,
+	                                                  lastLessOrEqual(serverTagKeys.begin),
+	                                                  firstGreaterThan(serverTagKeys.end),
+	                                                  GetRangeLimits(CLIENT_KNOBS->TOO_MANY),
+	                                                  false,
+	                                                  info,
+	                                                  options.readTags));
 	ASSERT(!serverTagResult.more && serverTagResult.size() < CLIENT_KNOBS->TOO_MANY);
-	Future<Standalone<RangeResultRef>> futureServerUids = getRange(
+	Future<RangeResult> futureServerUids = getRange(
 	    cx, ver, lastLessOrEqual(ksKey), firstGreaterThan(ksKey), GetRangeLimits(1), false, info, options.readTags);
-	Standalone<RangeResultRef> serverUids = wait(futureServerUids);
+	RangeResult serverUids = wait(futureServerUids);
 
 	ASSERT(serverUids.size()); // every shard needs to have a team
 
@@ -3580,16 +3577,16 @@ Future<Key> Transaction::getKey(const KeySelector& key, bool snapshot) {
 	return getKeyAndConflictRange(cx, key, getReadVersion(), conflictRange, info, options.readTags);
 }
 
-Future<Standalone<RangeResultRef>> Transaction::getRange(const KeySelector& begin,
-                                                         const KeySelector& end,
-                                                         GetRangeLimits limits,
-                                                         bool snapshot,
-                                                         bool reverse) {
+Future<RangeResult> Transaction::getRange(const KeySelector& begin,
+                                          const KeySelector& end,
+                                          GetRangeLimits limits,
+                                          bool snapshot,
+                                          bool reverse) {
 	++cx->transactionLogicalReads;
 	++cx->transactionGetRangeRequests;
 
 	if (limits.isReached())
-		return Standalone<RangeResultRef>();
+		return RangeResult();
 
 	if (!limits.isValid())
 		return range_limits_invalid();
@@ -3610,7 +3607,7 @@ Future<Standalone<RangeResultRef>> Transaction::getRange(const KeySelector& begi
 
 	if (b.offset >= e.offset && b.getKey() >= e.getKey()) {
 		TEST(true); // Native range inverted
-		return Standalone<RangeResultRef>();
+		return RangeResult();
 	}
 
 	Promise<std::pair<Key, Key>> conflictRange;
@@ -3622,11 +3619,11 @@ Future<Standalone<RangeResultRef>> Transaction::getRange(const KeySelector& begi
 	    cx, trLogInfo, getReadVersion(), b, e, limits, conflictRange, snapshot, reverse, info, options.readTags);
 }
 
-Future<Standalone<RangeResultRef>> Transaction::getRange(const KeySelector& begin,
-                                                         const KeySelector& end,
-                                                         int limit,
-                                                         bool snapshot,
-                                                         bool reverse) {
+Future<RangeResult> Transaction::getRange(const KeySelector& begin,
+                                          const KeySelector& end,
+                                          int limit,
+                                          bool snapshot,
+                                          bool reverse) {
 	return getRange(begin, end, GetRangeLimits(limit), snapshot, reverse);
 }
 
@@ -3982,7 +3979,7 @@ ACTOR void checkWrites(Database cx,
 			if (m.mutated) {
 				checkedRanges++;
 				if (m.cleared) {
-					Standalone<RangeResultRef> shouldBeEmpty = wait(tr.getRange(it->range(), 1));
+					RangeResult shouldBeEmpty = wait(tr.getRange(it->range(), 1));
 					if (shouldBeEmpty.size()) {
 						TraceEvent(SevError, "CheckWritesFailed")
 						    .detail("Class", "Clear")
@@ -5641,7 +5638,7 @@ ACTOR static Future<int64_t> rebootWorkerActor(DatabaseContext* cx, ValueRef add
 	state std::map<Key, std::pair<Value, ClientLeaderRegInterface>> address_interface;
 	if (!cx->getConnectionFile())
 		return 0;
-	Standalone<RangeResultRef> kvs = wait(getWorkerInterfaces(cx->getConnectionFile()));
+	RangeResult kvs = wait(getWorkerInterfaces(cx->getConnectionFile()));
 	ASSERT(!kvs.more);
 	// Note: reuse this knob from fdbcli, change it if necessary
 	Reference<FlowLock> connectLock(new FlowLock(CLIENT_KNOBS->CLI_CONNECT_PARALLELISM));
diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h
index 9f9b0057ca..6ba14764de 100644
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@@ -252,30 +252,30 @@ public:
 	[[nodiscard]] Future<Void> watch(Reference<Watch> watch);
 	[[nodiscard]] Future<Key> getKey(const KeySelector& key, bool snapshot = false);
 	// Future< Optional<KeyValue> > get( const KeySelectorRef& key );
-	[[nodiscard]] Future<Standalone<RangeResultRef>> getRange(const KeySelector& begin,
-	                                                          const KeySelector& end,
-	                                                          int limit,
-	                                                          bool snapshot = false,
-	                                                          bool reverse = false);
-	[[nodiscard]] Future<Standalone<RangeResultRef>> getRange(const KeySelector& begin,
-	                                                          const KeySelector& end,
-	                                                          GetRangeLimits limits,
-	                                                          bool snapshot = false,
-	                                                          bool reverse = false);
-	[[nodiscard]] Future<Standalone<RangeResultRef>> getRange(const KeyRange& keys,
-	                                                          int limit,
-	                                                          bool snapshot = false,
-	                                                          bool reverse = false) {
+	[[nodiscard]] Future<RangeResult> getRange(const KeySelector& begin,
+	                                           const KeySelector& end,
+	                                           int limit,
+	                                           bool snapshot = false,
+	                                           bool reverse = false);
+	[[nodiscard]] Future<RangeResult> getRange(const KeySelector& begin,
+	                                           const KeySelector& end,
+	                                           GetRangeLimits limits,
+	                                           bool snapshot = false,
+	                                           bool reverse = false);
+	[[nodiscard]] Future<RangeResult> getRange(const KeyRange& keys,
+	                                           int limit,
+	                                           bool snapshot = false,
+	                                           bool reverse = false) {
 		return getRange(KeySelector(firstGreaterOrEqual(keys.begin), keys.arena()),
 		                KeySelector(firstGreaterOrEqual(keys.end), keys.arena()),
 		                limit,
 		                snapshot,
 		                reverse);
 	}
-	[[nodiscard]] Future<Standalone<RangeResultRef>> getRange(const KeyRange& keys,
-	                                                          GetRangeLimits limits,
-	                                                          bool snapshot = false,
-	                                                          bool reverse = false) {
+	[[nodiscard]] Future<RangeResult> getRange(const KeyRange& keys,
+	                                           GetRangeLimits limits,
+	                                           bool snapshot = false,
+	                                           bool reverse = false) {
 		return getRange(KeySelector(firstGreaterOrEqual(keys.begin), keys.arena()),
 		                KeySelector(firstGreaterOrEqual(keys.end), keys.arena()),
 		                limits,
diff --git a/fdbclient/RYWIterator.cpp b/fdbclient/RYWIterator.cpp
index fd3eec35c7..7b6065088b 100644
--- a/fdbclient/RYWIterator.cpp
+++ b/fdbclient/RYWIterator.cpp
@@ -347,8 +347,9 @@ void testSnapshotCache() {
 }
 
 /*
-ACTOR Standalone<RangeResultRef> getRange( Transaction* tr, KeySelector begin, KeySelector end, SnapshotCache* cache,
-WriteMap* writes, GetRangeLimits limits ) { RYWIterator it(cache, writes); RYWIterator itEnd(cache, writes);
+ACTOR RangeResult getRange( Transaction* tr, KeySelector begin, KeySelector end, SnapshotCache* cache,
+WriteMap* writes, GetRangeLimits limits ) {
+    RYWIterator it(cache, writes); RYWIterator itEnd(cache, writes);
     resolveKeySelectorFromCache( begin, it );
     resolveKeySelectorFromCache( end, itEnd );
 
@@ -362,9 +363,8 @@ WriteMap* writes, GetRangeLimits limits ) { RYWIterator it(cache, writes); RYWIt
             ucEnd.skipUncached(itEnd);
 
             state KeySelector read_end = ucEnd==itEnd ? end :
-firstGreaterOrEqual(ucEnd.endKey().toStandaloneStringRef()); Standalone<RangeResultRef> snapshot_read = wait(
-tr->getRange( begin, read_end, limits, false, false ) ); cache->insert( getKnownKeyRange( snapshot_read, begin, read_end
-), snapshot_read );
+firstGreaterOrEqual(ucEnd.endKey().toStandaloneStringRef()); RangeResult snapshot_read = wait(tr->getRange( begin,
+read_end, limits, false, false ) ); cache->insert( getKnownKeyRange( snapshot_read, begin, read_end), snapshot_read );
 
             // TODO: Is there a more efficient way to deal with invalidation?
             it = itEnd = RYWIterator( cache, writes );
diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp
index a6e047b979..3012fddd29 100644
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@@ -71,7 +71,7 @@ public:
 		  : begin(begin), end(end), limits(limits) {}
 		KeySelector begin, end;
 		GetRangeLimits limits;
-		typedef Standalone<RangeResultRef> Result;
+		using Result = RangeResult;
 	};
 
 	// read() Performs a read (get, getKey, getRange, etc), in the context of the given transaction.  Snapshot or RYW
@@ -126,7 +126,7 @@ public:
 	ACTOR template <class Iter>
 	static Future<Key> read(ReadYourWritesTransaction* ryw, GetKeyReq read, Iter* it) {
 		if (read.key.offset > 0) {
-			Standalone<RangeResultRef> result =
+			RangeResult result =
 			    wait(getRangeValue(ryw, read.key, firstGreaterOrEqual(ryw->getMaxReadKey()), GetRangeLimits(1), it));
 			if (result.readToBegin)
 				return allKeys.begin;
@@ -135,7 +135,7 @@ public:
 			return result[0].key;
 		} else {
 			read.key.offset++;
-			Standalone<RangeResultRef> result =
+			RangeResult result =
 			    wait(getRangeValueBack(ryw, firstGreaterOrEqual(allKeys.begin), read.key, GetRangeLimits(1), it));
 			if (result.readThroughEnd)
 				return ryw->getMaxReadKey();
@@ -146,12 +146,12 @@ public:
 	};
 
 	template <class Iter>
-	static Future<Standalone<RangeResultRef>> read(ReadYourWritesTransaction* ryw, GetRangeReq<false> read, Iter* it) {
+	static Future<RangeResult> read(ReadYourWritesTransaction* ryw, GetRangeReq<false> read, Iter* it) {
 		return getRangeValue(ryw, read.begin, read.end, read.limits, it);
 	};
 
 	template <class Iter>
-	static Future<Standalone<RangeResultRef>> read(ReadYourWritesTransaction* ryw, GetRangeReq<true> read, Iter* it) {
+	static Future<RangeResult> read(ReadYourWritesTransaction* ryw, GetRangeReq<true> read, Iter* it) {
 		return getRangeValueBack(ryw, read.begin, read.end, read.limits, it);
 	};
 
@@ -171,9 +171,7 @@ public:
 	}
 
 	ACTOR template <bool Reverse>
-	static Future<Standalone<RangeResultRef>> readThrough(ReadYourWritesTransaction* ryw,
-	                                                      GetRangeReq<Reverse> read,
-	                                                      bool snapshot) {
+	static Future<RangeResult> readThrough(ReadYourWritesTransaction* ryw, GetRangeReq<Reverse> read, bool snapshot) {
 		if (Reverse && read.end.offset > 1) {
 			// FIXME: Optimistically assume that this will not run into the system keys, and only reissue if the result
 			// actually does.
@@ -184,16 +182,15 @@ public:
 				read.end = KeySelector(firstGreaterOrEqual(key), key.arena());
 		}
 
-		Standalone<RangeResultRef> v = wait(ryw->tr.getRange(read.begin, read.end, read.limits, snapshot, Reverse));
+		RangeResult v = wait(ryw->tr.getRange(read.begin, read.end, read.limits, snapshot, Reverse));
 		KeyRef maxKey = ryw->getMaxReadKey();
 		if (v.size() > 0) {
 			if (!Reverse && v[v.size() - 1].key >= maxKey) {
-				state Standalone<RangeResultRef> _v = v;
+				state RangeResult _v = v;
 				int i = _v.size() - 2;
 				for (; i >= 0 && _v[i].key >= maxKey; --i) {
 				}
-				return Standalone<RangeResultRef>(RangeResultRef(VectorRef<KeyValueRef>(&_v[0], i + 1), false),
-				                                  _v.arena());
+				return RangeResult(RangeResultRef(VectorRef<KeyValueRef>(&_v[0], i + 1), false), _v.arena());
 			}
 		}
 
@@ -230,7 +227,7 @@ public:
 	static void addConflictRange(ReadYourWritesTransaction* ryw,
 	                             GetRangeReq<false> read,
 	                             WriteMap::iterator& it,
-	                             Standalone<RangeResultRef> const& result) {
+	                             RangeResult const& result) {
 		KeyRef rangeBegin, rangeEnd;
 		bool endInArena = false;
 
@@ -265,7 +262,7 @@ public:
 	static void addConflictRange(ReadYourWritesTransaction* ryw,
 	                             GetRangeReq<true> read,
 	                             WriteMap::iterator& it,
-	                             Standalone<RangeResultRef> const& result) {
+	                             RangeResult const& result) {
 		KeyRef rangeBegin, rangeEnd;
 		bool endInArena = false;
 
@@ -527,14 +524,14 @@ public:
 
 	// TODO: read to begin, read through end flags for result
 	ACTOR template <class Iter>
-	static Future<Standalone<RangeResultRef>> getRangeValue(ReadYourWritesTransaction* ryw,
-	                                                        KeySelector begin,
-	                                                        KeySelector end,
-	                                                        GetRangeLimits limits,
-	                                                        Iter* pit) {
+	static Future<RangeResult> getRangeValue(ReadYourWritesTransaction* ryw,
+	                                         KeySelector begin,
+	                                         KeySelector end,
+	                                         GetRangeLimits limits,
+	                                         Iter* pit) {
 		state Iter& it(*pit);
 		state Iter itEnd(*pit);
-		state Standalone<RangeResultRef> result;
+		state RangeResult result;
 		state int64_t additionalRows = 0;
 		state int itemsPastEnd = 0;
 		state int requestCount = 0;
@@ -690,8 +687,7 @@ public:
 				//TraceEvent("RYWIssuing", randomID).detail("Begin", read_begin.toString()).detail("End", read_end.toString()).detail("Bytes", requestLimit.bytes).detail("Rows", requestLimit.rows).detail("Limits", limits.bytes).detail("Reached", limits.isReached()).detail("RequestCount", requestCount).detail("SingleClears", singleClears).detail("UcEnd", ucEnd.beginKey()).detail("MinRows", requestLimit.minRows);
 
 				additionalRows = 0;
-				Standalone<RangeResultRef> snapshot_read =
-				    wait(ryw->tr.getRange(read_begin, read_end, requestLimit, true, false));
+				RangeResult snapshot_read = wait(ryw->tr.getRange(read_begin, read_end, requestLimit, true, false));
 				KeyRangeRef range = getKnownKeyRange(snapshot_read, read_begin, read_end, ryw->arena);
 
 				//TraceEvent("RYWCacheInsert", randomID).detail("Range", range).detail("ExpectedSize", snapshot_read.expectedSize()).detail("Rows", snapshot_read.size()).detail("Results", snapshot_read).detail("More", snapshot_read.more).detail("ReadToBegin", snapshot_read.readToBegin).detail("ReadThroughEnd", snapshot_read.readThroughEnd).detail("ReadThrough", snapshot_read.readThrough);
@@ -829,14 +825,14 @@ public:
 	}
 
 	ACTOR template <class Iter>
-	static Future<Standalone<RangeResultRef>> getRangeValueBack(ReadYourWritesTransaction* ryw,
-	                                                            KeySelector begin,
-	                                                            KeySelector end,
-	                                                            GetRangeLimits limits,
-	                                                            Iter* pit) {
+	static Future<RangeResult> getRangeValueBack(ReadYourWritesTransaction* ryw,
+	                                             KeySelector begin,
+	                                             KeySelector end,
+	                                             GetRangeLimits limits,
+	                                             Iter* pit) {
 		state Iter& it(*pit);
 		state Iter itEnd(*pit);
-		state Standalone<RangeResultRef> result;
+		state RangeResult result;
 		state int64_t additionalRows = 0;
 		state int itemsPastBegin = 0;
 		state int requestCount = 0;
@@ -994,8 +990,7 @@ public:
 				//TraceEvent("RYWIssuing", randomID).detail("Begin", read_begin.toString()).detail("End", read_end.toString()).detail("Bytes", requestLimit.bytes).detail("Rows", requestLimit.rows).detail("Limits", limits.bytes).detail("Reached", limits.isReached()).detail("RequestCount", requestCount).detail("SingleClears", singleClears).detail("UcEnd", ucEnd.beginKey()).detail("MinRows", requestLimit.minRows);
 
 				additionalRows = 0;
-				Standalone<RangeResultRef> snapshot_read =
-				    wait(ryw->tr.getRange(read_begin, read_end, requestLimit, true, true));
+				RangeResult snapshot_read = wait(ryw->tr.getRange(read_begin, read_end, requestLimit, true, true));
 				KeyRangeRef range = getKnownKeyRangeBack(snapshot_read, read_begin, read_end, ryw->arena);
 
 				//TraceEvent("RYWCacheInsert", randomID).detail("Range", range).detail("ExpectedSize", snapshot_read.expectedSize()).detail("Rows", snapshot_read.size()).detail("Results", snapshot_read).detail("More", snapshot_read.more).detail("ReadToBegin", snapshot_read.readToBegin).detail("ReadThroughEnd", snapshot_read.readThroughEnd).detail("ReadThrough", snapshot_read.readThrough);
@@ -1329,7 +1324,7 @@ ACTOR Future<Optional<Value>> getJSON(Database db) {
 	return getValueFromJSON(statusObj);
 }
 
-ACTOR Future<Standalone<RangeResultRef>> getWorkerInterfaces(Reference<ClusterConnectionFile> clusterFile) {
+ACTOR Future<RangeResult> getWorkerInterfaces(Reference<ClusterConnectionFile> clusterFile) {
 	state Reference<AsyncVar<Optional<ClusterInterface>>> clusterInterface(new AsyncVar<Optional<ClusterInterface>>);
 	state Future<Void> leaderMon = monitorLeader<ClusterInterface>(clusterFile, clusterInterface);
 
@@ -1340,7 +1335,7 @@ ACTOR Future<Standalone<RangeResultRef>> getWorkerInterfaces(Reference<ClusterCo
 			                  ? brokenPromiseToNever(
 			                        clusterInterface->get().get().getClientWorkers.getReply(GetClientWorkersRequest()))
 			                  : Never())) {
-				Standalone<RangeResultRef> result;
+				RangeResult result;
 				for (auto& it : workers) {
 					result.push_back_deep(
 					    result.arena(),
@@ -1434,11 +1429,11 @@ Future<Key> ReadYourWritesTransaction::getKey(const KeySelector& key, bool snaps
 	return result;
 }
 
-Future<Standalone<RangeResultRef>> ReadYourWritesTransaction::getRange(KeySelector begin,
-                                                                       KeySelector end,
-                                                                       GetRangeLimits limits,
-                                                                       bool snapshot,
-                                                                       bool reverse) {
+Future<RangeResult> ReadYourWritesTransaction::getRange(KeySelector begin,
+                                                        KeySelector end,
+                                                        GetRangeLimits limits,
+                                                        bool snapshot,
+                                                        bool reverse) {
 	if (getDatabase()->apiVersionAtLeast(630)) {
 		if (specialKeys.contains(begin.getKey()) && specialKeys.begin <= end.getKey() &&
 		    end.getKey() <= specialKeys.end) {
@@ -1450,7 +1445,7 @@ Future<Standalone<RangeResultRef>> ReadYourWritesTransaction::getRange(KeySelect
 			if (tr.getDatabase().getPtr() && tr.getDatabase()->getConnectionFile()) {
 				return getWorkerInterfaces(tr.getDatabase()->getConnectionFile());
 			} else {
-				return Standalone<RangeResultRef>();
+				return RangeResult();
 			}
 		}
 	}
@@ -1469,7 +1464,7 @@ Future<Standalone<RangeResultRef>> ReadYourWritesTransaction::getRange(KeySelect
 	// This optimization prevents nullptr operations from being added to the conflict range
 	if (limits.isReached()) {
 		TEST(true); // RYW range read limit 0
-		return Standalone<RangeResultRef>();
+		return RangeResult();
 	}
 
 	if (!limits.isValid())
@@ -1483,10 +1478,10 @@ Future<Standalone<RangeResultRef>> ReadYourWritesTransaction::getRange(KeySelect
 
 	if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) {
 		TEST(true); // RYW range inverted
-		return Standalone<RangeResultRef>();
+		return RangeResult();
 	}
 
-	Future<Standalone<RangeResultRef>> result =
+	Future<RangeResult> result =
 	    reverse ? RYWImpl::readWithConflictRange(this, RYWImpl::GetRangeReq<true>(begin, end, limits), snapshot)
 	            : RYWImpl::readWithConflictRange(this, RYWImpl::GetRangeReq<false>(begin, end, limits), snapshot);
 
@@ -1494,11 +1489,11 @@ Future<Standalone<RangeResultRef>> ReadYourWritesTransaction::getRange(KeySelect
 	return result;
 }
 
-Future<Standalone<RangeResultRef>> ReadYourWritesTransaction::getRange(const KeySelector& begin,
-                                                                       const KeySelector& end,
-                                                                       int limit,
-                                                                       bool snapshot,
-                                                                       bool reverse) {
+Future<RangeResult> ReadYourWritesTransaction::getRange(const KeySelector& begin,
+                                                        const KeySelector& end,
+                                                        int limit,
+                                                        bool snapshot,
+                                                        bool reverse) {
 	return getRange(begin, end, GetRangeLimits(limit), snapshot, reverse);
 }
 
@@ -1739,11 +1734,11 @@ void ReadYourWritesTransaction::setToken(uint64_t token) {
 	tr.setToken(token);
 }
 
-Standalone<RangeResultRef> ReadYourWritesTransaction::getReadConflictRangeIntersecting(KeyRangeRef kr) {
+RangeResult ReadYourWritesTransaction::getReadConflictRangeIntersecting(KeyRangeRef kr) {
 	TEST(true); // Special keys read conflict range
 	ASSERT(readConflictRangeKeysRange.contains(kr));
 	ASSERT(!tr.options.checkWritesEnabled);
-	Standalone<RangeResultRef> result;
+	RangeResult result;
 	if (!options.readYourWritesDisabled) {
 		kr = kr.removePrefix(readConflictRangeKeysRange.begin);
 		auto iter = readConflicts.rangeContainingKeyBefore(kr.begin);
@@ -1781,10 +1776,10 @@ Standalone<RangeResultRef> ReadYourWritesTransaction::getReadConflictRangeInters
 	return result;
 }
 
-Standalone<RangeResultRef> ReadYourWritesTransaction::getWriteConflictRangeIntersecting(KeyRangeRef kr) {
+RangeResult ReadYourWritesTransaction::getWriteConflictRangeIntersecting(KeyRangeRef kr) {
 	TEST(true); // Special keys write conflict range
 	ASSERT(writeConflictRangeKeysRange.contains(kr));
-	Standalone<RangeResultRef> result;
+	RangeResult result;
 
 	// Memory owned by result
 	CoalescedKeyRefRangeMap<ValueRef> writeConflicts{ LiteralStringRef("0"), specialKeys.end };
diff --git a/fdbclient/ReadYourWrites.h b/fdbclient/ReadYourWrites.h
index f8e8e390bd..337e60408d 100644
--- a/fdbclient/ReadYourWrites.h
+++ b/fdbclient/ReadYourWrites.h
@@ -78,30 +78,27 @@ public:
 	Optional<Version> getCachedReadVersion() { return tr.getCachedReadVersion(); }
 	Future<Optional<Value>> get(const Key& key, bool snapshot = false);
 	Future<Key> getKey(const KeySelector& key, bool snapshot = false);
-	Future<Standalone<RangeResultRef>> getRange(const KeySelector& begin,
-	                                            const KeySelector& end,
-	                                            int limit,
-	                                            bool snapshot = false,
-	                                            bool reverse = false);
-	Future<Standalone<RangeResultRef>> getRange(KeySelector begin,
-	                                            KeySelector end,
-	                                            GetRangeLimits limits,
-	                                            bool snapshot = false,
-	                                            bool reverse = false);
-	Future<Standalone<RangeResultRef>> getRange(const KeyRange& keys,
-	                                            int limit,
-	                                            bool snapshot = false,
-	                                            bool reverse = false) {
+	Future<RangeResult> getRange(const KeySelector& begin,
+	                             const KeySelector& end,
+	                             int limit,
+	                             bool snapshot = false,
+	                             bool reverse = false);
+	Future<RangeResult> getRange(KeySelector begin,
+	                             KeySelector end,
+	                             GetRangeLimits limits,
+	                             bool snapshot = false,
+	                             bool reverse = false);
+	Future<RangeResult> getRange(const KeyRange& keys, int limit, bool snapshot = false, bool reverse = false) {
 		return getRange(KeySelector(firstGreaterOrEqual(keys.begin), keys.arena()),
 		                KeySelector(firstGreaterOrEqual(keys.end), keys.arena()),
 		                limit,
 		                snapshot,
 		                reverse);
 	}
-	Future<Standalone<RangeResultRef>> getRange(const KeyRange& keys,
-	                                            GetRangeLimits limits,
-	                                            bool snapshot = false,
-	                                            bool reverse = false) {
+	Future<RangeResult> getRange(const KeyRange& keys,
+	                             GetRangeLimits limits,
+	                             bool snapshot = false,
+	                             bool reverse = false) {
 		return getRange(KeySelector(firstGreaterOrEqual(keys.begin), keys.arena()),
 		                KeySelector(firstGreaterOrEqual(keys.end), keys.arena()),
 		                limits,
@@ -169,9 +166,9 @@ public:
 	void setToken(uint64_t token);
 
 	// Read from the special key space readConflictRangeKeysRange
-	Standalone<RangeResultRef> getReadConflictRangeIntersecting(KeyRangeRef kr);
+	RangeResult getReadConflictRangeIntersecting(KeyRangeRef kr);
 	// Read from the special key space writeConflictRangeKeysRange
-	Standalone<RangeResultRef> getWriteConflictRangeIntersecting(KeyRangeRef kr);
+	RangeResult getWriteConflictRangeIntersecting(KeyRangeRef kr);
 
 	bool specialKeySpaceRelaxed() const { return options.specialKeySpaceRelaxed; }
 	bool specialKeySpaceChangeConfiguration() const { return options.specialKeySpaceChangeConfiguration; }
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index c866354759..9eb8e3aacb 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -124,9 +124,7 @@ std::set<std::string> SpecialKeySpace::options = { "excluded/force", "failed/for
 
 std::set<std::string> SpecialKeySpace::tracingOptions = { kTracingTransactionIdKey, kTracingTokenKey };
 
-Standalone<RangeResultRef> rywGetRange(ReadYourWritesTransaction* ryw,
-                                       const KeyRangeRef& kr,
-                                       const Standalone<RangeResultRef>& res);
+RangeResult rywGetRange(ReadYourWritesTransaction* ryw, const KeyRangeRef& kr, const RangeResult& res);
 
 // This function will move the given KeySelector as far as possible to the standard form:
 // orEqual == false && offset == 1 (Standard form)
@@ -137,13 +135,13 @@ Standalone<RangeResultRef> rywGetRange(ReadYourWritesTransaction* ryw,
 ACTOR Future<Void> moveKeySelectorOverRangeActor(const SpecialKeyRangeReadImpl* skrImpl,
                                                  ReadYourWritesTransaction* ryw,
                                                  KeySelector* ks,
-                                                 Optional<Standalone<RangeResultRef>>* cache) {
+                                                 Optional<RangeResult>* cache) {
 	ASSERT(!ks->orEqual); // should be removed before calling
 	ASSERT(ks->offset != 1); // never being called if KeySelector is already normalized
 
 	state Key startKey(skrImpl->getKeyRange().begin);
 	state Key endKey(skrImpl->getKeyRange().end);
-	state Standalone<RangeResultRef> result;
+	state RangeResult result;
 
 	if (ks->offset < 1) {
 		// less than the given key
@@ -164,10 +162,10 @@ ACTOR Future<Void> moveKeySelectorOverRangeActor(const SpecialKeyRangeReadImpl*
 
 	if (skrImpl->isAsync()) {
 		const SpecialKeyRangeAsyncImpl* ptr = dynamic_cast<const SpecialKeyRangeAsyncImpl*>(skrImpl);
-		Standalone<RangeResultRef> result_ = wait(ptr->getRange(ryw, KeyRangeRef(startKey, endKey), cache));
+		RangeResult result_ = wait(ptr->getRange(ryw, KeyRangeRef(startKey, endKey), cache));
 		result = result_;
 	} else {
-		Standalone<RangeResultRef> result_ = wait(skrImpl->getRange(ryw, KeyRangeRef(startKey, endKey)));
+		RangeResult result_ = wait(skrImpl->getRange(ryw, KeyRangeRef(startKey, endKey)));
 		result = result_;
 	}
 
@@ -216,8 +214,8 @@ ACTOR Future<Void> normalizeKeySelectorActor(SpecialKeySpace* sks,
                                              KeySelector* ks,
                                              KeyRangeRef boundary,
                                              int* actualOffset,
-                                             Standalone<RangeResultRef>* result,
-                                             Optional<Standalone<RangeResultRef>>* cache) {
+                                             RangeResult* result,
+                                             Optional<RangeResult>* cache) {
 	// If offset < 1, where we need to move left, iter points to the range containing at least one smaller key
 	// (It's a wasting of time to walk through the range whose begin key is same as ks->key)
 	// (rangeContainingKeyBefore itself handles the case where ks->key == Key())
@@ -287,15 +285,15 @@ void SpecialKeySpace::modulesBoundaryInit() {
 	}
 }
 
-ACTOR Future<Standalone<RangeResultRef>> SpecialKeySpace::checkRYWValid(SpecialKeySpace* sks,
-                                                                        ReadYourWritesTransaction* ryw,
-                                                                        KeySelector begin,
-                                                                        KeySelector end,
-                                                                        GetRangeLimits limits,
-                                                                        bool reverse) {
+ACTOR Future<RangeResult> SpecialKeySpace::checkRYWValid(SpecialKeySpace* sks,
+                                                         ReadYourWritesTransaction* ryw,
+                                                         KeySelector begin,
+                                                         KeySelector end,
+                                                         GetRangeLimits limits,
+                                                         bool reverse) {
 	ASSERT(ryw);
 	choose {
-		when(Standalone<RangeResultRef> result =
+		when(RangeResult result =
 		         wait(SpecialKeySpace::getRangeAggregationActor(sks, ryw, begin, end, limits, reverse))) {
 			return result;
 		}
@@ -303,22 +301,22 @@ ACTOR Future<Standalone<RangeResultRef>> SpecialKeySpace::checkRYWValid(SpecialK
 	}
 }
 
-ACTOR Future<Standalone<RangeResultRef>> SpecialKeySpace::getRangeAggregationActor(SpecialKeySpace* sks,
-                                                                                   ReadYourWritesTransaction* ryw,
-                                                                                   KeySelector begin,
-                                                                                   KeySelector end,
-                                                                                   GetRangeLimits limits,
-                                                                                   bool reverse) {
+ACTOR Future<RangeResult> SpecialKeySpace::getRangeAggregationActor(SpecialKeySpace* sks,
+                                                                    ReadYourWritesTransaction* ryw,
+                                                                    KeySelector begin,
+                                                                    KeySelector end,
+                                                                    GetRangeLimits limits,
+                                                                    bool reverse) {
 	// This function handles ranges which cover more than one keyrange and aggregates all results
 	// KeySelector, GetRangeLimits and reverse are all handled here
-	state Standalone<RangeResultRef> result;
-	state Standalone<RangeResultRef> pairs;
+	state RangeResult result;
+	state RangeResult pairs;
 	state RangeMap<Key, SpecialKeyRangeReadImpl*, KeyRangeRef>::iterator iter;
 	state int actualBeginOffset;
 	state int actualEndOffset;
 	state KeyRangeRef moduleBoundary;
 	// used to cache result from potential first read
-	state Optional<Standalone<RangeResultRef>> cache;
+	state Optional<RangeResult> cache;
 
 	if (ryw->specialKeySpaceRelaxed()) {
 		moduleBoundary = sks->range;
@@ -367,10 +365,10 @@ ACTOR Future<Standalone<RangeResultRef>> SpecialKeySpace::getRangeAggregationAct
 			KeyRef keyEnd = kr.contains(end.getKey()) ? end.getKey() : kr.end;
 			if (iter->value()->isAsync() && cache.present()) {
 				const SpecialKeyRangeAsyncImpl* ptr = dynamic_cast<const SpecialKeyRangeAsyncImpl*>(iter->value());
-				Standalone<RangeResultRef> pairs_ = wait(ptr->getRange(ryw, KeyRangeRef(keyStart, keyEnd), &cache));
+				RangeResult pairs_ = wait(ptr->getRange(ryw, KeyRangeRef(keyStart, keyEnd), &cache));
 				pairs = pairs_;
 			} else {
-				Standalone<RangeResultRef> pairs_ = wait(iter->value()->getRange(ryw, KeyRangeRef(keyStart, keyEnd)));
+				RangeResult pairs_ = wait(iter->value()->getRange(ryw, KeyRangeRef(keyStart, keyEnd)));
 				pairs = pairs_;
 			}
 			result.arena().dependsOn(pairs.arena());
@@ -398,10 +396,10 @@ ACTOR Future<Standalone<RangeResultRef>> SpecialKeySpace::getRangeAggregationAct
 			KeyRef keyEnd = kr.contains(end.getKey()) ? end.getKey() : kr.end;
 			if (iter->value()->isAsync() && cache.present()) {
 				const SpecialKeyRangeAsyncImpl* ptr = dynamic_cast<const SpecialKeyRangeAsyncImpl*>(iter->value());
-				Standalone<RangeResultRef> pairs_ = wait(ptr->getRange(ryw, KeyRangeRef(keyStart, keyEnd), &cache));
+				RangeResult pairs_ = wait(ptr->getRange(ryw, KeyRangeRef(keyStart, keyEnd), &cache));
 				pairs = pairs_;
 			} else {
-				Standalone<RangeResultRef> pairs_ = wait(iter->value()->getRange(ryw, KeyRangeRef(keyStart, keyEnd)));
+				RangeResult pairs_ = wait(iter->value()->getRange(ryw, KeyRangeRef(keyStart, keyEnd)));
 				pairs = pairs_;
 			}
 			result.arena().dependsOn(pairs.arena());
@@ -424,17 +422,17 @@ ACTOR Future<Standalone<RangeResultRef>> SpecialKeySpace::getRangeAggregationAct
 	return result;
 }
 
-Future<Standalone<RangeResultRef>> SpecialKeySpace::getRange(ReadYourWritesTransaction* ryw,
-                                                             KeySelector begin,
-                                                             KeySelector end,
-                                                             GetRangeLimits limits,
-                                                             bool reverse) {
+Future<RangeResult> SpecialKeySpace::getRange(ReadYourWritesTransaction* ryw,
+                                              KeySelector begin,
+                                              KeySelector end,
+                                              GetRangeLimits limits,
+                                              bool reverse) {
 	// validate limits here
 	if (!limits.isValid())
 		return range_limits_invalid();
 	if (limits.isReached()) {
 		TEST(true); // read limit 0
-		return Standalone<RangeResultRef>();
+		return RangeResult();
 	}
 	// make sure orEqual == false
 	begin.removeOrEqual(begin.arena());
@@ -442,7 +440,7 @@ Future<Standalone<RangeResultRef>> SpecialKeySpace::getRange(ReadYourWritesTrans
 
 	if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) {
 		TEST(true); // range inverted
-		return Standalone<RangeResultRef>();
+		return RangeResult();
 	}
 
 	return checkRYWValid(this, ryw, begin, end, limits, reverse);
@@ -452,11 +450,11 @@ ACTOR Future<Optional<Value>> SpecialKeySpace::getActor(SpecialKeySpace* sks,
                                                         ReadYourWritesTransaction* ryw,
                                                         KeyRef key) {
 	// use getRange to workaround this
-	Standalone<RangeResultRef> result = wait(sks->getRange(ryw,
-	                                                       KeySelector(firstGreaterOrEqual(key)),
-	                                                       KeySelector(firstGreaterOrEqual(keyAfter(key))),
-	                                                       GetRangeLimits(CLIENT_KNOBS->TOO_MANY),
-	                                                       false));
+	RangeResult result = wait(sks->getRange(ryw,
+	                                        KeySelector(firstGreaterOrEqual(key)),
+	                                        KeySelector(firstGreaterOrEqual(keyAfter(key))),
+	                                        GetRangeLimits(CLIENT_KNOBS->TOO_MANY),
+	                                        false));
 	ASSERT(result.size() <= 1);
 	if (result.size()) {
 		return Optional<Value>(result[0].value);
@@ -603,7 +601,7 @@ Future<Void> SpecialKeySpace::commit(ReadYourWritesTransaction* ryw) {
 
 SKSCTestImpl::SKSCTestImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> SKSCTestImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+Future<RangeResult> SKSCTestImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	ASSERT(range.contains(kr));
 	auto resultFuture = ryw->getRange(kr, CLIENT_KNOBS->TOO_MANY);
 	// all keys are written to RYW, since GRV is set, the read should happen locally
@@ -621,27 +619,25 @@ Future<Optional<std::string>> SKSCTestImpl::commit(ReadYourWritesTransaction* ry
 
 ReadConflictRangeImpl::ReadConflictRangeImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
 
-ACTOR static Future<Standalone<RangeResultRef>> getReadConflictRangeImpl(ReadYourWritesTransaction* ryw, KeyRange kr) {
+ACTOR static Future<RangeResult> getReadConflictRangeImpl(ReadYourWritesTransaction* ryw, KeyRange kr) {
 	wait(ryw->pendingReads());
 	return ryw->getReadConflictRangeIntersecting(kr);
 }
 
-Future<Standalone<RangeResultRef>> ReadConflictRangeImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                                   KeyRangeRef kr) const {
+Future<RangeResult> ReadConflictRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return getReadConflictRangeImpl(ryw, kr);
 }
 
 WriteConflictRangeImpl::WriteConflictRangeImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> WriteConflictRangeImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                                    KeyRangeRef kr) const {
+Future<RangeResult> WriteConflictRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return ryw->getWriteConflictRangeIntersecting(kr);
 }
 
 ConflictingKeysImpl::ConflictingKeysImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> ConflictingKeysImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
-	Standalone<RangeResultRef> result;
+Future<RangeResult> ConflictingKeysImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+	RangeResult result;
 	if (ryw->getTransactionInfo().conflictingKeys) {
 		auto krMapPtr = ryw->getTransactionInfo().conflictingKeys.get();
 		auto beginIter = krMapPtr->rangeContaining(kr.begin);
@@ -657,13 +653,13 @@ Future<Standalone<RangeResultRef>> ConflictingKeysImpl::getRange(ReadYourWritesT
 	return result;
 }
 
-ACTOR Future<Standalone<RangeResultRef>> ddMetricsGetRangeActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
+ACTOR Future<RangeResult> ddMetricsGetRangeActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
 	loop {
 		try {
 			auto keys = kr.removePrefix(ddStatsRange.begin);
 			Standalone<VectorRef<DDMetricsRef>> resultWithoutPrefix = wait(
 			    waitDataDistributionMetricsList(ryw->getDatabase(), keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT));
-			Standalone<RangeResultRef> result;
+			RangeResult result;
 			for (const auto& ddMetricsRef : resultWithoutPrefix) {
 				// each begin key is the previous end key, thus we only encode the begin key in the result
 				KeyRef beginKey = ddMetricsRef.beginKey.withPrefix(ddStatsRange.begin, result.arena());
@@ -691,7 +687,7 @@ ACTOR Future<Standalone<RangeResultRef>> ddMetricsGetRangeActor(ReadYourWritesTr
 
 DDStatsRangeImpl::DDStatsRangeImpl(KeyRangeRef kr) : SpecialKeyRangeAsyncImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> DDStatsRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+Future<RangeResult> DDStatsRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return ddMetricsGetRangeActor(ryw, kr);
 }
 
@@ -704,9 +700,8 @@ Key SpecialKeySpace::getManagementApiCommandOptionSpecialKey(const std::string&
 
 ManagementCommandsOptionsImpl::ManagementCommandsOptionsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> ManagementCommandsOptionsImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                                           KeyRangeRef kr) const {
-	Standalone<RangeResultRef> result;
+Future<RangeResult> ManagementCommandsOptionsImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+	RangeResult result;
 	// Since we only have limit number of options, a brute force loop here is enough
 	for (const auto& option : SpecialKeySpace::getManagementApiOptionsSet()) {
 		auto key = getKeyRange().begin.withSuffix(option);
@@ -748,14 +743,12 @@ Future<Optional<std::string>> ManagementCommandsOptionsImpl::commit(ReadYourWrit
 	return Optional<std::string>();
 }
 
-Standalone<RangeResultRef> rywGetRange(ReadYourWritesTransaction* ryw,
-                                       const KeyRangeRef& kr,
-                                       const Standalone<RangeResultRef>& res) {
+RangeResult rywGetRange(ReadYourWritesTransaction* ryw, const KeyRangeRef& kr, const RangeResult& res) {
 	// "res" is the read result regardless of your writes, if ryw disabled, return immediately
 	if (ryw->readYourWritesDisabled())
 		return res;
 	// If ryw enabled, we update it with writes from the transaction
-	Standalone<RangeResultRef> result;
+	RangeResult result;
 	RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
 	    ryw->getSpecialKeySpaceWriteMap().containedRanges(kr);
 	RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
@@ -800,13 +793,13 @@ Standalone<RangeResultRef> rywGetRange(ReadYourWritesTransaction* ryw,
 }
 
 // read from those readwrite modules in which special keys have one-to-one mapping with real persisted keys
-ACTOR Future<Standalone<RangeResultRef>> rwModuleWithMappingGetRangeActor(ReadYourWritesTransaction* ryw,
-                                                                          const SpecialKeyRangeRWImpl* impl,
-                                                                          KeyRangeRef kr) {
-	Standalone<RangeResultRef> resultWithoutPrefix =
+ACTOR Future<RangeResult> rwModuleWithMappingGetRangeActor(ReadYourWritesTransaction* ryw,
+                                                           const SpecialKeyRangeRWImpl* impl,
+                                                           KeyRangeRef kr) {
+	RangeResult resultWithoutPrefix =
 	    wait(ryw->getTransaction().getRange(ryw->getDatabase()->specialKeySpace->decode(kr), CLIENT_KNOBS->TOO_MANY));
 	ASSERT(!resultWithoutPrefix.more && resultWithoutPrefix.size() < CLIENT_KNOBS->TOO_MANY);
-	Standalone<RangeResultRef> result;
+	RangeResult result;
 	for (const KeyValueRef& kv : resultWithoutPrefix)
 		result.push_back_deep(result.arena(), KeyValueRef(impl->encode(kv.key), kv.value));
 	return rywGetRange(ryw, kr, result);
@@ -814,8 +807,7 @@ ACTOR Future<Standalone<RangeResultRef>> rwModuleWithMappingGetRangeActor(ReadYo
 
 ExcludeServersRangeImpl::ExcludeServersRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> ExcludeServersRangeImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                                     KeyRangeRef kr) const {
+Future<RangeResult> ExcludeServersRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return rwModuleWithMappingGetRangeActor(ryw, this, kr);
 }
 
@@ -1054,8 +1046,7 @@ Future<Optional<std::string>> ExcludeServersRangeImpl::commit(ReadYourWritesTran
 
 FailedServersRangeImpl::FailedServersRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> FailedServersRangeImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                                    KeyRangeRef kr) const {
+Future<RangeResult> FailedServersRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return rwModuleWithMappingGetRangeActor(ryw, this, kr);
 }
 
@@ -1078,10 +1069,8 @@ Future<Optional<std::string>> FailedServersRangeImpl::commit(ReadYourWritesTrans
 	return excludeCommitActor(ryw, true);
 }
 
-ACTOR Future<Standalone<RangeResultRef>> ExclusionInProgressActor(ReadYourWritesTransaction* ryw,
-                                                                  KeyRef prefix,
-                                                                  KeyRangeRef kr) {
-	state Standalone<RangeResultRef> result;
+ACTOR Future<RangeResult> ExclusionInProgressActor(ReadYourWritesTransaction* ryw, KeyRef prefix, KeyRangeRef kr) {
+	state RangeResult result;
 	state Transaction& tr = ryw->getTransaction();
 	tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 	tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); // necessary?
@@ -1092,7 +1081,7 @@ ACTOR Future<Standalone<RangeResultRef>> ExclusionInProgressActor(ReadYourWrites
 	state std::set<NetworkAddress> inProgressExclusion;
 	// Just getting a consistent read version proves that a set of tlogs satisfying the exclusions has completed
 	// recovery Check that there aren't any storage servers with addresses violating the exclusions
-	state Standalone<RangeResultRef> serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
+	state RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
 	ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY);
 
 	for (auto& s : serverList) {
@@ -1137,21 +1126,18 @@ ACTOR Future<Standalone<RangeResultRef>> ExclusionInProgressActor(ReadYourWrites
 
 ExclusionInProgressRangeImpl::ExclusionInProgressRangeImpl(KeyRangeRef kr) : SpecialKeyRangeAsyncImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> ExclusionInProgressRangeImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                                          KeyRangeRef kr) const {
+Future<RangeResult> ExclusionInProgressRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return ExclusionInProgressActor(ryw, getKeyRange().begin, kr);
 }
 
-ACTOR Future<Standalone<RangeResultRef>> getProcessClassActor(ReadYourWritesTransaction* ryw,
-                                                              KeyRef prefix,
-                                                              KeyRangeRef kr) {
+ACTOR Future<RangeResult> getProcessClassActor(ReadYourWritesTransaction* ryw, KeyRef prefix, KeyRangeRef kr) {
 	vector<ProcessData> _workers = wait(getWorkers(&ryw->getTransaction()));
 	auto workers = _workers; // strip const
 	// Note : the sort by string is anti intuition, ex. 1.1.1.1:11 < 1.1.1.1:5
 	std::sort(workers.begin(), workers.end(), [](const ProcessData& lhs, const ProcessData& rhs) {
 		return formatIpPort(lhs.address.ip, lhs.address.port) < formatIpPort(rhs.address.ip, rhs.address.port);
 	});
-	Standalone<RangeResultRef> result;
+	RangeResult result;
 	for (auto& w : workers) {
 		// exclude :tls in keys even the network addresss is TLS
 		KeyRef k(prefix.withSuffix(formatIpPort(w.address.ip, w.address.port), result.arena()));
@@ -1205,8 +1191,7 @@ ACTOR Future<Optional<std::string>> processClassCommitActor(ReadYourWritesTransa
 
 ProcessClassRangeImpl::ProcessClassRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> ProcessClassRangeImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                                   KeyRangeRef kr) const {
+Future<RangeResult> ProcessClassRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return getProcessClassActor(ryw, getKeyRange().begin, kr);
 }
 
@@ -1259,16 +1244,14 @@ void ProcessClassRangeImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef&
 	    ryw, "setclass", "Clear range operation is meaningless thus forbidden for setclass");
 }
 
-ACTOR Future<Standalone<RangeResultRef>> getProcessClassSourceActor(ReadYourWritesTransaction* ryw,
-                                                                    KeyRef prefix,
-                                                                    KeyRangeRef kr) {
+ACTOR Future<RangeResult> getProcessClassSourceActor(ReadYourWritesTransaction* ryw, KeyRef prefix, KeyRangeRef kr) {
 	vector<ProcessData> _workers = wait(getWorkers(&ryw->getTransaction()));
 	auto workers = _workers; // strip const
 	// Note : the sort by string is anti intuition, ex. 1.1.1.1:11 < 1.1.1.1:5
 	std::sort(workers.begin(), workers.end(), [](const ProcessData& lhs, const ProcessData& rhs) {
 		return formatIpPort(lhs.address.ip, lhs.address.port) < formatIpPort(rhs.address.ip, rhs.address.port);
 	});
-	Standalone<RangeResultRef> result;
+	RangeResult result;
 	for (auto& w : workers) {
 		// exclude :tls in keys even the network addresss is TLS
 		Key k(prefix.withSuffix(formatIpPort(w.address.ip, w.address.port)));
@@ -1284,15 +1267,14 @@ ACTOR Future<Standalone<RangeResultRef>> getProcessClassSourceActor(ReadYourWrit
 
 ProcessClassSourceRangeImpl::ProcessClassSourceRangeImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> ProcessClassSourceRangeImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                                         KeyRangeRef kr) const {
+Future<RangeResult> ProcessClassSourceRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return getProcessClassSourceActor(ryw, getKeyRange().begin, kr);
 }
 
-ACTOR Future<Standalone<RangeResultRef>> getLockedKeyActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
+ACTOR Future<RangeResult> getLockedKeyActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
 	ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
 	Optional<Value> val = wait(ryw->getTransaction().get(databaseLockedKey));
-	Standalone<RangeResultRef> result;
+	RangeResult result;
 	if (val.present()) {
 		result.push_back_deep(result.arena(), KeyValueRef(kr.begin, val.get()));
 	}
@@ -1301,13 +1283,13 @@ ACTOR Future<Standalone<RangeResultRef>> getLockedKeyActor(ReadYourWritesTransac
 
 LockDatabaseImpl::LockDatabaseImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> LockDatabaseImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+Future<RangeResult> LockDatabaseImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	// single key range, the queried range should always be the same as the underlying range
 	ASSERT(kr == getKeyRange());
 	auto lockEntry = ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandPrefix("lock")];
 	if (!ryw->readYourWritesDisabled() && lockEntry.first) {
 		// ryw enabled and we have written to the special key
-		Standalone<RangeResultRef> result;
+		RangeResult result;
 		if (lockEntry.second.present()) {
 			result.push_back_deep(result.arena(), KeyValueRef(kr.begin, lockEntry.second.get()));
 		}
@@ -1358,12 +1340,12 @@ Future<Optional<std::string>> LockDatabaseImpl::commit(ReadYourWritesTransaction
 	}
 }
 
-ACTOR Future<Standalone<RangeResultRef>> getConsistencyCheckKeyActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
+ACTOR Future<RangeResult> getConsistencyCheckKeyActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
 	ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
 	ryw->getTransaction().setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 	Optional<Value> val = wait(ryw->getTransaction().get(fdbShouldConsistencyCheckBeSuspended));
 	bool ccSuspendSetting = val.present() ? BinaryReader::fromStringRef<bool>(val.get(), Unversioned()) : false;
-	Standalone<RangeResultRef> result;
+	RangeResult result;
 	if (ccSuspendSetting) {
 		result.push_back_deep(result.arena(), KeyValueRef(kr.begin, ValueRef()));
 	}
@@ -1372,14 +1354,13 @@ ACTOR Future<Standalone<RangeResultRef>> getConsistencyCheckKeyActor(ReadYourWri
 
 ConsistencyCheckImpl::ConsistencyCheckImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> ConsistencyCheckImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                                  KeyRangeRef kr) const {
+Future<RangeResult> ConsistencyCheckImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	// single key range, the queried range should always be the same as the underlying range
 	ASSERT(kr == getKeyRange());
 	auto entry = ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandPrefix("consistencycheck")];
 	if (!ryw->readYourWritesDisabled() && entry.first) {
 		// ryw enabled and we have written to the special key
-		Standalone<RangeResultRef> result;
+		RangeResult result;
 		if (entry.second.present()) {
 			result.push_back_deep(result.arena(), KeyValueRef(kr.begin, entry.second.get()));
 		}
@@ -1405,8 +1386,8 @@ GlobalConfigImpl::GlobalConfigImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {
 // framework within the range specified. The special-key-space getrange
 // function should only be used for informational purposes. All values are
 // returned as strings regardless of their true type.
-Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
-	Standalone<RangeResultRef> result;
+Future<RangeResult> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+	RangeResult result;
 
 	auto& globalConfig = GlobalConfig::globalConfig();
 	KeyRangeRef modified =
@@ -1453,7 +1434,7 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 
 	// History should only contain three most recent updates. If it currently
 	// has three items, remove the oldest to make room for a new item.
-	Standalone<RangeResultRef> history = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
+	RangeResult history = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
 	constexpr int kGlobalConfigMaxHistorySize = 3;
 	if (history.size() > kGlobalConfigMaxHistorySize - 1) {
 		for (int i = 0; i < history.size() - (kGlobalConfigMaxHistorySize - 1); ++i) {
@@ -1525,8 +1506,8 @@ void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key)
 
 TracingOptionsImpl::TracingOptionsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
-	Standalone<RangeResultRef> result;
+Future<RangeResult> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+	RangeResult result;
 	for (const auto& option : SpecialKeySpace::getTracingOptions()) {
 		auto key = getKeyRange().begin.withSuffix(option);
 		if (!kr.contains(key)) {
@@ -1584,8 +1565,8 @@ void TracingOptionsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key
 
 CoordinatorsImpl::CoordinatorsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> CoordinatorsImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
-	Standalone<RangeResultRef> result;
+Future<RangeResult> CoordinatorsImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+	RangeResult result;
 	KeyRef prefix(getKeyRange().begin);
 	// the constructor of ClusterConnectionFile already checks whether the file is valid
 	auto cs = ClusterConnectionFile(ryw->getDatabase()->getConnectionFile()->getFilename()).getConnectionString();
@@ -1730,9 +1711,8 @@ void CoordinatorsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key)
 
 CoordinatorsAutoImpl::CoordinatorsAutoImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
 
-ACTOR static Future<Standalone<RangeResultRef>> CoordinatorsAutoImplActor(ReadYourWritesTransaction* ryw,
-                                                                          KeyRangeRef kr) {
-	state Standalone<RangeResultRef> res;
+ACTOR static Future<RangeResult> CoordinatorsAutoImplActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
+	state RangeResult res;
 	state std::string autoCoordinatorsKey;
 	state Transaction& tr = ryw->getTransaction();
 
@@ -1768,18 +1748,16 @@ ACTOR static Future<Standalone<RangeResultRef>> CoordinatorsAutoImplActor(ReadYo
 	return res;
 }
 
-Future<Standalone<RangeResultRef>> CoordinatorsAutoImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                                  KeyRangeRef kr) const {
+Future<RangeResult> CoordinatorsAutoImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	// single key range, the queried range should always be the same as the underlying range
 	ASSERT(kr == getKeyRange());
 	return CoordinatorsAutoImplActor(ryw, kr);
 }
 
-ACTOR static Future<Standalone<RangeResultRef>> getMinCommitVersionActor(ReadYourWritesTransaction* ryw,
-                                                                         KeyRangeRef kr) {
+ACTOR static Future<RangeResult> getMinCommitVersionActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
 	ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
 	Optional<Value> val = wait(ryw->getTransaction().get(minRequiredCommitVersionKey));
-	Standalone<RangeResultRef> result;
+	RangeResult result;
 	if (val.present()) {
 		Version minRequiredCommitVersion = BinaryReader::fromStringRef<Version>(val.get(), Unversioned());
 		ValueRef version(result.arena(), boost::lexical_cast<std::string>(minRequiredCommitVersion));
@@ -1790,13 +1768,13 @@ ACTOR static Future<Standalone<RangeResultRef>> getMinCommitVersionActor(ReadYou
 
 AdvanceVersionImpl::AdvanceVersionImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> AdvanceVersionImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+Future<RangeResult> AdvanceVersionImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	// single key range, the queried range should always be the same as the underlying range
 	ASSERT(kr == getKeyRange());
 	auto entry = ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandPrefix("advanceversion")];
 	if (!ryw->readYourWritesDisabled() && entry.first) {
 		// ryw enabled and we have written to the special key
-		Standalone<RangeResultRef> result;
+		RangeResult result;
 		if (entry.second.present()) {
 			result.push_back_deep(result.arena(), KeyValueRef(kr.begin, entry.second.get()));
 		}
@@ -1845,10 +1823,10 @@ Future<Optional<std::string>> AdvanceVersionImpl::commit(ReadYourWritesTransacti
 
 ClientProfilingImpl::ClientProfilingImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-ACTOR static Future<Standalone<RangeResultRef>> ClientProfilingGetRangeActor(ReadYourWritesTransaction* ryw,
-                                                                             KeyRef prefix,
-                                                                             KeyRangeRef kr) {
-	state Standalone<RangeResultRef> result;
+ACTOR static Future<RangeResult> ClientProfilingGetRangeActor(ReadYourWritesTransaction* ryw,
+                                                              KeyRef prefix,
+                                                              KeyRangeRef kr) {
+	state RangeResult result;
 	// client_txn_sample_rate
 	state Key sampleRateKey = LiteralStringRef("client_txn_sample_rate").withPrefix(prefix);
 	if (kr.contains(sampleRateKey)) {
@@ -1891,7 +1869,7 @@ ACTOR static Future<Standalone<RangeResultRef>> ClientProfilingGetRangeActor(Rea
 }
 
 // TODO : add limitation on set operation
-Future<Standalone<RangeResultRef>> ClientProfilingImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+Future<RangeResult> ClientProfilingImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return ClientProfilingGetRangeActor(ryw, getKeyRange().begin, kr);
 }
 
@@ -2032,10 +2010,10 @@ void parse(std::vector<StringRef>::iterator it, std::vector<StringRef>::iterator
 	}
 }
 
-ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYourWritesTransaction* ryw,
-                                                                          KeyRef prefix,
-                                                                          KeyRangeRef kr) {
-	state Standalone<RangeResultRef> result;
+ACTOR static Future<RangeResult> actorLineageGetRangeActor(ReadYourWritesTransaction* ryw,
+                                                           KeyRef prefix,
+                                                           KeyRangeRef kr) {
+	state RangeResult result;
 
 	// Set default values for all fields. The default will be used if the field
 	// is missing in the key.
@@ -2146,7 +2124,7 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 	return result;
 }
 
-Future<Standalone<RangeResultRef>> ActorLineageImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+Future<RangeResult> ActorLineageImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return actorLineageGetRangeActor(ryw, getKeyRange().begin, kr);
 }
 
@@ -2159,8 +2137,8 @@ std::string_view to_string_view(StringRef sr) {
 ActorProfilerConf::ActorProfilerConf(KeyRangeRef kr)
   : SpecialKeyRangeRWImpl(kr), config(ProfilerConfig::instance().getConfig()) {}
 
-Future<Standalone<RangeResultRef>> ActorProfilerConf::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
-	Standalone<RangeResultRef> res;
+Future<RangeResult> ActorProfilerConf::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+	RangeResult res;
 	std::string_view begin(to_string_view(kr.begin.removePrefix(range.begin))),
 	    end(to_string_view(kr.end.removePrefix(range.begin)));
 	for (auto& p : config) {
@@ -2221,10 +2199,10 @@ MaintenanceImpl::MaintenanceImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 // we will calculate the remaining time(truncated to integer, the same as fdbcli) and return back as the value
 // If the zoneId is the special one `ignoreSSFailuresZoneString`,
 // value will be 0 (same as fdbcli)
-ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYourWritesTransaction* ryw,
-                                                                         KeyRef prefix,
-                                                                         KeyRangeRef kr) {
-	state Standalone<RangeResultRef> result;
+ACTOR static Future<RangeResult> MaintenanceGetRangeActor(ReadYourWritesTransaction* ryw,
+                                                          KeyRef prefix,
+                                                          KeyRangeRef kr) {
+	state RangeResult result;
 	// zoneId
 	ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
 	Optional<Value> val = wait(ryw->getTransaction().get(healthyZoneKey));
@@ -2246,7 +2224,7 @@ ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYou
 	return rywGetRange(ryw, kr, result);
 }
 
-Future<Standalone<RangeResultRef>> MaintenanceImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+Future<RangeResult> MaintenanceImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return MaintenanceGetRangeActor(ryw, getKeyRange().begin, kr);
 }
 
@@ -2315,10 +2293,10 @@ Future<Optional<std::string>> MaintenanceImpl::commit(ReadYourWritesTransaction*
 DataDistributionImpl::DataDistributionImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
 // Read the system keys dataDistributionModeKey and rebalanceDDIgnoreKey
-ACTOR static Future<Standalone<RangeResultRef>> DataDistributionGetRangeActor(ReadYourWritesTransaction* ryw,
-                                                                              KeyRef prefix,
-                                                                              KeyRangeRef kr) {
-	state Standalone<RangeResultRef> result;
+ACTOR static Future<RangeResult> DataDistributionGetRangeActor(ReadYourWritesTransaction* ryw,
+                                                               KeyRef prefix,
+                                                               KeyRangeRef kr) {
+	state RangeResult result;
 	// dataDistributionModeKey
 	state Key modeKey = LiteralStringRef("mode").withPrefix(prefix);
 	if (kr.contains(modeKey)) {
@@ -2346,8 +2324,7 @@ ACTOR static Future<Standalone<RangeResultRef>> DataDistributionGetRangeActor(Re
 	return rywGetRange(ryw, kr, result);
 }
 
-Future<Standalone<RangeResultRef>> DataDistributionImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                                  KeyRangeRef kr) const {
+Future<RangeResult> DataDistributionImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return DataDistributionGetRangeActor(ryw, getKeyRange().begin, kr);
 }
 
diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h
index ca6676a54b..9bf6bb7109 100644
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@@ -36,7 +36,7 @@
 class SpecialKeyRangeReadImpl {
 public:
 	// Each derived class only needs to implement this simple version of getRange
-	virtual Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const = 0;
+	virtual Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const = 0;
 
 	explicit SpecialKeyRangeReadImpl(KeyRangeRef kr) : range(kr) {}
 	KeyRangeRef getKeyRange() const { return range; }
@@ -100,28 +100,26 @@ class SpecialKeyRangeAsyncImpl : public SpecialKeyRangeReadImpl {
 public:
 	explicit SpecialKeyRangeAsyncImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
 
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override = 0;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override = 0;
 
 	// calling with a cache object to have consistent results if we need to call rpc
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw,
-	                                            KeyRangeRef kr,
-	                                            Optional<Standalone<RangeResultRef>>* cache) const {
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, Optional<RangeResult>* cache) const {
 		return getRangeAsyncActor(this, ryw, kr, cache);
 	}
 
 	bool isAsync() const override { return true; }
 
-	ACTOR static Future<Standalone<RangeResultRef>> getRangeAsyncActor(const SpecialKeyRangeReadImpl* skrAyncImpl,
-	                                                                   ReadYourWritesTransaction* ryw,
-	                                                                   KeyRangeRef kr,
-	                                                                   Optional<Standalone<RangeResultRef>>* cache) {
+	ACTOR static Future<RangeResult> getRangeAsyncActor(const SpecialKeyRangeReadImpl* skrAyncImpl,
+	                                                    ReadYourWritesTransaction* ryw,
+	                                                    KeyRangeRef kr,
+	                                                    Optional<RangeResult>* cache) {
 		ASSERT(skrAyncImpl->getKeyRange().contains(kr));
 		ASSERT(cache != nullptr);
 		if (!cache->present()) {
 			// For simplicity, every time we need to cache, we read the whole range
 			// Although sometimes the range can be narrowed,
 			// there is not a general way to do it in complicated scenarios
-			Standalone<RangeResultRef> result_ = wait(skrAyncImpl->getRange(ryw, skrAyncImpl->getKeyRange()));
+			RangeResult result_ = wait(skrAyncImpl->getRange(ryw, skrAyncImpl->getKeyRange()));
 			*cache = result_;
 		}
 		const auto& allResults = cache->get();
@@ -131,11 +129,11 @@ public:
 		while (end > 0 && allResults[end - 1].key >= kr.end)
 			--end;
 		if (start < end) {
-			Standalone<RangeResultRef> result = RangeResultRef(allResults.slice(start, end), false);
+			RangeResult result = RangeResultRef(allResults.slice(start, end), false);
 			result.arena().dependsOn(allResults.arena());
 			return result;
 		} else
-			return Standalone<RangeResultRef>();
+			return RangeResult();
 	}
 };
 
@@ -168,11 +166,11 @@ public:
 
 	Future<Optional<Value>> get(ReadYourWritesTransaction* ryw, const Key& key);
 
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw,
-	                                            KeySelector begin,
-	                                            KeySelector end,
-	                                            GetRangeLimits limits,
-	                                            bool reverse = false);
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw,
+	                             KeySelector begin,
+	                             KeySelector end,
+	                             GetRangeLimits limits,
+	                             bool reverse = false);
 
 	void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value);
 
@@ -214,18 +212,18 @@ public:
 private:
 	ACTOR static Future<Optional<Value>> getActor(SpecialKeySpace* sks, ReadYourWritesTransaction* ryw, KeyRef key);
 
-	ACTOR static Future<Standalone<RangeResultRef>> checkRYWValid(SpecialKeySpace* sks,
-	                                                              ReadYourWritesTransaction* ryw,
-	                                                              KeySelector begin,
-	                                                              KeySelector end,
-	                                                              GetRangeLimits limits,
-	                                                              bool reverse);
-	ACTOR static Future<Standalone<RangeResultRef>> getRangeAggregationActor(SpecialKeySpace* sks,
-	                                                                         ReadYourWritesTransaction* ryw,
-	                                                                         KeySelector begin,
-	                                                                         KeySelector end,
-	                                                                         GetRangeLimits limits,
-	                                                                         bool reverse);
+	ACTOR static Future<RangeResult> checkRYWValid(SpecialKeySpace* sks,
+	                                               ReadYourWritesTransaction* ryw,
+	                                               KeySelector begin,
+	                                               KeySelector end,
+	                                               GetRangeLimits limits,
+	                                               bool reverse);
+	ACTOR static Future<RangeResult> getRangeAggregationActor(SpecialKeySpace* sks,
+	                                                          ReadYourWritesTransaction* ryw,
+	                                                          KeySelector begin,
+	                                                          KeySelector end,
+	                                                          GetRangeLimits limits,
+	                                                          bool reverse);
 
 	KeyRangeMap<SpecialKeyRangeReadImpl*> readImpls;
 	KeyRangeMap<SpecialKeySpace::MODULE> modules;
@@ -247,7 +245,7 @@ private:
 class SKSCTestImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit SKSCTestImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };
 
@@ -260,31 +258,31 @@ public:
 class ConflictingKeysImpl : public SpecialKeyRangeReadImpl {
 public:
 	explicit ConflictingKeysImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 };
 
 class ReadConflictRangeImpl : public SpecialKeyRangeReadImpl {
 public:
 	explicit ReadConflictRangeImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 };
 
 class WriteConflictRangeImpl : public SpecialKeyRangeReadImpl {
 public:
 	explicit WriteConflictRangeImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 };
 
 class DDStatsRangeImpl : public SpecialKeyRangeAsyncImpl {
 public:
 	explicit DDStatsRangeImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 };
 
 class ManagementCommandsOptionsImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit ManagementCommandsOptionsImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) override;
 	void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
 	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
@@ -294,7 +292,7 @@ public:
 class ExcludeServersRangeImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit ExcludeServersRangeImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) override;
 	Key decode(const KeyRef& key) const override;
 	Key encode(const KeyRef& key) const override;
@@ -304,7 +302,7 @@ public:
 class FailedServersRangeImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit FailedServersRangeImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) override;
 	Key decode(const KeyRef& key) const override;
 	Key encode(const KeyRef& key) const override;
@@ -314,13 +312,13 @@ public:
 class ExclusionInProgressRangeImpl : public SpecialKeyRangeAsyncImpl {
 public:
 	explicit ExclusionInProgressRangeImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 };
 
 class ProcessClassRangeImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit ProcessClassRangeImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 	void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
 	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
@@ -329,27 +327,27 @@ public:
 class ProcessClassSourceRangeImpl : public SpecialKeyRangeReadImpl {
 public:
 	explicit ProcessClassSourceRangeImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 };
 
 class LockDatabaseImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit LockDatabaseImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };
 
 class ConsistencyCheckImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit ConsistencyCheckImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };
 
 class GlobalConfigImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit GlobalConfigImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) override;
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 	void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
@@ -359,7 +357,7 @@ public:
 class TracingOptionsImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit TracingOptionsImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) override;
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 	void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
@@ -369,7 +367,7 @@ public:
 class CoordinatorsImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit CoordinatorsImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 	void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
 	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
@@ -378,20 +376,20 @@ public:
 class CoordinatorsAutoImpl : public SpecialKeyRangeReadImpl {
 public:
 	explicit CoordinatorsAutoImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 };
 
 class AdvanceVersionImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit AdvanceVersionImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };
 
 class ClientProfilingImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit ClientProfilingImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 	void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
 	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
@@ -400,7 +398,7 @@ public:
 class ActorLineageImpl : public SpecialKeyRangeReadImpl {
 public:
 	explicit ActorLineageImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 };
 
 class ActorProfilerConf : public SpecialKeyRangeRWImpl {
@@ -409,7 +407,7 @@ class ActorProfilerConf : public SpecialKeyRangeRWImpl {
 
 public:
 	explicit ActorProfilerConf(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) override;
 	void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
 	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
@@ -419,14 +417,14 @@ public:
 class MaintenanceImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit MaintenanceImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };
 
 class DataDistributionImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit DataDistributionImpl(KeyRangeRef kr);
-	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };
 
diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index 42fec5f9f2..314df8930b 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -48,9 +48,7 @@ const Key keyServersKey(const KeyRef& k) {
 const KeyRef keyServersKey(const KeyRef& k, Arena& arena) {
 	return k.withPrefix(keyServersPrefix, arena);
 }
-const Value keyServersValue(Standalone<RangeResultRef> result,
-                            const std::vector<UID>& src,
-                            const std::vector<UID>& dest) {
+const Value keyServersValue(RangeResult result, const std::vector<UID>& src, const std::vector<UID>& dest) {
 	if (!CLIENT_KNOBS->TAG_ENCODE_KEY_SERVERS) {
 		BinaryWriter wr(IncludeVersion(ProtocolVersion::withKeyServerValue()));
 		wr << src << dest;
@@ -95,7 +93,7 @@ const Value keyServersValue(const std::vector<Tag>& srcTag, const std::vector<Ta
 	return wr.toValue();
 }
 
-void decodeKeyServersValue(Standalone<RangeResultRef> result,
+void decodeKeyServersValue(RangeResult result,
                            const ValueRef& value,
                            std::vector<UID>& src,
                            std::vector<UID>& dest,
diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h
index 952e8fcf00..e7a54e632c 100644
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@@ -52,12 +52,12 @@ extern const KeyRangeRef keyServersKeys, keyServersKeyServersKeys;
 extern const KeyRef keyServersPrefix, keyServersEnd, keyServersKeyServersKey;
 const Key keyServersKey(const KeyRef& k);
 const KeyRef keyServersKey(const KeyRef& k, Arena& arena);
-const Value keyServersValue(Standalone<RangeResultRef> result,
+const Value keyServersValue(RangeResult result,
                             const std::vector<UID>& src,
                             const std::vector<UID>& dest = std::vector<UID>());
 const Value keyServersValue(const std::vector<Tag>& srcTag, const std::vector<Tag>& destTag = std::vector<Tag>());
 // `result` must be the full result of getting serverTagKeys
-void decodeKeyServersValue(Standalone<RangeResultRef> result,
+void decodeKeyServersValue(RangeResult result,
                            const ValueRef& value,
                            std::vector<UID>& src,
                            std::vector<UID>& dest,
diff --git a/fdbclient/TagThrottle.actor.cpp b/fdbclient/TagThrottle.actor.cpp
index c278db116d..76adbb5431 100644
--- a/fdbclient/TagThrottle.actor.cpp
+++ b/fdbclient/TagThrottle.actor.cpp
@@ -179,7 +179,7 @@ ACTOR Future<std::vector<TagThrottleInfo>> getThrottledTags(Database db, int lim
 			if (!containsRecommend) {
 				wait(store(reportAuto, getValidAutoEnabled(&tr, db)));
 			}
-			Standalone<RangeResultRef> throttles = wait(tr.getRange(
+			RangeResult throttles = wait(tr.getRange(
 			    reportAuto ? tagThrottleKeys : KeyRangeRef(tagThrottleKeysPrefix, tagThrottleAutoKeysPrefix), limit));
 			std::vector<TagThrottleInfo> results;
 			for (auto throttle : throttles) {
@@ -202,7 +202,7 @@ ACTOR Future<std::vector<TagThrottleInfo>> getRecommendedTags(Database db, int l
 				return std::vector<TagThrottleInfo>();
 			}
 
-			Standalone<RangeResultRef> throttles =
+			RangeResult throttles =
 			    wait(tr.getRange(KeyRangeRef(tagThrottleAutoKeysPrefix, tagThrottleKeys.end), limit));
 			std::vector<TagThrottleInfo> results;
 			for (auto throttle : throttles) {
@@ -339,7 +339,7 @@ ACTOR Future<bool> unthrottleMatchingThrottles(Database db,
 
 	loop {
 		try {
-			state Standalone<RangeResultRef> tags = wait(tr.getRange(begin, end, 1000));
+			state RangeResult tags = wait(tr.getRange(begin, end, 1000));
 			state uint64_t unthrottledTags = 0;
 			uint64_t manualUnthrottledTags = 0;
 			for (auto tag : tags) {
diff --git a/fdbclient/TaskBucket.actor.cpp b/fdbclient/TaskBucket.actor.cpp
index 6f0f63a7f0..4e17a1c9f7 100644
--- a/fdbclient/TaskBucket.actor.cpp
+++ b/fdbclient/TaskBucket.actor.cpp
@@ -243,8 +243,7 @@ public:
 		state Reference<Task> task(new Task());
 		task->key = taskUID;
 
-		state Standalone<RangeResultRef> values =
-		    wait(tr->getRange(taskAvailableSpace.range(), CLIENT_KNOBS->TOO_MANY));
+		state RangeResult values = wait(tr->getRange(taskAvailableSpace.range(), CLIENT_KNOBS->TOO_MANY));
 		Version version = wait(tr->getReadVersion());
 		task->timeoutVersion =
 		    version + (uint64_t)(taskBucket->timeout *
@@ -602,19 +601,19 @@ public:
 		taskBucket->setOptions(tr);
 
 		// Check all available priorities for keys
-		state std::vector<Future<Standalone<RangeResultRef>>> resultFutures;
+		state std::vector<Future<RangeResult>> resultFutures;
 		for (int pri = 0; pri <= CLIENT_KNOBS->TASKBUCKET_MAX_PRIORITY; ++pri)
 			resultFutures.push_back(tr->getRange(taskBucket->getAvailableSpace(pri).range(), 1));
 
 		// If any priority levels have any keys then the taskbucket is not empty so return false
 		state int i;
 		for (i = 0; i < resultFutures.size(); ++i) {
-			Standalone<RangeResultRef> results = wait(resultFutures[i]);
+			RangeResult results = wait(resultFutures[i]);
 			if (results.size() > 0)
 				return false;
 		}
 
-		Standalone<RangeResultRef> values = wait(tr->getRange(taskBucket->timeouts.range(), 1));
+		RangeResult values = wait(tr->getRange(taskBucket->timeouts.range(), 1));
 		if (values.size() > 0)
 			return false;
 
@@ -625,14 +624,14 @@ public:
 		taskBucket->setOptions(tr);
 
 		// Check all available priorities for emptiness
-		state std::vector<Future<Standalone<RangeResultRef>>> resultFutures;
+		state std::vector<Future<RangeResult>> resultFutures;
 		for (int pri = 0; pri <= CLIENT_KNOBS->TASKBUCKET_MAX_PRIORITY; ++pri)
 			resultFutures.push_back(tr->getRange(taskBucket->getAvailableSpace(pri).range(), 1));
 
 		// If any priority levels have any keys then return true as the level is 'busy'
 		state int i;
 		for (i = 0; i < resultFutures.size(); ++i) {
-			Standalone<RangeResultRef> results = wait(resultFutures[i]);
+			RangeResult results = wait(resultFutures[i]);
 			if (results.size() > 0)
 				return true;
 		}
@@ -650,7 +649,7 @@ public:
 		t.append(task->timeoutVersion);
 		t.append(task->key);
 
-		Standalone<RangeResultRef> values = wait(tr->getRange(taskBucket->timeouts.range(t), 1));
+		RangeResult values = wait(tr->getRange(taskBucket->timeouts.range(t), 1));
 		if (values.size() > 0)
 			return false;
 
@@ -742,7 +741,7 @@ public:
 		state KeyRange range(
 		    KeyRangeRef(taskBucket->timeouts.get(0).range().begin, taskBucket->timeouts.get(end).range().end));
 
-		Standalone<RangeResultRef> values = wait(tr->getRange(range, CLIENT_KNOBS->TASKBUCKET_MAX_TASK_KEYS));
+		RangeResult values = wait(tr->getRange(range, CLIENT_KNOBS->TASKBUCKET_MAX_TASK_KEYS));
 
 		// Keys will be tuples of (taskUID, param) -> paramValue
 		// Unfortunately we need to know the priority parameter for a taskUID before we can know which available-tasks
@@ -793,7 +792,7 @@ public:
 	ACTOR static Future<Void> debugPrintRange(Reference<ReadYourWritesTransaction> tr, Subspace subspace, Key msg) {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-		Standalone<RangeResultRef> values = wait(tr->getRange(subspace.range(), CLIENT_KNOBS->TOO_MANY));
+		RangeResult values = wait(tr->getRange(subspace.range(), CLIENT_KNOBS->TOO_MANY));
 		TraceEvent("TaskBucketDebugPrintRange")
 		    .detail("Key", subspace.key())
 		    .detail("Count", values.size())
@@ -851,7 +850,7 @@ public:
 		} else {
 			TEST(true); // Extended a task without updating parameters
 			// Otherwise, read and transplant the params from the old to new timeout spaces
-			Standalone<RangeResultRef> params = wait(tr->getRange(oldTimeoutSpace.range(), CLIENT_KNOBS->TOO_MANY));
+			RangeResult params = wait(tr->getRange(oldTimeoutSpace.range(), CLIENT_KNOBS->TOO_MANY));
 			for (auto& kv : params) {
 				Tuple paramKey = oldTimeoutSpace.unpack(kv.key);
 				tr->set(newTimeoutSpace.pack(paramKey), kv.value);
@@ -1114,7 +1113,7 @@ public:
 	ACTOR static Future<bool> isSet(Reference<ReadYourWritesTransaction> tr, Reference<TaskFuture> taskFuture) {
 		taskFuture->futureBucket->setOptions(tr);
 
-		Standalone<RangeResultRef> values = wait(tr->getRange(taskFuture->blocks.range(), 1));
+		RangeResult values = wait(tr->getRange(taskFuture->blocks.range(), 1));
 		if (values.size() > 0)
 			return false;
 
@@ -1177,7 +1176,7 @@ public:
 	                                            Reference<TaskFuture> taskFuture) {
 		taskFuture->futureBucket->setOptions(tr);
 
-		Standalone<RangeResultRef> values = wait(tr->getRange(taskFuture->callbacks.range(), CLIENT_KNOBS->TOO_MANY));
+		RangeResult values = wait(tr->getRange(taskFuture->callbacks.range(), CLIENT_KNOBS->TOO_MANY));
 		tr->clear(taskFuture->callbacks.range());
 
 		std::vector<Future<Void>> actions;
diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp
index b8f2bc6a0a..aa0c6bca07 100644
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@@ -217,31 +217,31 @@ ThreadFuture<Standalone<VectorRef<KeyRef>>> ThreadSafeTransaction::getRangeSplit
 	});
 }
 
-ThreadFuture<Standalone<RangeResultRef>> ThreadSafeTransaction::getRange(const KeySelectorRef& begin,
-                                                                         const KeySelectorRef& end,
-                                                                         int limit,
-                                                                         bool snapshot,
-                                                                         bool reverse) {
+ThreadFuture<RangeResult> ThreadSafeTransaction::getRange(const KeySelectorRef& begin,
+                                                          const KeySelectorRef& end,
+                                                          int limit,
+                                                          bool snapshot,
+                                                          bool reverse) {
 	KeySelector b = begin;
 	KeySelector e = end;
 
 	ReadYourWritesTransaction* tr = this->tr;
-	return onMainThread([tr, b, e, limit, snapshot, reverse]() -> Future<Standalone<RangeResultRef>> {
+	return onMainThread([tr, b, e, limit, snapshot, reverse]() -> Future<RangeResult> {
 		tr->checkDeferredError();
 		return tr->getRange(b, e, limit, snapshot, reverse);
 	});
 }
 
-ThreadFuture<Standalone<RangeResultRef>> ThreadSafeTransaction::getRange(const KeySelectorRef& begin,
-                                                                         const KeySelectorRef& end,
-                                                                         GetRangeLimits limits,
-                                                                         bool snapshot,
-                                                                         bool reverse) {
+ThreadFuture<RangeResult> ThreadSafeTransaction::getRange(const KeySelectorRef& begin,
+                                                          const KeySelectorRef& end,
+                                                          GetRangeLimits limits,
+                                                          bool snapshot,
+                                                          bool reverse) {
 	KeySelector b = begin;
 	KeySelector e = end;
 
 	ReadYourWritesTransaction* tr = this->tr;
-	return onMainThread([tr, b, e, limits, snapshot, reverse]() -> Future<Standalone<RangeResultRef>> {
+	return onMainThread([tr, b, e, limits, snapshot, reverse]() -> Future<RangeResult> {
 		tr->checkDeferredError();
 		return tr->getRange(b, e, limits, snapshot, reverse);
 	});
diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h
index d8502f7613..1fde6b06a3 100644
--- a/fdbclient/ThreadSafeTransaction.h
+++ b/fdbclient/ThreadSafeTransaction.h
@@ -79,26 +79,26 @@ public:
 
 	ThreadFuture<Optional<Value>> get(const KeyRef& key, bool snapshot = false) override;
 	ThreadFuture<Key> getKey(const KeySelectorRef& key, bool snapshot = false) override;
-	ThreadFuture<Standalone<RangeResultRef>> getRange(const KeySelectorRef& begin,
-	                                                  const KeySelectorRef& end,
-	                                                  int limit,
-	                                                  bool snapshot = false,
-	                                                  bool reverse = false) override;
-	ThreadFuture<Standalone<RangeResultRef>> getRange(const KeySelectorRef& begin,
-	                                                  const KeySelectorRef& end,
-	                                                  GetRangeLimits limits,
-	                                                  bool snapshot = false,
-	                                                  bool reverse = false) override;
-	ThreadFuture<Standalone<RangeResultRef>> getRange(const KeyRangeRef& keys,
-	                                                  int limit,
-	                                                  bool snapshot = false,
-	                                                  bool reverse = false) override {
+	ThreadFuture<RangeResult> getRange(const KeySelectorRef& begin,
+	                                   const KeySelectorRef& end,
+	                                   int limit,
+	                                   bool snapshot = false,
+	                                   bool reverse = false) override;
+	ThreadFuture<RangeResult> getRange(const KeySelectorRef& begin,
+	                                   const KeySelectorRef& end,
+	                                   GetRangeLimits limits,
+	                                   bool snapshot = false,
+	                                   bool reverse = false) override;
+	ThreadFuture<RangeResult> getRange(const KeyRangeRef& keys,
+	                                   int limit,
+	                                   bool snapshot = false,
+	                                   bool reverse = false) override {
 		return getRange(firstGreaterOrEqual(keys.begin), firstGreaterOrEqual(keys.end), limit, snapshot, reverse);
 	}
-	ThreadFuture<Standalone<RangeResultRef>> getRange(const KeyRangeRef& keys,
-	                                                  GetRangeLimits limits,
-	                                                  bool snapshot = false,
-	                                                  bool reverse = false) override {
+	ThreadFuture<RangeResult> getRange(const KeyRangeRef& keys,
+	                                   GetRangeLimits limits,
+	                                   bool snapshot = false,
+	                                   bool reverse = false) override {
 		return getRange(firstGreaterOrEqual(keys.begin), firstGreaterOrEqual(keys.end), limits, snapshot, reverse);
 	}
 	ThreadFuture<Standalone<VectorRef<const char*>>> getAddressesForKey(const KeyRef& key) override;
diff --git a/fdbserver/ApplyMetadataMutation.cpp b/fdbserver/ApplyMetadataMutation.cpp
index 20d873d590..125344d721 100644
--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@@ -84,7 +84,7 @@ void applyMetadataMutations(SpanID const& spanContext,
 						vector<UID> src, dest;
 						// txnStateStore is always an in-memory KVS, and must always be recovered before
 						// applyMetadataMutations is called, so a wait here should never be needed.
-						Future<Standalone<RangeResultRef>> fResult = txnStateStore->readRange(serverTagKeys);
+						Future<RangeResult> fResult = txnStateStore->readRange(serverTagKeys);
 						decodeKeyServersValue(fResult.get(), m.param2, src, dest);
 
 						ASSERT(storageCache);
diff --git a/fdbserver/BackupProgress.actor.cpp b/fdbserver/BackupProgress.actor.cpp
index 0352ff6e31..28af84eaac 100644
--- a/fdbserver/BackupProgress.actor.cpp
+++ b/fdbserver/BackupProgress.actor.cpp
@@ -153,7 +153,7 @@ ACTOR Future<Void> getBackupProgress(Database cx, UID dbgid, Reference<BackupPro
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			state Future<Optional<Value>> fValue = tr.get(backupStartedKey);
-			state Standalone<RangeResultRef> results = wait(tr.getRange(backupProgressKeys, CLIENT_KNOBS->TOO_MANY));
+			state RangeResult results = wait(tr.getRange(backupProgressKeys, CLIENT_KNOBS->TOO_MANY));
 			ASSERT(!results.more && results.size() < CLIENT_KNOBS->TOO_MANY);
 
 			Optional<Value> value = wait(fValue);
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index d0735301df..685e384289 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -2595,7 +2595,7 @@ public:
 	std::map<Optional<Standalone<StringRef>>, WorkerInfo> id_worker;
 	std::map<Optional<Standalone<StringRef>>, ProcessClass>
 	    id_class; // contains the mapping from process id to process class from the database
-	Standalone<RangeResultRef> lastProcessClasses;
+	RangeResult lastProcessClasses;
 	bool gotProcessClasses;
 	bool gotFullyRecoveredConfig;
 	Optional<Standalone<StringRef>> masterProcessId;
@@ -3642,7 +3642,7 @@ ACTOR Future<Void> monitorProcessClasses(ClusterControllerData* self) {
 			if (val.present())
 				break;
 
-			Standalone<RangeResultRef> processClasses = wait(trVer.getRange(processClassKeys, CLIENT_KNOBS->TOO_MANY));
+			RangeResult processClasses = wait(trVer.getRange(processClassKeys, CLIENT_KNOBS->TOO_MANY));
 			ASSERT(!processClasses.more && processClasses.size() < CLIENT_KNOBS->TOO_MANY);
 
 			trVer.clear(processClassKeys);
@@ -3667,7 +3667,7 @@ ACTOR Future<Void> monitorProcessClasses(ClusterControllerData* self) {
 			try {
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-				Standalone<RangeResultRef> processClasses = wait(tr.getRange(processClassKeys, CLIENT_KNOBS->TOO_MANY));
+				RangeResult processClasses = wait(tr.getRange(processClassKeys, CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!processClasses.more && processClasses.size() < CLIENT_KNOBS->TOO_MANY);
 
 				if (processClasses != self->lastProcessClasses || !self->gotProcessClasses) {
@@ -3773,7 +3773,7 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 					// Since the history keys end with versionstamps, they
 					// should be sorted correctly (versionstamps are stored in
 					// big-endian order).
-					Standalone<RangeResultRef> globalConfigHistory =
+					RangeResult globalConfigHistory =
 					    wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
 					// If the global configuration version key has been set,
 					// the history should contain at least one item.
diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp
index 428a384279..208744e3a0 100644
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@@ -1515,8 +1515,7 @@ ACTOR static Future<Void> rejoinServer(CommitProxyInterface proxy, ProxyCommitDa
 			GetStorageServerRejoinInfoReply rep;
 			rep.version = commitData->version;
 			rep.tag = decodeServerTagValue(commitData->txnStateStore->readValue(serverTagKeyFor(req.id)).get().get());
-			Standalone<RangeResultRef> history =
-			    commitData->txnStateStore->readRange(serverTagHistoryRangeFor(req.id)).get();
+			RangeResult history = commitData->txnStateStore->readRange(serverTagHistoryRangeFor(req.id)).get();
 			for (int i = history.size() - 1; i >= 0; i--) {
 				rep.history.push_back(
 				    std::make_pair(decodeServerTagHistoryKey(history[i].key), decodeServerTagValue(history[i].value)));
@@ -1940,18 +1939,18 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
 
 				if (txnSequences.size() == maxSequence) {
 					state KeyRange txnKeys = allKeys;
-					Standalone<RangeResultRef> UIDtoTagMap = commitData.txnStateStore->readRange(serverTagKeys).get();
+					RangeResult UIDtoTagMap = commitData.txnStateStore->readRange(serverTagKeys).get();
 					state std::map<Tag, UID> tag_uid;
 					for (const KeyValueRef& kv : UIDtoTagMap) {
 						tag_uid[decodeServerTagValue(kv.value)] = decodeServerTagKey(kv.key);
 					}
 					loop {
 						wait(yield());
-						Standalone<RangeResultRef> data = commitData.txnStateStore
-						                                      ->readRange(txnKeys,
-						                                                  SERVER_KNOBS->BUGGIFIED_ROW_LIMIT,
-						                                                  SERVER_KNOBS->APPLY_MUTATION_BYTES)
-						                                      .get();
+						RangeResult data = commitData.txnStateStore
+						                       ->readRange(txnKeys,
+						                                   SERVER_KNOBS->BUGGIFIED_ROW_LIMIT,
+						                                   SERVER_KNOBS->APPLY_MUTATION_BYTES)
+						                       .get();
 						if (!data.size())
 							break;
 						((KeyRangeRef&)txnKeys) = KeyRangeRef(keyAfter(data.back().key, txnKeys.arena()), txnKeys.end);
diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp
index 6e9be53ddb..92de5a5b3c 100644
--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@@ -480,7 +480,7 @@ struct LeaderRegisterCollection {
 		if (!self->pStore->exists())
 			return Void();
 		OnDemandStore& store = *self->pStore;
-		Standalone<RangeResultRef> forwardingInfo = wait(store->readRange(fwdKeys));
+		RangeResult forwardingInfo = wait(store->readRange(fwdKeys));
 		for (int i = 0; i < forwardingInfo.size(); i++) {
 			LeaderInfo forwardInfo;
 			forwardInfo.forward = true;
diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index 8548fb9fc6..d1762fc7cb 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -435,7 +435,7 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution(Data
 			}
 
 			state Future<vector<ProcessData>> workers = getWorkers(&tr);
-			state Future<Standalone<RangeResultRef>> serverList = tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY);
+			state Future<RangeResult> serverList = tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY);
 			wait(success(workers) && success(serverList));
 			ASSERT(!serverList.get().more && serverList.get().size() < CLIENT_KNOBS->TOO_MANY);
 
@@ -469,13 +469,13 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution(Data
 			try {
 				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 				wait(checkMoveKeysLockReadOnly(&tr, moveKeysLock, ddEnabledState));
-				state Standalone<RangeResultRef> UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
+				state RangeResult UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY);
-				Standalone<RangeResultRef> keyServers = wait(krmGetRanges(&tr,
-				                                                          keyServersPrefix,
-				                                                          KeyRangeRef(beginKey, allKeys.end),
-				                                                          SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT,
-				                                                          SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
+				RangeResult keyServers = wait(krmGetRanges(&tr,
+				                                           keyServersPrefix,
+				                                           KeyRangeRef(beginKey, allKeys.end),
+				                                           SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT,
+				                                           SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
 				succeeded = true;
 
 				vector<UID> src, dest, last;
@@ -3657,16 +3657,14 @@ ACTOR Future<Void> trackExcludedServers(DDTeamCollection* self) {
 	loop {
 		try {
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-			state Future<Standalone<RangeResultRef>> fresultsExclude =
-			    tr.getRange(excludedServersKeys, CLIENT_KNOBS->TOO_MANY);
-			state Future<Standalone<RangeResultRef>> fresultsFailed =
-			    tr.getRange(failedServersKeys, CLIENT_KNOBS->TOO_MANY);
+			state Future<RangeResult> fresultsExclude = tr.getRange(excludedServersKeys, CLIENT_KNOBS->TOO_MANY);
+			state Future<RangeResult> fresultsFailed = tr.getRange(failedServersKeys, CLIENT_KNOBS->TOO_MANY);
 			wait(success(fresultsExclude) && success(fresultsFailed));
 
-			Standalone<RangeResultRef> excludedResults = fresultsExclude.get();
+			RangeResult excludedResults = fresultsExclude.get();
 			ASSERT(!excludedResults.more && excludedResults.size() < CLIENT_KNOBS->TOO_MANY);
 
-			Standalone<RangeResultRef> failedResults = fresultsFailed.get();
+			RangeResult failedResults = fresultsFailed.get();
 			ASSERT(!failedResults.more && failedResults.size() < CLIENT_KNOBS->TOO_MANY);
 
 			std::set<AddressExclusion> excluded;
@@ -3720,7 +3718,7 @@ ACTOR Future<Void> trackExcludedServers(DDTeamCollection* self) {
 
 ACTOR Future<vector<std::pair<StorageServerInterface, ProcessClass>>> getServerListAndProcessClasses(Transaction* tr) {
 	state Future<vector<ProcessData>> workers = getWorkers(tr);
-	state Future<Standalone<RangeResultRef>> serverList = tr->getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY);
+	state Future<RangeResult> serverList = tr->getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY);
 	wait(success(workers) && success(serverList));
 	ASSERT(!serverList.get().more && serverList.get().size() < CLIENT_KNOBS->TOO_MANY);
 
@@ -4873,13 +4871,13 @@ ACTOR Future<Void> debugCheckCoalescing(Database cx) {
 	state Transaction tr(cx);
 	loop {
 		try {
-			state Standalone<RangeResultRef> serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
+			state RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
 			ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY);
 
 			state int i;
 			for (i = 0; i < serverList.size(); i++) {
 				state UID id = decodeServerListValue(serverList[i].value).id();
-				Standalone<RangeResultRef> ranges = wait(krmGetRanges(&tr, serverKeysPrefixFor(id), allKeys));
+				RangeResult ranges = wait(krmGetRanges(&tr, serverKeysPrefixFor(id), allKeys));
 				ASSERT(ranges.end()[-1].key == allKeys.end);
 
 				for (int j = 0; j < ranges.size() - 2; j++)
@@ -5011,8 +5009,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
 						tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 						tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 
-						Standalone<RangeResultRef> replicaKeys =
-						    wait(tr.getRange(datacenterReplicasKeys, CLIENT_KNOBS->TOO_MANY));
+						RangeResult replicaKeys = wait(tr.getRange(datacenterReplicasKeys, CLIENT_KNOBS->TOO_MANY));
 
 						for (auto& kv : replicaKeys) {
 							auto dcId = decodeDatacenterReplicasKey(kv.key);
@@ -5570,7 +5567,7 @@ ACTOR Future<Void> cacheServerWatcher(Database* db) {
 	loop {
 		tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		try {
-			Standalone<RangeResultRef> range = wait(tr.getRange(storageCacheServerKeys, CLIENT_KNOBS->TOO_MANY));
+			RangeResult range = wait(tr.getRange(storageCacheServerKeys, CLIENT_KNOBS->TOO_MANY));
 			ASSERT(!range.more);
 			std::set<UID> caches;
 			for (auto& kv : range) {
diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp
index e151ddddc5..f7b0d465c7 100644
--- a/fdbserver/DataDistributionQueue.actor.cpp
+++ b/fdbserver/DataDistributionQueue.actor.cpp
@@ -595,12 +595,11 @@ struct DDQueueData {
 			servers.clear();
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 			try {
-				state Standalone<RangeResultRef> UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
+				state RangeResult UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY);
-				Standalone<RangeResultRef> keyServersEntries =
-				    wait(tr.getRange(lastLessOrEqual(keyServersKey(input.keys.begin)),
-				                     firstGreaterOrEqual(keyServersKey(input.keys.end)),
-				                     SERVER_KNOBS->DD_QUEUE_MAX_KEY_SERVERS));
+				RangeResult keyServersEntries = wait(tr.getRange(lastLessOrEqual(keyServersKey(input.keys.begin)),
+				                                                 firstGreaterOrEqual(keyServersKey(input.keys.end)),
+				                                                 SERVER_KNOBS->DD_QUEUE_MAX_KEY_SERVERS));
 
 				if (keyServersEntries.size() < SERVER_KNOBS->DD_QUEUE_MAX_KEY_SERVERS) {
 					for (int shard = 0; shard < keyServersEntries.size(); shard++) {
@@ -629,7 +628,7 @@ struct DDQueueData {
 				// When a shard is inflight and DD crashes, some destination servers may have already got the data.
 				// The new DD will treat the destination servers as source servers. So the size can be large.
 				else {
-					Standalone<RangeResultRef> serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
+					RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
 					ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY);
 
 					for (auto s = serverList.begin(); s != serverList.end(); ++s)
diff --git a/fdbserver/IKeyValueStore.h b/fdbserver/IKeyValueStore.h
index 262a345329..c5b03a53f4 100644
--- a/fdbserver/IKeyValueStore.h
+++ b/fdbserver/IKeyValueStore.h
@@ -56,9 +56,7 @@ public:
 
 	// If rowLimit>=0, reads first rows sorted ascending, otherwise reads last rows sorted descending
 	// The total size of the returned value (less the last entry) will be less than byteLimit
-	virtual Future<Standalone<RangeResultRef>> readRange(KeyRangeRef keys,
-	                                                     int rowLimit = 1 << 30,
-	                                                     int byteLimit = 1 << 30) = 0;
+	virtual Future<RangeResult> readRange(KeyRangeRef keys, int rowLimit = 1 << 30, int byteLimit = 1 << 30) = 0;
 
 	// To debug MEMORY_RADIXTREE type ONLY
 	// Returns (1) how many key & value pairs have been inserted (2) how many nodes have been created (3) how many
diff --git a/fdbserver/KeyValueStoreCompressTestData.actor.cpp b/fdbserver/KeyValueStoreCompressTestData.actor.cpp
index 1e83f22be4..c83114e967 100644
--- a/fdbserver/KeyValueStoreCompressTestData.actor.cpp
+++ b/fdbserver/KeyValueStoreCompressTestData.actor.cpp
@@ -72,9 +72,7 @@ struct KeyValueStoreCompressTestData final : IKeyValueStore {
 
 	// If rowLimit>=0, reads first rows sorted ascending, otherwise reads last rows sorted descending
 	// The total size of the returned value (less the last entry) will be less than byteLimit
-	Future<Standalone<RangeResultRef>> readRange(KeyRangeRef keys,
-	                                             int rowLimit = 1 << 30,
-	                                             int byteLimit = 1 << 30) override {
+	Future<RangeResult> readRange(KeyRangeRef keys, int rowLimit = 1 << 30, int byteLimit = 1 << 30) override {
 		return doReadRange(store, keys, rowLimit, byteLimit);
 	}
 
@@ -99,12 +97,9 @@ private:
 			return v;
 		}
 	}
-	ACTOR Future<Standalone<RangeResultRef>> doReadRange(IKeyValueStore* store,
-	                                                     KeyRangeRef keys,
-	                                                     int rowLimit,
-	                                                     int byteLimit) {
-		Standalone<RangeResultRef> _vs = wait(store->readRange(keys, rowLimit, byteLimit));
-		Standalone<RangeResultRef> vs = _vs; // Get rid of implicit const& from wait statement
+	ACTOR Future<RangeResult> doReadRange(IKeyValueStore* store, KeyRangeRef keys, int rowLimit, int byteLimit) {
+		RangeResult _vs = wait(store->readRange(keys, rowLimit, byteLimit));
+		RangeResult vs = _vs; // Get rid of implicit const& from wait statement
 		Arena& a = vs.arena();
 		for (int i = 0; i < vs.size(); i++)
 			vs[i].value = ValueRef(a, (ValueRef const&)unpack(vs[i].value));
diff --git a/fdbserver/KeyValueStoreMemory.actor.cpp b/fdbserver/KeyValueStoreMemory.actor.cpp
index d3432c27dd..bbe314c667 100644
--- a/fdbserver/KeyValueStoreMemory.actor.cpp
+++ b/fdbserver/KeyValueStoreMemory.actor.cpp
@@ -228,15 +228,13 @@ public:
 
 	// If rowLimit>=0, reads first rows sorted ascending, otherwise reads last rows sorted descending
 	// The total size of the returned value (less the last entry) will be less than byteLimit
-	Future<Standalone<RangeResultRef>> readRange(KeyRangeRef keys,
-	                                             int rowLimit = 1 << 30,
-	                                             int byteLimit = 1 << 30) override {
+	Future<RangeResult> readRange(KeyRangeRef keys, int rowLimit = 1 << 30, int byteLimit = 1 << 30) override {
 		if (recovering.isError())
 			throw recovering.getError();
 		if (!recovering.isReady())
 			return waitAndReadRange(this, keys, rowLimit, byteLimit);
 
-		Standalone<RangeResultRef> result;
+		RangeResult result;
 		if (rowLimit == 0) {
 			return result;
 		}
@@ -835,10 +833,10 @@ private:
 		wait(self->recovering);
 		return self->readValuePrefix(key, maxLength).get();
 	}
-	ACTOR static Future<Standalone<RangeResultRef>> waitAndReadRange(KeyValueStoreMemory* self,
-	                                                                 KeyRange keys,
-	                                                                 int rowLimit,
-	                                                                 int byteLimit) {
+	ACTOR static Future<RangeResult> waitAndReadRange(KeyValueStoreMemory* self,
+	                                                  KeyRange keys,
+	                                                  int rowLimit,
+	                                                  int byteLimit) {
 		wait(self->recovering);
 		return self->readRange(keys, rowLimit, byteLimit).get();
 	}
diff --git a/fdbserver/KeyValueStoreRocksDB.actor.cpp b/fdbserver/KeyValueStoreRocksDB.actor.cpp
index fb8cbfe6f3..625d5ce6c6 100644
--- a/fdbserver/KeyValueStoreRocksDB.actor.cpp
+++ b/fdbserver/KeyValueStoreRocksDB.actor.cpp
@@ -290,13 +290,13 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		struct ReadRangeAction : TypedAction<Reader, ReadRangeAction>, FastAllocated<ReadRangeAction> {
 			KeyRange keys;
 			int rowLimit, byteLimit;
-			ThreadReturnPromise<Standalone<RangeResultRef>> result;
+			ThreadReturnPromise<RangeResult> result;
 			ReadRangeAction(KeyRange keys, int rowLimit, int byteLimit)
 			  : keys(keys), rowLimit(rowLimit), byteLimit(byteLimit) {}
 			double getTimeEstimate() const override { return SERVER_KNOBS->READ_RANGE_TIME_ESTIMATE; }
 		};
 		void action(ReadRangeAction& a) {
-			Standalone<RangeResultRef> result;
+			RangeResult result;
 			if (a.rowLimit == 0 || a.byteLimit == 0) {
 				a.result.send(result);
 			}
@@ -446,7 +446,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
 		return res;
 	}
 
-	Future<Standalone<RangeResultRef>> readRange(KeyRangeRef keys, int rowLimit, int byteLimit) override {
+	Future<RangeResult> readRange(KeyRangeRef keys, int rowLimit, int byteLimit) override {
 		auto a = new Reader::ReadRangeAction(keys, rowLimit, byteLimit);
 		auto res = a->result.getFuture();
 		readThreads->post(a);
diff --git a/fdbserver/KeyValueStoreSQLite.actor.cpp b/fdbserver/KeyValueStoreSQLite.actor.cpp
index be98515cf3..6e3043f3f3 100644
--- a/fdbserver/KeyValueStoreSQLite.actor.cpp
+++ b/fdbserver/KeyValueStoreSQLite.actor.cpp
@@ -1145,8 +1145,8 @@ struct RawCursor {
 		}
 		return Optional<Value>();
 	}
-	Standalone<RangeResultRef> getRange(KeyRangeRef keys, int rowLimit, int byteLimit) {
-		Standalone<RangeResultRef> result;
+	RangeResult getRange(KeyRangeRef keys, int rowLimit, int byteLimit) {
+		RangeResult result;
 		int accumulatedBytes = 0;
 		ASSERT(byteLimit > 0);
 		if (rowLimit == 0) {
@@ -1579,9 +1579,7 @@ public:
 
 	Future<Optional<Value>> readValue(KeyRef key, Optional<UID> debugID) override;
 	Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, Optional<UID> debugID) override;
-	Future<Standalone<RangeResultRef>> readRange(KeyRangeRef keys,
-	                                             int rowLimit = 1 << 30,
-	                                             int byteLimit = 1 << 30) override;
+	Future<RangeResult> readRange(KeyRangeRef keys, int rowLimit = 1 << 30, int byteLimit = 1 << 30) override;
 
 	KeyValueStoreSQLite(std::string const& filename,
 	                    UID logID,
@@ -1697,7 +1695,7 @@ private:
 		struct ReadRangeAction : TypedAction<Reader, ReadRangeAction>, FastAllocated<ReadRangeAction> {
 			KeyRange keys;
 			int rowLimit, byteLimit;
-			ThreadReturnPromise<Standalone<RangeResultRef>> result;
+			ThreadReturnPromise<RangeResult> result;
 			ReadRangeAction(KeyRange keys, int rowLimit, int byteLimit)
 			  : keys(keys), rowLimit(rowLimit), byteLimit(byteLimit) {}
 			double getTimeEstimate() const override { return SERVER_KNOBS->READ_RANGE_TIME_ESTIMATE; }
@@ -2207,7 +2205,7 @@ Future<Optional<Value>> KeyValueStoreSQLite::readValuePrefix(KeyRef key, int max
 	readThreads->post(p);
 	return f;
 }
-Future<Standalone<RangeResultRef>> KeyValueStoreSQLite::readRange(KeyRangeRef keys, int rowLimit, int byteLimit) {
+Future<RangeResult> KeyValueStoreSQLite::readRange(KeyRangeRef keys, int rowLimit, int byteLimit) {
 	++readsRequested;
 	auto p = new Reader::ReadRangeAction(keys, rowLimit, byteLimit);
 	auto f = p->result.getFuture();
diff --git a/fdbserver/MetricLogger.actor.cpp b/fdbserver/MetricLogger.actor.cpp
index 6e14527d25..65a4a3ae5f 100644
--- a/fdbserver/MetricLogger.actor.cpp
+++ b/fdbserver/MetricLogger.actor.cpp
@@ -182,7 +182,7 @@ public:
 	// levelKey is the prefix for the entire level, no timestamp at the end
 	ACTOR static Future<Optional<Standalone<StringRef>>> getLastBlock_impl(ReadYourWritesTransaction* tr,
 	                                                                       Standalone<StringRef> levelKey) {
-		Standalone<RangeResultRef> results = wait(tr->getRange(normalKeys.withPrefix(levelKey), 1, true, true));
+		RangeResult results = wait(tr->getRange(normalKeys.withPrefix(levelKey), 1, true, true));
 		if (results.size() == 1)
 			return results[0].value;
 		return Optional<Standalone<StringRef>>();
diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp
index 0702b8d097..1f2e3a9780 100644
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@@ -234,11 +234,11 @@ ACTOR Future<vector<UID>> addReadWriteDestinations(KeyRangeRef shard,
 	return result;
 }
 
-ACTOR Future<vector<vector<UID>>> additionalSources(Standalone<RangeResultRef> shards,
+ACTOR Future<vector<vector<UID>>> additionalSources(RangeResult shards,
                                                     Transaction* tr,
                                                     int desiredHealthy,
                                                     int maxServers) {
-	state Standalone<RangeResultRef> UIDtoTagMap = wait(tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
+	state RangeResult UIDtoTagMap = wait(tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
 	ASSERT(!UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY);
 	vector<Future<Optional<Value>>> serverListEntries;
 	std::set<UID> fetching;
@@ -380,11 +380,11 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 					// Get all existing shards overlapping keys (exclude any that have been processed in a previous
 					// iteration of the outer loop)
 					state KeyRange currentKeys = KeyRangeRef(begin, keys.end);
-					state Standalone<RangeResultRef> old = wait(krmGetRanges(&tr,
-					                                                         keyServersPrefix,
-					                                                         currentKeys,
-					                                                         SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT,
-					                                                         SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
+					state RangeResult old = wait(krmGetRanges(&tr,
+					                                          keyServersPrefix,
+					                                          currentKeys,
+					                                          SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT,
+					                                          SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
 
 					// Determine the last processed key (which will be the beginning for the next iteration)
 					state Key endKey = old.end()[-1].key;
@@ -399,8 +399,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 					// 	printf("'%s': '%s'\n", old[i].key.toString().c_str(), old[i].value.toString().c_str());
 
 					// Check that enough servers for each shard are in the correct state
-					state Standalone<RangeResultRef> UIDtoTagMap =
-					    wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
+					state RangeResult UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
 					ASSERT(!UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY);
 					vector<vector<UID>> addAsSource = wait(additionalSources(
 					    old, &tr, servers.size(), SERVER_KNOBS->MAX_ADDED_SOURCES_MULTIPLIER * servers.size()));
@@ -630,15 +629,13 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 					wait(checkMoveKeysLock(&tr, lock, ddEnabledState));
 
 					state KeyRange currentKeys = KeyRangeRef(begin, keys.end);
-					state Standalone<RangeResultRef> UIDtoTagMap =
-					    wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
+					state RangeResult UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
 					ASSERT(!UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY);
-					state Standalone<RangeResultRef> keyServers =
-					    wait(krmGetRanges(&tr,
-					                      keyServersPrefix,
-					                      currentKeys,
-					                      SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT,
-					                      SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
+					state RangeResult keyServers = wait(krmGetRanges(&tr,
+					                                                 keyServersPrefix,
+					                                                 currentKeys,
+					                                                 SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT,
+					                                                 SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
 
 					// Determine the last processed key (which will be the beginning for the next iteration)
 					endKey = keyServers.end()[-1].key;
@@ -869,8 +866,7 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
 	state int maxSkipTags = 1;
 	loop {
 		try {
-			state Future<Standalone<RangeResultRef>> fTagLocalities =
-			    tr.getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
+			state Future<RangeResult> fTagLocalities = tr.getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
 			state Future<Optional<Value>> fv = tr.get(serverListKeyFor(server.id()));
 
 			state Future<Optional<Value>> fExclProc = tr.get(
@@ -901,9 +897,8 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
 			        ? tr.get(StringRef(encodeFailedServersKey(AddressExclusion(server.secondaryAddress().get().ip))))
 			        : Future<Optional<Value>>(Optional<Value>());
 
-			state Future<Standalone<RangeResultRef>> fTags = tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY, true);
-			state Future<Standalone<RangeResultRef>> fHistoryTags =
-			    tr.getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY, true);
+			state Future<RangeResult> fTags = tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY, true);
+			state Future<RangeResult> fHistoryTags = tr.getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY, true);
 
 			wait(success(fTagLocalities) && success(fv) && success(fTags) && success(fHistoryTags) &&
 			     success(fExclProc) && success(fExclIP) && success(fFailProc) && success(fFailIP) &&
@@ -991,7 +986,7 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
 }
 // A SS can be removed only if all data (shards) on the SS have been moved away from the SS.
 ACTOR Future<bool> canRemoveStorageServer(Transaction* tr, UID serverID) {
-	Standalone<RangeResultRef> keys = wait(krmGetRanges(tr, serverKeysPrefixFor(serverID), allKeys, 2));
+	RangeResult keys = wait(krmGetRanges(tr, serverKeysPrefixFor(serverID), allKeys, 2));
 
 	ASSERT(keys.size() >= 2);
 
@@ -1034,13 +1029,10 @@ ACTOR Future<Void> removeStorageServer(Database cx,
 			} else {
 
 				state Future<Optional<Value>> fListKey = tr.get(serverListKeyFor(serverID));
-				state Future<Standalone<RangeResultRef>> fTags = tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY);
-				state Future<Standalone<RangeResultRef>> fHistoryTags =
-				    tr.getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY);
-				state Future<Standalone<RangeResultRef>> fTagLocalities =
-				    tr.getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
-				state Future<Standalone<RangeResultRef>> fTLogDatacenters =
-				    tr.getRange(tLogDatacentersKeys, CLIENT_KNOBS->TOO_MANY);
+				state Future<RangeResult> fTags = tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY);
+				state Future<RangeResult> fHistoryTags = tr.getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY);
+				state Future<RangeResult> fTagLocalities = tr.getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
+				state Future<RangeResult> fTLogDatacenters = tr.getRange(tLogDatacentersKeys, CLIENT_KNOBS->TOO_MANY);
 
 				wait(success(fListKey) && success(fTags) && success(fHistoryTags) && success(fTagLocalities) &&
 				     success(fTLogDatacenters));
@@ -1122,14 +1114,13 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx,
 				// Get all values of keyServers and remove serverID from every occurrence
 				// Very inefficient going over every entry in keyServers
 				// No shortcut because keyServers and serverKeys are not guaranteed same shard boundaries
-				state Standalone<RangeResultRef> UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
+				state RangeResult UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY);
-				state Standalone<RangeResultRef> keyServers =
-				    wait(krmGetRanges(&tr,
-				                      keyServersPrefix,
-				                      KeyRangeRef(begin, allKeys.end),
-				                      SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT,
-				                      SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
+				state RangeResult keyServers = wait(krmGetRanges(&tr,
+				                                                 keyServersPrefix,
+				                                                 KeyRangeRef(begin, allKeys.end),
+				                                                 SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT,
+				                                                 SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
 				state KeyRange currentKeys = KeyRangeRef(begin, keyServers.end()[-1].key);
 				for (int i = 0; i < keyServers.size() - 1; ++i) {
 					auto it = keyServers[i];
@@ -1248,7 +1239,7 @@ void seedShardServers(Arena& arena, CommitTransactionRef& tr, vector<StorageServ
 	}
 
 	auto ksValue = CLIENT_KNOBS->TAG_ENCODE_KEY_SERVERS ? keyServersValue(serverTags)
-	                                                    : keyServersValue(Standalone<RangeResultRef>(), serverSrcUID);
+	                                                    : keyServersValue(RangeResult(), serverSrcUID);
 	// We have to set this range in two blocks, because the master tracking of "keyServersLocations" depends on a change
 	// to a specific
 	//   key (keyServersKeyServersKey)
diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp
index 64b71950de..3c5ae2ee1f 100644
--- a/fdbserver/OldTLogServer_4_6.actor.cpp
+++ b/fdbserver/OldTLogServer_4_6.actor.cpp
@@ -1063,7 +1063,7 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
 
 		peekMessagesFromMemory(logData, req, messages2, endVersion);
 
-		Standalone<RangeResultRef> kvs = wait(self->persistentData->readRange(
+		RangeResult kvs = wait(self->persistentData->readRange(
 		    KeyRangeRef(persistTagMessagesKey(logData->logId, oldTag, req.begin),
 		                persistTagMessagesKey(logData->logId, oldTag, logData->persistentDataDurableVersion + 1)),
 		    SERVER_KNOBS->DESIRED_TOTAL_BYTES,
@@ -1391,8 +1391,8 @@ ACTOR Future<Void> restorePersistentState(TLogData* self, LocalityData locality)
 
 	IKeyValueStore* storage = self->persistentData;
 	state Future<Optional<Value>> fFormat = storage->readValue(persistFormat.key);
-	state Future<Standalone<RangeResultRef>> fVers = storage->readRange(persistCurrentVersionKeys);
-	state Future<Standalone<RangeResultRef>> fRecoverCounts = storage->readRange(persistRecoveryCountKeys);
+	state Future<RangeResult> fVers = storage->readRange(persistCurrentVersionKeys);
+	state Future<RangeResult> fRecoverCounts = storage->readRange(persistRecoveryCountKeys);
 
 	// FIXME: metadata in queue?
 
@@ -1407,8 +1407,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self, LocalityData locality)
 	}
 
 	if (!fFormat.get().present()) {
-		Standalone<RangeResultRef> v =
-		    wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
+		RangeResult v = wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
 		if (!v.size()) {
 			TEST(true); // The DB is completely empty, so it was never initialized.  Delete it.
 			throw worker_removed();
@@ -1469,8 +1468,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self, LocalityData locality)
 		loop {
 			if (logData->removed.isReady())
 				break;
-			Standalone<RangeResultRef> data =
-			    wait(self->persistentData->readRange(tagKeys, BUGGIFY ? 3 : 1 << 30, 1 << 20));
+			RangeResult data = wait(self->persistentData->readRange(tagKeys, BUGGIFY ? 3 : 1 << 30, 1 << 20));
 			if (!data.size())
 				break;
 			((KeyRangeRef&)tagKeys) = KeyRangeRef(keyAfter(data.back().key, tagKeys.arena()), tagKeys.end);
diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp
index f90ef99ad8..a442a3df6a 100644
--- a/fdbserver/OldTLogServer_6_0.actor.cpp
+++ b/fdbserver/OldTLogServer_6_0.actor.cpp
@@ -1357,7 +1357,7 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
 			peekMessagesFromMemory(logData, req, messages2, endVersion);
 		}
 
-		Standalone<RangeResultRef> kvs = wait(self->persistentData->readRange(
+		RangeResult kvs = wait(self->persistentData->readRange(
 		    KeyRangeRef(persistTagMessagesKey(logData->logId, req.tag, req.begin),
 		                persistTagMessagesKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)),
 		    SERVER_KNOBS->DESIRED_TOTAL_BYTES,
@@ -2232,12 +2232,12 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	state IKeyValueStore* storage = self->persistentData;
 	wait(storage->init());
 	state Future<Optional<Value>> fFormat = storage->readValue(persistFormat.key);
-	state Future<Standalone<RangeResultRef>> fVers = storage->readRange(persistCurrentVersionKeys);
-	state Future<Standalone<RangeResultRef>> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys);
-	state Future<Standalone<RangeResultRef>> fLocality = storage->readRange(persistLocalityKeys);
-	state Future<Standalone<RangeResultRef>> fLogRouterTags = storage->readRange(persistLogRouterTagsKeys);
-	state Future<Standalone<RangeResultRef>> fTxsTags = storage->readRange(persistTxsTagsKeys);
-	state Future<Standalone<RangeResultRef>> fRecoverCounts = storage->readRange(persistRecoveryCountKeys);
+	state Future<RangeResult> fVers = storage->readRange(persistCurrentVersionKeys);
+	state Future<RangeResult> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys);
+	state Future<RangeResult> fLocality = storage->readRange(persistLocalityKeys);
+	state Future<RangeResult> fLogRouterTags = storage->readRange(persistLogRouterTagsKeys);
+	state Future<RangeResult> fTxsTags = storage->readRange(persistTxsTagsKeys);
+	state Future<RangeResult> fRecoverCounts = storage->readRange(persistRecoveryCountKeys);
 
 	// FIXME: metadata in queue?
 
@@ -2258,8 +2258,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	}
 
 	if (!fFormat.get().present()) {
-		Standalone<RangeResultRef> v =
-		    wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
+		RangeResult v = wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
 		if (!v.size()) {
 			TEST(true); // The DB is completely empty, so it was never initialized.  Delete it.
 			throw worker_removed();
@@ -2374,8 +2373,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 		loop {
 			if (logData->removed.isReady())
 				break;
-			Standalone<RangeResultRef> data =
-			    wait(self->persistentData->readRange(tagKeys, BUGGIFY ? 3 : 1 << 30, 1 << 20));
+			RangeResult data = wait(self->persistentData->readRange(tagKeys, BUGGIFY ? 3 : 1 << 30, 1 << 20));
 			if (!data.size())
 				break;
 			((KeyRangeRef&)tagKeys) = KeyRangeRef(keyAfter(data.back().key, tagKeys.arena()), tagKeys.end);
diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp
index 18a25454b0..a305b27f3a 100644
--- a/fdbserver/OldTLogServer_6_2.actor.cpp
+++ b/fdbserver/OldTLogServer_6_2.actor.cpp
@@ -828,7 +828,7 @@ ACTOR Future<Void> updatePoppedLocation(TLogData* self, Reference<LogData> logDa
 	// us to remove data that still is pointed to by SpilledData in the btree.
 	if (data->persistentPopped <= logData->persistentDataVersion) {
 		// Recover the next needed location in the Disk Queue from the index.
-		Standalone<RangeResultRef> kvrefs = wait(self->persistentData->readRange(
+		RangeResult kvrefs = wait(self->persistentData->readRange(
 		    KeyRangeRef(persistTagMessageRefsKey(logData->logId, data->tag, data->persistentPopped),
 		                persistTagMessageRefsKey(logData->logId, data->tag, logData->persistentDataVersion + 1)),
 		    1));
@@ -1682,7 +1682,7 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
 		}
 
 		if (req.tag.locality == tagLocalityTxs || req.tag == txsTag) {
-			Standalone<RangeResultRef> kvs = wait(self->persistentData->readRange(
+			RangeResult kvs = wait(self->persistentData->readRange(
 			    KeyRangeRef(persistTagMessagesKey(logData->logId, req.tag, req.begin),
 			                persistTagMessagesKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)),
 			    SERVER_KNOBS->DESIRED_TOTAL_BYTES,
@@ -1702,7 +1702,7 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
 			}
 		} else {
 			// FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow.
-			Standalone<RangeResultRef> kvrefs = wait(self->persistentData->readRange(
+			RangeResult kvrefs = wait(self->persistentData->readRange(
 			    KeyRangeRef(
 			        persistTagMessageRefsKey(logData->logId, req.tag, req.begin),
 			        persistTagMessageRefsKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)),
@@ -2683,13 +2683,13 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	wait(storage->init());
 	state Future<Optional<Value>> fFormat = storage->readValue(persistFormat.key);
 	state Future<Optional<Value>> fRecoveryLocation = storage->readValue(persistRecoveryLocationKey);
-	state Future<Standalone<RangeResultRef>> fVers = storage->readRange(persistCurrentVersionKeys);
-	state Future<Standalone<RangeResultRef>> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys);
-	state Future<Standalone<RangeResultRef>> fLocality = storage->readRange(persistLocalityKeys);
-	state Future<Standalone<RangeResultRef>> fLogRouterTags = storage->readRange(persistLogRouterTagsKeys);
-	state Future<Standalone<RangeResultRef>> fTxsTags = storage->readRange(persistTxsTagsKeys);
-	state Future<Standalone<RangeResultRef>> fRecoverCounts = storage->readRange(persistRecoveryCountKeys);
-	state Future<Standalone<RangeResultRef>> fProtocolVersions = storage->readRange(persistProtocolVersionKeys);
+	state Future<RangeResult> fVers = storage->readRange(persistCurrentVersionKeys);
+	state Future<RangeResult> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys);
+	state Future<RangeResult> fLocality = storage->readRange(persistLocalityKeys);
+	state Future<RangeResult> fLogRouterTags = storage->readRange(persistLogRouterTagsKeys);
+	state Future<RangeResult> fTxsTags = storage->readRange(persistTxsTagsKeys);
+	state Future<RangeResult> fRecoverCounts = storage->readRange(persistRecoveryCountKeys);
+	state Future<RangeResult> fProtocolVersions = storage->readRange(persistProtocolVersionKeys);
 
 	// FIXME: metadata in queue?
 
@@ -2711,8 +2711,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	}
 
 	if (!fFormat.get().present()) {
-		Standalone<RangeResultRef> v =
-		    wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
+		RangeResult v = wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
 		if (!v.size()) {
 			TEST(true); // The DB is completely empty, so it was never initialized.  Delete it.
 			throw worker_removed();
@@ -2821,8 +2820,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 		loop {
 			if (logData->removed.isReady())
 				break;
-			Standalone<RangeResultRef> data =
-			    wait(self->persistentData->readRange(tagKeys, BUGGIFY ? 3 : 1 << 30, 1 << 20));
+			RangeResult data = wait(self->persistentData->readRange(tagKeys, BUGGIFY ? 3 : 1 << 30, 1 << 20));
 			if (!data.size())
 				break;
 			((KeyRangeRef&)tagKeys) = KeyRangeRef(keyAfter(data.back().key, tagKeys.arena()), tagKeys.end);
diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index 3470dfc821..98f14d545e 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -229,7 +229,7 @@ ACTOR Future<vector<StorageServerInterface>> getStorageServers(Database cx, bool
 		}
 		tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 		try {
-			Standalone<RangeResultRef> serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
+			RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
 			ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY);
 
 			vector<StorageServerInterface> servers;
diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 92de6b093b..71d3056489 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -784,8 +784,7 @@ ACTOR Future<Void> monitorThrottlingChanges(RatekeeperData* self) {
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 
-				state Future<Standalone<RangeResultRef>> throttledTagKeys =
-				    tr.getRange(tagThrottleKeys, CLIENT_KNOBS->TOO_MANY);
+				state Future<RangeResult> throttledTagKeys = tr.getRange(tagThrottleKeys, CLIENT_KNOBS->TOO_MANY);
 				state Future<Optional<Value>> autoThrottlingEnabled = tr.get(tagThrottleAutoEnabledKey);
 
 				if (!committed) {
@@ -1388,7 +1387,7 @@ ACTOR Future<Void> configurationMonitor(RatekeeperData* self) {
 			try {
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-				Standalone<RangeResultRef> results = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY));
+				RangeResult results = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!results.more && results.size() < CLIENT_KNOBS->TOO_MANY);
 
 				self->configuration.fromKeyValues((VectorRef<KeyValueRef>)results);
diff --git a/fdbserver/RestoreController.actor.cpp b/fdbserver/RestoreController.actor.cpp
index 5010afbada..4341fbf1cf 100644
--- a/fdbserver/RestoreController.actor.cpp
+++ b/fdbserver/RestoreController.actor.cpp
@@ -722,8 +722,7 @@ ACTOR static Future<std::vector<RestoreRequest>> collectRestoreRequests(Database
 			Optional<Value> numRequests = wait(tr.get(restoreRequestTriggerKey));
 			ASSERT(numRequests.present());
 
-			Standalone<RangeResultRef> restoreRequestValues =
-			    wait(tr.getRange(restoreRequestKeys, CLIENT_KNOBS->TOO_MANY));
+			RangeResult restoreRequestValues = wait(tr.getRange(restoreRequestKeys, CLIENT_KNOBS->TOO_MANY));
 			ASSERT(!restoreRequestValues.more);
 			if (restoreRequestValues.size()) {
 				for (auto& it : restoreRequestValues) {
diff --git a/fdbserver/RestoreWorker.actor.cpp b/fdbserver/RestoreWorker.actor.cpp
index 3f7fa1f46d..193f2631da 100644
--- a/fdbserver/RestoreWorker.actor.cpp
+++ b/fdbserver/RestoreWorker.actor.cpp
@@ -153,7 +153,7 @@ ACTOR Future<Void> collectRestoreWorkerInterface(Reference<RestoreWorkerData> se
 			tr.reset();
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-			Standalone<RangeResultRef> agentValues = wait(tr.getRange(restoreWorkersKeys, CLIENT_KNOBS->TOO_MANY));
+			RangeResult agentValues = wait(tr.getRange(restoreWorkersKeys, CLIENT_KNOBS->TOO_MANY));
 			ASSERT(!agentValues.more);
 			// If agentValues.size() < min_num_workers, we should wait for coming workers to register their
 			// workerInterface before we read them once for all
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 91daae600c..8579fcd2df 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -1410,10 +1410,9 @@ ACTOR static Future<Void> logRangeWarningFetcher(Database cx,
 				tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 
-				state Future<Standalone<RangeResultRef>> existingDestUidValues =
+				state Future<RangeResult> existingDestUidValues =
 				    tr.getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY);
-				state Future<Standalone<RangeResultRef>> existingLogRanges =
-				    tr.getRange(logRangesRange, CLIENT_KNOBS->TOO_MANY);
+				state Future<RangeResult> existingLogRanges = tr.getRange(logRangesRange, CLIENT_KNOBS->TOO_MANY);
 				wait((success(existingDestUidValues) && success(existingLogRanges)) || timeoutFuture);
 
 				std::set<LogRangeAndUID> loggingRanges;
@@ -1494,8 +1493,7 @@ loadConfiguration(Database cx, JsonBuilderArray* messages, std::set<std::string>
 		tr.setOption(FDBTransactionOptions::CAUSAL_READ_RISKY);
 		try {
 			choose {
-				when(Standalone<RangeResultRef> res =
-				         wait(tr.getRange(configKeys, SERVER_KNOBS->CONFIGURATION_ROWS_TO_FETCH))) {
+				when(RangeResult res = wait(tr.getRange(configKeys, SERVER_KNOBS->CONFIGURATION_ROWS_TO_FETCH))) {
 					DatabaseConfiguration configuration;
 					if (res.size() == SERVER_KNOBS->CONFIGURATION_ROWS_TO_FETCH) {
 						status_incomplete_reasons->insert("Too many configuration parameters set.");
@@ -2522,11 +2520,10 @@ ACTOR Future<JsonBuilderObject> layerStatusFetcher(Database cx,
 				tr.setOption(FDBTransactionOptions::TIMEOUT, StringRef((uint8_t*)&timeout_ms, sizeof(int64_t)));
 
 				std::string jsonPrefix = layerStatusMetaPrefixRange.begin.toString() + "json/";
-				Standalone<RangeResultRef> jsonLayers =
-				    wait(tr.getRange(KeyRangeRef(jsonPrefix, strinc(jsonPrefix)), 1000));
+				RangeResult jsonLayers = wait(tr.getRange(KeyRangeRef(jsonPrefix, strinc(jsonPrefix)), 1000));
 				// TODO:  Also fetch other linked subtrees of meta keys
 
-				state std::vector<Future<Standalone<RangeResultRef>>> docFutures;
+				state std::vector<Future<RangeResult>> docFutures;
 				state int i;
 				for (i = 0; i < jsonLayers.size(); ++i)
 					docFutures.push_back(
@@ -2536,7 +2533,7 @@ ACTOR Future<JsonBuilderObject> layerStatusFetcher(Database cx,
 				JSONDoc::expires_reference_version = (uint64_t)tr.getReadVersion().get();
 
 				for (i = 0; i < docFutures.size(); ++i) {
-					state Standalone<RangeResultRef> docs = wait(docFutures[i]);
+					state RangeResult docs = wait(docFutures[i]);
 					state int j;
 					for (j = 0; j < docs.size(); ++j) {
 						state json_spirit::mValue doc;
@@ -2629,8 +2626,7 @@ ACTOR Future<Optional<Value>> getActivePrimaryDC(Database cx, int* fullyReplicat
 			}
 			tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-			state Future<Standalone<RangeResultRef>> fReplicaKeys =
-			    tr.getRange(datacenterReplicasKeys, CLIENT_KNOBS->TOO_MANY);
+			state Future<RangeResult> fReplicaKeys = tr.getRange(datacenterReplicasKeys, CLIENT_KNOBS->TOO_MANY);
 			state Future<Optional<Value>> fPrimaryDatacenterKey = tr.get(primaryDatacenterKey);
 			wait(timeoutError(success(fPrimaryDatacenterKey) && success(fReplicaKeys), 5));
 
diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp
index b084f52896..d758e32bf3 100644
--- a/fdbserver/StorageCache.actor.cpp
+++ b/fdbserver/StorageCache.actor.cpp
@@ -1174,13 +1174,13 @@ void coalesceCacheRanges(StorageCacheData* data, KeyRangeRef keys) {
 	}
 }
 
-ACTOR Future<Standalone<RangeResultRef>> tryFetchRange(Database cx,
-                                                       Version version,
-                                                       KeyRangeRef keys,
-                                                       GetRangeLimits limits,
-                                                       bool* isTooOld) {
+ACTOR Future<RangeResult> tryFetchRange(Database cx,
+                                        Version version,
+                                        KeyRangeRef keys,
+                                        GetRangeLimits limits,
+                                        bool* isTooOld) {
 	state Transaction tr(cx);
-	state Standalone<RangeResultRef> output;
+	state RangeResult output;
 	state KeySelectorRef begin = firstGreaterOrEqual(keys.begin);
 	state KeySelectorRef end = firstGreaterOrEqual(keys.end);
 
@@ -1194,7 +1194,7 @@ ACTOR Future<Standalone<RangeResultRef>> tryFetchRange(Database cx,
 
 	try {
 		loop {
-			Standalone<RangeResultRef> rep = wait(tr.getRange(begin, end, limits, true));
+			RangeResult rep = wait(tr.getRange(begin, end, limits, true));
 			limits.decrement(rep);
 
 			if (limits.isReached() || !rep.more) {
@@ -1317,7 +1317,7 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang
 			try {
 				TEST(true); // Fetching keys for transferred cacheRange
 
-				state Standalone<RangeResultRef> this_block =
+				state RangeResult this_block =
 				    wait(tryFetchRange(data->cx,
 				                       fetchVersion,
 				                       keys,
@@ -1367,7 +1367,7 @@ ACTOR Future<Void> fetchKeys(StorageCacheData* data, AddingCacheRange* cacheRang
 					throw please_reboot();
 				}
 
-				this_block = Standalone<RangeResultRef>();
+				this_block = RangeResult();
 
 				if (BUGGIFY)
 					wait(delay(1));
@@ -2099,7 +2099,7 @@ ACTOR Future<Void> storageCacheStartUpWarmup(StorageCacheData* self) {
 			tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			try {
-				Standalone<RangeResultRef> range = wait(tr.getRange(storageCacheKeys, CLIENT_KNOBS->TOO_MANY));
+				RangeResult range = wait(tr.getRange(storageCacheKeys, CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!range.more);
 				readVersion = tr.getReadVersion().get();
 				bool currCached = false;
diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index 1561b2f81f..a7d2b031d0 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -867,7 +867,7 @@ ACTOR Future<Void> updatePoppedLocation(TLogData* self, Reference<LogData> logDa
 	// us to remove data that still is pointed to by SpilledData in the btree.
 	if (data->persistentPopped <= logData->persistentDataVersion) {
 		// Recover the next needed location in the Disk Queue from the index.
-		Standalone<RangeResultRef> kvrefs = wait(self->persistentData->readRange(
+		RangeResult kvrefs = wait(self->persistentData->readRange(
 		    KeyRangeRef(persistTagMessageRefsKey(logData->logId, data->tag, data->persistentPopped),
 		                persistTagMessageRefsKey(logData->logId, data->tag, logData->persistentDataVersion + 1)),
 		    1));
@@ -1741,7 +1741,7 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
 		}
 
 		if (logData->shouldSpillByValue(req.tag)) {
-			Standalone<RangeResultRef> kvs = wait(self->persistentData->readRange(
+			RangeResult kvs = wait(self->persistentData->readRange(
 			    KeyRangeRef(persistTagMessagesKey(logData->logId, req.tag, req.begin),
 			                persistTagMessagesKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)),
 			    SERVER_KNOBS->DESIRED_TOTAL_BYTES,
@@ -1761,7 +1761,7 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
 			}
 		} else {
 			// FIXME: Limit to approximately DESIRED_TOTATL_BYTES somehow.
-			Standalone<RangeResultRef> kvrefs = wait(self->persistentData->readRange(
+			RangeResult kvrefs = wait(self->persistentData->readRange(
 			    KeyRangeRef(
 			        persistTagMessageRefsKey(logData->logId, req.tag, req.begin),
 			        persistTagMessageRefsKey(logData->logId, req.tag, logData->persistentDataDurableVersion + 1)),
@@ -2741,14 +2741,14 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	wait(storage->init());
 	state Future<Optional<Value>> fFormat = storage->readValue(persistFormat.key);
 	state Future<Optional<Value>> fRecoveryLocation = storage->readValue(persistRecoveryLocationKey);
-	state Future<Standalone<RangeResultRef>> fVers = storage->readRange(persistCurrentVersionKeys);
-	state Future<Standalone<RangeResultRef>> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys);
-	state Future<Standalone<RangeResultRef>> fLocality = storage->readRange(persistLocalityKeys);
-	state Future<Standalone<RangeResultRef>> fLogRouterTags = storage->readRange(persistLogRouterTagsKeys);
-	state Future<Standalone<RangeResultRef>> fTxsTags = storage->readRange(persistTxsTagsKeys);
-	state Future<Standalone<RangeResultRef>> fRecoverCounts = storage->readRange(persistRecoveryCountKeys);
-	state Future<Standalone<RangeResultRef>> fProtocolVersions = storage->readRange(persistProtocolVersionKeys);
-	state Future<Standalone<RangeResultRef>> fTLogSpillTypes = storage->readRange(persistTLogSpillTypeKeys);
+	state Future<RangeResult> fVers = storage->readRange(persistCurrentVersionKeys);
+	state Future<RangeResult> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys);
+	state Future<RangeResult> fLocality = storage->readRange(persistLocalityKeys);
+	state Future<RangeResult> fLogRouterTags = storage->readRange(persistLogRouterTagsKeys);
+	state Future<RangeResult> fTxsTags = storage->readRange(persistTxsTagsKeys);
+	state Future<RangeResult> fRecoverCounts = storage->readRange(persistRecoveryCountKeys);
+	state Future<RangeResult> fProtocolVersions = storage->readRange(persistProtocolVersionKeys);
+	state Future<RangeResult> fTLogSpillTypes = storage->readRange(persistTLogSpillTypeKeys);
 
 	// FIXME: metadata in queue?
 
@@ -2776,8 +2776,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 	}
 
 	if (!fFormat.get().present()) {
-		Standalone<RangeResultRef> v =
-		    wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
+		RangeResult v = wait(self->persistentData->readRange(KeyRangeRef(StringRef(), LiteralStringRef("\xff")), 1));
 		if (!v.size()) {
 			TEST(true); // The DB is completely empty, so it was never initialized.  Delete it.
 			throw worker_removed();
@@ -2894,8 +2893,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
 		loop {
 			if (logData->removed.isReady())
 				break;
-			Standalone<RangeResultRef> data =
-			    wait(self->persistentData->readRange(tagKeys, BUGGIFY ? 3 : 1 << 30, 1 << 20));
+			RangeResult data = wait(self->persistentData->readRange(tagKeys, BUGGIFY ? 3 : 1 << 30, 1 << 20));
 			if (!data.size())
 				break;
 			((KeyRangeRef&)tagKeys) = KeyRangeRef(keyAfter(data.back().key, tagKeys.arena()), tagKeys.end);
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index d1c9ad77f0..57bb26dea7 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -6026,17 +6026,15 @@ public:
 		m_tree->set(keyValue);
 	}
 
-	Future<Standalone<RangeResultRef>> readRange(KeyRangeRef keys,
-	                                             int rowLimit = 1 << 30,
-	                                             int byteLimit = 1 << 30) override {
+	Future<RangeResult> readRange(KeyRangeRef keys, int rowLimit = 1 << 30, int byteLimit = 1 << 30) override {
 		debug_printf("READRANGE %s\n", printable(keys).c_str());
 		return catchError(readRange_impl(this, keys, rowLimit, byteLimit));
 	}
 
-	ACTOR static Future<Standalone<RangeResultRef>> readRange_impl(KeyValueStoreRedwoodUnversioned* self,
-	                                                               KeyRange keys,
-	                                                               int rowLimit,
-	                                                               int byteLimit) {
+	ACTOR static Future<RangeResult> readRange_impl(KeyValueStoreRedwoodUnversioned* self,
+	                                                KeyRange keys,
+	                                                int rowLimit,
+	                                                int byteLimit) {
 		state VersionedBTree::BTreeCursor cur;
 		wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion()));
 
@@ -6045,7 +6043,7 @@ public:
 		state FlowLock::Releaser releaser(*readLock);
 		++g_redwoodMetrics.opGetRange;
 
-		state Standalone<RangeResultRef> result;
+		state RangeResult result;
 		state int accumulatedBytes = 0;
 		ASSERT(byteLimit > 0);
 
@@ -8687,7 +8685,7 @@ ACTOR Future<Void> randomRangeScans(IKeyValueStore* kvs,
 		KeyRangeRef range = source.getKeyRangeRef(singlePrefix, suffixSize);
 		int rowLim = (deterministicRandom()->randomInt(0, 2) != 0) ? rowLimit : -rowLimit;
 
-		Standalone<RangeResultRef> result = wait(kvs->readRange(range, rowLim));
+		RangeResult result = wait(kvs->readRange(range, rowLim));
 
 		recordsRead += result.size();
 		bytesRead += result.size() * recordSize;
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 3b0fc8b180..ac75e87947 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -436,7 +436,7 @@ ACTOR Future<Void> dumpDatabase(Database cx, std::string outputFilename, KeyRang
 				fprintf(output, "<h3>Database version: %" PRId64 "</h3>", ver);
 
 				loop {
-					Standalone<RangeResultRef> results = wait(tr.getRange(iter, firstGreaterOrEqual(range.end), 1000));
+					RangeResult results = wait(tr.getRange(iter, firstGreaterOrEqual(range.end), 1000));
 					for (int r = 0; r < results.size(); r++) {
 						std::string key = toHTML(results[r].key), value = toHTML(results[r].value);
 						fprintf(output, "<p>%s <b>:=</b> %s</p>\n", key.c_str(), value.c_str());
diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp
index a5f6ed7b75..41c5159832 100644
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@@ -825,7 +825,7 @@ ACTOR Future<Void> readTransactionSystemState(Reference<MasterData> self,
 	    .detail("LastEpochEnd", self->lastEpochEnd)
 	    .detail("RecoveryTransactionVersion", self->recoveryTransactionVersion);
 
-	Standalone<RangeResultRef> rawConf = wait(self->txnStateStore->readRange(configKeys));
+	RangeResult rawConf = wait(self->txnStateStore->readRange(configKeys));
 	self->configuration.fromKeyValues(rawConf.castTo<VectorRef<KeyValueRef>>());
 	self->originalConfiguration = self->configuration;
 	self->hasConfiguration = true;
@@ -836,13 +836,13 @@ ACTOR Future<Void> readTransactionSystemState(Reference<MasterData> self,
 	    .detail("Conf", self->configuration.toString())
 	    .trackLatest("RecoveredConfig");
 
-	Standalone<RangeResultRef> rawLocalities = wait(self->txnStateStore->readRange(tagLocalityListKeys));
+	RangeResult rawLocalities = wait(self->txnStateStore->readRange(tagLocalityListKeys));
 	self->dcId_locality.clear();
 	for (auto& kv : rawLocalities) {
 		self->dcId_locality[decodeTagLocalityListKey(kv.key)] = decodeTagLocalityListValue(kv.value);
 	}
 
-	Standalone<RangeResultRef> rawTags = wait(self->txnStateStore->readRange(serverTagKeys));
+	RangeResult rawTags = wait(self->txnStateStore->readRange(serverTagKeys));
 	self->allTags.clear();
 	if (self->lastEpochEnd > 0) {
 		self->allTags.push_back(cacheTag);
@@ -862,7 +862,7 @@ ACTOR Future<Void> readTransactionSystemState(Reference<MasterData> self,
 		}
 	}
 
-	Standalone<RangeResultRef> rawHistoryTags = wait(self->txnStateStore->readRange(serverTagHistoryKeys));
+	RangeResult rawHistoryTags = wait(self->txnStateStore->readRange(serverTagHistoryKeys));
 	for (auto& kv : rawHistoryTags) {
 		self->allTags.push_back(decodeServerTagValue(kv.value));
 	}
@@ -888,7 +888,7 @@ ACTOR Future<Void> sendInitialCommitToResolvers(Reference<MasterData> self) {
 	state Sequence txnSequence = 0;
 	ASSERT(self->recoveryTransactionVersion);
 
-	state Standalone<RangeResultRef> data =
+	state RangeResult data =
 	    self->txnStateStore
 	        ->readRange(txnKeys, BUGGIFY ? 3 : SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES)
 	        .get();
@@ -904,7 +904,7 @@ ACTOR Future<Void> sendInitialCommitToResolvers(Reference<MasterData> self) {
 		if (!data.size())
 			break;
 		((KeyRangeRef&)txnKeys) = KeyRangeRef(keyAfter(data.back().key, txnKeys.arena()), txnKeys.end);
-		Standalone<RangeResultRef> nextData =
+		RangeResult nextData =
 		    self->txnStateStore
 		        ->readRange(txnKeys, BUGGIFY ? 3 : SERVER_KNOBS->DESIRED_TOTAL_BYTES, SERVER_KNOBS->DESIRED_TOTAL_BYTES)
 		        .get();
@@ -1483,7 +1483,7 @@ ACTOR Future<Void> configurationMonitor(Reference<MasterData> self, Database cx)
 		loop {
 			try {
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-				Standalone<RangeResultRef> results = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY));
+				RangeResult results = wait(tr.getRange(configKeys, CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!results.more && results.size() < CLIENT_KNOBS->TOO_MANY);
 
 				DatabaseConfiguration conf;
diff --git a/fdbserver/pubsub.actor.cpp b/fdbserver/pubsub.actor.cpp
index 8931d7bd47..3f88c93b47 100644
--- a/fdbserver/pubsub.actor.cpp
+++ b/fdbserver/pubsub.actor.cpp
@@ -212,7 +212,7 @@ ACTOR Future<Void> updateFeedWatchers(Transaction* tr, uint64_t feed) {
 	state bool first = true;
 	loop {
 		// Grab watching inboxes in swaths of 100
-		state Standalone<RangeResultRef> watchingInboxes =
+		state RangeResult watchingInboxes =
 		    wait((*tr).getRange(firstGreaterOrEqual(keyForFeedWatcher(feed, first ? 0 : highestInbox + 1)),
 		                        firstGreaterOrEqual(keyForFeedWatcher(feed, UINT64_MAX)),
 		                        100)); // REVIEW: does 100 make sense?
@@ -260,7 +260,7 @@ ACTOR Future<uint64_t> _postMessage(Database cx, uint64_t feed, Standalone<Strin
 			}
 
 			// Get globally latest message, set our ID to that less one
-			state Standalone<RangeResultRef> latestMessage = wait(
+			state RangeResult latestMessage = wait(
 			    tr.getRange(firstGreaterOrEqual(keyForMessage(0)), firstGreaterOrEqual(keyForMessage(UINT64_MAX)), 1));
 			if (!latestMessage.size()) {
 				messageId = UINT64_MAX - 1;
@@ -317,7 +317,7 @@ ACTOR Future<int> singlePassInboxCacheUpdate(Database cx, uint64_t inbox, int sw
 	loop {
 		try {
 			// For each stale feed, update cache with latest message id
-			state Standalone<RangeResultRef> staleFeeds =
+			state RangeResult staleFeeds =
 			    wait(tr.getRange(firstGreaterOrEqual(keyForInboxStaleFeed(inbox, 0)),
 			                     firstGreaterOrEqual(keyForInboxStaleFeed(inbox, UINT64_MAX)),
 			                     swath)); // REVIEW: does 100 make sense?
@@ -376,10 +376,9 @@ ACTOR Future<Void> updateInboxCache(Database cx, uint64_t inbox) {
 }
 
 ACTOR Future<MessageId> getFeedLatestAtOrAfter(Transaction* tr, Feed feed, MessageId position) {
-	state Standalone<RangeResultRef> lastMessageRange =
-	    wait((*tr).getRange(firstGreaterOrEqual(keyForFeedMessage(feed, position)),
-	                        firstGreaterOrEqual(keyForFeedMessage(feed, UINT64_MAX)),
-	                        1));
+	state RangeResult lastMessageRange = wait((*tr).getRange(firstGreaterOrEqual(keyForFeedMessage(feed, position)),
+	                                                         firstGreaterOrEqual(keyForFeedMessage(feed, UINT64_MAX)),
+	                                                         1));
 	if (!lastMessageRange.size())
 		return uint64_t(0);
 	KeyValueRef m = lastMessageRange[0];
@@ -413,10 +412,9 @@ ACTOR Future<std::vector<Message>> _listInboxMessages(Database cx, uint64_t inbo
 			// Fetch all cached entries for all the feeds to which we are subscribed
 			Optional<Value> cntValue = wait(tr.get(keyForInboxSubcriptionCount(inbox)));
 			uint64_t subscriptions = valueToUInt64(cntValue.get());
-			state Standalone<RangeResultRef> feeds =
-			    wait(tr.getRange(firstGreaterOrEqual(keyForInboxCacheByID(inbox, 0)),
-			                     firstGreaterOrEqual(keyForInboxCacheByID(inbox, UINT64_MAX)),
-			                     subscriptions));
+			state RangeResult feeds = wait(tr.getRange(firstGreaterOrEqual(keyForInboxCacheByID(inbox, 0)),
+			                                           firstGreaterOrEqual(keyForInboxCacheByID(inbox, UINT64_MAX)),
+			                                           subscriptions));
 			if (!feeds.size())
 				return messages;
 
@@ -444,10 +442,9 @@ ACTOR Future<std::vector<Message>> _listInboxMessages(Database cx, uint64_t inbo
 
 			// Check the list of dispatching messages to make sure there are no older ones than ours
 			state MessageId earliestMessage = feedLatest.begin()->first;
-			Standalone<RangeResultRef> dispatching =
-			    wait(tr.getRange(firstGreaterOrEqual(keyForDisptchEntry(earliestMessage)),
-			                     firstGreaterOrEqual(keyForDisptchEntry(UINT64_MAX)),
-			                     1));
+			RangeResult dispatching = wait(tr.getRange(firstGreaterOrEqual(keyForDisptchEntry(earliestMessage)),
+			                                           firstGreaterOrEqual(keyForDisptchEntry(UINT64_MAX)),
+			                                           1));
 			// If there are messages "older" than ours, try this again
 			//  (with a new transaction and a flush of the "stale" feeds
 			if (dispatching.size()) {
@@ -487,10 +484,9 @@ ACTOR Future<std::vector<Message>> _listFeedMessages(Database cx, Feed feed, int
 	TraceEvent("PubSubListFeed").detail("Feed", feed).detail("Count", count).detail("Cursor", cursor);
 	loop {
 		try {
-			state Standalone<RangeResultRef> messageIds =
-			    wait(tr.getRange(firstGreaterOrEqual(keyForFeedMessage(feed, cursor)),
-			                     firstGreaterOrEqual(keyForFeedMessage(feed, UINT64_MAX)),
-			                     count));
+			state RangeResult messageIds = wait(tr.getRange(firstGreaterOrEqual(keyForFeedMessage(feed, cursor)),
+			                                                firstGreaterOrEqual(keyForFeedMessage(feed, UINT64_MAX)),
+			                                                count));
 			if (!messageIds.size())
 				return messages;
 
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 3fa7f20172..ad33ba8631 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -187,7 +187,7 @@ struct StorageServerDisk {
 	Future<Optional<Value>> readValuePrefix(KeyRef key, int maxLength, Optional<UID> debugID = Optional<UID>()) {
 		return storage->readValuePrefix(key, maxLength, debugID);
 	}
-	Future<Standalone<RangeResultRef>> readRange(KeyRangeRef keys, int rowLimit = 1 << 30, int byteLimit = 1 << 30) {
+	Future<RangeResult> readRange(KeyRangeRef keys, int rowLimit = 1 << 30, int byteLimit = 1 << 30) {
 		return storage->readRange(keys, rowLimit, byteLimit);
 	}
 
@@ -202,7 +202,7 @@ private:
 	void writeMutations(const VectorRef<MutationRef>& mutations, Version debugVersion, const char* debugContext);
 
 	ACTOR static Future<Key> readFirstKey(IKeyValueStore* storage, KeyRangeRef range) {
-		Standalone<RangeResultRef> r = wait(storage->readRange(range, 1));
+		RangeResult r = wait(storage->readRange(range, 1));
 		if (r.size())
 			return r[0].key;
 		else
@@ -1439,7 +1439,7 @@ ACTOR Future<Void> getShardStateQ(StorageServer* data, GetShardStateRequest req)
 void merge(Arena& arena,
            VectorRef<KeyValueRef, VecSerStrategy::String>& output,
            VectorRef<KeyValueRef> const& vm_output,
-           Standalone<RangeResultRef> const& base,
+           RangeResult const& base,
            int& vCount,
            int limit,
            bool stopAtEndOfBase,
@@ -1557,7 +1557,7 @@ ACTOR Future<GetKeyValuesReply> readRange(StorageServer* data,
 
 			// Read the data on disk up to vCurrent (or the end of the range)
 			readEnd = vCurrent ? std::min(vCurrent.key(), range.end) : range.end;
-			Standalone<RangeResultRef> atStorageVersion =
+			RangeResult atStorageVersion =
 			    wait(data->storage.readRange(KeyRangeRef(readBegin, readEnd), limit, *pLimitBytes));
 
 			ASSERT(atStorageVersion.size() <= limit);
@@ -1638,7 +1638,7 @@ ACTOR Future<GetKeyValuesReply> readRange(StorageServer* data,
 
 			readBegin = vCurrent ? std::max(vCurrent->isClearTo() ? vCurrent->getEndKey() : vCurrent.key(), range.begin)
 			                     : range.begin;
-			Standalone<RangeResultRef> atStorageVersion =
+			RangeResult atStorageVersion =
 			    wait(data->storage.readRange(KeyRangeRef(readBegin, readEnd), limit, *pLimitBytes));
 
 			ASSERT(atStorageVersion.size() <= -limit);
@@ -2392,13 +2392,13 @@ void coalesceShards(StorageServer* data, KeyRangeRef keys) {
 	}
 }
 
-ACTOR Future<Standalone<RangeResultRef>> tryGetRange(Database cx,
-                                                     Version version,
-                                                     KeyRangeRef keys,
-                                                     GetRangeLimits limits,
-                                                     bool* isTooOld) {
+ACTOR Future<RangeResult> tryGetRange(Database cx,
+                                      Version version,
+                                      KeyRangeRef keys,
+                                      GetRangeLimits limits,
+                                      bool* isTooOld) {
 	state Transaction tr(cx);
-	state Standalone<RangeResultRef> output;
+	state RangeResult output;
 	state KeySelectorRef begin = firstGreaterOrEqual(keys.begin);
 	state KeySelectorRef end = firstGreaterOrEqual(keys.end);
 
@@ -2412,7 +2412,7 @@ ACTOR Future<Standalone<RangeResultRef>> tryGetRange(Database cx,
 
 	try {
 		loop {
-			Standalone<RangeResultRef> rep = wait(tr.getRange(begin, end, limits, true));
+			RangeResult rep = wait(tr.getRange(begin, end, limits, true));
 			limits.decrement(rep);
 
 			if (limits.isReached() || !rep.more) {
@@ -2624,7 +2624,7 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 			try {
 				TEST(true); // Fetching keys for transferred shard
 
-				state Standalone<RangeResultRef> this_block =
+				state RangeResult this_block =
 				    wait(tryGetRange(data->cx,
 				                     fetchVersion,
 				                     keys,
@@ -2701,7 +2701,7 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
 					}
 				}
 
-				this_block = Standalone<RangeResultRef>();
+				this_block = RangeResult();
 
 				if (BUGGIFY)
 					wait(delay(1));
@@ -3819,7 +3819,7 @@ ACTOR Future<Void> applyByteSampleResult(StorageServer* data,
 	state int totalKeys = 0;
 	state int totalBytes = 0;
 	loop {
-		Standalone<RangeResultRef> bs = wait(storage->readRange(
+		RangeResult bs = wait(storage->readRange(
 		    KeyRangeRef(begin, end), SERVER_KNOBS->STORAGE_LIMIT_BYTES, SERVER_KNOBS->STORAGE_LIMIT_BYTES));
 		if (results)
 			results->push_back(bs.castTo<VectorRef<KeyValueRef>>());
@@ -3922,8 +3922,8 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
 	state Future<Optional<Value>> fVersion = storage->readValue(persistVersion);
 	state Future<Optional<Value>> fLogProtocol = storage->readValue(persistLogProtocol);
 	state Future<Optional<Value>> fPrimaryLocality = storage->readValue(persistPrimaryLocality);
-	state Future<Standalone<RangeResultRef>> fShardAssigned = storage->readRange(persistShardAssignedKeys);
-	state Future<Standalone<RangeResultRef>> fShardAvailable = storage->readRange(persistShardAvailableKeys);
+	state Future<RangeResult> fShardAssigned = storage->readRange(persistShardAssignedKeys);
+	state Future<RangeResult> fShardAvailable = storage->readRange(persistShardAvailableKeys);
 
 	state Promise<Void> byteSampleSampleRecovered;
 	state Promise<Void> startByteSampleRestore;
@@ -3963,7 +3963,7 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
 	debug_checkRestoredVersion(data->thisServerID, version, "StorageServer");
 	data->setInitialVersion(version);
 
-	state Standalone<RangeResultRef> available = fShardAvailable.get();
+	state RangeResult available = fShardAvailable.get();
 	state int availableLoc;
 	for (availableLoc = 0; availableLoc < available.size(); availableLoc++) {
 		KeyRangeRef keys(available[availableLoc].key.removePrefix(persistShardAvailableKeys.begin),
@@ -3978,7 +3978,7 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
 		wait(yield());
 	}
 
-	state Standalone<RangeResultRef> assigned = fShardAssigned.get();
+	state RangeResult assigned = fShardAssigned.get();
 	state int assignedLoc;
 	for (assignedLoc = 0; assignedLoc < assigned.size(); assignedLoc++) {
 		KeyRangeRef keys(assigned[assignedLoc].key.removePrefix(persistShardAssignedKeys.begin),
diff --git a/fdbserver/workloads/ApiCorrectness.actor.cpp b/fdbserver/workloads/ApiCorrectness.actor.cpp
index 26df8c69c1..9aec6679c5 100644
--- a/fdbserver/workloads/ApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/ApiCorrectness.actor.cpp
@@ -447,10 +447,10 @@ public:
 		state bool reverse = deterministicRandom()->random01() > 0.5 ? false : true;
 
 		// Get the range from memory
-		state Standalone<RangeResultRef> storeResults = self->store.getRange(KeyRangeRef(start, end), limit, reverse);
+		state RangeResult storeResults = self->store.getRange(KeyRangeRef(start, end), limit, reverse);
 
 		// Get the range from the database
-		state Standalone<RangeResultRef> dbResults;
+		state RangeResult dbResults;
 		state Version readVersion;
 
 		state Reference<TransactionWrapper> transaction = self->createTransaction();
@@ -461,7 +461,7 @@ public:
 				readVersion = version;
 
 				KeyRangeRef range(start, end);
-				Standalone<RangeResultRef> rangeResults = wait(transaction->getRange(range, limit, reverse));
+				RangeResult rangeResults = wait(transaction->getRange(range, limit, reverse));
 				dbResults = rangeResults;
 				break;
 			} catch (Error& e) {
@@ -533,11 +533,10 @@ public:
 		state bool reverse = deterministicRandom()->random01() < 0.5 ? false : true;
 
 		// Get the range from the memory store
-		state Standalone<RangeResultRef> storeResults =
-		    self->store.getRange(KeyRangeRef(startKey, endKey), limit, reverse);
+		state RangeResult storeResults = self->store.getRange(KeyRangeRef(startKey, endKey), limit, reverse);
 
 		// Get the range from the database
-		state Standalone<RangeResultRef> dbResults;
+		state RangeResult dbResults;
 
 		state Reference<TransactionWrapper> transaction = self->createTransaction();
 		state Version readVersion;
@@ -547,8 +546,7 @@ public:
 				Version version = wait(transaction->getReadVersion());
 				readVersion = version;
 
-				Standalone<RangeResultRef> range =
-				    wait(transaction->getRange(startSelector, endSelector, limit, reverse));
+				RangeResult range = wait(transaction->getRange(startSelector, endSelector, limit, reverse));
 
 				if (endKey == self->store.endKey()) {
 					for (int i = 0; i < range.size(); i++) {
diff --git a/fdbserver/workloads/ApiWorkload.actor.cpp b/fdbserver/workloads/ApiWorkload.actor.cpp
index 42ecfee400..ad68076836 100644
--- a/fdbserver/workloads/ApiWorkload.actor.cpp
+++ b/fdbserver/workloads/ApiWorkload.actor.cpp
@@ -167,15 +167,14 @@ ACTOR Future<bool> compareDatabaseToMemory(ApiWorkload* self) {
 
 	loop {
 		// Fetch a subset of the results from each of the database and the memory store and compare them
-		state Standalone<RangeResultRef> storeResults =
-		    self->store.getRange(KeyRangeRef(startKey, endKey), resultsPerRange, false);
+		state RangeResult storeResults = self->store.getRange(KeyRangeRef(startKey, endKey), resultsPerRange, false);
 
 		state Reference<TransactionWrapper> transaction = self->createTransaction();
 		state KeyRangeRef range(startKey, endKey);
 
 		loop {
 			try {
-				state Standalone<RangeResultRef> dbResults = wait(transaction->getRange(range, resultsPerRange, false));
+				state RangeResult dbResults = wait(transaction->getRange(range, resultsPerRange, false));
 
 				// Compare results of database and memory store
 				Version v = wait(transaction->getReadVersion());
diff --git a/fdbserver/workloads/ApiWorkload.h b/fdbserver/workloads/ApiWorkload.h
index 96f009c109..48eaab348a 100644
--- a/fdbserver/workloads/ApiWorkload.h
+++ b/fdbserver/workloads/ApiWorkload.h
@@ -46,13 +46,10 @@ struct TransactionWrapper : public ReferenceCounted<TransactionWrapper> {
 	virtual Future<Optional<Value>> get(KeyRef& key) = 0;
 
 	// Gets a range of key-value pairs from the database specified by a key range
-	virtual Future<Standalone<RangeResultRef>> getRange(KeyRangeRef& keys, int limit, bool reverse) = 0;
+	virtual Future<RangeResult> getRange(KeyRangeRef& keys, int limit, bool reverse) = 0;
 
 	// Gets a range of key-value pairs from the database specified by a pair of key selectors
-	virtual Future<Standalone<RangeResultRef>> getRange(KeySelectorRef& begin,
-	                                                    KeySelectorRef& end,
-	                                                    int limit,
-	                                                    bool reverse) = 0;
+	virtual Future<RangeResult> getRange(KeySelectorRef& begin, KeySelectorRef& end, int limit, bool reverse) = 0;
 
 	// Gets the key from the database specified by a given key selector
 	virtual Future<Key> getKey(KeySelectorRef& key) = 0;
@@ -104,15 +101,12 @@ struct FlowTransactionWrapper : public TransactionWrapper {
 	Future<Optional<Value>> get(KeyRef& key) override { return transaction.get(key); }
 
 	// Gets a range of key-value pairs from the database specified by a key range
-	Future<Standalone<RangeResultRef>> getRange(KeyRangeRef& keys, int limit, bool reverse) override {
+	Future<RangeResult> getRange(KeyRangeRef& keys, int limit, bool reverse) override {
 		return transaction.getRange(keys, limit, false, reverse);
 	}
 
 	// Gets a range of key-value pairs from the database specified by a pair of key selectors
-	Future<Standalone<RangeResultRef>> getRange(KeySelectorRef& begin,
-	                                            KeySelectorRef& end,
-	                                            int limit,
-	                                            bool reverse) override {
+	Future<RangeResult> getRange(KeySelectorRef& begin, KeySelectorRef& end, int limit, bool reverse) override {
 		return transaction.getRange(begin, end, limit, false, reverse);
 	}
 
@@ -167,15 +161,12 @@ struct ThreadTransactionWrapper : public TransactionWrapper {
 	Future<Optional<Value>> get(KeyRef& key) override { return unsafeThreadFutureToFuture(transaction->get(key)); }
 
 	// Gets a range of key-value pairs from the database specified by a key range
-	Future<Standalone<RangeResultRef>> getRange(KeyRangeRef& keys, int limit, bool reverse) override {
+	Future<RangeResult> getRange(KeyRangeRef& keys, int limit, bool reverse) override {
 		return unsafeThreadFutureToFuture(transaction->getRange(keys, limit, false, reverse));
 	}
 
 	// Gets a range of key-value pairs from the database specified by a pair of key selectors
-	Future<Standalone<RangeResultRef>> getRange(KeySelectorRef& begin,
-	                                            KeySelectorRef& end,
-	                                            int limit,
-	                                            bool reverse) override {
+	Future<RangeResult> getRange(KeySelectorRef& begin, KeySelectorRef& end, int limit, bool reverse) override {
 		return unsafeThreadFutureToFuture(transaction->getRange(begin, end, limit, false, reverse));
 	}
 
diff --git a/fdbserver/workloads/AtomicOps.actor.cpp b/fdbserver/workloads/AtomicOps.actor.cpp
index 06eb3ad531..551c8c008f 100644
--- a/fdbserver/workloads/AtomicOps.actor.cpp
+++ b/fdbserver/workloads/AtomicOps.actor.cpp
@@ -148,8 +148,7 @@ struct AtomicOpsWorkload : TestWorkload {
 		loop {
 			try {
 				Key begin(std::string("log"));
-				Standalone<RangeResultRef> log =
-				    wait(tr1.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY));
+				RangeResult log = wait(tr1.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY));
 				if (!log.empty()) {
 					TraceEvent(SevError, "AtomicOpSetup")
 					    .detail("LogKeySpace", "Not empty")
@@ -230,8 +229,7 @@ struct AtomicOpsWorkload : TestWorkload {
 		state ReadYourWritesTransaction tr(cx);
 		try {
 			Key begin(format("log%08x", g));
-			Standalone<RangeResultRef> log =
-			    wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY));
+			RangeResult log = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY));
 			if (log.more) {
 				TraceEvent(SevError, "LogHitTxnLimits").detail("Result", log.toString());
 			}
@@ -257,8 +255,7 @@ struct AtomicOpsWorkload : TestWorkload {
 		state ReadYourWritesTransaction tr(cx);
 		try {
 			Key begin(format("debug%08x", g));
-			Standalone<RangeResultRef> debuglog =
-			    wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY));
+			RangeResult debuglog = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY));
 			if (debuglog.more) {
 				TraceEvent(SevError, "DebugLogHitTxnLimits").detail("Result", debuglog.toString());
 			}
@@ -276,8 +273,7 @@ struct AtomicOpsWorkload : TestWorkload {
 		state ReadYourWritesTransaction tr(cx);
 		try {
 			Key begin(format("ops%08x", g));
-			Standalone<RangeResultRef> ops =
-			    wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY));
+			RangeResult ops = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY));
 			if (ops.more) {
 				TraceEvent(SevError, "OpsHitTxnLimits").detail("Result", ops.toString());
 			}
@@ -303,8 +299,7 @@ struct AtomicOpsWorkload : TestWorkload {
 		// Get mapping between opsKeys and debugKeys
 		state ReadYourWritesTransaction tr1(cx);
 		state std::map<Key, Key> records; // <ops, debugKey>
-		Standalone<RangeResultRef> debuglog =
-		    wait(tr1.getRange(prefixRange(format("debug%08x", g)), CLIENT_KNOBS->TOO_MANY));
+		RangeResult debuglog = wait(tr1.getRange(prefixRange(format("debug%08x", g)), CLIENT_KNOBS->TOO_MANY));
 		if (debuglog.more) {
 			TraceEvent(SevError, "DebugLogHitTxnLimits").detail("Result", debuglog.toString());
 			return Void();
@@ -316,7 +311,7 @@ struct AtomicOpsWorkload : TestWorkload {
 		// Get log key's value and assign it to the associated debugKey
 		state ReadYourWritesTransaction tr2(cx);
 		state std::map<Key, int64_t> logVal; // debugKey, log's value
-		Standalone<RangeResultRef> log = wait(tr2.getRange(prefixRange(format("log%08x", g)), CLIENT_KNOBS->TOO_MANY));
+		RangeResult log = wait(tr2.getRange(prefixRange(format("log%08x", g)), CLIENT_KNOBS->TOO_MANY));
 		if (log.more) {
 			TraceEvent(SevError, "LogHitTxnLimits").detail("Result", log.toString());
 			return Void();
@@ -330,7 +325,7 @@ struct AtomicOpsWorkload : TestWorkload {
 		// Get opsKeys and validate if it has correct value
 		state ReadYourWritesTransaction tr3(cx);
 		state std::map<Key, int64_t> opsVal; // ops key, ops value
-		Standalone<RangeResultRef> ops = wait(tr3.getRange(prefixRange(format("ops%08x", g)), CLIENT_KNOBS->TOO_MANY));
+		RangeResult ops = wait(tr3.getRange(prefixRange(format("ops%08x", g)), CLIENT_KNOBS->TOO_MANY));
 		if (ops.more) {
 			TraceEvent(SevError, "OpsHitTxnLimits").detail("Result", ops.toString());
 			return Void();
@@ -368,14 +363,13 @@ struct AtomicOpsWorkload : TestWorkload {
 		state bool ret = true;
 		for (; g < 100; g++) {
 			state ReadYourWritesTransaction tr(cx);
-			state Standalone<RangeResultRef> log;
+			state RangeResult log;
 			loop {
 				try {
 					{
 						// Calculate the accumulated value in the log keyspace for the group g
 						Key begin(format("log%08x", g));
-						Standalone<RangeResultRef> log_ =
-						    wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY));
+						RangeResult log_ = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY));
 						log = log_;
 						uint64_t zeroValue = 0;
 						tr.set(LiteralStringRef("xlogResult"),
@@ -390,8 +384,7 @@ struct AtomicOpsWorkload : TestWorkload {
 					{
 						// Calculate the accumulated value in the ops keyspace for the group g
 						Key begin(format("ops%08x", g));
-						Standalone<RangeResultRef> ops =
-						    wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY));
+						RangeResult ops = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY));
 						uint64_t zeroValue = 0;
 						tr.set(LiteralStringRef("xopsResult"),
 						       StringRef((const uint8_t*)&zeroValue, sizeof(zeroValue)));
diff --git a/fdbserver/workloads/AtomicSwitchover.actor.cpp b/fdbserver/workloads/AtomicSwitchover.actor.cpp
index 874b1ed331..4c227f4cd0 100644
--- a/fdbserver/workloads/AtomicSwitchover.actor.cpp
+++ b/fdbserver/workloads/AtomicSwitchover.actor.cpp
@@ -92,9 +92,8 @@ struct AtomicSwitchoverWorkload : TestWorkload {
 				state Transaction tr2(dest);
 				try {
 					loop {
-						state Future<Standalone<RangeResultRef>> srcFuture =
-						    tr.getRange(KeyRangeRef(begin, range.end), 1000);
-						state Future<Standalone<RangeResultRef>> bkpFuture =
+						state Future<RangeResult> srcFuture = tr.getRange(KeyRangeRef(begin, range.end), 1000);
+						state Future<RangeResult> bkpFuture =
 						    tr2.getRange(KeyRangeRef(begin, range.end).withPrefix(backupPrefix), 1000);
 						wait(success(srcFuture) && success(bkpFuture));
 
diff --git a/fdbserver/workloads/BackgroundSelectors.actor.cpp b/fdbserver/workloads/BackgroundSelectors.actor.cpp
index 874dceaa01..4c3fca31c5 100644
--- a/fdbserver/workloads/BackgroundSelectors.actor.cpp
+++ b/fdbserver/workloads/BackgroundSelectors.actor.cpp
@@ -87,7 +87,7 @@ struct BackgroundSelectorWorkload : TestWorkload {
 		state Key endKey;
 		state int diff;
 		state Transaction tr(cx);
-		state Standalone<RangeResultRef> rangeResult;
+		state RangeResult rangeResult;
 		state Standalone<StringRef> startResult;
 		state Standalone<StringRef> endResult;
 		state int startDrift;
@@ -148,7 +148,7 @@ struct BackgroundSelectorWorkload : TestWorkload {
 				loop {
 					try {
 						if (diff < 0) {
-							Standalone<RangeResultRef> rangeResult_ =
+							RangeResult rangeResult_ =
 							    wait(tr.getRange(randomizedSelector(endKey, true, endDrift),
 							                     randomizedSelector(startKey, true, startDrift + 1),
 							                     self->resultLimit));
@@ -160,10 +160,9 @@ struct BackgroundSelectorWorkload : TestWorkload {
 							    wait(tr.getKey(randomizedSelector(endKey, true, endDrift)));
 							startResult = startResult_;
 						} else {
-							Standalone<RangeResultRef> rangeResult_ =
-							    wait(tr.getRange(randomizedSelector(startKey, true, startDrift),
-							                     randomizedSelector(endKey, true, endDrift + 1),
-							                     self->resultLimit));
+							RangeResult rangeResult_ = wait(tr.getRange(randomizedSelector(startKey, true, startDrift),
+							                                            randomizedSelector(endKey, true, endDrift + 1),
+							                                            self->resultLimit));
 							rangeResult = rangeResult_;
 							Standalone<StringRef> startResult_ =
 							    wait(tr.getKey(randomizedSelector(startKey, true, startDrift)));
diff --git a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
index cbbe52f64a..5df80491b4 100644
--- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
@@ -360,7 +360,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 		state int rowCount = 0;
 		loop {
 			try {
-				Standalone<RangeResultRef> existingRows = wait(tr.getRange(normalKeys, 1));
+				RangeResult existingRows = wait(tr.getRange(normalKeys, 1));
 				rowCount = existingRows.size();
 				break;
 			} catch (Error& e) {
@@ -696,7 +696,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 						printf("BackupCorrectnessLeftOverLogTasks: %ld\n", (long)taskCount);
 					}
 
-					Standalone<RangeResultRef> agentValues =
+					RangeResult agentValues =
 					    wait(tr->getRange(KeyRange(KeyRangeRef(backupAgentKey, strinc(backupAgentKey))), 100));
 
 					// Error if the system keyspace for the backup tag is not empty
@@ -731,10 +731,10 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 						printf("No left over backup version key\n");
 					}
 
-					Standalone<RangeResultRef> versions = wait(tr->getRange(
+					RangeResult versions = wait(tr->getRange(
 					    KeyRange(KeyRangeRef(backupLatestVersionsPath, strinc(backupLatestVersionsPath))), 1));
 					if (!self->shareLogRange || !versions.size()) {
-						Standalone<RangeResultRef> logValues = wait(
+						RangeResult logValues = wait(
 						    tr->getRange(KeyRange(KeyRangeRef(backupLogValuesKey, strinc(backupLogValuesKey))), 100));
 
 						// Error if the log/mutation keyspace for the backup tag  is not empty
diff --git a/fdbserver/workloads/BackupCorrectness.actor.cpp b/fdbserver/workloads/BackupCorrectness.actor.cpp
index 98c528a2be..abd3609325 100644
--- a/fdbserver/workloads/BackupCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupCorrectness.actor.cpp
@@ -187,7 +187,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 				for (restoreIndex = 0; restoreIndex < self->skippedRestoreRanges.size(); restoreIndex++) {
 					state KeyRangeRef range = self->skippedRestoreRanges[restoreIndex];
 					Standalone<StringRef> restoreTag(self->backupTag.toString() + "_" + std::to_string(restoreIndex));
-					Standalone<RangeResultRef> res = wait(tr.getRange(range, GetRangeLimits::ROW_LIMIT_UNLIMITED));
+					RangeResult res = wait(tr.getRange(range, GetRangeLimits::ROW_LIMIT_UNLIMITED));
 					if (!res.empty()) {
 						TraceEvent(SevError, "BARW_UnexpectedRangePresent").detail("Range", printable(range));
 						return false;
@@ -400,7 +400,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 		state int rowCount = 0;
 		loop {
 			try {
-				Standalone<RangeResultRef> existingRows = wait(tr.getRange(normalKeys, 1));
+				RangeResult existingRows = wait(tr.getRange(normalKeys, 1));
 				rowCount = existingRows.size();
 				break;
 			} catch (Error& e) {
@@ -756,7 +756,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 						printf("BackupCorrectnessLeftOverLogTasks: %ld\n", (long)taskCount);
 					}
 
-					Standalone<RangeResultRef> agentValues =
+					RangeResult agentValues =
 					    wait(tr->getRange(KeyRange(KeyRangeRef(backupAgentKey, strinc(backupAgentKey))), 100));
 
 					// Error if the system keyspace for the backup tag is not empty
@@ -791,10 +791,10 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 						printf("No left over backup version key\n");
 					}
 
-					Standalone<RangeResultRef> versions = wait(tr->getRange(
+					RangeResult versions = wait(tr->getRange(
 					    KeyRange(KeyRangeRef(backupLatestVersionsPath, strinc(backupLatestVersionsPath))), 1));
 					if (!self->shareLogRange || !versions.size()) {
-						Standalone<RangeResultRef> logValues = wait(
+						RangeResult logValues = wait(
 						    tr->getRange(KeyRange(KeyRangeRef(backupLogValuesKey, strinc(backupLogValuesKey))), 100));
 
 						// Error if the log/mutation keyspace for the backup tag  is not empty
diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index dadf47c96b..6988908633 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -148,19 +148,19 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 	void getMetrics(vector<PerfMetric>& m) override {}
 
 	// Reads a series of key ranges and returns each range.
-	ACTOR static Future<std::vector<Standalone<RangeResultRef>>> readRanges(Database cx,
-	                                    Standalone<VectorRef<KeyRangeRef>> ranges,
-	                                    StringRef removePrefix) {
+	ACTOR static Future<std::vector<RangeResult>> readRanges(Database cx,
+	                                                         Standalone<VectorRef<KeyRangeRef>> ranges,
+	                                                         StringRef removePrefix) {
 		loop {
 			state Transaction tr(cx);
 			try {
-				state std::vector<Future<Standalone<RangeResultRef>>> results;
+				state std::vector<Future<RangeResult>> results;
 				for (auto& range : ranges) {
 					results.push_back(tr.getRange(range.removePrefix(removePrefix), 1000));
 				}
 				wait(waitForAll(results));
 
-				std::vector<Standalone<RangeResultRef>> ret;
+				std::vector<RangeResult> ret;
 				for (auto result : results) {
 					ret.push_back(result.get());
 				}
@@ -184,9 +184,8 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 				state Transaction tr2(dest);
 				try {
 					loop {
-						state Future<Standalone<RangeResultRef>> srcFuture =
-						    tr.getRange(KeyRangeRef(begin, range.end), 1000);
-						state Future<Standalone<RangeResultRef>> bkpFuture =
+						state Future<RangeResult> srcFuture = tr.getRange(KeyRangeRef(begin, range.end), 1000);
+						state Future<RangeResult> bkpFuture =
 						    tr2.getRange(KeyRangeRef(begin, range.end).withPrefix(backupPrefix), 1000);
 						wait(success(srcFuture) && success(bkpFuture));
 
@@ -472,7 +471,7 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 					printf("BackupCorrectnessLeftoverLogTasks: %ld\n", (long)taskCount);
 				}
 
-				Standalone<RangeResultRef> agentValues =
+				RangeResult agentValues =
 				    wait(tr->getRange(KeyRange(KeyRangeRef(backupAgentKey, strinc(backupAgentKey))), 100));
 
 				// Error if the system keyspace for the backup tag is not empty
@@ -507,10 +506,10 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 					printf("No left over backup version key\n");
 				}
 
-				Standalone<RangeResultRef> versions = wait(
+				RangeResult versions = wait(
 				    tr->getRange(KeyRange(KeyRangeRef(backupLatestVersionsPath, strinc(backupLatestVersionsPath))), 1));
 				if (!shareLogRange || !versions.size()) {
-					Standalone<RangeResultRef> logValues =
+					RangeResult logValues =
 					    wait(tr->getRange(KeyRange(KeyRangeRef(backupLogValuesKey, strinc(backupLogValuesKey))), 100));
 
 					// Error if the log/mutation keyspace for the backup tag is not empty
@@ -671,9 +670,9 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 
 				// Make sure no more data is written to the restored range
 				// after the restore completes.
-				state std::vector<Standalone<RangeResultRef>> res1 = wait(readRanges(cx, restoreRange, self->backupPrefix));
+				state std::vector<RangeResult> res1 = wait(readRanges(cx, restoreRange, self->backupPrefix));
 				wait(delay(5));
-				state std::vector<Standalone<RangeResultRef>> res2 = wait(readRanges(cx, restoreRange, self->backupPrefix));
+				state std::vector<RangeResult> res2 = wait(readRanges(cx, restoreRange, self->backupPrefix));
 				ASSERT(res1.size() == res2.size());
 				for (int i = 0; i < res1.size(); ++i) {
 					auto range1 = res1.at(i);
diff --git a/fdbserver/workloads/BackupToDBUpgrade.actor.cpp b/fdbserver/workloads/BackupToDBUpgrade.actor.cpp
index 76d596cdbf..a7e3807fce 100644
--- a/fdbserver/workloads/BackupToDBUpgrade.actor.cpp
+++ b/fdbserver/workloads/BackupToDBUpgrade.actor.cpp
@@ -204,7 +204,7 @@ struct BackupToDBUpgradeWorkload : TestWorkload {
 					printf("BackupCorrectnessLeftoverLogTasks: %ld\n", (long)taskCount);
 				}
 
-				Standalone<RangeResultRef> agentValues =
+				RangeResult agentValues =
 				    wait(tr->getRange(KeyRange(KeyRangeRef(backupAgentKey, strinc(backupAgentKey))), 100));
 
 				// Error if the system keyspace for the backup tag is not empty
@@ -239,10 +239,10 @@ struct BackupToDBUpgradeWorkload : TestWorkload {
 					printf("No left over backup version key\n");
 				}
 
-				Standalone<RangeResultRef> versions = wait(
+				RangeResult versions = wait(
 				    tr->getRange(KeyRange(KeyRangeRef(backupLatestVersionsPath, strinc(backupLatestVersionsPath))), 1));
 				if (!versions.size()) {
-					Standalone<RangeResultRef> logValues =
+					RangeResult logValues =
 					    wait(tr->getRange(KeyRange(KeyRangeRef(backupLogValuesKey, strinc(backupLogValuesKey))), 100));
 
 					// Error if the log/mutation keyspace for the backup tag is not empty
@@ -321,9 +321,8 @@ struct BackupToDBUpgradeWorkload : TestWorkload {
 					loop {
 						tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 						tr2.setOption(FDBTransactionOptions::LOCK_AWARE);
-						state Future<Standalone<RangeResultRef>> srcFuture =
-						    tr.getRange(KeyRangeRef(begin, range.end), 1000);
-						state Future<Standalone<RangeResultRef>> bkpFuture =
+						state Future<RangeResult> srcFuture = tr.getRange(KeyRangeRef(begin, range.end), 1000);
+						state Future<RangeResult> bkpFuture =
 						    tr2.getRange(KeyRangeRef(begin, range.end).withPrefix(backupPrefix), 1000);
 						wait(success(srcFuture) && success(bkpFuture));
 
diff --git a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
index 5c99263f58..616986fc23 100644
--- a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
+++ b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
@@ -217,7 +217,7 @@ struct ClientTransactionProfileCorrectnessWorkload : TestWorkload {
 
 	std::string getTrId(KeyRef key) const { return key.substr(trIdStartIndex, trIdFormatSize).toString(); }
 
-	bool checkTxInfoEntriesFormat(const Standalone<RangeResultRef>& txInfoEntries) {
+	bool checkTxInfoEntriesFormat(const RangeResult& txInfoEntries) {
 		std::string val;
 		std::map<std::string, std::vector<ValueRef>> trInfoChunks;
 		for (auto kv : txInfoEntries) {
@@ -288,7 +288,7 @@ struct ClientTransactionProfileCorrectnessWorkload : TestWorkload {
 
 		state Key clientLatencyAtomicCtr = CLIENT_LATENCY_INFO_CTR_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin);
 		state int64_t counter;
-		state Standalone<RangeResultRef> txInfoEntries;
+		state RangeResult txInfoEntries;
 		Optional<Value> ctrValue =
 		    wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Optional<Value>> {
 			    tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@@ -307,7 +307,7 @@ struct ClientTransactionProfileCorrectnessWorkload : TestWorkload {
 			try {
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-				state Standalone<RangeResultRef> kvRange = wait(tr.getRange(begin, end, keysLimit));
+				state RangeResult kvRange = wait(tr.getRange(begin, end, keysLimit));
 				if (kvRange.empty())
 					break;
 				txInfoEntries.arena().dependsOn(kvRange.arena());
diff --git a/fdbserver/workloads/ConflictRange.actor.cpp b/fdbserver/workloads/ConflictRange.actor.cpp
index e9f342a26b..061a93289e 100644
--- a/fdbserver/workloads/ConflictRange.actor.cpp
+++ b/fdbserver/workloads/ConflictRange.actor.cpp
@@ -82,7 +82,7 @@ struct ConflictRangeWorkload : TestWorkload {
 		state int randomLimit;
 		state bool randomSets = false;
 		state std::set<int> insertedSet;
-		state Standalone<RangeResultRef> originalResults;
+		state RangeResult originalResults;
 		state Standalone<StringRef> firstElement;
 
 		state std::set<int> clearedSet;
@@ -154,10 +154,9 @@ struct ConflictRangeWorkload : TestWorkload {
 					offsetB = deterministicRandom()->randomInt(-1 * self->maxOffset, self->maxOffset);
 					randomLimit = deterministicRandom()->randomInt(1, self->maxKeySpace);
 
-					Standalone<RangeResultRef> res =
-					    wait(tr1.getRange(KeySelectorRef(StringRef(myKeyA), onEqualA, offsetA),
-					                      KeySelectorRef(StringRef(myKeyB), onEqualB, offsetB),
-					                      randomLimit));
+					RangeResult res = wait(tr1.getRange(KeySelectorRef(StringRef(myKeyA), onEqualA, offsetA),
+					                                    KeySelectorRef(StringRef(myKeyB), onEqualB, offsetB),
+					                                    randomLimit));
 					if (res.size()) {
 						originalResults = res;
 						break;
@@ -219,17 +218,15 @@ struct ConflictRangeWorkload : TestWorkload {
 					if (self->testReadYourWrites) {
 						trRYOW.clear(KeyRangeRef(StringRef(format("%010d", clearedBegin)),
 						                         StringRef(format("%010d", clearedEnd))));
-						Standalone<RangeResultRef> res =
-						    wait(trRYOW.getRange(KeySelectorRef(StringRef(myKeyA), onEqualA, offsetA),
-						                         KeySelectorRef(StringRef(myKeyB), onEqualB, offsetB),
-						                         randomLimit));
+						RangeResult res = wait(trRYOW.getRange(KeySelectorRef(StringRef(myKeyA), onEqualA, offsetA),
+						                                       KeySelectorRef(StringRef(myKeyB), onEqualB, offsetB),
+						                                       randomLimit));
 						wait(trRYOW.commit());
 					} else {
 						tr3.clear(StringRef(format("%010d", self->maxKeySpace + 1)));
-						Standalone<RangeResultRef> res =
-						    wait(tr3.getRange(KeySelectorRef(StringRef(myKeyA), onEqualA, offsetA),
-						                      KeySelectorRef(StringRef(myKeyB), onEqualB, offsetB),
-						                      randomLimit));
+						RangeResult res = wait(tr3.getRange(KeySelectorRef(StringRef(myKeyA), onEqualA, offsetA),
+						                                    KeySelectorRef(StringRef(myKeyB), onEqualB, offsetB),
+						                                    randomLimit));
 						wait(tr3.commit());
 					}
 				} catch (Error& e) {
@@ -248,10 +245,9 @@ struct ConflictRangeWorkload : TestWorkload {
 						tr1 = Transaction(cx);
 					}
 
-					Standalone<RangeResultRef> res =
-					    wait(tr4.getRange(KeySelectorRef(StringRef(myKeyA), onEqualA, offsetA),
-					                      KeySelectorRef(StringRef(myKeyB), onEqualB, offsetB),
-					                      randomLimit));
+					RangeResult res = wait(tr4.getRange(KeySelectorRef(StringRef(myKeyA), onEqualA, offsetA),
+					                                    KeySelectorRef(StringRef(myKeyB), onEqualB, offsetB),
+					                                    randomLimit));
 					++self->withConflicts;
 
 					if (res.size() == originalResults.size()) {
@@ -312,7 +308,7 @@ struct ConflictRangeWorkload : TestWorkload {
 						    .detail("Original", keyStr2);
 
 						tr4 = Transaction(cx);
-						Standalone<RangeResultRef> res = wait(tr4.getRange(
+						RangeResult res = wait(tr4.getRange(
 						    KeyRangeRef(StringRef(format("%010d", 0)), StringRef(format("%010d", self->maxKeySpace))),
 						    200));
 						std::string allKeyEntries = "";
@@ -325,10 +321,9 @@ struct ConflictRangeWorkload : TestWorkload {
 					throw not_committed();
 				} else {
 					// If the commit is successful, check that the result matches the first execution.
-					Standalone<RangeResultRef> res =
-					    wait(tr4.getRange(KeySelectorRef(StringRef(myKeyA), onEqualA, offsetA),
-					                      KeySelectorRef(StringRef(myKeyB), onEqualB, offsetB),
-					                      randomLimit));
+					RangeResult res = wait(tr4.getRange(KeySelectorRef(StringRef(myKeyA), onEqualA, offsetA),
+					                                    KeySelectorRef(StringRef(myKeyB), onEqualB, offsetB),
+					                                    randomLimit));
 					++self->withoutConflicts;
 
 					if (res.size() == originalResults.size()) {
diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp
index 60ab7a3585..722e1f5e6e 100644
--- a/fdbserver/workloads/ConsistencyCheck.actor.cpp
+++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp
@@ -210,7 +210,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 				tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 				loop {
 					try {
-						Standalone<RangeResultRef> res = wait(tr.getRange(configKeys, 1000));
+						RangeResult res = wait(tr.getRange(configKeys, 1000));
 						if (res.size() == 1000) {
 							TraceEvent("ConsistencyCheck_TooManyConfigOptions");
 							self->testFailure("Read too many configuration options");
@@ -984,7 +984,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 					}
 
 					auto keyValueResponse = keyValueFutures[firstValidStorageServer].get().get();
-					Standalone<RangeResultRef> currentLocations = krmDecodeRanges(
+					RangeResult currentLocations = krmDecodeRanges(
 					    keyServersPrefix,
 					    KeyRangeRef(beginKey.removePrefix(keyServersPrefix),
 					                std::min<KeyRef>(shards[i].first.end, endKey).removePrefix(keyServersPrefix)),
@@ -1174,7 +1174,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			state int bytesReadInRange = 0;
 
-			Standalone<RangeResultRef> UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
+			RangeResult UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
 			ASSERT(!UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY);
 			decodeKeyServersValue(
 			    UIDtoTagMap, keyLocations[shard].value, sourceStorageServers, destStorageServers, false);
diff --git a/fdbserver/workloads/Cycle.actor.cpp b/fdbserver/workloads/Cycle.actor.cpp
index 2e5306f303..c80b2348f7 100644
--- a/fdbserver/workloads/Cycle.actor.cpp
+++ b/fdbserver/workloads/Cycle.actor.cpp
@@ -250,10 +250,9 @@ struct CycleWorkload : TestWorkload {
 			loop {
 				try {
 					state Version v = wait(tr.getReadVersion());
-					Standalone<RangeResultRef> data =
-					    wait(tr.getRange(firstGreaterOrEqual(doubleToTestKey(0.0, self->keyPrefix)),
-					                     firstGreaterOrEqual(doubleToTestKey(1.0, self->keyPrefix)),
-					                     self->nodeCount + 1));
+					RangeResult data = wait(tr.getRange(firstGreaterOrEqual(doubleToTestKey(0.0, self->keyPrefix)),
+					                                    firstGreaterOrEqual(doubleToTestKey(1.0, self->keyPrefix)),
+					                                    self->nodeCount + 1));
 					ok = self->cycleCheckData(data, v) && ok;
 					break;
 				} catch (Error& e) {
diff --git a/fdbserver/workloads/DataDistributionMetrics.actor.cpp b/fdbserver/workloads/DataDistributionMetrics.actor.cpp
index 4db2807fd0..09e2c82301 100644
--- a/fdbserver/workloads/DataDistributionMetrics.actor.cpp
+++ b/fdbserver/workloads/DataDistributionMetrics.actor.cpp
@@ -88,8 +88,7 @@ struct DataDistributionMetricsWorkload : KVWorkload {
 				// the range. If we didn't read through the end of the range, then the second last key
 				// in the result will be the last key less than endKey. (Condition #2)
 				state KeySelector end = KeySelectorRef(endKey.withPrefix(ddStatsRange.begin, endKey.arena()), false, 2);
-				Standalone<RangeResultRef> result =
-				    wait(tr->getRange(begin, end, GetRangeLimits(CLIENT_KNOBS->SHARD_COUNT_LIMIT)));
+				RangeResult result = wait(tr->getRange(begin, end, GetRangeLimits(CLIENT_KNOBS->SHARD_COUNT_LIMIT)));
 				// Condition #1 and #2 can be broken if multiple rpc calls happened in one getRange
 				if (result.size() > 1) {
 					if (result[0].key > begin.getKey() || result[1].key <= begin.getKey()) {
@@ -141,7 +140,7 @@ struct DataDistributionMetricsWorkload : KVWorkload {
 		// wait(quietDatabase(cx, self->dbInfo, "PopulateTPCC"));
 		state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
 		try {
-			state Standalone<RangeResultRef> result = wait(tr->getRange(ddStatsRange, CLIENT_KNOBS->SHARD_COUNT_LIMIT));
+			state RangeResult result = wait(tr->getRange(ddStatsRange, CLIENT_KNOBS->SHARD_COUNT_LIMIT));
 			ASSERT(!result.more);
 			self->numShards = result.size();
 			if (self->numShards < 1)
@@ -165,7 +164,7 @@ struct DataDistributionMetricsWorkload : KVWorkload {
 			// fetch data-distribution stats for a smaller range
 			ASSERT(result.size());
 			state int idx = deterministicRandom()->randomInt(0, result.size());
-			Standalone<RangeResultRef> res = wait(tr->getRange(
+			RangeResult res = wait(tr->getRange(
 			    KeyRangeRef(result[idx].key, idx + 1 < result.size() ? result[idx + 1].key : ddStatsRange.end), 100));
 			ASSERT_WE_THINK(res.size() == 1 && res[0] == result[idx]); // It works good now. However, not sure in any
 			                                                           // case of data-distribution, the number changes
diff --git a/fdbserver/workloads/DiskDurabilityTest.actor.cpp b/fdbserver/workloads/DiskDurabilityTest.actor.cpp
index 563b2f2e8b..262910e290 100644
--- a/fdbserver/workloads/DiskDurabilityTest.actor.cpp
+++ b/fdbserver/workloads/DiskDurabilityTest.actor.cpp
@@ -89,7 +89,7 @@ struct DiskDurabilityTest : TestWorkload {
 		state Transaction tr(db);
 		loop {
 			try {
-				state Standalone<RangeResultRef> r = wait(tr.getRange(self->range, GetRangeLimits(1000000)));
+				state RangeResult r = wait(tr.getRange(self->range, GetRangeLimits(1000000)));
 				verifyPages = r.size();
 				state int i;
 				for (i = 0; i < r.size(); i++) {
diff --git a/fdbserver/workloads/FileSystem.actor.cpp b/fdbserver/workloads/FileSystem.actor.cpp
index 4b57a5ad0d..1b8b3d86e7 100644
--- a/fdbserver/workloads/FileSystem.actor.cpp
+++ b/fdbserver/workloads/FileSystem.actor.cpp
@@ -280,8 +280,7 @@ struct FileSystemWorkload : TestWorkload {
 		if (self->loggingQueries)
 			TraceEvent("UserQuery").detail("UserID", userID).detail("PathBase", base);
 		Key keyEnd(base + "/updated0");
-		Standalone<RangeResultRef> val =
-		    wait(tr->getRange(firstGreaterOrEqual(keyEnd) - 10, firstGreaterOrEqual(keyEnd), 10));
+		RangeResult val = wait(tr->getRange(firstGreaterOrEqual(keyEnd) - 10, firstGreaterOrEqual(keyEnd), 10));
 		Key keyBegin(base + "/updated/");
 		for (int i = val.size() - 1; i >= 0; i--) {
 			if (val[i].key.startsWith(keyBegin) && self->loggingQueries) {
@@ -308,7 +307,7 @@ struct FileSystemWorkload : TestWorkload {
 		state int transferSize = 1000;
 		state uint64_t deletedFiles = 0;
 		while (transfered == transferSize) {
-			Standalone<RangeResultRef> val = wait(tr->getRange(begin, end, transferSize));
+			RangeResult val = wait(tr->getRange(begin, end, transferSize));
 			transfered = val.size();
 			deletedFiles += transfered;
 			begin = begin + transfered;
diff --git a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
index 83f6f32405..9e7176d85f 100644
--- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp
@@ -679,8 +679,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 		}
 	};
 
-	struct TestGetRange0 : public BaseTest<TestGetRange0, Standalone<RangeResultRef>> {
-		typedef BaseTest<TestGetRange0, Standalone<RangeResultRef>> base_type;
+	struct TestGetRange0 : public BaseTest<TestGetRange0, RangeResult> {
+		typedef BaseTest<TestGetRange0, RangeResult> base_type;
 		KeySelector keysel1, keysel2;
 		int limit;
 
@@ -730,8 +730,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 		}
 	};
 
-	struct TestGetRange1 : public BaseTest<TestGetRange1, Standalone<RangeResultRef>> {
-		typedef BaseTest<TestGetRange1, Standalone<RangeResultRef>> base_type;
+	struct TestGetRange1 : public BaseTest<TestGetRange1, RangeResult> {
+		typedef BaseTest<TestGetRange1, RangeResult> base_type;
 		KeySelector keysel1, keysel2;
 		GetRangeLimits limits;
 
@@ -776,8 +776,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 		}
 	};
 
-	struct TestGetRange2 : public BaseTest<TestGetRange2, Standalone<RangeResultRef>> {
-		typedef BaseTest<TestGetRange2, Standalone<RangeResultRef>> base_type;
+	struct TestGetRange2 : public BaseTest<TestGetRange2, RangeResult> {
+		typedef BaseTest<TestGetRange2, RangeResult> base_type;
 		Key key1, key2;
 		int limit;
 
@@ -836,8 +836,8 @@ struct FuzzApiCorrectnessWorkload : TestWorkload {
 		}
 	};
 
-	struct TestGetRange3 : public BaseTest<TestGetRange3, Standalone<RangeResultRef>> {
-		typedef BaseTest<TestGetRange3, Standalone<RangeResultRef>> base_type;
+	struct TestGetRange3 : public BaseTest<TestGetRange3, RangeResult> {
+		typedef BaseTest<TestGetRange3, RangeResult> base_type;
 		Key key1, key2;
 		GetRangeLimits limits;
 
diff --git a/fdbserver/workloads/Increment.actor.cpp b/fdbserver/workloads/Increment.actor.cpp
index 757ae752f9..b940e98c4f 100644
--- a/fdbserver/workloads/Increment.actor.cpp
+++ b/fdbserver/workloads/Increment.actor.cpp
@@ -161,10 +161,9 @@ struct Increment : TestWorkload {
 			loop {
 				try {
 					state Version v = wait(tr.getReadVersion());
-					Standalone<RangeResultRef> data =
-					    wait(tr.getRange(firstGreaterOrEqual(intToTestKey(0)),
-					                     firstGreaterOrEqual(intToTestKey(self->nodeCount)),
-					                     self->nodeCount + 1));
+					RangeResult data = wait(tr.getRange(firstGreaterOrEqual(intToTestKey(0)),
+					                                    firstGreaterOrEqual(intToTestKey(self->nodeCount)),
+					                                    self->nodeCount + 1));
 					ok = self->incrementCheckData(data, v, self) && ok;
 					break;
 				} catch (Error& e) {
diff --git a/fdbserver/workloads/IndexScan.actor.cpp b/fdbserver/workloads/IndexScan.actor.cpp
index 644c9506c5..f384664650 100644
--- a/fdbserver/workloads/IndexScan.actor.cpp
+++ b/fdbserver/workloads/IndexScan.actor.cpp
@@ -120,7 +120,7 @@ struct IndexScanWorkload : KVWorkload {
 
 			try {
 				loop {
-					Standalone<RangeResultRef> r = wait(tr.getRange(begin, end, limits));
+					RangeResult r = wait(tr.getRange(begin, end, limits));
 					chunks++;
 					rowsRead += r.size();
 					if (!r.size() || !r.more || (now() - startTime) > self->transactionDuration) {
diff --git a/fdbserver/workloads/Inventory.actor.cpp b/fdbserver/workloads/Inventory.actor.cpp
index 8b28f61fe9..584458ed46 100644
--- a/fdbserver/workloads/Inventory.actor.cpp
+++ b/fdbserver/workloads/Inventory.actor.cpp
@@ -116,7 +116,7 @@ struct InventoryTestWorkload : TestWorkload {
 		state Transaction tr(cx);
 		loop {
 			try {
-				Standalone<RangeResultRef> data = wait(tr.getRange(
+				RangeResult data = wait(tr.getRange(
 				    firstGreaterOrEqual(doubleToTestKey(0)), firstGreaterOrEqual(doubleToTestKey(1)), self->nProducts));
 
 				std::map<Key, int> actualResults;
diff --git a/fdbserver/workloads/KVStoreTest.actor.cpp b/fdbserver/workloads/KVStoreTest.actor.cpp
index 6060356711..9c100cfa36 100644
--- a/fdbserver/workloads/KVStoreTest.actor.cpp
+++ b/fdbserver/workloads/KVStoreTest.actor.cpp
@@ -270,8 +270,7 @@ ACTOR Future<Void> testKVStoreMain(KVStoreTestWorkload* workload, KVTest* ptest)
 		state Key k;
 		state double cst = timer();
 		while (true) {
-			Standalone<RangeResultRef> kv =
-			    wait(test.store->readRange(KeyRangeRef(k, LiteralStringRef("\xff\xff\xff\xff")), 1000));
+			RangeResult kv = wait(test.store->readRange(KeyRangeRef(k, LiteralStringRef("\xff\xff\xff\xff")), 1000));
 			count += kv.size();
 			if (kv.size() < 1000)
 				break;
diff --git a/fdbserver/workloads/LocalRatekeeper.actor.cpp b/fdbserver/workloads/LocalRatekeeper.actor.cpp
index 7eebede7e2..1def492236 100644
--- a/fdbserver/workloads/LocalRatekeeper.actor.cpp
+++ b/fdbserver/workloads/LocalRatekeeper.actor.cpp
@@ -30,7 +30,7 @@ ACTOR Future<StorageServerInterface> getRandomStorage(Database cx) {
 		try {
 			tr.reset();
 			tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-			Standalone<RangeResultRef> range = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
+			RangeResult range = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
 			if (range.size() > 0) {
 				auto idx = deterministicRandom()->randomInt(0, range.size());
 				return decodeServerListValue(range[idx].value);
diff --git a/fdbserver/workloads/LockDatabase.actor.cpp b/fdbserver/workloads/LockDatabase.actor.cpp
index 5846c84af5..25379bb171 100644
--- a/fdbserver/workloads/LockDatabase.actor.cpp
+++ b/fdbserver/workloads/LockDatabase.actor.cpp
@@ -50,12 +50,12 @@ struct LockDatabaseWorkload : TestWorkload {
 
 	void getMetrics(vector<PerfMetric>& m) override {}
 
-	ACTOR static Future<Standalone<RangeResultRef>> lockAndSave(Database cx, LockDatabaseWorkload* self, UID lockID) {
+	ACTOR static Future<RangeResult> lockAndSave(Database cx, LockDatabaseWorkload* self, UID lockID) {
 		state Transaction tr(cx);
 		loop {
 			try {
 				wait(lockDatabase(&tr, lockID));
-				state Standalone<RangeResultRef> data = wait(tr.getRange(normalKeys, 50000));
+				state RangeResult data = wait(tr.getRange(normalKeys, 50000));
 				ASSERT(!data.more);
 				wait(tr.commit());
 				return data;
@@ -65,10 +65,7 @@ struct LockDatabaseWorkload : TestWorkload {
 		}
 	}
 
-	ACTOR static Future<Void> unlockAndCheck(Database cx,
-	                                         LockDatabaseWorkload* self,
-	                                         UID lockID,
-	                                         Standalone<RangeResultRef> data) {
+	ACTOR static Future<Void> unlockAndCheck(Database cx, LockDatabaseWorkload* self, UID lockID, RangeResult data) {
 		state Transaction tr(cx);
 		loop {
 			try {
@@ -78,7 +75,7 @@ struct LockDatabaseWorkload : TestWorkload {
 					return Void();
 
 				wait(unlockDatabase(&tr, lockID));
-				state Standalone<RangeResultRef> data2 = wait(tr.getRange(normalKeys, 50000));
+				state RangeResult data2 = wait(tr.getRange(normalKeys, 50000));
 				if (data.size() != data2.size()) {
 					TraceEvent(SevError, "DataChangedWhileLocked")
 					    .detail("BeforeSize", data.size())
@@ -122,7 +119,7 @@ struct LockDatabaseWorkload : TestWorkload {
 	ACTOR static Future<Void> lockWorker(Database cx, LockDatabaseWorkload* self) {
 		state UID lockID = deterministicRandom()->randomUniqueID();
 		wait(delay(self->lockAfter));
-		state Standalone<RangeResultRef> data = wait(lockAndSave(cx, self, lockID));
+		state RangeResult data = wait(lockAndSave(cx, self, lockID));
 		state Future<Void> checker = checkLocked(cx, self);
 		wait(delay(self->unlockAfter - self->lockAfter));
 		checker.cancel();
diff --git a/fdbserver/workloads/MemoryKeyValueStore.cpp b/fdbserver/workloads/MemoryKeyValueStore.cpp
index 6c4a2e9d4f..1c951ec1a6 100644
--- a/fdbserver/workloads/MemoryKeyValueStore.cpp
+++ b/fdbserver/workloads/MemoryKeyValueStore.cpp
@@ -78,8 +78,8 @@ Key MemoryKeyValueStore::getKey(KeySelectorRef selector) const {
 }
 
 // Gets a range of key-value pairs, returning a maximum of <limit> results
-Standalone<RangeResultRef> MemoryKeyValueStore::getRange(KeyRangeRef range, int limit, bool reverse) const {
-	Standalone<RangeResultRef> results;
+RangeResult MemoryKeyValueStore::getRange(KeyRangeRef range, int limit, bool reverse) const {
+	RangeResult results;
 	if (!reverse) {
 		std::map<Key, Value>::const_iterator mapItr = store.lower_bound(range.begin);
 
diff --git a/fdbserver/workloads/MemoryKeyValueStore.h b/fdbserver/workloads/MemoryKeyValueStore.h
index 2a1d6cb176..bd8318c509 100644
--- a/fdbserver/workloads/MemoryKeyValueStore.h
+++ b/fdbserver/workloads/MemoryKeyValueStore.h
@@ -38,7 +38,7 @@ public:
 	Key getKey(KeySelectorRef selector) const;
 
 	// Gets a range of key-value pairs, returning a maximum of <limit> results
-	Standalone<RangeResultRef> getRange(KeyRangeRef range, int limit, bool reverse) const;
+	RangeResult getRange(KeyRangeRef range, int limit, bool reverse) const;
 
 	// Stores a key-value pair in the database
 	void set(KeyRef key, ValueRef value);
diff --git a/fdbserver/workloads/MemoryLifetime.actor.cpp b/fdbserver/workloads/MemoryLifetime.actor.cpp
index f31ee5c728..47a2d7e65a 100644
--- a/fdbserver/workloads/MemoryLifetime.actor.cpp
+++ b/fdbserver/workloads/MemoryLifetime.actor.cpp
@@ -86,13 +86,13 @@ struct MemoryLifetime : KVWorkload {
 					//TraceEvent("MemoryLifetimeCheck").detail("IsReverse", getRange_isReverse).detail("StartKey", printable(getRange_startKey)).detail("RandomStart", getRange_randomStart).detail("NewValue", getRange_newValue.size()).detail("IsSnapshot", getRange_isSnapshot);
 					if (getRange_randomStart)
 						tr.set(getRange_startKey, getRange_newValue);
-					state Standalone<RangeResultRef> getRange_res1 = wait(tr.getRange(
+					state RangeResult getRange_res1 = wait(tr.getRange(
 					    getRange_queryRange, GetRangeLimits(4000), getRange_isSnapshot, getRange_isReverse));
 					tr = ReadYourWritesTransaction(cx);
 					wait(delay(0.01));
 					if (getRange_randomStart)
 						tr.set(getRange_startKey, getRange_newValue);
-					Standalone<RangeResultRef> getRange_res2 = wait(tr.getRange(
+					RangeResult getRange_res2 = wait(tr.getRange(
 					    getRange_queryRange, GetRangeLimits(4000), getRange_isSnapshot, getRange_isReverse));
 					ASSERT(getRange_res1.size() == getRange_res2.size());
 					for (int i = 0; i < getRange_res1.size(); i++) {
diff --git a/fdbserver/workloads/RYWPerformance.actor.cpp b/fdbserver/workloads/RYWPerformance.actor.cpp
index 4b14268b61..4435f63320 100644
--- a/fdbserver/workloads/RYWPerformance.actor.cpp
+++ b/fdbserver/workloads/RYWPerformance.actor.cpp
@@ -118,13 +118,13 @@ struct RYWPerformanceWorkload : TestWorkload {
 				tr->clear(KeyRangeRef(self->keyForIndex(i), self->keyForIndex(i + 1)));
 			}
 		} else if (type == 9) {
-			std::vector<Future<Standalone<RangeResultRef>>> gets;
+			std::vector<Future<RangeResult>> gets;
 			for (i = 0; i < self->nodes; i++) {
 				gets.push_back(tr->getRange(KeyRangeRef(self->keyForIndex(i), self->keyForIndex(i + 2)), self->nodes));
 			}
 			wait(waitForAll(gets));
 		} else if (type == 10) {
-			std::vector<Future<Standalone<RangeResultRef>>> gets;
+			std::vector<Future<RangeResult>> gets;
 			for (i = 0; i < self->nodes; i++) {
 				gets.push_back(tr->getRange(KeyRangeRef(self->keyForIndex(i), self->keyForIndex(i + 2)), self->nodes));
 			}
@@ -133,7 +133,7 @@ struct RYWPerformanceWorkload : TestWorkload {
 				tr->set(self->keyForIndex(i), LiteralStringRef("foo"));
 			}
 		} else if (type == 11) {
-			std::vector<Future<Standalone<RangeResultRef>>> gets;
+			std::vector<Future<RangeResult>> gets;
 			for (i = 0; i < self->nodes; i++) {
 				gets.push_back(tr->getRange(KeyRangeRef(self->keyForIndex(i), self->keyForIndex(i + 2)), self->nodes));
 			}
@@ -142,7 +142,7 @@ struct RYWPerformanceWorkload : TestWorkload {
 				tr->set(self->keyForIndex(i), LiteralStringRef("foo"));
 			}
 		} else if (type == 12) {
-			std::vector<Future<Standalone<RangeResultRef>>> gets;
+			std::vector<Future<RangeResult>> gets;
 			for (i = 0; i < self->nodes; i++) {
 				gets.push_back(tr->getRange(KeyRangeRef(self->keyForIndex(i), self->keyForIndex(i + 2)), self->nodes));
 			}
@@ -151,7 +151,7 @@ struct RYWPerformanceWorkload : TestWorkload {
 				tr->clear(self->keyForIndex(i));
 			}
 		} else if (type == 13) {
-			std::vector<Future<Standalone<RangeResultRef>>> gets;
+			std::vector<Future<RangeResult>> gets;
 			for (i = 0; i < self->nodes; i++) {
 				gets.push_back(tr->getRange(KeyRangeRef(self->keyForIndex(i), self->keyForIndex(i + 2)), self->nodes));
 			}
diff --git a/fdbserver/workloads/RandomMoveKeys.actor.cpp b/fdbserver/workloads/RandomMoveKeys.actor.cpp
index 6a712c7fab..8bdafb35e0 100644
--- a/fdbserver/workloads/RandomMoveKeys.actor.cpp
+++ b/fdbserver/workloads/RandomMoveKeys.actor.cpp
@@ -51,7 +51,7 @@ struct MoveKeysWorkload : TestWorkload {
 			state Transaction tr(cx);
 			loop {
 				try {
-					Standalone<RangeResultRef> res = wait(tr.getRange(configKeys, 1000));
+					RangeResult res = wait(tr.getRange(configKeys, 1000));
 					ASSERT(res.size() < 1000);
 					for (int i = 0; i < res.size(); i++)
 						self->configuration.set(res[i].key, res[i].value);
diff --git a/fdbserver/workloads/RandomSelector.actor.cpp b/fdbserver/workloads/RandomSelector.actor.cpp
index 33742d5fcc..584c63450c 100644
--- a/fdbserver/workloads/RandomSelector.actor.cpp
+++ b/fdbserver/workloads/RandomSelector.actor.cpp
@@ -442,8 +442,8 @@ struct RandomSelectorWorkload : TestWorkload {
 
 						//TraceEvent("RYOWgetRange").detail("KeyA", myKeyA).detail("KeyB", myKeyB).detail("OnEqualA",onEqualA).detail("OnEqualB",onEqualB).detail("OffsetA",offsetA).detail("OffsetB",offsetB).detail("RandomLimit",randomLimit).detail("RandomByteLimit", randomByteLimit).detail("Reverse", reverse);
 
-						state Standalone<RangeResultRef> getRangeTest1;
-						Standalone<RangeResultRef> getRangeTest =
+						state RangeResult getRangeTest1;
+						RangeResult getRangeTest =
 						    wait(trRYOW.getRange(KeySelectorRef(StringRef(clientID + "b/" + myKeyA), onEqualA, offsetA),
 						                         KeySelectorRef(StringRef(clientID + "b/" + myKeyB), onEqualB, offsetB),
 						                         randomLimit,
@@ -453,7 +453,7 @@ struct RandomSelectorWorkload : TestWorkload {
 
 						loop {
 							try {
-								Standalone<RangeResultRef> getRangeTest2 = wait(
+								RangeResult getRangeTest2 = wait(
 								    tr.getRange(KeySelectorRef(StringRef(clientID + "d/" + myKeyA), onEqualA, offsetA),
 								                KeySelectorRef(StringRef(clientID + "d/" + myKeyB), onEqualB, offsetB),
 								                randomLimit,
@@ -524,9 +524,9 @@ struct RandomSelectorWorkload : TestWorkload {
 
 				loop {
 					try {
-						state Standalone<RangeResultRef> finalTest1 = wait(finalTransaction.getRange(
+						state RangeResult finalTest1 = wait(finalTransaction.getRange(
 						    KeyRangeRef(StringRef(clientID + "b/"), StringRef(clientID + "c/")), self->maxKeySpace));
-						Standalone<RangeResultRef> finalTest2 = wait(finalTransaction.getRange(
+						RangeResult finalTest2 = wait(finalTransaction.getRange(
 						    KeyRangeRef(StringRef(clientID + "d/"), StringRef(clientID + "e/")), self->maxKeySpace));
 
 						if (finalTest1.size() != finalTest2.size()) {
diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp
index 2cf284b381..aab168be41 100644
--- a/fdbserver/workloads/ReadWrite.actor.cpp
+++ b/fdbserver/workloads/ReadWrite.actor.cpp
@@ -481,14 +481,14 @@ struct ReadWriteWorkload : KVWorkload {
 		return Void();
 	}
 
-	ACTOR static Future<Void> logLatency(Future<Standalone<RangeResultRef>> f,
+	ACTOR static Future<Void> logLatency(Future<RangeResult> f,
 	                                     ContinuousSample<double>* latencies,
 	                                     double* totalLatency,
 	                                     int* latencyCount,
 	                                     EventMetricHandle<ReadMetric> readMetric,
 	                                     bool shouldRecord) {
 		state double readBegin = now();
-		Standalone<RangeResultRef> value = wait(f);
+		RangeResult value = wait(f);
 
 		double latency = now() - readBegin;
 		readMetric->readLatency = latency * 1e9;
@@ -816,12 +816,12 @@ ACTOR Future<std::vector<std::pair<uint64_t, double>>> trackInsertionCount(Datab
 
 	while (currentCountIndex < countsOfInterest.size()) {
 		try {
-			state Future<Standalone<RangeResultRef>> countFuture = tr.getRange(keyPrefix, 1000000000);
-			state Future<Standalone<RangeResultRef>> bytesFuture = tr.getRange(bytesPrefix, 1000000000);
+			state Future<RangeResult> countFuture = tr.getRange(keyPrefix, 1000000000);
+			state Future<RangeResult> bytesFuture = tr.getRange(bytesPrefix, 1000000000);
 			wait(success(countFuture) && success(bytesFuture));
 
-			Standalone<RangeResultRef> counts = countFuture.get();
-			Standalone<RangeResultRef> bytes = bytesFuture.get();
+			RangeResult counts = countFuture.get();
+			RangeResult bytes = bytesFuture.get();
 
 			uint64_t numInserted = 0;
 			for (int i = 0; i < counts.size(); i++)
diff --git a/fdbserver/workloads/ReportConflictingKeys.actor.cpp b/fdbserver/workloads/ReportConflictingKeys.actor.cpp
index 5b57bb0bcf..a68b34b900 100644
--- a/fdbserver/workloads/ReportConflictingKeys.actor.cpp
+++ b/fdbserver/workloads/ReportConflictingKeys.actor.cpp
@@ -192,13 +192,12 @@ struct ReportConflictingKeysWorkload : TestWorkload {
 					                LiteralStringRef("\xff\xff").withPrefix(conflictingKeysRange.begin));
 					// The getRange here using the special key prefix "\xff\xff/transaction/conflicting_keys/" happens
 					// locally Thus, the error handling is not needed here
-					Future<Standalone<RangeResultRef>> conflictingKeyRangesFuture =
-					    tr2->getRange(ckr, CLIENT_KNOBS->TOO_MANY);
+					Future<RangeResult> conflictingKeyRangesFuture = tr2->getRange(ckr, CLIENT_KNOBS->TOO_MANY);
 					ASSERT(conflictingKeyRangesFuture.isReady());
 
 					tr2 = makeReference<ReadYourWritesTransaction>(cx);
 
-					const Standalone<RangeResultRef> conflictingKeyRanges = conflictingKeyRangesFuture.get();
+					const RangeResult conflictingKeyRanges = conflictingKeyRangesFuture.get();
 					ASSERT(conflictingKeyRanges.size() &&
 					       (conflictingKeyRanges.size() <= readConflictRanges.size() * 2));
 					ASSERT(conflictingKeyRanges.size() % 2 == 0);
diff --git a/fdbserver/workloads/RyowCorrectness.actor.cpp b/fdbserver/workloads/RyowCorrectness.actor.cpp
index 5806b0a76a..fc0e0fad59 100644
--- a/fdbserver/workloads/RyowCorrectness.actor.cpp
+++ b/fdbserver/workloads/RyowCorrectness.actor.cpp
@@ -139,8 +139,8 @@ struct RyowCorrectnessWorkload : ApiWorkload {
 	}
 
 	// Adds a single KV-pair to the list of results
-	void pushKVPair(std::vector<Standalone<RangeResultRef>>& results, Key const& key, Optional<Value> const& value) {
-		Standalone<RangeResultRef> result;
+	void pushKVPair(std::vector<RangeResult>& results, Key const& key, Optional<Value> const& value) {
+		RangeResult result;
 		if (!value.present())
 			result.push_back_deep(result.arena(), KeyValueRef(key, LiteralStringRef("VALUE_NOT_PRESENT")));
 		else
@@ -150,8 +150,8 @@ struct RyowCorrectnessWorkload : ApiWorkload {
 	}
 
 	// Applies a sequence of operations to the memory store and returns the results
-	std::vector<Standalone<RangeResultRef>> applySequenceToStore(std::vector<Operation> sequence) {
-		std::vector<Standalone<RangeResultRef>> results;
+	std::vector<RangeResult> applySequenceToStore(std::vector<Operation> sequence) {
+		std::vector<RangeResult> results;
 		Key key;
 
 #if TRACE_TRANSACTION
@@ -222,12 +222,11 @@ struct RyowCorrectnessWorkload : ApiWorkload {
 	}
 
 	// Applies a sequence of operations to the database and returns the results
-	ACTOR Future<std::vector<Standalone<RangeResultRef>>> applySequenceToDatabase(
-	    Reference<TransactionWrapper> transaction,
-	    std::vector<Operation> sequence,
-	    RyowCorrectnessWorkload* self) {
+	ACTOR Future<std::vector<RangeResult>> applySequenceToDatabase(Reference<TransactionWrapper> transaction,
+	                                                               std::vector<Operation> sequence,
+	                                                               RyowCorrectnessWorkload* self) {
 		state bool dontUpdateResults = false;
-		state std::vector<Standalone<RangeResultRef>> results;
+		state std::vector<RangeResult> results;
 		loop {
 			try {
 				state int i;
@@ -242,11 +241,11 @@ struct RyowCorrectnessWorkload : ApiWorkload {
 							self->pushKVPair(results, op.beginKey, val);
 					} else if (op.type == Operation::GET_RANGE) {
 						KeyRangeRef range(op.beginKey, op.endKey);
-						Standalone<RangeResultRef> result = wait(transaction->getRange(range, op.limit, op.reverse));
+						RangeResult result = wait(transaction->getRange(range, op.limit, op.reverse));
 						if (!dontUpdateResults)
 							results.push_back((RangeResultRef)result);
 					} else if (op.type == Operation::GET_RANGE_SELECTOR) {
-						Standalone<RangeResultRef> result =
+						RangeResult result =
 						    wait(transaction->getRange(op.beginSelector, op.endSelector, op.limit, op.reverse));
 						if (!dontUpdateResults)
 							results.push_back((RangeResultRef)result);
@@ -278,8 +277,8 @@ struct RyowCorrectnessWorkload : ApiWorkload {
 	}
 
 	// Compares a sequence of results from the database and the memory store
-	bool compareResults(std::vector<Standalone<RangeResultRef>> dbResults,
-	                    std::vector<Standalone<RangeResultRef>> storeResults,
+	bool compareResults(std::vector<RangeResult> dbResults,
+	                    std::vector<RangeResult> storeResults,
 	                    std::vector<Operation> sequence,
 	                    Version readVersion) {
 		ASSERT(storeResults.size() == dbResults.size());
@@ -334,9 +333,8 @@ struct RyowCorrectnessWorkload : ApiWorkload {
 		loop {
 			state Reference<TransactionWrapper> transaction = self->createTransaction();
 			state std::vector<Operation> sequence = self->generateOperationSequence(data);
-			state std::vector<Standalone<RangeResultRef>> storeResults = self->applySequenceToStore(sequence);
-			state std::vector<Standalone<RangeResultRef>> dbResults =
-			    wait(self->applySequenceToDatabase(transaction, sequence, self));
+			state std::vector<RangeResult> storeResults = self->applySequenceToStore(sequence);
+			state std::vector<RangeResult> dbResults = wait(self->applySequenceToDatabase(transaction, sequence, self));
 
 			Version readVersion = wait(transaction->getReadVersion());
 
diff --git a/fdbserver/workloads/SelectorCorrectness.actor.cpp b/fdbserver/workloads/SelectorCorrectness.actor.cpp
index bdde84e2fb..2faecd35b4 100644
--- a/fdbserver/workloads/SelectorCorrectness.actor.cpp
+++ b/fdbserver/workloads/SelectorCorrectness.actor.cpp
@@ -176,7 +176,7 @@ struct SelectorCorrectnessWorkload : TestWorkload {
 						    2;
 
 						if (self->testReadYourWrites) {
-							Standalone<RangeResultRef> getRangeTest =
+							RangeResult getRangeTest =
 							    wait(trRYOW.getRange(KeySelectorRef(StringRef(myKeyA), onEqualA, offsetA),
 							                         KeySelectorRef(StringRef(myKeyB), onEqualB, offsetB),
 							                         2 * (self->maxKeySpace + self->maxOffset),
@@ -204,7 +204,7 @@ struct SelectorCorrectnessWorkload : TestWorkload {
 								    .detail("DataSize", getRangeTest.size());
 							}
 						} else {
-							Standalone<RangeResultRef> getRangeTest =
+							RangeResult getRangeTest =
 							    wait(tr.getRange(KeySelectorRef(StringRef(myKeyA), onEqualA, offsetA),
 							                     KeySelectorRef(StringRef(myKeyB), onEqualB, offsetB),
 							                     2 * (self->maxKeySpace + self->maxOffset),
diff --git a/fdbserver/workloads/Serializability.actor.cpp b/fdbserver/workloads/Serializability.actor.cpp
index e92171f40f..32444f2092 100644
--- a/fdbserver/workloads/Serializability.actor.cpp
+++ b/fdbserver/workloads/Serializability.actor.cpp
@@ -232,7 +232,7 @@ struct SerializabilityWorkload : TestWorkload {
 	                                         std::vector<TransactionOperation> ops,
 	                                         std::vector<Future<Optional<Value>>>* getFutures,
 	                                         std::vector<Future<Key>>* getKeyFutures,
-	                                         std::vector<Future<Standalone<RangeResultRef>>>* getRangeFutures,
+	                                         std::vector<Future<RangeResult>>* getRangeFutures,
 	                                         std::vector<Future<Void>>* watchFutures,
 	                                         bool checkSnapshotReads) {
 		state int opNum = 0;
@@ -306,10 +306,10 @@ struct SerializabilityWorkload : TestWorkload {
 		return Void();
 	}
 
-	ACTOR static Future<Standalone<RangeResultRef>> getDatabaseContents(Database cx, int nodes) {
+	ACTOR static Future<RangeResult> getDatabaseContents(Database cx, int nodes) {
 		state ReadYourWritesTransaction tr(cx);
 
-		Standalone<RangeResultRef> result = wait(tr.getRange(normalKeys, nodes + 1));
+		RangeResult result = wait(tr.getRange(normalKeys, nodes + 1));
 		ASSERT(result.size() <= nodes);
 		return result;
 	}
@@ -332,14 +332,14 @@ struct SerializabilityWorkload : TestWorkload {
 			state std::vector<ReadYourWritesTransaction> tr;
 			state std::vector<std::vector<Future<Optional<Value>>>> getFutures;
 			state std::vector<std::vector<Future<Key>>> getKeyFutures;
-			state std::vector<std::vector<Future<Standalone<RangeResultRef>>>> getRangeFutures;
+			state std::vector<std::vector<Future<RangeResult>>> getRangeFutures;
 			state std::vector<std::vector<Future<Void>>> watchFutures;
 
 			for (int i = 0; i < 5; i++) {
 				tr.push_back(ReadYourWritesTransaction(cx));
 				getFutures.push_back(std::vector<Future<Optional<Value>>>());
 				getKeyFutures.push_back(std::vector<Future<Key>>());
-				getRangeFutures.push_back(std::vector<Future<Standalone<RangeResultRef>>>());
+				getRangeFutures.push_back(std::vector<Future<RangeResult>>());
 				watchFutures.push_back(std::vector<Future<Void>>());
 			}
 
@@ -382,7 +382,7 @@ struct SerializabilityWorkload : TestWorkload {
 				wait(tr[2].commit());
 
 				// get contents of database
-				state Standalone<RangeResultRef> result1 = wait(getDatabaseContents(cx, self->nodes));
+				state RangeResult result1 = wait(getDatabaseContents(cx, self->nodes));
 
 				// reset database to known state
 				wait(resetDatabase(cx, initialData));
@@ -397,7 +397,7 @@ struct SerializabilityWorkload : TestWorkload {
 				wait(tr[4].commit());
 
 				// get contents of database
-				Standalone<RangeResultRef> result2 = wait(getDatabaseContents(cx, self->nodes));
+				RangeResult result2 = wait(getDatabaseContents(cx, self->nodes));
 
 				if (result1.size() != result2.size()) {
 					TraceEvent(SevError, "SRL_ResultMismatch")
diff --git a/fdbserver/workloads/SnapTest.actor.cpp b/fdbserver/workloads/SnapTest.actor.cpp
index 5f1d6b1611..050162cabd 100644
--- a/fdbserver/workloads/SnapTest.actor.cpp
+++ b/fdbserver/workloads/SnapTest.actor.cpp
@@ -263,7 +263,7 @@ public: // workload functions
 			tr.reset();
 			loop {
 				try {
-					Standalone<RangeResultRef> kvRange = wait(tr.getRange(begin, end, 1000));
+					RangeResult kvRange = wait(tr.getRange(begin, end, 1000));
 					if (!kvRange.more && kvRange.size() == 0) {
 						TraceEvent("SnapTestNoMoreEntries");
 						break;
diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
index cf7273ffda..e6a6650de3 100644
--- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
+++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
@@ -193,7 +193,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 		}
 	}
 
-	bool compareRangeResult(Standalone<RangeResultRef>& res1, Standalone<RangeResultRef>& res2) {
+	bool compareRangeResult(RangeResult const& res1, RangeResult const& res2) {
 		if ((res1.more != res2.more) || (res1.readToBegin != res2.readToBegin) ||
 		    (res1.readThroughEnd != res2.readThroughEnd)) {
 			TraceEvent(SevError, "TestFailure")
@@ -338,7 +338,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 			tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_RELAXED);
 			const KeyRef startKey = LiteralStringRef("\xff\xff/transactio");
 			const KeyRef endKey = LiteralStringRef("\xff\xff/transaction1");
-			Standalone<RangeResultRef> result =
+			RangeResult result =
 			    wait(tx->getRange(KeyRangeRef(startKey, endKey), GetRangeLimits(CLIENT_KNOBS->TOO_MANY)));
 			// The whole transaction module should be empty
 			ASSERT(!result.size());
@@ -386,7 +386,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 			const KeyRef key = LiteralStringRef("\xff\xff/cluster_file_path");
 			KeySelector begin = KeySelectorRef(key, false, 0);
 			KeySelector end = KeySelectorRef(keyAfter(key), false, 2);
-			Standalone<RangeResultRef> result = wait(tx->getRange(begin, end, GetRangeLimits(CLIENT_KNOBS->TOO_MANY)));
+			RangeResult result = wait(tx->getRange(begin, end, GetRangeLimits(CLIENT_KNOBS->TOO_MANY)));
 			ASSERT(result.readToBegin && result.readThroughEnd);
 			tx->reset();
 		} catch (Error& e) {
@@ -397,7 +397,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 			const KeyRef key = LiteralStringRef("\xff\xff/transaction/a_to_be_the_first");
 			KeySelector begin = KeySelectorRef(key, false, 0);
 			KeySelector end = KeySelectorRef(key, false, 2);
-			Standalone<RangeResultRef> result = wait(tx->getRange(begin, end, GetRangeLimits(CLIENT_KNOBS->TOO_MANY)));
+			RangeResult result = wait(tx->getRange(begin, end, GetRangeLimits(CLIENT_KNOBS->TOO_MANY)));
 			ASSERT(result.readToBegin && !result.readThroughEnd);
 			tx->reset();
 		} catch (Error& e) {
@@ -440,7 +440,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 		try {
 			const KeySelector startKeySelector = KeySelectorRef(LiteralStringRef("\xff\xff/test"), true, -200);
 			const KeySelector endKeySelector = KeySelectorRef(LiteralStringRef("test"), true, -10);
-			Standalone<RangeResultRef> result =
+			RangeResult result =
 			    wait(tx->getRange(startKeySelector, endKeySelector, GetRangeLimits(CLIENT_KNOBS->TOO_MANY)));
 			ASSERT(false);
 		} catch (Error& e) {
@@ -451,10 +451,9 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 		}
 		// test case when registered range is the same as the underlying module
 		try {
-			state Standalone<RangeResultRef> result =
-			    wait(tx->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"),
-			                                  LiteralStringRef("\xff\xff/worker_interfaces0")),
-			                      CLIENT_KNOBS->TOO_MANY));
+			state RangeResult result = wait(tx->getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"),
+			                                                         LiteralStringRef("\xff\xff/worker_interfaces0")),
+			                                             CLIENT_KNOBS->TOO_MANY));
 			// Note: there's possibility we get zero workers
 			if (result.size()) {
 				state KeyValueRef entry = deterministicRandom()->randomChoice(result);
@@ -609,7 +608,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 		return Void();
 	}
 
-	bool getRangeResultInOrder(const Standalone<RangeResultRef>& result) {
+	bool getRangeResultInOrder(const RangeResult& result) {
 		for (int i = 0; i < result.size() - 1; ++i) {
 			if (result[i].key >= result[i + 1].key) {
 				TraceEvent(SevError, "TestFailure")
@@ -636,7 +635,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 				            .withSuffix(option),
 				        ValueRef());
 			}
-			Standalone<RangeResultRef> result = wait(tx->getRange(
+			RangeResult result = wait(tx->getRange(
 			    KeyRangeRef(LiteralStringRef("options/"), LiteralStringRef("options0"))
 			        .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin),
 			    CLIENT_KNOBS->TOO_MANY));
@@ -677,7 +676,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 			try {
 				tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
 				// test getRange
-				state Standalone<RangeResultRef> result = wait(tx->getRange(
+				state RangeResult result = wait(tx->getRange(
 				    KeyRangeRef(LiteralStringRef("process/class_type/"), LiteralStringRef("process/class_type0"))
 				        .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin),
 				    CLIENT_KNOBS->TOO_MANY));
@@ -746,7 +745,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 		{
 			try {
 				// test getRange
-				state Standalone<RangeResultRef> class_source_result = wait(tx->getRange(
+				state RangeResult class_source_result = wait(tx->getRange(
 				    KeyRangeRef(LiteralStringRef("process/class_source/"), LiteralStringRef("process/class_source0"))
 				        .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin),
 				    CLIENT_KNOBS->TOO_MANY));
@@ -835,7 +834,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 		// if database locked, fdb read should get database_locked error
 		try {
 			tx->reset();
-			Standalone<RangeResultRef> res = wait(tx->getRange(normalKeys, 1));
+			RangeResult res = wait(tx->getRange(normalKeys, 1));
 		} catch (Error& e) {
 			if (e.code() == error_code_actor_cancelled)
 				throw;
@@ -853,7 +852,7 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 				TraceEvent(SevDebug, "DatabaseUnlocked");
 				tx->reset();
 				// read should be successful
-				Standalone<RangeResultRef> res = wait(tx->getRange(normalKeys, 1));
+				RangeResult res = wait(tx->getRange(normalKeys, 1));
 				tx->reset();
 				break;
 			} catch (Error& e) {
@@ -1257,13 +1256,13 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 		loop {
 			try {
 				// maintenance
-				Standalone<RangeResultRef> maintenanceKVs = wait(
+				RangeResult maintenanceKVs = wait(
 				    tx->getRange(SpecialKeySpace::getManamentApiCommandRange("maintenance"), CLIENT_KNOBS->TOO_MANY));
 				// By default, no maintenance is going on
 				ASSERT(!maintenanceKVs.more && !maintenanceKVs.size());
 				// datadistribution
-				Standalone<RangeResultRef> ddKVs = wait(tx->getRange(
-				    SpecialKeySpace::getManamentApiCommandRange("datadistribution"), CLIENT_KNOBS->TOO_MANY));
+				RangeResult ddKVs = wait(tx->getRange(SpecialKeySpace::getManamentApiCommandRange("datadistribution"),
+				                                      CLIENT_KNOBS->TOO_MANY));
 				// By default, data_distribution/mode := "-1"
 				ASSERT(!ddKVs.more && ddKVs.size() == 1);
 				ASSERT(ddKVs[0].key == LiteralStringRef("mode").withPrefix(
diff --git a/fdbserver/workloads/Storefront.actor.cpp b/fdbserver/workloads/Storefront.actor.cpp
index e97be1409a..9988385cfd 100644
--- a/fdbserver/workloads/Storefront.actor.cpp
+++ b/fdbserver/workloads/Storefront.actor.cpp
@@ -185,7 +185,7 @@ struct StorefrontWorkload : TestWorkload {
 		state KeySelectorRef begin = firstGreaterThan(keyRange.begin);
 		state KeySelectorRef end = lastLessThan(keyRange.end);
 		while (fetched == 10000) {
-			Standalone<RangeResultRef> values = wait(tr.getRange(begin, end, 10000));
+			RangeResult values = wait(tr.getRange(begin, end, 10000));
 			int orderIdx;
 			for (orderIdx = 0; orderIdx < values.size(); orderIdx++) {
 				vector<int> saved;
@@ -209,7 +209,7 @@ struct StorefrontWorkload : TestWorkload {
 			    cx, self, KeyRangeRef(Key(format("/orders/%x", c)), Key(format("/orders/%x", c + 1)))));
 
 		Transaction tr(cx);
-		state Future<Standalone<RangeResultRef>> values =
+		state Future<RangeResult> values =
 		    tr.getRange(KeyRangeRef(self->itemKey(0), self->itemKey(self->itemCount)), self->itemCount + 1);
 
 		wait(waitForAll(accumulators));
@@ -220,7 +220,7 @@ struct StorefrontWorkload : TestWorkload {
 				totals[i] += subTotals[i];
 		}
 
-		Standalone<RangeResultRef> inventory = wait(values);
+		RangeResult inventory = wait(values);
 		for (int c = 0; c < inventory.size(); c++) {
 			if (self->valueToInt(inventory[c].value) != totals[c]) {
 				TraceEvent(SevError, "TestFailure")
diff --git a/fdbserver/workloads/StreamingRead.actor.cpp b/fdbserver/workloads/StreamingRead.actor.cpp
index 61bc3d520e..3696d1d37b 100644
--- a/fdbserver/workloads/StreamingRead.actor.cpp
+++ b/fdbserver/workloads/StreamingRead.actor.cpp
@@ -119,7 +119,7 @@ struct StreamingReadWorkload : TestWorkload {
 						else if (currentIndex > maxIndex - thisRangeSize)
 							currentIndex = minIndex;
 
-						Standalone<RangeResultRef> values =
+						RangeResult values =
 						    wait(tr.getRange(firstGreaterOrEqual(self->keyForIndex(currentIndex)),
 						                     firstGreaterOrEqual(self->keyForIndex(currentIndex + thisRangeSize)),
 						                     thisRangeSize));
diff --git a/fdbserver/workloads/SuspendProcesses.actor.cpp b/fdbserver/workloads/SuspendProcesses.actor.cpp
index 341651b6c8..69600355eb 100644
--- a/fdbserver/workloads/SuspendProcesses.actor.cpp
+++ b/fdbserver/workloads/SuspendProcesses.actor.cpp
@@ -31,10 +31,9 @@ struct SuspendProcessesWorkload : TestWorkload {
 			try {
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-				Standalone<RangeResultRef> kvs =
-				    wait(tr.getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"),
-				                                 LiteralStringRef("\xff\xff/worker_interfaces0")),
-				                     CLIENT_KNOBS->TOO_MANY));
+				RangeResult kvs = wait(tr.getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"),
+				                                               LiteralStringRef("\xff\xff/worker_interfaces0")),
+				                                   CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!kvs.more);
 				std::vector<Standalone<StringRef>> suspendProcessInterfaces;
 				for (auto it : kvs) {
diff --git a/fdbserver/workloads/TPCC.actor.cpp b/fdbserver/workloads/TPCC.actor.cpp
index 7c2380007d..d8e40c2266 100644
--- a/fdbserver/workloads/TPCC.actor.cpp
+++ b/fdbserver/workloads/TPCC.actor.cpp
@@ -347,7 +347,7 @@ struct TPCC : TestWorkload {
 			memcpy(end, s.begin(), s.size());
 			begin[s.size()] = '/';
 			end[s.size()] = '0';
-			state Standalone<RangeResultRef> range =
+			state RangeResult range =
 			    wait(tr->getRange(KeyRangeRef(StringRef(begin, s.size() + 1), StringRef(end, s.size() + 1)), 1000));
 			ASSERT(range.size() > 0);
 
@@ -468,7 +468,7 @@ struct TPCC : TestWorkload {
 			order.o_w_id = customer.c_w_id;
 			order.o_d_id = customer.c_d_id;
 			order.o_c_id = customer.c_id;
-			Standalone<RangeResultRef> range = wait(tr.getRange(order.keyRange(1), 1, false, true));
+			RangeResult range = wait(tr.getRange(order.keyRange(1), 1, false, true));
 			ASSERT(range.size() > 0);
 			{
 				BinaryReader r(range[0].value, IncludeVersion());
@@ -506,7 +506,7 @@ struct TPCC : TestWorkload {
 			for (d_id = 0; d_id < 10; ++d_id) {
 				newOrder.no_w_id = w_id;
 				newOrder.no_d_id = d_id;
-				Standalone<RangeResultRef> range = wait(tr.getRange(newOrder.keyRange(1), 1));
+				RangeResult range = wait(tr.getRange(newOrder.keyRange(1), 1));
 				if (range.size() > 0) {
 					{
 						BinaryReader r(range[0].value, IncludeVersion());
@@ -588,8 +588,7 @@ struct TPCC : TestWorkload {
 				orderLine.ol_w_id = w_id;
 				orderLine.ol_d_id = d_id;
 				orderLine.ol_o_id = ol_o_id;
-				state Standalone<RangeResultRef> range =
-				    wait(tr.getRange(orderLine.keyRange(1), CLIENT_KNOBS->TOO_MANY));
+				state RangeResult range = wait(tr.getRange(orderLine.keyRange(1), CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!range.more);
 				ASSERT(range.size() > 0);
 				for (i = 0; i < range.size(); ++i) {
diff --git a/fdbserver/workloads/TaskBucketCorrectness.actor.cpp b/fdbserver/workloads/TaskBucketCorrectness.actor.cpp
index c591619b7d..2beba003c2 100644
--- a/fdbserver/workloads/TaskBucketCorrectness.actor.cpp
+++ b/fdbserver/workloads/TaskBucketCorrectness.actor.cpp
@@ -319,7 +319,7 @@ struct TaskBucketCorrectnessWorkload : TestWorkload {
 			data.insert(format("task_%d", i));
 		}
 
-		Standalone<RangeResultRef> values = wait(tr->getRange(
+		RangeResult values = wait(tr->getRange(
 		    KeyRangeRef(LiteralStringRef("Hello_\x00"), LiteralStringRef("Hello_\xff")), CLIENT_KNOBS->TOO_MANY));
 		if (values.size() != data.size()) {
 			TraceEvent(SevError, "CheckSayHello")
diff --git a/fdbserver/workloads/Throttling.actor.cpp b/fdbserver/workloads/Throttling.actor.cpp
index d11cd1ea61..e6397da740 100644
--- a/fdbserver/workloads/Throttling.actor.cpp
+++ b/fdbserver/workloads/Throttling.actor.cpp
@@ -131,7 +131,7 @@ struct ThrottlingWorkload : KVWorkload {
 		state json_spirit::mValue logSchema = readJSONStrictly(JSONSchemas::logHealthSchema.toString()).get_obj();
 		loop {
 			try {
-				Standalone<RangeResultRef> result = wait(
+				RangeResult result = wait(
 				    tr.getRange(prefixRange(LiteralStringRef("\xff\xff/metrics/health/")), CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!result.more);
 				for (const auto& [k, v] : result) {
diff --git a/fdbserver/workloads/TriggerRecovery.actor.cpp b/fdbserver/workloads/TriggerRecovery.actor.cpp
index e798ec79ca..ab21256c9b 100644
--- a/fdbserver/workloads/TriggerRecovery.actor.cpp
+++ b/fdbserver/workloads/TriggerRecovery.actor.cpp
@@ -92,10 +92,9 @@ struct TriggerRecoveryLoopWorkload : TestWorkload {
 			try {
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr.setOption(FDBTransactionOptions::LOCK_AWARE);
-				Standalone<RangeResultRef> kvs =
-				    wait(tr.getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"),
-				                                 LiteralStringRef("\xff\xff/worker_interfaces0")),
-				                     CLIENT_KNOBS->TOO_MANY));
+				RangeResult kvs = wait(tr.getRange(KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"),
+				                                               LiteralStringRef("\xff\xff/worker_interfaces0")),
+				                                   CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!kvs.more);
 				std::map<Key, Value> address_interface;
 				for (auto it : kvs) {
diff --git a/fdbserver/workloads/UDPWorkload.actor.cpp b/fdbserver/workloads/UDPWorkload.actor.cpp
index 9a3ca3aeaa..d96bbf13d5 100644
--- a/fdbserver/workloads/UDPWorkload.actor.cpp
+++ b/fdbserver/workloads/UDPWorkload.actor.cpp
@@ -218,8 +218,7 @@ struct UDPWorkload : TestWorkload {
 		state std::vector<NetworkAddress> remotes;
 		loop {
 			try {
-				Standalone<RangeResultRef> range =
-				    wait(tr.getRange(prefixRange(self->keyPrefix), CLIENT_KNOBS->TOO_MANY));
+				RangeResult range = wait(tr.getRange(prefixRange(self->keyPrefix), CLIENT_KNOBS->TOO_MANY));
 				ASSERT(!range.more);
 				for (auto const& p : range) {
 					auto cID = BinaryReader::fromStringRef<decltype(self->clientId)>(
diff --git a/fdbserver/workloads/Unreadable.actor.cpp b/fdbserver/workloads/Unreadable.actor.cpp
index e804048880..2f322e2350 100644
--- a/fdbserver/workloads/Unreadable.actor.cpp
+++ b/fdbserver/workloads/Unreadable.actor.cpp
@@ -360,7 +360,7 @@ struct UnreadableWorkload : TestWorkload {
 					if (snapshot)
 						tr.setOption(FDBTransactionOptions::SNAPSHOT_RYW_DISABLE);
 
-					ErrorOr<Standalone<RangeResultRef>> value =
+					ErrorOr<RangeResult> value =
 					    wait(errorOr(tr.getRange(range, CLIENT_KNOBS->TOO_MANY, snapshot, reverse)));
 
 					if (snapshot)
@@ -393,15 +393,14 @@ struct UnreadableWorkload : TestWorkload {
 						tr.setOption(FDBTransactionOptions::SNAPSHOT_RYW_DISABLE);
 
 					//TraceEvent("RYWT_GetRangeBefore").detail("Reverse", reverse).detail("Begin", begin.toString()).detail("End", end.toString()).detail("Limit", limit);
-					ErrorOr<Standalone<RangeResultRef>> value =
-					    wait(errorOr(tr.getRange(begin, end, limit, snapshot, reverse)));
+					ErrorOr<RangeResult> value = wait(errorOr(tr.getRange(begin, end, limit, snapshot, reverse)));
 
 					if (snapshot)
 						tr.setOption(FDBTransactionOptions::SNAPSHOT_RYW_ENABLE);
 					bool isUnreadable = value.isError() && value.getError().code() == error_code_accessed_unreadable;
 					if (!value.isError() || value.getError().code() == error_code_accessed_unreadable) {
 						/*
-						Standalone<RangeResultRef> result = value.get();
+						RangeResult result = value.get();
 						TraceEvent("RYWT_GetKeySelRangeOk")
 						    .detail("Begin", begin.toString())
 						    .detail("End", end.toString())
diff --git a/fdbserver/workloads/VersionStamp.actor.cpp b/fdbserver/workloads/VersionStamp.actor.cpp
index 5665ada1c3..6b1bd0d579 100644
--- a/fdbserver/workloads/VersionStamp.actor.cpp
+++ b/fdbserver/workloads/VersionStamp.actor.cpp
@@ -184,10 +184,10 @@ struct VersionStampWorkload : TestWorkload {
 			tr.setVersion(readVersion);
 		}
 
-		state Standalone<RangeResultRef> result;
+		state RangeResult result;
 		loop {
 			try {
-				Standalone<RangeResultRef> result_ = wait(tr.getRange(
+				RangeResult result_ = wait(tr.getRange(
 				    KeyRangeRef(self->vsValuePrefix, endOfRange(self->vsValuePrefix)), self->nodeCount + 1));
 				result = result_;
 				if ((self->apiVersion >= 610 || self->apiVersion == Database::API_VERSION_LATEST) &&
@@ -249,7 +249,7 @@ struct VersionStampWorkload : TestWorkload {
 					}
 				}
 
-				Standalone<RangeResultRef> result__ = wait(
+				RangeResult result__ = wait(
 				    tr.getRange(KeyRangeRef(self->vsKeyPrefix, endOfRange(self->vsKeyPrefix)), self->nodeCount + 1));
 				result = result__;
 				ASSERT(result.size() <= self->nodeCount);
diff --git a/fdbserver/workloads/WriteDuringRead.actor.cpp b/fdbserver/workloads/WriteDuringRead.actor.cpp
index af4124c5bf..80a36b1848 100644
--- a/fdbserver/workloads/WriteDuringRead.actor.cpp
+++ b/fdbserver/workloads/WriteDuringRead.actor.cpp
@@ -245,8 +245,8 @@ struct WriteDuringReadWorkload : TestWorkload {
 			    limit,
 			    reverse);
 			*memLimit -= memRes.expectedSize();
-			Standalone<RangeResultRef> _res = wait(tr->getRange(begin, end, limit, snapshot, reverse));
-			Standalone<RangeResultRef> res = _res;
+			RangeResult _res = wait(tr->getRange(begin, end, limit, snapshot, reverse));
+			RangeResult res = _res;
 			*memLimit += memRes.expectedSize();
 
 			int systemKeyCount = 0;

From b8712a068eb332ac3be1d28208d6e95ad3c5d153 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 3 May 2021 17:19:19 -0400
Subject: [PATCH 325/461] Update documentation/sphinx/source/administration.rst

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>
---
 documentation/sphinx/source/administration.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst
index bcdeec1566..365b6378c0 100644
--- a/documentation/sphinx/source/administration.rst
+++ b/documentation/sphinx/source/administration.rst
@@ -807,8 +807,8 @@ In general, downgrades between patch releases (i.e. 6.2.x - 6.1.x) are not suppo
 
 Downgrading from 6.3.13 - 6.2.33
 --------------------------------
-After upgrading from 6.2 to 6.3, the option of rolling back and downgrading to return to 6.2 is still possible, given that the following conditions are met:
+After upgrading from 6.2 to 6.3, the option of rolling back and downgrading to 6.2 is still possible, given that the following conditions are met:
 
 * The 6.3 cluster cannot have ``TLogVersion`` greater than V4 (6.2).
 * The 6.3 cluster cannot use storage engine types that are not ``ssd-1``, ``ssd-2``, or ``memory``.
-* The 6.3 cluster must not have any key servers serialized with tag encoding. The ``TAG_ENCODE_KEY_SERVERS`` fdbclient knob must not be set to true at any point in time.
\ No newline at end of file
+* The 6.3 cluster must not have any key servers serialized with tag encoding. The ``TAG_ENCODE_KEY_SERVERS`` fdbclient knob must not be set to true at any point in time.

From 236a6e098514597338f1241767a62f012fba3747 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 3 May 2021 17:19:25 -0400
Subject: [PATCH 326/461] Update documentation/sphinx/source/administration.rst

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>
---
 documentation/sphinx/source/administration.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst
index 365b6378c0..f5c45e6773 100644
--- a/documentation/sphinx/source/administration.rst
+++ b/documentation/sphinx/source/administration.rst
@@ -803,7 +803,7 @@ Upgrades from versions older than 5.0.0 are no longer supported.
 Version-specific notes on downgrading
 ===================================
 
-In general, downgrades between patch releases (i.e. 6.2.x - 6.1.x) are not supported.
+In general, downgrades between non-patch releases (i.e. 6.2.x - 6.1.x) are not supported.
 
 Downgrading from 6.3.13 - 6.2.33
 --------------------------------

From 9eb908de21e5048c4ca3ce10480952a5d2493498 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 3 May 2021 17:26:53 -0400
Subject: [PATCH 327/461] Adjusted release notes to link to the administration
 page

---
 documentation/sphinx/source/administration.rst                | 2 ++
 .../sphinx/source/release-notes/release-notes-620.rst         | 4 +---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst
index bcdeec1566..b6a86cde21 100644
--- a/documentation/sphinx/source/administration.rst
+++ b/documentation/sphinx/source/administration.rst
@@ -805,6 +805,8 @@ Version-specific notes on downgrading
 
 In general, downgrades between patch releases (i.e. 6.2.x - 6.1.x) are not supported.
 
+.. _downgrade-specific-version:
+
 Downgrading from 6.3.13 - 6.2.33
 --------------------------------
 After upgrading from 6.2 to 6.3, the option of rolling back and downgrading to return to 6.2 is still possible, given that the following conditions are met:
diff --git a/documentation/sphinx/source/release-notes/release-notes-620.rst b/documentation/sphinx/source/release-notes/release-notes-620.rst
index b14bbc65fd..b61b33e445 100644
--- a/documentation/sphinx/source/release-notes/release-notes-620.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-620.rst
@@ -8,9 +8,7 @@ Release Notes
 * Fix backup agent stall when writing to local filesystem with slow metadata operations. `(PR #4428) <https://github.com/apple/foundationdb/pull/4428>`_
 * Backup agent no longer uses 4k block caching layer on local output files so that write operations are larger. `(PR #4428) <https://github.com/apple/foundationdb/pull/4428>`_
 * Fix accounting error that could cause commits to incorrectly fail with ``proxy_memory_limit_exceeded``. `(PR #4529) <https://github.com/apple/foundationdb/pull/4529>`_
-* Added support for downgrades from FDB version 6.3. `(PR #4673) <https://github.com/apple/foundationdb/pull/4673>`_
-* Restrictions added for 6.3 clusters to maintain compatibility with a 6.2 downgrade. Details available in ``administration.rst``. `(PR #4469) <https://github.com/apple/foundationdb/pull/4469>`_
-
+* Added support for downgrades from FDB version 6.3. For more details, see the :ref:`administration notes <downgrade-specific-version>`. `(PR #4673) <https://github.com/apple/foundationdb/pull/4673>`_ `(PR #4469) <https://github.com/apple/foundationdb/pull/4469>`_
 6.2.32
 ======
 * Fix an issue where symbolic links in cmake-built RPMs are broken if you unpack the RPM to a custom directory. `(PR #4380) <https://github.com/apple/foundationdb/pull/4380>`_

From 86fde3824020173d15521ef1958063e4197debdc Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 3 May 2021 14:30:48 -0700
Subject: [PATCH 328/461] Fix boost download links

---
 README.md                             | 2 +-
 build/cmake/Dockerfile                | 2 +-
 build/docker/centos6/build/Dockerfile | 4 ++--
 build/docker/centos7/build/Dockerfile | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 44b451c135..cd28c798f0 100755
--- a/README.md
+++ b/README.md
@@ -171,7 +171,7 @@ that Visual Studio is used to compile.
 
 1. Install Visual Studio 2017 (Community Edition is tested)
 1. Install cmake Version 3.12 or higher [CMake](https://cmake.org/)
-1. Download version 1.72 of [Boost](https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.tar.bz2)
+1. Download version 1.72 of [Boost](https://boostorg.jfrog.io/artifactory/main/release/1.72.0/source/boost_1_72_0.tar.bz2)
 1. Unpack boost (you don't need to compile it)
 1. Install [Mono](http://www.mono-project.com/download/stable/)
 1. (Optional) Install a [JDK](http://www.oracle.com/technetwork/java/javase/downloads/index.html). FoundationDB currently builds with Java 8
diff --git a/build/cmake/Dockerfile b/build/cmake/Dockerfile
index 3f9d51a29a..82619f6e26 100644
--- a/build/cmake/Dockerfile
+++ b/build/cmake/Dockerfile
@@ -13,7 +13,7 @@ RUN curl -L https://github.com/Kitware/CMake/releases/download/v3.13.4/cmake-3.1
     cd /tmp && tar xf cmake.tar.gz && cp -r cmake-3.13.4-Linux-x86_64/* /usr/local/
 
 # install boost
-RUN curl -L https://dl.bintray.com/boostorg/release/1.67.0/source/boost_1_72_0.tar.bz2 > /tmp/boost.tar.bz2 &&\
+RUN curl -L https://boostorg.jfrog.io/artifactory/main/release/1.72.0/source/boost_1_72_0.tar.bz2 > /tmp/boost.tar.bz2 &&\
     cd /tmp && echo "2684c972994ee57fc5632e03bf044746f6eb45d4920c343937a465fd67a5adba  boost.tar.bz2" > boost-sha.txt &&\
     sha256sum -c boost-sha.txt && tar xf boost.tar.bz2 && cp -r boost_1_72_0/boost /usr/local/include/ &&\
     rm -rf boost.tar.bz2 boost_1_72_0
diff --git a/build/docker/centos6/build/Dockerfile b/build/docker/centos6/build/Dockerfile
index c007626643..08ad28256b 100644
--- a/build/docker/centos6/build/Dockerfile
+++ b/build/docker/centos6/build/Dockerfile
@@ -156,7 +156,7 @@ RUN curl -Ls https://github.com/facebook/rocksdb/archive/v6.10.1.tar.gz -o rocks
     rm -rf /tmp/*
 
 # install boost 1.67 to /opt
-RUN curl -Ls https://dl.bintray.com/boostorg/release/1.67.0/source/boost_1_67_0.tar.bz2 -o boost_1_67_0.tar.bz2 && \
+RUN curl -Ls https://boostorg.jfrog.io/artifactory/main/release/1.67.0/source/boost_1_67_0.tar.bz2 -o boost_1_67_0.tar.bz2 && \
     echo "2684c972994ee57fc5632e03bf044746f6eb45d4920c343937a465fd67a5adba  boost_1_67_0.tar.bz2" > boost-sha-67.txt && \
     sha256sum -c boost-sha-67.txt && \
     tar --no-same-owner --directory /opt -xjf boost_1_67_0.tar.bz2 && \
@@ -165,7 +165,7 @@ RUN curl -Ls https://dl.bintray.com/boostorg/release/1.67.0/source/boost_1_67_0.
 
 # install boost 1.72 to /opt
 RUN source /opt/rh/devtoolset-8/enable && \
-    curl -Ls https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.tar.bz2 -o boost_1_72_0.tar.bz2 && \
+    curl -Ls https://boostorg.jfrog.io/artifactory/main/release/1.72.0/source/boost_1_72_0.tar.bz2 -o boost_1_72_0.tar.bz2 && \
     echo "59c9b274bc451cf91a9ba1dd2c7fdcaf5d60b1b3aa83f2c9fa143417cc660722  boost_1_72_0.tar.bz2" > boost-sha-72.txt && \
     sha256sum -c boost-sha-72.txt && \
     tar --no-same-owner --directory /opt -xjf boost_1_72_0.tar.bz2 && \
diff --git a/build/docker/centos7/build/Dockerfile b/build/docker/centos7/build/Dockerfile
index 18773c041a..3492fc15f4 100644
--- a/build/docker/centos7/build/Dockerfile
+++ b/build/docker/centos7/build/Dockerfile
@@ -139,7 +139,7 @@ RUN curl -Ls https://github.com/facebook/rocksdb/archive/v6.10.1.tar.gz -o rocks
     rm -rf /tmp/*
 
 # install boost 1.67 to /opt
-RUN curl -Ls https://dl.bintray.com/boostorg/release/1.67.0/source/boost_1_67_0.tar.bz2 -o boost_1_67_0.tar.bz2 && \
+RUN curl -Ls https://boostorg.jfrog.io/artifactory/main/release/1.67.0/source/boost_1_67_0.tar.bz2 -o boost_1_67_0.tar.bz2 && \
     echo "2684c972994ee57fc5632e03bf044746f6eb45d4920c343937a465fd67a5adba  boost_1_67_0.tar.bz2" > boost-sha-67.txt && \
     sha256sum -c boost-sha-67.txt && \
     tar --no-same-owner --directory /opt -xjf boost_1_67_0.tar.bz2 && \
@@ -148,7 +148,7 @@ RUN curl -Ls https://dl.bintray.com/boostorg/release/1.67.0/source/boost_1_67_0.
 
 # install boost 1.72 to /opt
 RUN source /opt/rh/devtoolset-8/enable && \
-    curl -Ls https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.tar.bz2 -o boost_1_72_0.tar.bz2 && \
+    curl -Ls https://boostorg.jfrog.io/artifactory/main/release/1.72.0/source/boost_1_72_0.tar.bz2 -o boost_1_72_0.tar.bz2 && \
     echo "59c9b274bc451cf91a9ba1dd2c7fdcaf5d60b1b3aa83f2c9fa143417cc660722  boost_1_72_0.tar.bz2" > boost-sha-72.txt && \
     sha256sum -c boost-sha-72.txt && \
     tar --no-same-owner --directory /opt -xjf boost_1_72_0.tar.bz2 && \

From acdb57c838a1b025fc7207d6c61cec15d6c0cdef Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 3 May 2021 18:00:21 -0400
Subject: [PATCH 329/461] Update documentation/sphinx/source/administration.rst

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>
---
 documentation/sphinx/source/administration.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst
index 153c129c81..c71d26ee41 100644
--- a/documentation/sphinx/source/administration.rst
+++ b/documentation/sphinx/source/administration.rst
@@ -813,4 +813,4 @@ After upgrading from 6.2 to 6.3, the option of rolling back and downgrading to 6
 
 * The 6.3 cluster cannot have ``TLogVersion`` greater than V4 (6.2).
 * The 6.3 cluster cannot use storage engine types that are not ``ssd-1``, ``ssd-2``, or ``memory``.
-* The 6.3 cluster must not have any key servers serialized with tag encoding. The ``TAG_ENCODE_KEY_SERVERS`` fdbclient knob must not be set to true at any point in time.
+* The 6.3 cluster must not have any key servers serialized with tag encoding. This condition can only be guaranteed if the ``TAG_ENCODE_KEY_SERVERS`` knob has never been changed to ``true`` on this cluster.

From a9532c7e793b45d12e5e10ebaa1abb5da91e710e Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Mon, 3 May 2021 22:25:04 +0000
Subject: [PATCH 330/461] Expose CommitBatchingWindowSize metric to fdbcli
 status

Changes:

mr-status-json-schemas.rst.inc: Update schema to reflect the change made
to Schemas.cpp (to include statistics about CommitBatchingWindowSize).

release-notes-700.rst: Add a note about the new metric in the Status section.
---
 .../sphinx/source/mr-status-json-schemas.rst.inc     | 12 ++++++++++++
 .../source/release-notes/release-notes-700.rst       |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 8cbd5d9a9f..73a0c22578 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -171,6 +171,18 @@
                      "p99":0.0,
                      "p99.9":0.0
                   },
+                  "commit_batching_window_size":{
+                     "count":0,
+                     "min":0.0,
+                     "max":0.0,
+                     "median":0.0,
+                     "mean":0.0,
+                     "p25":0.0,
+                     "p90":0.0,
+                     "p95":0.0,
+                     "p99":0.0,
+                     "p99.9":0.0
+                  },
                   "grv_latency_bands":{ // How many GRV requests belong to the latency (in seconds) band (e.g., How many requests belong to [0.01,0.1] latency band). The key is the upper bound of the band and the lower bound is the next smallest band (or 0, if none). Example: {0.01: 27, 0.1: 18, 1: 1, inf: 98,filtered: 10}, we have 18 requests in [0.01, 0.1) band.
                      "$map_key=upperBoundOfBand": 1
                   },
diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst
index 84e8f0680a..99fcc930c1 100644
--- a/documentation/sphinx/source/release-notes/release-notes-700.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-700.rst
@@ -30,7 +30,7 @@ Fixes
 
 Status
 ------
-
+* Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4735) <https://github.com/apple/foundationdb/pull/4735>`_
 
 
 Bindings

From 56e719533eb91ab7d7171ff88e9c88d1c186c7eb Mon Sep 17 00:00:00 2001
From: sfc-gh-tclinkenbeard <trevor.clinkenbeard@snowflake.com>
Date: Mon, 3 May 2021 15:38:31 -0700
Subject: [PATCH 331/461] Move all Standalone<*Ref> type alias definitions to
 the same location

---
 fdbclient/FDBTypes.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h
index 59d7412d79..c64f2ff487 100644
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@@ -468,11 +468,12 @@ struct Traceable<KeyValueRef> : std::true_type {
 	}
 };
 
-typedef Standalone<KeyRef> Key;
-typedef Standalone<ValueRef> Value;
-typedef Standalone<KeyRangeRef> KeyRange;
-typedef Standalone<KeyValueRef> KeyValue;
-typedef Standalone<struct KeySelectorRef> KeySelector;
+using Key = Standalone<KeyRef>;
+using Value = Standalone<ValueRef>;
+using KeyRange = Standalone<KeyRangeRef>;
+using KeyValue = Standalone<KeyValueRef>;
+using KeySelector = Standalone<struct KeySelectorRef>;
+using RangeResult = Standalone<struct RangeResultRef>;
 
 enum { invalidVersion = -1, latestVersion = -2, MAX_VERSION = std::numeric_limits<int64_t>::max() };
 

From 45c79ecc175606399277bc0e6296ad50b0c615dd Mon Sep 17 00:00:00 2001
From: Sam Gwydir <sam.gwydir@snowflake.com>
Date: Mon, 3 May 2021 21:18:07 -0700
Subject: [PATCH 332/461] Mako should be able to set loggroup

---
 bindings/c/test/mako/mako.c | 14 ++++++++++++++
 bindings/c/test/mako/mako.h |  3 +++
 2 files changed, 17 insertions(+)

diff --git a/bindings/c/test/mako/mako.c b/bindings/c/test/mako/mako.c
index 5ed7ab9a50..ed24ba5a39 100644
--- a/bindings/c/test/mako/mako.c
+++ b/bindings/c/test/mako/mako.c
@@ -1172,6 +1172,14 @@ int worker_process_main(mako_args_t* args, int worker_id, mako_shmhdr_t* shm, pi
 #endif
 	}
 
+	/* Set client Log group */
+	if (strlen(args->log_group) != 0) {
+		err = fdb_network_set_option(FDB_NET_OPTION_TRACE_LOG_GROUP, (uint8_t*)args->log_group, strlen(args->log_group));
+		if (err) {
+			fprintf(stderr, "ERROR: fdb_network_set_option(FDB_NET_OPTION_TRACE_LOG_GROUP): %s\n", fdb_get_error(err));
+		}
+	}
+
 	/* enable tracing if specified */
 	if (args->trace) {
 		fprintf(debugme,
@@ -1345,6 +1353,7 @@ int init_args(mako_args_t* args) {
 	args->verbose = 1;
 	args->flatbuffers = 0; /* internal */
 	args->knobs[0] = '\0';
+	args->log_group[0] = '\0';
 	args->trace = 0;
 	args->tracepath[0] = '\0';
 	args->traceformat = 0; /* default to client's default (XML) */
@@ -1505,6 +1514,7 @@ void usage() {
 	printf("%-24s %s\n", "-m, --mode=MODE", "Specify the mode (build, run, clean)");
 	printf("%-24s %s\n", "-z, --zipf", "Use zipfian distribution instead of uniform distribution");
 	printf("%-24s %s\n", "    --commitget", "Commit GETs");
+	printf("%-24s %s\n", "    --loggroup=LOGGROUP", "Set client log group");
 	printf("%-24s %s\n", "    --trace", "Enable tracing");
 	printf("%-24s %s\n", "    --tracepath=PATH", "Set trace file path");
 	printf("%-24s %s\n", "    --trace_format <xml|json>", "Set trace format (Default: json)");
@@ -1546,6 +1556,7 @@ int parse_args(int argc, char* argv[], mako_args_t* args) {
 			                                    { "verbose", required_argument, NULL, 'v' },
 			                                    { "mode", required_argument, NULL, 'm' },
 			                                    { "knobs", required_argument, NULL, ARG_KNOBS },
+			                                    { "loggroup", required_argument, NULL, ARG_LOGGROUP },
 			                                    { "tracepath", required_argument, NULL, ARG_TRACEPATH },
 			                                    { "trace_format", required_argument, NULL, ARG_TRACEFORMAT },
 			                                    { "streaming", required_argument, NULL, ARG_STREAMING_MODE },
@@ -1656,6 +1667,9 @@ int parse_args(int argc, char* argv[], mako_args_t* args) {
 		case ARG_KNOBS:
 			memcpy(args->knobs, optarg, strlen(optarg) + 1);
 			break;
+		case ARG_LOGGROUP:
+			memcpy(args->log_group, optarg, strlen(optarg) + 1);
+			break;
 		case ARG_TRACE:
 			args->trace = 1;
 			break;
diff --git a/bindings/c/test/mako/mako.h b/bindings/c/test/mako/mako.h
index c065b44c13..7df4c72c76 100644
--- a/bindings/c/test/mako/mako.h
+++ b/bindings/c/test/mako/mako.h
@@ -68,6 +68,7 @@ enum Arguments {
 	ARG_VERSION,
 	ARG_KNOBS,
 	ARG_FLATBUFFERS,
+	ARG_LOGGROUP,
 	ARG_TRACE,
 	ARG_TRACEPATH,
 	ARG_TRACEFORMAT,
@@ -97,6 +98,7 @@ typedef struct {
 	int ops[MAX_OP][3];
 } mako_txnspec_t;
 
+#define LOGGROUP_MAX 256
 #define KNOB_MAX 256
 #define TAGPREFIXLENGTH_MAX 8
 
@@ -122,6 +124,7 @@ typedef struct {
 	int verbose;
 	mako_txnspec_t txnspec;
 	char cluster_file[PATH_MAX];
+	char log_group[LOGGROUP_MAX];
 	int trace;
 	char tracepath[PATH_MAX];
 	int traceformat; /* 0 - XML, 1 - JSON */

From 3175e823938d5be02500672669424a5be5af2cad Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Tue, 4 May 2021 09:26:09 -0700
Subject: [PATCH 333/461] updated documentation

---
 fdbclient/RYWIterator.h              | 3 ++-
 fdbclient/ReadYourWrites.actor.cpp   | 1 -
 fdbclient/vexillographer/fdb.options | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fdbclient/RYWIterator.h b/fdbclient/RYWIterator.h
index ba49bad9e3..8bc9091fe2 100644
--- a/fdbclient/RYWIterator.h
+++ b/fdbclient/RYWIterator.h
@@ -74,7 +74,8 @@ private:
 	SnapshotCache::iterator cache;
 	WriteMap::iterator writes;
 	KeyValueRef temp;
-	bool bypassUnreadable;
+	bool bypassUnreadable; // When set, allows read from sections of keyspace that have become unreadable because of
+	                       // versionstamp operations
 
 	void updateCmp();
 };
diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp
index 717499112c..9666a41d5e 100644
--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@@ -2242,7 +2242,6 @@ void ReadYourWritesTransaction::setOptionImpl(FDBTransactionOptions::Option opti
 		break;
 	case FDBTransactionOptions::BYPASS_UNREADABLE:
 		validateOptionValue(value, false);
-		TraceEvent("ReadVersionStampValueOptionSet");
 		options.bypassUnreadable = true;
 		break;
 	default:
diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options
index 9e09062234..87270b8451 100644
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@@ -193,7 +193,7 @@ description is not currently required but encouraged.
     <Option name="distributed_transaction_trace_disable" code="601"
             description="Disable tracing for all transactions." />
     <Option name="transaction_bypass_unreadable" code="700"
-            description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations."
+            description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. This sets the ``bypass_unreadable`` option of each transaction created by this database. See the transaction option description for more information."
             defaultFor="1100"/>
   </Scope>
   
@@ -288,7 +288,7 @@ description is not currently required but encouraged.
     <Option name="expensive_clear_cost_estimation_enable" code="1000"
                 description="Asks storage servers for how many bytes a clear key range contains. Otherwise uses the location cache to roughly estimate this." />
     <Option name="bypass_unreadable" code="1100"
-                description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations." />            
+                description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. These reads will view versionstamp operations as if they were set operations." />            
   </Scope>
 
   <!-- The enumeration values matter - do not change them without

From f5f9de48e7a4d1652a0653ba74d64a5c1b77a58c Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Tue, 4 May 2021 09:31:24 -0700
Subject: [PATCH 334/461] Update fdbclient/vexillographer/fdb.options

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>
---
 fdbclient/vexillographer/fdb.options | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options
index 87270b8451..2d3a5b57ce 100644
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@@ -288,7 +288,7 @@ description is not currently required but encouraged.
     <Option name="expensive_clear_cost_estimation_enable" code="1000"
                 description="Asks storage servers for how many bytes a clear key range contains. Otherwise uses the location cache to roughly estimate this." />
     <Option name="bypass_unreadable" code="1100"
-                description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. These reads will view versionstamp operations as if they were set operations." />            
+                description="Allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. These reads will view versionstamp operations as if they were set operations that did not fill in the versionstamp." />            
   </Scope>
 
   <!-- The enumeration values matter - do not change them without

From 2bd0bbf7dfccba7ffda09ef385c3693497e7e5ad Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 4 May 2021 10:46:07 -0700
Subject: [PATCH 335/461] Add documentation for consistencycheck

---
 documentation/sphinx/source/developer-guide.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index d26f235304..9d28e4f8b8 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -955,6 +955,7 @@ that process, and wait for necessary data to be moved away.
    Maintenance mode will be unable to use until the key is cleared, which is the same as the fdbcli command ``datadistribution enable ssfailure``.
    While the key is set, any commit that tries to set a key in the range will fail with the ``special_keys_api_failure`` error.
 #. ``\xff\xff/management/data_distribution/<mode|rebalance_ignored>`` Read/write. Changing these two keys will change the two corresponding system keys ``\xff/dataDistributionMode`` and ``\xff\x02/rebalanceDDIgnored``. The value of ``\xff\xff/management/data_distribution/mode`` is a literal text of ``0`` (disable) or ``1`` (enable). Transactions committed with invalid values will throw ``special_keys_api_failure`` . The value of ``\xff\xff/management/data_distribution/rebalance_ignored`` is empty. If present, it means data distribution is disabled for rebalance. Any transaction committed with non-empty value for this key will throw ``special_keys_api_failure``. For more details, see help text of ``fdbcli`` command ``datadistribution``.
+#. ``\xff\xff/management/consistency_check_suspended`` Read/write. Set or read this key will set or read the underlying system key ``\xff\x02/ConsistencyCheck/Suspend``. The value of this special key is unused thus if present, will be empty. In particular, if the key exists, then consistency is suspended. For more details, see help text of ``fdbcli`` command ``consistencycheck``.
 
 An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
 an ip address and port (e.g. ``127.0.0.1:4500``). If no port is specified,

From 6bffbdf7e3bbefd7cb0e22388f4395abfd07018d Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Tue, 4 May 2021 15:38:24 -0700
Subject: [PATCH 336/461] Revert "Actually close files in simulation"

---
 fdbrpc/AsyncFileNonDurable.actor.h | 14 --------------
 fdbrpc/sim2.actor.cpp              | 16 ++++++++--------
 flow/flow.h                        |  2 --
 3 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 28b3506d6e..00c0f7441d 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -276,20 +276,6 @@ public:
 			Future<Void> deleteFuture = deleteFile(this);
 			if (!deleteFuture.isReady())
 				filesBeingDeleted[filename] = deleteFuture;
-		} else if (isSoleOwner()) {
-			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
-			// remove the file from the map to make sure it gets closed.
-			auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
-			auto iter = openFiles.find(filename);
-			// the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the
-			// map anymore.
-			if (iter != openFiles.end()) {
-				// even if the filename exists, it doesn't mean that it references the same file. It could be that the
-				// file was renamed and later a file with the same name was opened.
-				if (iter->second.canGet() && iter->second.get().getPtr() == this) {
-					openFiles.erase(filename);
-				}
-			}
 		}
 	}
 
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index d72dcc62b8..5cf65da0a5 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -537,10 +537,7 @@ public:
 
 	std::string getFilename() const override { return actualFilename; }
 
-	~SimpleFile() override {
-		_close(h);
-		--openCount;
-	}
+	~SimpleFile() override { _close(h); }
 
 private:
 	int h;
@@ -1941,7 +1938,10 @@ public:
 		TraceEvent("ClogInterface")
 		    .detail("IP", ip.toString())
 		    .detail("Delay", seconds)
-		    .detail("Queue", mode == ClogSend ? "Send" : mode == ClogReceive ? "Receive" : "All");
+		    .detail("Queue",
+		            mode == ClogSend      ? "Send"
+		            : mode == ClogReceive ? "Receive"
+		                                  : "All");
 
 		if (mode == ClogSend || mode == ClogAll)
 			g_clogging.clogSendFor(ip, seconds);
@@ -2415,9 +2415,9 @@ int sf_open(const char* filename, int flags, int convFlags, int mode) {
 	                       GENERIC_READ | ((flags & IAsyncFile::OPEN_READWRITE) ? GENERIC_WRITE : 0),
 	                       FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
 	                       nullptr,
-	                       (flags & IAsyncFile::OPEN_EXCLUSIVE)
-	                           ? CREATE_NEW
-	                           : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS : OPEN_EXISTING,
+	                       (flags & IAsyncFile::OPEN_EXCLUSIVE) ? CREATE_NEW
+	                       : (flags & IAsyncFile::OPEN_CREATE)  ? OPEN_ALWAYS
+	                                                            : OPEN_EXISTING,
 	                       FILE_ATTRIBUTE_NORMAL,
 	                       nullptr);
 	int h = -1;
diff --git a/flow/flow.h b/flow/flow.h
index 86d5ce0529..8388113253 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -805,8 +805,6 @@ public:
 	bool isValid() const { return sav != 0; }
 	bool isReady() const { return sav->isSet(); }
 	bool isError() const { return sav->isError(); }
-	// returns true if get can be called on this future (counterpart of canBeSet on Promises)
-	bool canGet() const { return isValid() && isReady() && !isError(); }
 	Error& getError() const {
 		ASSERT(isError());
 		return sav->error_state;

From a8d7f7748ca7adc1b61690a75134198218879d25 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 4 May 2021 02:14:01 -0700
Subject: [PATCH 337/461] Rewrote FIFOQueue::Cursor read and write actors for
 more clarity, less overhead, and to fix bugs during pager shutdown.

---
 fdbserver/VersionedBTree.actor.cpp | 380 ++++++++++++++++++++---------
 1 file changed, 260 insertions(+), 120 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index d1c9ad77f0..ee105869d5 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -235,24 +235,36 @@ public:
 #pragma pack(pop)
 
 	struct Cursor {
+		// Queue mode
 		enum Mode { NONE, POP, READONLY, WRITE };
+		Mode mode;
 
-		// The current page being read or written to
+		// Queue this cursor is accessing
+		FIFOQueue* queue;
+
+		// The current page and pageID being read or written to
 		LogicalPageID pageID;
+		Reference<IPage> page;
 
 		// The first page ID to be written to the pager, if this cursor has written anything
 		LogicalPageID firstPageIDWritten;
 
-		// Offset after RawPage header to next read from or write to
+		// Offset after RawPage header in page to next read from or write to
 		int offset;
 
 		// A read cursor will not read this page (or beyond)
 		LogicalPageID endPageID;
 
-		Reference<IPage> page;
-		FIFOQueue* queue;
-		Future<Void> operation;
-		Mode mode;
+		// Page future and corresponding page ID for the expected next page to be used.  It may not
+		// match the current page's next page link because queues can prepended with new front pages.
+		Future<Reference<IPage>> nextPageReader;
+		LogicalPageID nextPageID;
+
+		// Future that represents all outstanding write operations previously issued
+		// This exists because writing the queue returns void, not a future
+		Future<Void> writeOperations;
+
+		FlowLock mutex;
 
 		Cursor() : mode(NONE) {}
 
@@ -262,26 +274,27 @@ public:
 		          LogicalPageID initialPageID = invalidLogicalPageID,
 		          int readOffset = 0,
 		          LogicalPageID endPage = invalidLogicalPageID) {
-			if (operation.isValid()) {
-				operation.cancel();
-			}
 			queue = q;
 			mode = m;
 			firstPageIDWritten = invalidLogicalPageID;
 			offset = readOffset;
 			endPageID = endPage;
 			page.clear();
+			writeOperations = Void();
 
 			if (mode == POP || mode == READONLY) {
 				// If cursor is not pointed at the end page then start loading it.
 				// The end page will not have been written to disk yet.
 				pageID = initialPageID;
-				operation = (pageID == endPageID) ? Void() : loadPage();
+				if (pageID != endPageID) {
+					startNextPageLoad(pageID);
+				} else {
+					nextPageID = invalidLogicalPageID;
+				}
 			} else {
 				pageID = invalidLogicalPageID;
 				ASSERT(mode == WRITE ||
 				       (initialPageID == invalidLogicalPageID && readOffset == 0 && endPage == invalidLogicalPageID));
-				operation = Void();
 			}
 
 			debug_printf("FIFOQueue::Cursor(%s) initialized\n", toString().c_str());
@@ -294,17 +307,17 @@ public:
 		// Since cursors can have async operations pending which modify their state they can't be copied cleanly
 		Cursor(const Cursor& other) = delete;
 
+		~Cursor() { writeOperations.cancel(); }
+
 		// A read cursor can be initialized from a pop cursor
 		void initReadOnly(const Cursor& c) {
 			ASSERT(c.mode == READONLY || c.mode == POP);
 			init(c.queue, READONLY, c.pageID, c.offset, c.endPageID);
 		}
 
-		~Cursor() { operation.cancel(); }
-
 		std::string toString() const {
 			if (mode == WRITE) {
-				return format("{WriteCursor %s:%p pos=%s:%d endOffset=%d}",
+				return format("{WriteCursor %s:%p pos=%s:%d rawEndOffset=%d}",
 				              queue->name.c_str(),
 				              this,
 				              ::toString(pageID).c_str(),
@@ -312,13 +325,14 @@ public:
 				              page ? raw()->endOffset : -1);
 			}
 			if (mode == POP || mode == READONLY) {
-				return format("{ReadCursor %s:%p pos=%s:%d endOffset=%d endPage=%s}",
+				return format("{ReadCursor %s:%p pos=%s:%d rawEndOffset=%d endPage=%s nextPage=%s}",
 				              queue->name.c_str(),
 				              this,
 				              ::toString(pageID).c_str(),
 				              offset,
 				              page ? raw()->endOffset : -1,
-				              ::toString(endPageID).c_str());
+				              ::toString(endPageID).c_str(),
+				              ::toString(nextPageID).c_str());
 			}
 			ASSERT(mode == NONE);
 			return format("{NullCursor=%p}", this);
@@ -326,17 +340,33 @@ public:
 
 #pragma pack(push, 1)
 		struct RawPage {
+			// The next page of the queue after this one
 			LogicalPageID nextPageID;
+			// The start offset of the next page
 			uint16_t nextOffset;
+			// The end offset of the current page
 			uint16_t endOffset;
+			// Get pointer to data after page header
 			uint8_t* begin() { return (uint8_t*)(this + 1); }
 		};
 #pragma pack(pop)
 
-		Future<Void> notBusy() { return operation; }
+		// Returns true if the mutex cannot be immediately taken.
+		bool isBusy() { return mutex.activePermits() != 0; }
+
+		// Wait for all operations started before now to be ready, which is done by
+		// obtaining and releasing the mutex.
+		Future<Void> notBusy() {
+			return isBusy() ? map(mutex.take(),
+			                      [&](Void) {
+				                      mutex.release();
+				                      return Void();
+			                      })
+			                : Void();
+		}
 
 		// Returns true if any items have been written to the last page
-		bool pendingWrites() const { return mode == WRITE && offset != 0; }
+		bool pendingTailWrites() const { return mode == WRITE && offset != 0; }
 
 		RawPage* raw() const { return ((RawPage*)(page->begin())); }
 
@@ -347,14 +377,11 @@ public:
 			p->nextOffset = offset;
 		}
 
-		Future<Void> loadPage() {
-			ASSERT(mode == POP | mode == READONLY);
-			debug_printf("FIFOQueue::Cursor(%s) loadPage\n", toString().c_str());
-			return map(queue->pager->readPage(pageID, true), [=](Reference<IPage> p) {
-				page = p;
-				debug_printf("FIFOQueue::Cursor(%s) loadPage done\n", toString().c_str());
-				return Void();
-			});
+		void startNextPageLoad(LogicalPageID id) {
+			nextPageID = id;
+			debug_printf(
+			    "FIFOQueue::Cursor(%s) loadPage start id=%s\n", toString().c_str(), ::toString(nextPageID).c_str());
+			nextPageReader = waitOrError(queue->pager->readPage(nextPageID, true), queue->pagerError);
 		}
 
 		void writePage() {
@@ -369,6 +396,8 @@ public:
 		}
 
 		// Link the current page to newPageID:newOffset and then write it to the pager.
+		// The link destination could be a new page at the end of the queue, or the beginning of
+		// an existing chain of queue pages.
 		// If initializeNewPage is true a page buffer will be allocated for the new page and it will be initialized
 		// as a new tail page.
 		void addNewPage(LogicalPageID newPageID, int newOffset, bool initializeNewPage) {
@@ -382,13 +411,25 @@ public:
 			// Update existing page and write, if it exists
 			if (page) {
 				setNext(newPageID, newOffset);
-				debug_printf("FIFOQueue::Cursor(%s) Linked new page\n", toString().c_str());
+				debug_printf("FIFOQueue::Cursor(%s) Linked new page %s:%d\n",
+				             toString().c_str(),
+				             ::toString(newPageID).c_str(),
+				             newOffset);
 				writePage();
 			}
 
 			pageID = newPageID;
 			offset = newOffset;
 
+			if (BUGGIFY) {
+				// Randomly change the byte limit for queue pages.  The min here must be large enough for at least one
+				// queue item of any type.  This change will suddenly make some pages being written to seem overfilled
+				// but this won't break anything, the next write will just be detected as not fitting and the page will
+				// end.
+				queue->dataBytesPerPage = deterministicRandom()->randomInt(
+				    50, queue->pager->getUsablePageSize() - sizeof(typename Cursor::RawPage));
+			}
+
 			if (initializeNewPage) {
 				debug_printf("FIFOQueue::Cursor(%s) Initializing new page\n", toString().c_str());
 				page = queue->pager->newPageBuffer();
@@ -402,123 +443,197 @@ public:
 		}
 
 		// Write item to the next position in the current page or, if it won't fit, add a new page and write it there.
-		ACTOR static Future<Void> write_impl(Cursor* self, T item, Future<Void> start) {
+		ACTOR static Future<Void> write_impl(Cursor* self, T item) {
 			ASSERT(self->mode == WRITE);
 
-			// Wait for the previous operation to finish
-			state Future<Void> previous = self->operation;
-			wait(start);
-			wait(previous);
-
+			state bool mustWait = self->isBusy();
 			state int bytesNeeded = Codec::bytesNeeded(item);
-			if (self->pageID == invalidLogicalPageID || self->offset + bytesNeeded > self->queue->dataBytesPerPage) {
+			state bool needNewPage =
+			    self->pageID == invalidLogicalPageID || self->offset + bytesNeeded > self->queue->dataBytesPerPage;
+
+			debug_printf("FIFOQueue::Cursor(%s) write(%s) mustWait=%d needNewPage=%d\n",
+			             self->toString().c_str(),
+			             ::toString(item).c_str(),
+			             mustWait,
+			             needNewPage);
+
+			// If we have to wait for the mutex because it's busy, or we need a new page, then wait for the mutex.
+			if (mustWait || needNewPage) {
+				wait(self->mutex.take());
+
+				// If we had to wait because the mutex was busy, then update needNewPage as another writer
+				// would have changed the cursor state
+				// Otherwise, taking the mutex would be immediate so no other writer could have run
+				if (mustWait) {
+					needNewPage = self->pageID == invalidLogicalPageID ||
+					              self->offset + bytesNeeded > self->queue->dataBytesPerPage;
+				}
+			}
+
+			// If we need a new page, add one.
+			if (needNewPage) {
 				debug_printf("FIFOQueue::Cursor(%s) write(%s) page is full, adding new page\n",
 				             self->toString().c_str(),
 				             ::toString(item).c_str());
 				LogicalPageID newPageID = wait(self->queue->pager->newPageID());
 				self->addNewPage(newPageID, 0, true);
 				++self->queue->numPages;
-				wait(yield());
 			}
+
 			debug_printf(
-			    "FIFOQueue::Cursor(%s) before write(%s)\n", self->toString().c_str(), ::toString(item).c_str());
+			    "FIFOQueue::Cursor(%s) write(%s) writing\n", self->toString().c_str(), ::toString(item).c_str());
 			auto p = self->raw();
 			Codec::writeToBytes(p->begin() + self->offset, item);
 			self->offset += bytesNeeded;
 			p->endOffset = self->offset;
 			++self->queue->numEntries;
+
+			if (mustWait || needNewPage) {
+				self->mutex.release();
+			}
+
 			return Void();
 		}
 
 		void write(const T& item) {
-			Promise<Void> p;
-			operation = write_impl(this, item, p.getFuture());
-			p.send(Void());
+			// Start the write.  It may complete immediately if no IO was being waited on
+			Future<Void> w = write_impl(this, item);
+			// If it didn't complete immediately, then store the future in operation
+			if (!w.isReady()) {
+				writeOperations = writeOperations && w;
+			}
 		}
 
-		// Read the next item at the cursor (if <= upperBound), moving to a new page first if the current page is
-		// exhausted
-		ACTOR static Future<Optional<T>> readNext_impl(Cursor* self, Optional<T> upperBound, Future<Void> start) {
-			ASSERT(self->mode == POP || self->mode == READONLY);
+		// If readNext() cannot complete immediately, it will route to here
+		// The mutex will be taken if locked is false
+		// The next page will be waited for if load is true
+		// Only mutex holders will wait on the page read.
+		ACTOR static Future<Optional<T>> waitThenReadNext(Cursor* self,
+		                                                  Optional<T> upperBound,
+		                                                  bool locked,
+		                                                  bool load) {
+			// Lock the mutex if it wasn't already
+			if (!locked) {
+				debug_printf("FIFOQueue::Cursor(%s) waitThenReadNext locking mutex\n", self->toString().c_str());
+				wait(self->mutex.take());
+			}
 
-			// Wait for the previous operation to finish
-			state Future<Void> previous = self->operation;
-			wait(start);
-			wait(previous);
+			if (load) {
+				debug_printf("FIFOQueue::Cursor(%s) waitThenReadNext waiting for page load\n",
+				             self->toString().c_str());
+				wait(success(self->nextPageReader));
+			}
 
-			debug_printf("FIFOQueue::Cursor(%s) readNext begin\n", self->toString().c_str());
-			if (self->pageID == invalidLogicalPageID || self->pageID == self->endPageID) {
-				debug_printf("FIFOQueue::Cursor(%s) readNext returning nothing\n", self->toString().c_str());
+			Optional<T> result = wait(self->readNext(upperBound, true));
+
+			// If this actor instance locked the mutex, then unlock it.
+			if (!locked) {
+				debug_printf("FIFOQueue::Cursor(%s) waitThenReadNext unlocking mutex\n", self->toString().c_str());
+				self->mutex.release();
+			}
+
+			return result;
+		}
+
+		// Read the next item at the cursor (if < upperBound), moving to a new page first if the current page is
+		// exhausted If locked is true, this call owns the mutex, which would have been locked by readNext() before a
+		// recursive call
+		Future<Optional<T>> readNext(const Optional<T>& upperBound = {}, bool locked = false) {
+			if ((mode != POP && mode != READONLY) || pageID == invalidLogicalPageID || pageID == endPageID) {
+				debug_printf("FIFOQueue::Cursor(%s) readNext returning nothing\n", toString().c_str());
 				return Optional<T>();
 			}
 
-			// We now know we are pointing to PageID and it should be read and used, but it may not be loaded yet.
-			if (!self->page) {
-				wait(self->loadPage());
-				wait(yield());
+			// If we don't own the mutex and it's not available then acquire it
+			if (!locked && isBusy()) {
+				return waitThenReadNext(this, upperBound, false, false);
 			}
 
-			auto p = self->raw();
-			debug_printf("FIFOQueue::Cursor(%s) readNext reading at current position\n", self->toString().c_str());
-			ASSERT(self->offset < p->endOffset);
+			// We now know pageID is valid and should be used, but page might not point to it yet
+			if (!page) {
+				debug_printf("FIFOQueue::Cursor(%s) loading\n", toString().c_str());
+
+				// If the next pageID loading or loaded is not the page we should be reading then restart the load
+				// nextPageID coud be different because it could be invalid or it could be no longer relevant
+				// if the previous commit added new pages to the front of the queue.
+				if (pageID != nextPageID) {
+					debug_printf("FIFOQueue::Cursor(%s) reloading\n", toString().c_str());
+					startNextPageLoad(pageID);
+				}
+
+				if (!nextPageReader.isReady()) {
+					return waitThenReadNext(this, upperBound, locked, true);
+				}
+
+				page = nextPageReader.get();
+
+				// Start loading the next page if it's not the end page
+				auto p = raw();
+				if (p->nextPageID != endPageID) {
+					startNextPageLoad(p->nextPageID);
+				} else {
+					// Prevent a future next page read from reusing the same result as page would have to be updated
+					// before the queue would read it again
+					nextPageID = invalidLogicalPageID;
+				}
+			}
+
+			auto p = raw();
+			debug_printf("FIFOQueue::Cursor(%s) readNext reading at current position\n", toString().c_str());
+			ASSERT(offset < p->endOffset);
 			int bytesRead;
-			T result = Codec::readFromBytes(p->begin() + self->offset, bytesRead);
+			const T result = Codec::readFromBytes(p->begin() + offset, bytesRead);
 
 			if (upperBound.present() && upperBound.get() < result) {
 				debug_printf("FIFOQueue::Cursor(%s) not popping %s, exceeds upper bound %s\n",
-				             self->toString().c_str(),
+				             toString().c_str(),
 				             ::toString(result).c_str(),
 				             ::toString(upperBound.get()).c_str());
+
 				return Optional<T>();
 			}
 
-			self->offset += bytesRead;
-			if (self->mode == POP) {
-				--self->queue->numEntries;
+			offset += bytesRead;
+			if (mode == POP) {
+				--queue->numEntries;
 			}
-			debug_printf(
-			    "FIFOQueue::Cursor(%s) after read of %s\n", self->toString().c_str(), ::toString(result).c_str());
-			ASSERT(self->offset <= p->endOffset);
+			debug_printf("FIFOQueue::Cursor(%s) after read of %s\n", toString().c_str(), ::toString(result).c_str());
+			ASSERT(offset <= p->endOffset);
 
-			if (self->offset == p->endOffset) {
-				debug_printf("FIFOQueue::Cursor(%s) Page exhausted\n", self->toString().c_str());
-				LogicalPageID oldPageID = self->pageID;
-				self->pageID = p->nextPageID;
-				self->offset = p->nextOffset;
-				if (self->mode == POP) {
-					--self->queue->numPages;
+			// If this page is exhausted, start reading the next page for the next readNext() to use, unless it's the
+			// tail page
+			if (offset == p->endOffset) {
+				debug_printf("FIFOQueue::Cursor(%s) Page exhausted\n", toString().c_str());
+				LogicalPageID oldPageID = pageID;
+				pageID = p->nextPageID;
+				offset = p->nextOffset;
+
+				// If pageID isn't the tail page and nextPageID isn't pageID then start loading the next page
+				if (pageID != endPageID && nextPageID != pageID) {
+					startNextPageLoad(pageID);
 				}
-				self->page.clear();
-				debug_printf("FIFOQueue::Cursor(%s) readNext page exhausted, moved to new page\n",
-				             self->toString().c_str());
 
-				if (self->mode == POP) {
+				if (mode == POP) {
+					--queue->numPages;
+				}
+				page.clear();
+				debug_printf("FIFOQueue::Cursor(%s) readNext page exhausted, moved to new page\n", toString().c_str());
+
+				if (mode == POP) {
 					// Freeing the old page must happen after advancing the cursor and clearing the page reference
 					// because freePage() could cause a push onto a queue that causes a newPageID() call which could
-					// pop() from this very same queue. Queue pages are freed at page 0 because they can be reused after
-					// the next commit.
-					self->queue->pager->freePage(oldPageID, 0);
+					// pop() from this very same queue. Queue pages are freed at version 0 because they can be reused
+					// after the next commit.
+					queue->pager->freePage(oldPageID, 0);
 				}
 			}
 
 			debug_printf("FIFOQueue(%s) %s(upperBound=%s) -> %s\n",
-			             self->queue->name.c_str(),
-			             (self->mode == POP ? "pop" : "peek"),
+			             queue->name.c_str(),
+			             (mode == POP ? "pop" : "peek"),
 			             ::toString(upperBound).c_str(),
 			             ::toString(result).c_str());
-			return result;
-		}
-
-		// Read and move past the next item if is <= upperBound or if upperBound is not present
-		Future<Optional<T>> readNext(const Optional<T>& upperBound = {}) {
-			if (mode == NONE) {
-				return Optional<T>();
-			}
-			Promise<Void> p;
-			Future<Optional<T>> read = readNext_impl(this, upperBound, p.getFuture());
-			operation = success(read);
-			p.send(Void());
-			return read;
+			return Optional<T>(result);
 		}
 	};
 
@@ -534,6 +649,7 @@ public:
 	void create(IPager2* p, LogicalPageID newPageID, std::string queueName) {
 		debug_printf("FIFOQueue(%s) create from page %s\n", queueName.c_str(), toString(newPageID).c_str());
 		pager = p;
+		pagerError = pager->getError();
 		name = queueName;
 		numPages = 1;
 		numEntries = 0;
@@ -549,6 +665,7 @@ public:
 	void recover(IPager2* p, const QueueState& qs, std::string queueName) {
 		debug_printf("FIFOQueue(%s) recover from queue state %s\n", queueName.c_str(), qs.toString().c_str());
 		pager = p;
+		pagerError = pager->getError();
 		name = queueName;
 		numPages = qs.numPages;
 		numEntries = qs.numEntries;
@@ -614,15 +731,15 @@ public:
 		headWriter.write(item);
 	}
 
-	// Wait until the most recently started operations on each cursor as of now are ready
-	Future<Void> notBusy() {
-		return headWriter.notBusy() && headReader.notBusy() && tailWriter.notBusy() && ready(newTailPage);
+	bool isBusy() {
+		return headWriter.isBusy() || headReader.isBusy() || tailWriter.isBusy() || !newTailPage.isReady();
 	}
 
-	// Returns true if any most recently started operations on any cursors are not ready
-	bool busy() {
-		return !headWriter.notBusy().isReady() || !headReader.notBusy().isReady() || !tailWriter.notBusy().isReady() ||
-		       !newTailPage.isReady();
+	// Wait until all previously started operations on each cursor are done and the new tail page is ready
+	Future<Void> notBusy() {
+		auto f = headWriter.notBusy() && headReader.notBusy() && tailWriter.notBusy() && ready(newTailPage);
+		debug_printf("FIFOQueue(%s) notBusy future ready=%d\n", name.c_str(), f.isReady());
+		return f;
 	}
 
 	// preFlush() prepares this queue to be flushed to disk, but doesn't actually do it so the queue can still
@@ -631,7 +748,7 @@ public:
 	//
 	// If one or more queues are used by their pager in newPageID() or freePage() operations, then preFlush()
 	// must be called on each of them inside a loop that runs until each of the preFlush() calls have returned
-	// false.
+	// false twice in a row.
 	//
 	// The reason for all this is that:
 	//   - queue pop() can call pager->freePage() which can call push() on the same or another queue
@@ -644,7 +761,7 @@ public:
 
 		// Completion of the pending operations as of the start of notBusy() could have began new operations,
 		// so see if any work is pending now.
-		bool workPending = self->busy();
+		bool workPending = self->isBusy();
 
 		if (!workPending) {
 			// A newly created or flushed queue starts out in a state where its tail page to be written to is empty.
@@ -653,8 +770,12 @@ public:
 			// the next flush.  (This is explained more at the top of FIFOQueue but it is because queue pages can only
 			// be written once because once they contain durable data a second write to link to a new page could corrupt
 			// the existing data if the subsequent commit never succeeds.)
+			//
+			// If the newTailPage future is ready but it's an invalid page and the tail page we are currently pointed to
+			// has had items added to it, then get a new tail page ID.
 			if (self->newTailPage.isReady() && self->newTailPage.get() == invalidLogicalPageID &&
-			    self->tailWriter.pendingWrites()) {
+			    self->tailWriter.pendingTailWrites()) {
+				debug_printf("FIFOQueue(%s) preFlush starting to get new page ID\n", self->name.c_str());
 				self->newTailPage = self->pager->newPageID();
 				workPending = true;
 			}
@@ -668,7 +789,7 @@ public:
 
 	void finishFlush() {
 		debug_printf("FIFOQueue(%s) finishFlush start\n", name.c_str());
-		ASSERT(!busy());
+		ASSERT(!isBusy());
 
 		// If a new tail page was allocated, link the last page of the tail writer to it.
 		if (newTailPage.get() != invalidLogicalPageID) {
@@ -677,14 +798,14 @@ public:
 			++numPages;
 
 			// newPage() should be ready immediately since a pageID is being explicitly passed.
-			ASSERT(tailWriter.notBusy().isReady());
+			ASSERT(!tailWriter.isBusy());
 
 			newTailPage = invalidLogicalPageID;
 		}
 
 		// If the headWriter wrote anything, link its tail page to the headReader position and point the headReader
 		// to the start of the headWriter
-		if (headWriter.pendingWrites()) {
+		if (headWriter.pendingTailWrites()) {
 			headWriter.addNewPage(headReader.pageID, headReader.offset, false);
 			headReader.pageID = headWriter.firstPageIDWritten;
 			headReader.offset = 0;
@@ -715,6 +836,8 @@ public:
 	Future<Void> flush() { return flush_impl(this); }
 
 	IPager2* pager;
+	Future<Void> pagerError;
+
 	int64_t numPages;
 	int64_t numEntries;
 	int dataBytesPerPage;
@@ -1204,7 +1327,7 @@ public:
 
 		Type getType() const { return getTypeOf(newPageID); }
 
-		bool operator<(const RemappedPage& rhs) { return version < rhs.version; }
+		bool operator<(const RemappedPage& rhs) const { return version < rhs.version; }
 
 		std::string toString() const {
 			return format("RemappedPage(%c: %s -> %s %s}",
@@ -1389,8 +1512,8 @@ public:
 
 			// Create queues
 			self->freeList.create(self, self->newLastPageID(), "FreeList");
-			self->delayedFreeList.create(self, self->newLastPageID(), "delayedFreeList");
-			self->remapQueue.create(self, self->newLastPageID(), "remapQueue");
+			self->delayedFreeList.create(self, self->newLastPageID(), "DelayedFreeList");
+			self->remapQueue.create(self, self->newLastPageID(), "RemapQueue");
 
 			// The first commit() below will flush the queues and update the queue states in the header,
 			// but since the queues will not be used between now and then their states will not change.
@@ -1680,6 +1803,7 @@ public:
 				    .detail("CalculatedChecksum", p->calculateChecksum(pageID))
 				    .detail("ChecksumInPage", p->getChecksum())
 				    .error(e);
+				ASSERT(false);
 				throw e;
 			}
 		}
@@ -1963,15 +2087,26 @@ public:
 		wait(self->remapQueue.flush());
 
 		// Flush the free list and delayed free list queues together as they are used by freePage() and newPageID()
+		// Since each queue's preFlush can create work for the other, we must see preflush return false for both
+		// twice in row.
+		state int clear = 0;
 		loop {
 			state bool freeBusy = wait(self->freeList.preFlush());
 			state bool delayedFreeBusy = wait(self->delayedFreeList.preFlush());
+			debug_printf("DWALPager(%s) flushQueues freeBusy=%d delayedFreeBusy=%d\n",
+			             self->filename.c_str(),
+			             freeBusy,
+			             delayedFreeBusy);
 
 			// Once preFlush() returns false for both queues then there are no more operations pending
 			// on either queue.  If preFlush() returns true for either queue in one loop execution then
 			// it could have generated new work for itself or the other queue.
 			if (!freeBusy && !delayedFreeBusy) {
-				break;
+				if (++clear == 2) {
+					break;
+				}
+			} else {
+				clear = 0;
 			}
 		}
 		self->freeList.finishFlush();
@@ -7674,8 +7809,8 @@ TEST_CASE("/redwood/correctness/btree") {
 	state std::string fileName = params.get("fileName").orDefault("unittest_pageFile.redwood");
 	IPager2* pager;
 
-	state bool serialTest = params.getInt("serialTest").orDefault(deterministicRandom()->coinflip());
-	state bool shortTest = params.getInt("shortTest").orDefault(deterministicRandom()->coinflip());
+	state bool serialTest = params.getInt("serialTest").orDefault(deterministicRandom()->random01() < 0.25);
+	state bool shortTest = params.getInt("shortTest").orDefault(deterministicRandom()->random01() < 0.25);
 
 	state int pageSize =
 	    shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400));
@@ -7694,8 +7829,8 @@ TEST_CASE("/redwood/correctness/btree") {
 	    params.getDouble("clearSingleKeyProbability").orDefault(deterministicRandom()->random01());
 	state double clearPostSetProbability =
 	    params.getDouble("clearPostSetProbability").orDefault(deterministicRandom()->random01() * .1);
-	state double coldStartProbability = params.getDouble("coldStartProbability")
-	                                        .orDefault(pagerMemoryOnly ? 0 : (deterministicRandom()->random01() * 0.3));
+	state double coldStartProbability =
+	    params.getDouble("coldStartProbability").orDefault(pagerMemoryOnly ? 0 : (deterministicRandom()->random01()));
 	state double advanceOldVersionProbability =
 	    params.getDouble("advanceOldVersionProbability").orDefault(deterministicRandom()->random01());
 	state int64_t cacheSizeBytes =
@@ -7871,8 +8006,9 @@ TEST_CASE("/redwood/correctness/btree") {
 		    mutationBytesThisCommit >= mutationBytesTargetThisCommit) {
 			// Wait for previous commit to finish
 			wait(commit);
-			printf(
-			    "Committed.  Next commit %d bytes, %" PRId64 " bytes.", mutationBytesThisCommit, mutationBytes.get());
+			printf("Last commit complete.  Next commit %d bytes, %" PRId64 " bytes committed so far.",
+			       mutationBytesThisCommit,
+			       mutationBytes.get() - mutationBytesThisCommit);
 			printf("  Stats:  Insert %.2f MB/s  ClearedKeys %.2f MB/s  Total %.2f\n",
 			       (keyBytesInserted.rate() + valueBytesInserted.rate()) / 1e6,
 			       keyBytesCleared.rate() / 1e6,
@@ -7891,7 +8027,8 @@ TEST_CASE("/redwood/correctness/btree") {
 			commit = map(btree->commit(), [=, &ops = totalPageOps](Void) {
 				// Update pager ops before clearing metrics
 				ops += g_redwoodMetrics.pageOps();
-				printf("PageOps %" PRId64 "/%" PRId64 " (%.2f%%) VerificationMapEntries %d/%d (%.2f%%)\n",
+				printf("Committed %s PageOps %" PRId64 "/%" PRId64 " (%.2f%%) VerificationMapEntries %d/%d (%.2f%%)\n",
+				       toString(v).c_str(),
 				       ops,
 				       targetPageOps,
 				       ops * 100.0 / targetPageOps,
@@ -7919,7 +8056,7 @@ TEST_CASE("/redwood/correctness/btree") {
 			mutationBytesTargetThisCommit = randomSize(maxCommitSize);
 
 			// Recover from disk at random
-			if (!serialTest && deterministicRandom()->random01() < coldStartProbability) {
+			if (!pagerMemoryOnly && deterministicRandom()->random01() < coldStartProbability) {
 				printf("Recovering from disk after next commit.\n");
 
 				// Wait for outstanding commit
@@ -7942,8 +8079,11 @@ TEST_CASE("/redwood/correctness/btree") {
 				wait(btree->init());
 
 				Version v = btree->getLatestVersion();
+				printf("Recovered from disk.  Latest recovered version %" PRId64 " highest written version %" PRId64
+				       "\n",
+				       v,
+				       version);
 				ASSERT(v == version);
-				printf("Recovered from disk.  Latest version %" PRId64 "\n", v);
 
 				// Create new promise stream and start the verifier again
 				committedVersions = PromiseStream<Version>();

From f2b70011cc5b7269debc1e46e2863eb6b753a312 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 4 May 2021 15:59:08 -0700
Subject: [PATCH 338/461] Fix msgpack dependency

---
 cmake/GetMsgpack.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/GetMsgpack.cmake b/cmake/GetMsgpack.cmake
index dc9a578175..b3313f336e 100644
--- a/cmake/GetMsgpack.cmake
+++ b/cmake/GetMsgpack.cmake
@@ -16,4 +16,5 @@ else()
 
   ExternalProject_Get_property(msgpackProject SOURCE_DIR)
   target_include_directories(msgpack SYSTEM INTERFACE "${SOURCE_DIR}/include")
+  add_dependencies(msgpack msgpackProject)
 endif()

From 5aa51b6067b132ce0e22d5446105067ecb830baa Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Tue, 4 May 2021 19:51:00 -0400
Subject: [PATCH 339/461] fix documentation build

---
 documentation/sphinx/source/administration.rst                  | 2 +-
 documentation/sphinx/source/release-notes/release-notes-620.rst | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst
index c71d26ee41..7053a78ca0 100644
--- a/documentation/sphinx/source/administration.rst
+++ b/documentation/sphinx/source/administration.rst
@@ -801,7 +801,7 @@ Upgrading from Older Versions
 Upgrades from versions older than 5.0.0 are no longer supported.
 
 Version-specific notes on downgrading
-===================================
+=====================================
 
 In general, downgrades between non-patch releases (i.e. 6.2.x - 6.1.x) are not supported.
 
diff --git a/documentation/sphinx/source/release-notes/release-notes-620.rst b/documentation/sphinx/source/release-notes/release-notes-620.rst
index b61b33e445..3e388a8129 100644
--- a/documentation/sphinx/source/release-notes/release-notes-620.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-620.rst
@@ -9,6 +9,7 @@ Release Notes
 * Backup agent no longer uses 4k block caching layer on local output files so that write operations are larger. `(PR #4428) <https://github.com/apple/foundationdb/pull/4428>`_
 * Fix accounting error that could cause commits to incorrectly fail with ``proxy_memory_limit_exceeded``. `(PR #4529) <https://github.com/apple/foundationdb/pull/4529>`_
 * Added support for downgrades from FDB version 6.3. For more details, see the :ref:`administration notes <downgrade-specific-version>`. `(PR #4673) <https://github.com/apple/foundationdb/pull/4673>`_ `(PR #4469) <https://github.com/apple/foundationdb/pull/4469>`_
+
 6.2.32
 ======
 * Fix an issue where symbolic links in cmake-built RPMs are broken if you unpack the RPM to a custom directory. `(PR #4380) <https://github.com/apple/foundationdb/pull/4380>`_

From 946e3f989fbe9bf92fc70f7459691debffe727e2 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Tue, 4 May 2021 00:22:34 +0000
Subject: [PATCH 340/461] Use a deterministic iteration order for committing in
 the special key space

---
 fdbclient/SpecialKeySpace.actor.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 9eb8e3aacb..545755c44b 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -572,16 +572,16 @@ ACTOR Future<Void> commitActor(SpecialKeySpace* sks, ReadYourWritesTransaction*
 	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
 	    ryw->getSpecialKeySpaceWriteMap().containedRanges(specialKeys);
 	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
-	state std::set<SpecialKeyRangeRWImpl*> writeModulePtrs;
+	state std::vector<SpecialKeyRangeRWImpl*> writeModulePtrs;
 	while (iter != ranges.end()) {
 		std::pair<bool, Optional<Value>> entry = iter->value();
 		if (entry.first) {
 			auto modulePtr = sks->getRWImpls().rangeContaining(iter->begin())->value();
-			writeModulePtrs.insert(modulePtr);
+			writeModulePtrs.push_back(modulePtr);
 		}
 		++iter;
 	}
-	state std::set<SpecialKeyRangeRWImpl*>::const_iterator it;
+	state std::vector<SpecialKeyRangeRWImpl*>::const_iterator it;
 	for (it = writeModulePtrs.begin(); it != writeModulePtrs.end(); ++it) {
 		Optional<std::string> msg = wait((*it)->commit(ryw));
 		if (msg.present()) {

From ed464d3514867f248ccf35e34e1b416f2b33c9e2 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Tue, 4 May 2021 15:24:31 +0000
Subject: [PATCH 341/461] Deduplicate

---
 fdbclient/SpecialKeySpace.actor.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 545755c44b..5d24fa4c98 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -573,11 +573,15 @@ ACTOR Future<Void> commitActor(SpecialKeySpace* sks, ReadYourWritesTransaction*
 	    ryw->getSpecialKeySpaceWriteMap().containedRanges(specialKeys);
 	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
 	state std::vector<SpecialKeyRangeRWImpl*> writeModulePtrs;
+	std::unordered_set<SpecialKeyRangeRWImpl*> deduplicate;
 	while (iter != ranges.end()) {
 		std::pair<bool, Optional<Value>> entry = iter->value();
 		if (entry.first) {
 			auto modulePtr = sks->getRWImpls().rangeContaining(iter->begin())->value();
-			writeModulePtrs.push_back(modulePtr);
+			auto [_, inserted] = deduplicate.insert(modulePtr);
+			if (inserted) {
+				writeModulePtrs.push_back(modulePtr);
+			}
 		}
 		++iter;
 	}

From 01fbb4faa39362259ad0f044db2e9b4d6b4770b0 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Wed, 5 May 2021 10:10:18 -0700
Subject: [PATCH 342/461] Add missing documentation for special keys

---
 documentation/sphinx/source/developer-guide.rst | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index 9d28e4f8b8..c4d900572a 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -955,12 +955,26 @@ that process, and wait for necessary data to be moved away.
    Maintenance mode will be unable to use until the key is cleared, which is the same as the fdbcli command ``datadistribution enable ssfailure``.
    While the key is set, any commit that tries to set a key in the range will fail with the ``special_keys_api_failure`` error.
 #. ``\xff\xff/management/data_distribution/<mode|rebalance_ignored>`` Read/write. Changing these two keys will change the two corresponding system keys ``\xff/dataDistributionMode`` and ``\xff\x02/rebalanceDDIgnored``. The value of ``\xff\xff/management/data_distribution/mode`` is a literal text of ``0`` (disable) or ``1`` (enable). Transactions committed with invalid values will throw ``special_keys_api_failure`` . The value of ``\xff\xff/management/data_distribution/rebalance_ignored`` is empty. If present, it means data distribution is disabled for rebalance. Any transaction committed with non-empty value for this key will throw ``special_keys_api_failure``. For more details, see help text of ``fdbcli`` command ``datadistribution``.
-#. ``\xff\xff/management/consistency_check_suspended`` Read/write. Set or read this key will set or read the underlying system key ``\xff\x02/ConsistencyCheck/Suspend``. The value of this special key is unused thus if present, will be empty. In particular, if the key exists, then consistency is suspended. For more details, see help text of ``fdbcli`` command ``consistencycheck``.
+#. ``\xff\xff/management/consistency_check_suspended`` Read/write. Set or read this key will set or read the underlying system key ``\xff\x02/ConsistencyCheck/Suspend``. The value of this special key is unused thus if present, will be empty. In particular, if the key exists, then consistency is suspended. For more details, see help text of ``fdbcli`` command ``consistencycheck``. For more details, see help text of ``fdbcli`` command ``lock`` and ``unlock``.
+#. ``\xff\xff/management/db_locked`` Read/write. A single key that can be read and modified. Set the key will lock the database and clear the key will unlock. If the database is already locked, then the commit will fail with ``special_keys_api_failure`` error.
+#. ``\xff\xff/management/auto_coordinators`` Read-only. A single key,  if read, will return a group of coordinators that satisfies the current redundency level. The value is a comma delimited string of network addresses of coordinators, i.e. ``<ip:port>,<ip:port>,...,<ip:port>``.
 
 An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
 an ip address and port (e.g. ``127.0.0.1:4500``). If no port is specified,
 then all processes on that host match the exclusion.
 
+Configuration module
+~~~~~~~~~~~~~~~~~~~~
+
+The configuration module is for changing the cluster configuration.
+For example, you can change a process type or update coordinators through transactions manipulating related special keys.
+
+#. ``\xff\xff/configuration/process/class_type/<address> := <class_type>`` Read/write. Reading keys in the range will give processes' class types. Setting keys in this range can update processes' class types. The process matching ``<address>`` will be assigned to the given type if the commit is successful. The valid class types are ``storage``, ``transaction``, ``resolution`` and etc. A full list can be found via ``fdbcli`` command ``help setclass``. Clear keys are forbidden in the range. Instead, you can set the type as ``default``, which will clear the assigned class type if existing. For more details, see help text of ``fdbcli`` command ``setclass``.
+#. ``\xff\xff/configuration/process/class_source/<address> := <class_source>`` Read-only. Reading keys in the range will give processes' class source. The class source can be one of ``command_line``, ``configure_auto``, ``set_class`` and ``invalid``, and each represents where the process's class type came from.
+#. ``\xff\xff/configuration/coordinators`` Read/write. Modifying keys in the range can change cooridinators or cluster description. Read ``\xff\xff/configuration/coordinators/cluster_description := <new_description>`` will return the cluster description and thus modify the key will update the decription. The new description needs to match ``[A-Za-z0-9_]+``, otherwise, the ``special_keys_api_failure`` error will be thrown. Read ``\xff\xff/configuration/coordinators/processes`` will return a comma delimited string of coordinators's network addresses, i.e. ``<ip:port>,<ip:port>,...,<ip:port>``. Thus to provide a new set of cooridinators, set the key with a correct formatted string of new coordinators' network addresses. For more details, see help text of ``fdbcli`` command ``coordinators``.
+
+The ``<address>`` here is the network address of the corresponding process. Thus the general form is ``ip:port``.
+
 Error message module
 ~~~~~~~~~~~~~~~~~~~~
 

From 06e8caa0aaec7efedd340cf5d6ce57bf2b5658ba Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 5 May 2021 12:34:21 -0700
Subject: [PATCH 343/461] Replaced IPage with ArenaPage, a new Arena-based page
 class which enables StringRefs to hold a reference to Redwood page memory.

---
 fdbserver/IPager.h                 | 100 ++++++++++---
 fdbserver/VersionedBTree.actor.cpp | 224 ++++++++++-------------------
 2 files changed, 153 insertions(+), 171 deletions(-)

diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h
index 0f74c744a8..2c4ea486bb 100644
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@@ -26,6 +26,7 @@
 
 #include "flow/flow.h"
 #include "fdbclient/FDBTypes.h"
+#include "flow/crc32c.h"
 
 #ifndef VALGRIND
 #define VALGRIND_MAKE_MEM_UNDEFINED(x, y)
@@ -36,36 +37,97 @@ typedef uint32_t LogicalPageID;
 typedef uint32_t PhysicalPageID;
 #define invalidLogicalPageID std::numeric_limits<LogicalPageID>::max()
 
-class IPage {
+// Represents a block of memory in a 4096-byte aligned location held by an Arena.
+class ArenaPage : public ReferenceCounted<ArenaPage>, public FastAllocated<ArenaPage> {
 public:
-	IPage() : userData(nullptr) {}
+	// The page's logical size includes an opaque checksum, use size() to get usable size
+	ArenaPage(int logicalSize, int bufferSize) : logicalSize(logicalSize), bufferSize(bufferSize), userData(nullptr) {
+		if (bufferSize > 0) {
+			// Get an aligned pointer to bufferSize from an arena in a wasteful way
+			// TODO:  Use arena.allocateAlignedBuffer()
+			size_t space = bufferSize * 2;
+			void* pSpace = new (arena) uint8_t[space];
+			buffer = (uint8_t*)std::align(4096, bufferSize, pSpace, space);
 
-	virtual uint8_t const* begin() const = 0;
-	virtual uint8_t* mutate() = 0;
+			// Mark any unused page portion defined
+			VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize);
+		} else {
+			buffer = nullptr;
+		}
+	};
 
-	// Must return the same size for all pages created by the same pager instance
-	virtual int size() const = 0;
-
-	StringRef asStringRef() const { return StringRef(begin(), size()); }
-
-	virtual ~IPage() {
+	~ArenaPage() {
 		if (userData != nullptr && userDataDestructor != nullptr) {
 			userDataDestructor(userData);
 		}
 	}
 
-	virtual Reference<IPage> clone() const = 0;
+	uint8_t const* begin() const { return (uint8_t*)buffer; }
 
-	virtual void addref() const = 0;
-	virtual void delref() const = 0;
+	uint8_t* mutate() { return (uint8_t*)buffer; }
 
+	typedef uint32_t Checksum;
+
+	// Usable size, without checksum
+	int size() const { return logicalSize - sizeof(Checksum); }
+
+	Standalone<StringRef> asStringRef() const { return Standalone<StringRef>(StringRef(begin(), size()), arena); }
+
+	// Get an ArenaPage which is a copy of this page, in its own Arena
+	Reference<ArenaPage> cloneContents() const {
+		ArenaPage* p = new ArenaPage(logicalSize, bufferSize);
+		memcpy(p->buffer, buffer, logicalSize);
+		return Reference<ArenaPage>(p);
+	}
+
+	// Get an ArenaPage which depends on this page's Arena and references some of its memory
+	Reference<ArenaPage> subPage(int offset, int len) const {
+		ArenaPage* p = new ArenaPage(len, 0);
+		p->buffer = buffer + offset;
+		p->arena.dependsOn(arena);
+		return Reference<ArenaPage>(p);
+	}
+
+	// Given a vector of pages with the same ->size(), create a new ArenaPage with a ->size() that is
+	// equivalent to all of the input pages and has all of their contents copied into it.
+	static Reference<ArenaPage> concatPages(const std::vector<Reference<const ArenaPage>>& pages) {
+		int usableSize = pages.front()->size();
+		int totalUsableSize = pages.size() * usableSize;
+		int totalBufferSize = pages.front()->bufferSize * pages.size();
+		ArenaPage* p = new ArenaPage(totalUsableSize + sizeof(Checksum), totalBufferSize);
+
+		uint8_t* wptr = p->mutate();
+		for (auto& p : pages) {
+			ASSERT(p->size() == usableSize);
+			memcpy(wptr, p->begin(), usableSize);
+			wptr += usableSize;
+		}
+
+		return Reference<ArenaPage>(p);
+	}
+
+	Checksum& getChecksum() { return *(Checksum*)(buffer + size()); }
+
+	Checksum calculateChecksum(LogicalPageID pageID) { return crc32c_append(pageID, buffer, size()); }
+
+	void updateChecksum(LogicalPageID pageID) { getChecksum() = calculateChecksum(pageID); }
+
+	bool verifyChecksum(LogicalPageID pageID) { return getChecksum() == calculateChecksum(pageID); }
+
+private:
+	Arena arena;
+	int logicalSize;
+	int bufferSize;
+	uint8_t* buffer;
+
+public:
 	mutable void* userData;
 	mutable void (*userDataDestructor)(void*);
 };
 
 class IPagerSnapshot {
 public:
-	virtual Future<Reference<const IPage>> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool nohit) = 0;
+	virtual Future<Reference<const ArenaPage>> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool nohit) = 0;
 	virtual Version getVersion() const = 0;
 
 	virtual Key getMetaKey() const = 0;
@@ -79,8 +141,8 @@ public:
 // This API is probably customized to the behavior of DWALPager and probably needs some changes to be more generic.
 class IPager2 : public IClosable {
 public:
-	// Returns an IPage that can be passed to writePage. The data in the returned IPage might not be zeroed.
-	virtual Reference<IPage> newPageBuffer() = 0;
+	// Returns an ArenaPage that can be passed to writePage. The data in the returned ArenaPage might not be zeroed.
+	virtual Reference<ArenaPage> newPageBuffer() = 0;
 
 	// Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead).
 	// For a given pager instance, separate calls to this function must return the same value.
@@ -94,13 +156,13 @@ public:
 	// Replace the contents of a page with new data across *all* versions.
 	// Existing holders of a page reference for pageID, read from any version,
 	// may see the effects of this write.
-	virtual void updatePage(LogicalPageID pageID, Reference<IPage> data) = 0;
+	virtual void updatePage(LogicalPageID pageID, Reference<ArenaPage> data) = 0;
 
 	// Try to atomically update the contents of a page as of version v in the next commit.
 	// If the pager is unable to do this at this time, it may choose to write the data to a new page ID
 	// instead and return the new page ID to the caller.  Otherwise the original pageID argument will be returned.
 	// If a new page ID is returned, the old page ID will be freed as of version v
-	virtual Future<LogicalPageID> atomicUpdatePage(LogicalPageID pageID, Reference<IPage> data, Version v) = 0;
+	virtual Future<LogicalPageID> atomicUpdatePage(LogicalPageID pageID, Reference<ArenaPage> data, Version v) = 0;
 
 	// Free pageID to be used again after the commit that moves oldestVersion past v
 	virtual void freePage(LogicalPageID pageID, Version v) = 0;
@@ -116,7 +178,7 @@ public:
 	// Cacheable indicates that the page should be added to the page cache (if applicable?) as a result of this read.
 	// NoHit indicates that the read should not be considered a cache hit, such as when preloading pages that are
 	// considered likely to be needed soon.
-	virtual Future<Reference<IPage>> readPage(LogicalPageID pageID, bool cacheable = true, bool noHit = false) = 0;
+	virtual Future<Reference<ArenaPage>> readPage(LogicalPageID pageID, bool cacheable = true, bool noHit = false) = 0;
 
 	// Get a snapshot of the metakey and all pages as of the version v which must be >= getOldestVersion()
 	// Note that snapshots at any version may still see the results of updatePage() calls.
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index ee105869d5..507b95013c 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -27,7 +27,6 @@
 #include "flow/UnitTest.h"
 #include "fdbserver/IPager.h"
 #include "fdbrpc/IAsyncFile.h"
-#include "flow/crc32c.h"
 #include "flow/ActorCollection.h"
 #include <map>
 #include <vector>
@@ -244,7 +243,7 @@ public:
 
 		// The current page and pageID being read or written to
 		LogicalPageID pageID;
-		Reference<IPage> page;
+		Reference<ArenaPage> page;
 
 		// The first page ID to be written to the pager, if this cursor has written anything
 		LogicalPageID firstPageIDWritten;
@@ -257,7 +256,7 @@ public:
 
 		// Page future and corresponding page ID for the expected next page to be used.  It may not
 		// match the current page's next page link because queues can prepended with new front pages.
-		Future<Reference<IPage>> nextPageReader;
+		Future<Reference<ArenaPage>> nextPageReader;
 		LogicalPageID nextPageID;
 
 		// Future that represents all outstanding write operations previously issued
@@ -856,50 +855,6 @@ int nextPowerOf2(uint32_t x) {
 	return 1 << (32 - clz(x - 1));
 }
 
-class FastAllocatedPage : public IPage, public FastAllocated<FastAllocatedPage>, ReferenceCounted<FastAllocatedPage> {
-public:
-	// Create a fast-allocated page with size total bytes INCLUDING checksum
-	FastAllocatedPage(int size, int bufferSize) : logicalSize(size), bufferSize(bufferSize) {
-		buffer = (uint8_t*)allocateFast(bufferSize);
-		// Mark any unused page portion defined
-		VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize);
-	};
-
-	~FastAllocatedPage() override { freeFast(bufferSize, buffer); }
-
-	Reference<IPage> clone() const override {
-		FastAllocatedPage* p = new FastAllocatedPage(logicalSize, bufferSize);
-		memcpy(p->buffer, buffer, logicalSize);
-		return Reference<IPage>(p);
-	}
-
-	// Usable size, without checksum
-	int size() const override { return logicalSize - sizeof(Checksum); }
-
-	uint8_t const* begin() const override { return buffer; }
-
-	uint8_t* mutate() override { return buffer; }
-
-	void addref() const override { ReferenceCounted<FastAllocatedPage>::addref(); }
-
-	void delref() const override { ReferenceCounted<FastAllocatedPage>::delref(); }
-
-	typedef uint32_t Checksum;
-
-	Checksum& getChecksum() { return *(Checksum*)(buffer + size()); }
-
-	Checksum calculateChecksum(LogicalPageID pageID) { return crc32c_append(pageID, buffer, size()); }
-
-	void updateChecksum(LogicalPageID pageID) { getChecksum() = calculateChecksum(pageID); }
-
-	bool verifyChecksum(LogicalPageID pageID) { return getChecksum() == calculateChecksum(pageID); }
-
-private:
-	int logicalSize;
-	int bufferSize;
-	uint8_t* buffer;
-};
-
 struct RedwoodMetrics {
 	static constexpr int btreeLevels = 5;
 
@@ -1287,7 +1242,6 @@ class DWALPagerSnapshot;
 // back to their original location once the original version is no longer needed.
 class DWALPager : public IPager2 {
 public:
-	typedef FastAllocatedPage Page;
 	typedef FIFOQueue<LogicalPageID> LogicalPageQueueT;
 	typedef std::map<Version, LogicalPageID> VersionToPageMapT;
 	typedef std::unordered_map<LogicalPageID, VersionToPageMapT> PageToVersionedMapT;
@@ -1419,12 +1373,12 @@ public:
 			wait(store(self->headerPage, self->readHeaderPage(self, 0)));
 
 			// If the checksum fails for the header page, try to recover committed header backup from page 1
-			if (!self->headerPage.castTo<Page>()->verifyChecksum(0)) {
+			if (!self->headerPage->verifyChecksum(0)) {
 				TraceEvent(SevWarn, "DWALPagerRecoveringHeader").detail("Filename", self->filename);
 
 				wait(store(self->headerPage, self->readHeaderPage(self, 1)));
 
-				if (!self->headerPage.castTo<Page>()->verifyChecksum(1)) {
+				if (!self->headerPage->verifyChecksum(1)) {
 					if (g_network->isSimulated()) {
 						// TODO: Detect if process is being restarted and only throw injected if so?
 						throw io_error().asInjectedFault();
@@ -1542,13 +1496,15 @@ public:
 		return Void();
 	}
 
-	Reference<IPage> newPageBuffer() override {
-		return Reference<IPage>(new FastAllocatedPage(logicalPageSize, physicalPageSize));
+	Reference<ArenaPage> newPageBuffer() override {
+		return Reference<ArenaPage>(new ArenaPage(logicalPageSize, physicalPageSize));
 	}
 
 	// Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead).
 	// For a given pager instance, separate calls to this function must return the same value.
-	int getUsablePageSize() const override { return logicalPageSize - sizeof(FastAllocatedPage::Checksum); }
+	// TODO: This is abstraction breaking.  This should probably be stored as a member, calculated once on construction
+	// by creating an ArenaPage and getting its usable size.
+	int getUsablePageSize() const override { return logicalPageSize - sizeof(ArenaPage::Checksum); }
 
 	// Get a new, previously available page ID.  The page will be considered in-use after the next commit
 	// regardless of whether or not it was written to, until it is returned to the pager via freePage()
@@ -1590,7 +1546,7 @@ public:
 
 	Future<LogicalPageID> newPageID() override { return newPageID_impl(this); }
 
-	Future<Void> writePhysicalPage(PhysicalPageID pageID, Reference<IPage> page, bool header = false) {
+	Future<Void> writePhysicalPage(PhysicalPageID pageID, Reference<ArenaPage> page, bool header = false) {
 		debug_printf("DWALPager(%s) op=%s %s ptr=%p\n",
 		             filename.c_str(),
 		             (header ? "writePhysicalHeader" : "writePhysical"),
@@ -1599,7 +1555,7 @@ public:
 
 		++g_redwoodMetrics.pagerDiskWrite;
 		VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size());
-		((Page*)page.getPtr())->updateChecksum(pageID);
+		page->updateChecksum(pageID);
 
 		if (memoryOnly) {
 			return Void();
@@ -1620,11 +1576,11 @@ public:
 		return f;
 	}
 
-	Future<Void> writeHeaderPage(PhysicalPageID pageID, Reference<IPage> page) {
+	Future<Void> writeHeaderPage(PhysicalPageID pageID, Reference<ArenaPage> page) {
 		return writePhysicalPage(pageID, page, true);
 	}
 
-	void updatePage(LogicalPageID pageID, Reference<IPage> data) override {
+	void updatePage(LogicalPageID pageID, Reference<ArenaPage> data) override {
 		// Get the cache entry for this page, without counting it as a cache hit as we're replacing its contents now
 		// or as a cache miss because there is no benefit to the page already being in cache
 		PageCacheEntry& cacheEntry = pageCache.get(pageID, true, true);
@@ -1666,7 +1622,7 @@ public:
 		cacheEntry.readFuture = data;
 	}
 
-	Future<LogicalPageID> atomicUpdatePage(LogicalPageID pageID, Reference<IPage> data, Version v) override {
+	Future<LogicalPageID> atomicUpdatePage(LogicalPageID pageID, Reference<ArenaPage> data, Version v) override {
 		debug_printf("DWALPager(%s) op=writeAtomic %s @%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v);
 		Future<LogicalPageID> f = map(newPageID(), [=](LogicalPageID newPageID) {
 			updatePage(newPageID, data);
@@ -1761,9 +1717,9 @@ public:
 	// Read a physical page from the page file.  Note that header pages use a page size of smallestPhysicalBlock
 	// If the user chosen physical page size is larger, then there will be a gap of unused space after the header pages
 	// and before the user-chosen sized pages.
-	ACTOR static Future<Reference<IPage>> readPhysicalPage(DWALPager* self,
-	                                                       PhysicalPageID pageID,
-	                                                       bool header = false) {
+	ACTOR static Future<Reference<ArenaPage>> readPhysicalPage(DWALPager* self,
+	                                                           PhysicalPageID pageID,
+	                                                           bool header = false) {
 		ASSERT(!self->memoryOnly);
 		++g_redwoodMetrics.pagerDiskRead;
 
@@ -1771,8 +1727,8 @@ public:
 			wait(delay(0, TaskPriority::DiskRead));
 		}
 
-		state Reference<IPage> page =
-		    header ? Reference<IPage>(new FastAllocatedPage(smallestPhysicalBlock, smallestPhysicalBlock))
+		state Reference<ArenaPage> page =
+		    header ? Reference<ArenaPage>(new ArenaPage(smallestPhysicalBlock, smallestPhysicalBlock))
 		           : self->newPageBuffer();
 		debug_printf("DWALPager(%s) op=readPhysicalStart %s ptr=%p\n",
 		             self->filename.c_str(),
@@ -1790,8 +1746,7 @@ public:
 
 		// Header reads are checked explicitly during recovery
 		if (!header) {
-			Page* p = (Page*)page.getPtr();
-			if (!p->verifyChecksum(pageID)) {
+			if (!page->verifyChecksum(pageID)) {
 				debug_printf(
 				    "DWALPager(%s) checksum failed for %s\n", self->filename.c_str(), toString(pageID).c_str());
 				Error e = checksum_failed();
@@ -1800,8 +1755,8 @@ public:
 				    .detail("PageID", pageID)
 				    .detail("PageSize", self->physicalPageSize)
 				    .detail("Offset", pageID * self->physicalPageSize)
-				    .detail("CalculatedChecksum", p->calculateChecksum(pageID))
-				    .detail("ChecksumInPage", p->getChecksum())
+				    .detail("CalculatedChecksum", page->calculateChecksum(pageID))
+				    .detail("ChecksumInPage", page->getChecksum())
 				    .error(e);
 				ASSERT(false);
 				throw e;
@@ -1810,13 +1765,13 @@ public:
 		return page;
 	}
 
-	static Future<Reference<IPage>> readHeaderPage(DWALPager* self, PhysicalPageID pageID) {
+	static Future<Reference<ArenaPage>> readHeaderPage(DWALPager* self, PhysicalPageID pageID) {
 		return readPhysicalPage(self, pageID, true);
 	}
 
 	// Reads the most recent version of pageID, either previously committed or written using updatePage() in the current
 	// commit
-	Future<Reference<IPage>> readPage(LogicalPageID pageID, bool cacheable, bool noHit = false) override {
+	Future<Reference<ArenaPage>> readPage(LogicalPageID pageID, bool cacheable, bool noHit = false) override {
 		// Use cached page if present, without triggering a cache hit.
 		// Otherwise, read the page and return it but don't add it to the cache
 		if (!cacheable) {
@@ -1849,7 +1804,7 @@ public:
 		return cacheEntry.readFuture;
 	}
 
-	Future<Reference<IPage>> readPageAtVersion(LogicalPageID pageID, Version v, bool cacheable, bool noHit) {
+	Future<Reference<ArenaPage>> readPageAtVersion(LogicalPageID pageID, Version v, bool cacheable, bool noHit) {
 		auto i = remappedPages.find(pageID);
 
 		if (i != remappedPages.end()) {
@@ -1978,7 +1933,7 @@ public:
 			debug_printf("DWALPager(%s) remapCleanup copy %s\n", self->filename.c_str(), p.toString().c_str());
 
 			// Read the data from the page that the original was mapped to
-			Reference<IPage> data = wait(self->readPage(p.newPageID, false, true));
+			Reference<ArenaPage> data = wait(self->readPage(p.newPageID, false, true));
 
 			// Write the data to the original page so it can be read using its original pageID
 			self->updatePage(p.originalPageID, data);
@@ -2324,7 +2279,7 @@ private:
 #pragma pack(pop)
 
 	struct PageCacheEntry {
-		Future<Reference<IPage>> readFuture;
+		Future<Reference<ArenaPage>> readFuture;
 		Future<Void> writeFuture;
 
 		bool initialized() const { return readFuture.isValid(); }
@@ -2351,12 +2306,12 @@ private:
 	int64_t pageCacheBytes;
 
 	// The header will be written to / read from disk as a smallestPhysicalBlock sized chunk.
-	Reference<IPage> headerPage;
+	Reference<ArenaPage> headerPage;
 	Header* pHeader;
 
 	int desiredPageSize;
 
-	Reference<IPage> lastCommittedHeaderPage;
+	Reference<ArenaPage> lastCommittedHeaderPage;
 	Header* pLastCommittedHeader;
 
 	std::string filename;
@@ -2410,12 +2365,12 @@ public:
 	  : pager(pager), metaKey(meta), version(version), expired(expiredFuture) {}
 	~DWALPagerSnapshot() override {}
 
-	Future<Reference<const IPage>> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool noHit) override {
+	Future<Reference<const ArenaPage>> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool noHit) override {
 		if (expired.isError()) {
 			throw expired.getError();
 		}
 		return map(pager->readPageAtVersion(pageID, version, cacheable, noHit),
-		           [=](Reference<IPage> p) { return Reference<const IPage>(p); });
+		           [=](Reference<ArenaPage> p) { return Reference<const ArenaPage>(p); });
 	}
 
 	Key getMetaKey() const override { return metaKey; }
@@ -3191,21 +3146,21 @@ struct BTreePage {
 	}
 };
 
-static void makeEmptyRoot(Reference<IPage> page) {
+static void makeEmptyRoot(Reference<ArenaPage> page) {
 	BTreePage* btpage = (BTreePage*)page->begin();
 	btpage->height = 1;
 	btpage->kvBytes = 0;
 	btpage->tree().build(page->size(), nullptr, nullptr, nullptr, nullptr);
 }
 
-BTreePage::BinaryTree::Cursor getCursor(const Reference<const IPage>& page) {
+BTreePage::BinaryTree::Cursor getCursor(const Reference<const ArenaPage>& page) {
 	return ((BTreePage::BinaryTree::Mirror*)page->userData)->getCursor();
 }
 
 struct BoundaryRefAndPage {
 	Standalone<RedwoodRecordRef> lowerBound;
-	Reference<IPage> firstPage;
-	std::vector<Reference<IPage>> extPages;
+	Reference<ArenaPage> firstPage;
+	std::vector<Reference<ArenaPage>> extPages;
 
 	std::string toString() const {
 		return format("[%s, %d pages]", lowerBound.toString().c_str(), extPages.size() + (firstPage ? 1 : 0));
@@ -3417,7 +3372,7 @@ public:
 
 		loop {
 			state int toPop = SERVER_KNOBS->REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES;
-			state std::vector<std::pair<LazyClearQueueEntry, Future<Reference<const IPage>>>> entries;
+			state std::vector<std::pair<LazyClearQueueEntry, Future<Reference<const ArenaPage>>>> entries;
 			entries.reserve(toPop);
 
 			// Take up to batchSize pages from front of queue
@@ -3436,7 +3391,7 @@ public:
 
 			state int i;
 			for (i = 0; i < entries.size(); ++i) {
-				Reference<const IPage> p = wait(entries[i].second);
+				Reference<const ArenaPage> p = wait(entries[i].second);
 				const LazyClearQueueEntry& entry = entries[i].first;
 				const BTreePage& btPage = *(BTreePage*)p->begin();
 				auto& metrics = g_redwoodMetrics.level(btPage.height);
@@ -3519,7 +3474,7 @@ public:
 			self->m_header.root.set(newRoot, sizeof(headerSpace) - sizeof(m_header));
 			self->m_header.height = 1;
 			++latest;
-			Reference<IPage> page = self->m_pager->newPageBuffer();
+			Reference<ArenaPage> page = self->m_pager->newPageBuffer();
 			makeEmptyRoot(page);
 			self->m_pager->updatePage(id, page);
 			self->m_pager->setCommitVersion(latest);
@@ -3867,7 +3822,7 @@ private:
 	Future<int> m_lazyClearActor;
 	bool m_lazyClearStop;
 
-	// Writes entries to 1 or more pages and return a vector of boundary keys with their IPage(s)
+	// Writes entries to 1 or more pages and return a vector of boundary keys with their ArenaPage(s)
 	ACTOR static Future<Standalone<VectorRef<RedwoodRecordRef>>> writePages(VersionedBTree* self,
 	                                                                        const RedwoodRecordRef* lowerBound,
 	                                                                        const RedwoodRecordRef* upperBound,
@@ -3987,12 +3942,12 @@ private:
 				pageUpperBound.truncate(commonPrefix + 1);
 			}
 
-			state std::vector<Reference<IPage>> pages;
+			state std::vector<Reference<ArenaPage>> pages;
 			BTreePage* btPage;
 
 			int capacity = blockSize * blockCount;
 			if (blockCount == 1) {
-				Reference<IPage> page = self->m_pager->newPageBuffer();
+				Reference<ArenaPage> page = self->m_pager->newPageBuffer();
 				btPage = (BTreePage*)page->mutate();
 				pages.push_back(std::move(page));
 			} else {
@@ -4049,7 +4004,7 @@ private:
 				VALGRIND_MAKE_MEM_DEFINED(((uint8_t*)btPage) + written, (blockCount * blockSize) - written);
 				const uint8_t* rptr = (const uint8_t*)btPage;
 				for (int b = 0; b < blockCount; ++b) {
-					Reference<IPage> page = self->m_pager->newPageBuffer();
+					Reference<ArenaPage> page = self->m_pager->newPageBuffer();
 					memcpy(page->mutate(), rptr, blockSize);
 					rptr += blockSize;
 					pages.push_back(std::move(page));
@@ -4156,46 +4111,11 @@ private:
 		return records;
 	}
 
-	class SuperPage : public IPage, ReferenceCounted<SuperPage>, public FastAllocated<SuperPage> {
-	public:
-		SuperPage(std::vector<Reference<const IPage>> pages) {
-			int blockSize = pages.front()->size();
-			m_size = blockSize * pages.size();
-			m_data = new uint8_t[m_size];
-			uint8_t* wptr = m_data;
-			for (auto& p : pages) {
-				ASSERT(p->size() == blockSize);
-				memcpy(wptr, p->begin(), blockSize);
-				wptr += blockSize;
-			}
-		}
-
-		~SuperPage() override { delete[] m_data; }
-
-		Reference<IPage> clone() const override {
-			return Reference<IPage>(new SuperPage({ Reference<const IPage>::addRef(this) }));
-		}
-
-		void addref() const override { ReferenceCounted<SuperPage>::addref(); }
-
-		void delref() const override { ReferenceCounted<SuperPage>::delref(); }
-
-		int size() const override { return m_size; }
-
-		uint8_t const* begin() const override { return m_data; }
-
-		uint8_t* mutate() override { return m_data; }
-
-	private:
-		uint8_t* m_data;
-		int m_size;
-	};
-
-	ACTOR static Future<Reference<const IPage>> readPage(Reference<IPagerSnapshot> snapshot,
-	                                                     BTreePageIDRef id,
-	                                                     const RedwoodRecordRef* lowerBound,
-	                                                     const RedwoodRecordRef* upperBound,
-	                                                     bool forLazyClear = false) {
+	ACTOR static Future<Reference<const ArenaPage>> readPage(Reference<IPagerSnapshot> snapshot,
+	                                                         BTreePageIDRef id,
+	                                                         const RedwoodRecordRef* lowerBound,
+	                                                         const RedwoodRecordRef* upperBound,
+	                                                         bool forLazyClear = false) {
 		if (!forLazyClear) {
 			debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n",
 			             toString(id).c_str(),
@@ -4209,20 +4129,20 @@ private:
 
 		wait(yield());
 
-		state Reference<const IPage> page;
+		state Reference<const ArenaPage> page;
 
 		if (id.size() == 1) {
-			Reference<const IPage> p = wait(snapshot->getPhysicalPage(id.front(), !forLazyClear, false));
+			Reference<const ArenaPage> p = wait(snapshot->getPhysicalPage(id.front(), !forLazyClear, false));
 			page = p;
 		} else {
 			ASSERT(!id.empty());
-			std::vector<Future<Reference<const IPage>>> reads;
+			std::vector<Future<Reference<const ArenaPage>>> reads;
 			for (auto& pageID : id) {
 				reads.push_back(snapshot->getPhysicalPage(pageID, !forLazyClear, false));
 			}
-			std::vector<Reference<const IPage>> pages = wait(getAll(reads));
+			std::vector<Reference<const ArenaPage>> pages = wait(getAll(reads));
 			// TODO:  Cache reconstituted super pages somehow, perhaps with help from the Pager.
-			page = Reference<const IPage>(new SuperPage(pages));
+			page = ArenaPage::concatPages(pages);
 		}
 
 		debug_printf("readPage() op=readComplete %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion());
@@ -4270,7 +4190,7 @@ private:
 	ACTOR static Future<BTreePageIDRef> updateBTreePage(VersionedBTree* self,
 	                                                    BTreePageIDRef oldID,
 	                                                    Arena* arena,
-	                                                    Reference<IPage> page,
+	                                                    Reference<ArenaPage> page,
 	                                                    Version writeVersion) {
 		state BTreePageIDRef newID;
 		newID.resize(*arena, oldID.size());
@@ -4279,11 +4199,11 @@ private:
 			LogicalPageID id = wait(self->m_pager->atomicUpdatePage(oldID.front(), page, writeVersion));
 			newID.front() = id;
 		} else {
-			state std::vector<Reference<IPage>> pages;
+			state std::vector<Reference<ArenaPage>> pages;
 			const uint8_t* rptr = page->begin();
 			int bytesLeft = page->size();
 			while (bytesLeft > 0) {
-				Reference<IPage> p = self->m_pager->newPageBuffer();
+				Reference<ArenaPage> p = self->m_pager->newPageBuffer();
 				int blockSize = p->size();
 				memcpy(p->mutate(), rptr, blockSize);
 				rptr += blockSize;
@@ -4304,8 +4224,8 @@ private:
 	}
 
 	// Copy page and initialize a Mirror for reading it.
-	Reference<IPage> cloneForUpdate(Reference<const IPage> page) {
-		Reference<IPage> newPage = page->clone();
+	Reference<ArenaPage> cloneForUpdate(Reference<const ArenaPage> page) {
+		Reference<ArenaPage> newPage = page->cloneContents();
 
 		auto oldMirror = (const BTreePage::BinaryTree::Mirror*)page->userData;
 		auto newBTPage = (BTreePage*)newPage->mutate();
@@ -4616,7 +4536,7 @@ private:
 		state Reference<FlowLock> commitReadLock = self->m_commitReadLock;
 		wait(commitReadLock->take());
 		state FlowLock::Releaser readLock(*commitReadLock);
-		state Reference<const IPage> page =
+		state Reference<const ArenaPage> page =
 		    wait(readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound));
 		readLock.release();
 
@@ -4874,7 +4794,7 @@ private:
 				} else {
 					// Otherwise update it.
 					BTreePageIDRef newID = wait(self->updateBTreePage(
-					    self, rootID, &update->newLinks.arena(), page.castTo<IPage>(), writeVersion));
+					    self, rootID, &update->newLinks.arena(), page.castTo<ArenaPage>(), writeVersion));
 
 					update->updatedInPlace(newID, btPage, newID.size() * self->m_blockSize);
 					debug_printf(
@@ -5180,7 +5100,7 @@ private:
 						}
 
 						BTreePageIDRef newID = wait(self->updateBTreePage(
-						    self, rootID, &update->newLinks.arena(), page.castTo<IPage>(), writeVersion));
+						    self, rootID, &update->newLinks.arena(), page.castTo<ArenaPage>(), writeVersion));
 						debug_printf(
 						    "%s commitSubtree(): Internal page updated in-place at version %s, new contents: %s\n",
 						    context.c_str(),
@@ -5306,7 +5226,7 @@ private:
 			if (all.newLinks.empty()) {
 				debug_printf("Writing new empty root.\n");
 				LogicalPageID newRootID = wait(self->m_pager->newPageID());
-				Reference<IPage> page = self->m_pager->newPageBuffer();
+				Reference<ArenaPage> page = self->m_pager->newPageBuffer();
 				makeEmptyRoot(page);
 				self->m_header.height = 1;
 				self->m_pager->updatePage(newRootID, page);
@@ -5367,12 +5287,12 @@ public:
 		struct PageCursor : ReferenceCounted<PageCursor>, FastAllocated<PageCursor> {
 			Reference<PageCursor> parent;
 			BTreePageIDRef pageID; // Only needed for debugging purposes
-			Reference<const IPage> page;
+			Reference<const ArenaPage> page;
 			BTreePage::BinaryTree::Cursor cursor;
 
 			// id will normally reference memory owned by the parent, which is okay because a reference to the parent
 			// will be held in the cursor
-			PageCursor(BTreePageIDRef id, Reference<const IPage> page, Reference<PageCursor> parent = {})
+			PageCursor(BTreePageIDRef id, Reference<const ArenaPage> page, Reference<PageCursor> parent = {})
 			  : pageID(id), page(page), parent(parent), cursor(getCursor(page)) {}
 
 			PageCursor(const PageCursor& toCopy)
@@ -5391,7 +5311,7 @@ public:
 				next.moveNext();
 				const RedwoodRecordRef& rec = cursor.get();
 				BTreePageIDRef id = rec.getChildPage();
-				Future<Reference<const IPage>> child = readPage(pager, id, &rec, &next.getOrUpperBound());
+				Future<Reference<const ArenaPage>> child = readPage(pager, id, &rec, &next.getOrUpperBound());
 
 				// Read ahead siblings at level 2
 				// TODO:  Application of readAheadBytes is not taking into account the size of the current page or any
@@ -5409,7 +5329,7 @@ public:
 					} while (readAheadBytes > 0 && next.moveNext());
 				}
 
-				return map(child, [=](Reference<const IPage> page) {
+				return map(child, [=](Reference<const ArenaPage> page) {
 					return makeReference<PageCursor>(id, page, Reference<PageCursor>::addRef(this));
 				});
 			}
@@ -5485,8 +5405,8 @@ public:
 			}
 
 			// Otherwise read the root page
-			Future<Reference<const IPage>> root = readPage(pager, rootPageID, &dbBegin, &dbEnd);
-			return map(root, [=](Reference<const IPage> p) {
+			Future<Reference<const ArenaPage>> root = readPage(pager, rootPageID, &dbBegin, &dbEnd);
+			return map(root, [=](Reference<const ArenaPage> p) {
 				pageCursor = makeReference<PageCursor>(rootPageID, p);
 				return Void();
 			});
@@ -5628,7 +5548,7 @@ public:
 	class BTreeCursor {
 		Arena arena;
 		Reference<IPagerSnapshot> pager;
-		std::unordered_map<LogicalPageID, Reference<const IPage>> pages;
+		std::unordered_map<LogicalPageID, Reference<const ArenaPage>> pages;
 		VersionedBTree* btree;
 		bool valid;
 
@@ -5675,7 +5595,7 @@ public:
 		Future<Void> pushPage(BTreePageIDRef id,
 		                      const RedwoodRecordRef& lowerBound,
 		                      const RedwoodRecordRef& upperBound) {
-			Reference<const IPage>& page = pages[id.front()];
+			Reference<const ArenaPage>& page = pages[id.front()];
 			if (page.isValid()) {
 				// The pager won't see this access so count it as a cache hit
 				++g_redwoodMetrics.pagerCacheHit;
@@ -5683,7 +5603,7 @@ public:
 				return Void();
 			}
 
-			return map(readPage(pager, id, &lowerBound, &upperBound), [this, &page, id](Reference<const IPage> p) {
+			return map(readPage(pager, id, &lowerBound, &upperBound), [this, &page, id](Reference<const ArenaPage> p) {
 				page = p;
 				path.push_back(arena, { (BTreePage*)p->begin(), getCursor(p) });
 				return Void();
@@ -8190,12 +8110,12 @@ TEST_CASE(":/redwood/correctness/pager/cow") {
 
 	wait(success(pager->init()));
 	state LogicalPageID id = wait(pager->newPageID());
-	Reference<IPage> p = pager->newPageBuffer();
+	Reference<ArenaPage> p = pager->newPageBuffer();
 	memset(p->mutate(), (char)id, p->size());
 	pager->updatePage(id, p);
 	pager->setMetaKey(LiteralStringRef("asdfasdf"));
 	wait(pager->commit());
-	Reference<IPage> p2 = wait(pager->readPage(id, true));
+	Reference<ArenaPage> p2 = wait(pager->readPage(id, true));
 	printf("%s\n", StringRef(p2->begin(), p2->size()).toHexString().c_str());
 
 	// TODO: Verify reads, do more writes and reads to make this a real pager validator

From e0dc449a447ad1c6bf9cfb76a32190ed99ae6416 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Wed, 5 May 2021 12:42:17 -0700
Subject: [PATCH 344/461] Update special keys documentation, fix typos

---
 documentation/sphinx/source/developer-guide.rst | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index c4d900572a..0c3e9c111b 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -956,8 +956,8 @@ that process, and wait for necessary data to be moved away.
    While the key is set, any commit that tries to set a key in the range will fail with the ``special_keys_api_failure`` error.
 #. ``\xff\xff/management/data_distribution/<mode|rebalance_ignored>`` Read/write. Changing these two keys will change the two corresponding system keys ``\xff/dataDistributionMode`` and ``\xff\x02/rebalanceDDIgnored``. The value of ``\xff\xff/management/data_distribution/mode`` is a literal text of ``0`` (disable) or ``1`` (enable). Transactions committed with invalid values will throw ``special_keys_api_failure`` . The value of ``\xff\xff/management/data_distribution/rebalance_ignored`` is empty. If present, it means data distribution is disabled for rebalance. Any transaction committed with non-empty value for this key will throw ``special_keys_api_failure``. For more details, see help text of ``fdbcli`` command ``datadistribution``.
 #. ``\xff\xff/management/consistency_check_suspended`` Read/write. Set or read this key will set or read the underlying system key ``\xff\x02/ConsistencyCheck/Suspend``. The value of this special key is unused thus if present, will be empty. In particular, if the key exists, then consistency is suspended. For more details, see help text of ``fdbcli`` command ``consistencycheck``. For more details, see help text of ``fdbcli`` command ``lock`` and ``unlock``.
-#. ``\xff\xff/management/db_locked`` Read/write. A single key that can be read and modified. Set the key will lock the database and clear the key will unlock. If the database is already locked, then the commit will fail with ``special_keys_api_failure`` error.
-#. ``\xff\xff/management/auto_coordinators`` Read-only. A single key,  if read, will return a group of coordinators that satisfies the current redundency level. The value is a comma delimited string of network addresses of coordinators, i.e. ``<ip:port>,<ip:port>,...,<ip:port>``.
+#. ``\xff\xff/management/db_locked`` Read/write. A single key that can be read and modified. Set the key will lock the database and clear the key will unlock. If the database is already locked, then the commit will fail with the ``special_keys_api_failure`` error.
+#. ``\xff\xff/management/auto_coordinators`` Read-only. A single key, if read, will return a set of processes which is able to satisfy the current redundency level and serve as new cooridinators. The return value is formatted as a comma delimited string of network addresses of coordinators, i.e. ``<ip:port>,<ip:port>,...,<ip:port>``.
 
 An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
 an ip address and port (e.g. ``127.0.0.1:4500``). If no port is specified,
@@ -967,11 +967,12 @@ Configuration module
 ~~~~~~~~~~~~~~~~~~~~
 
 The configuration module is for changing the cluster configuration.
-For example, you can change a process type or update coordinators through transactions manipulating related special keys.
+For example, you can change a process type or update coordinators by manipulating related special keys through transactions.
 
-#. ``\xff\xff/configuration/process/class_type/<address> := <class_type>`` Read/write. Reading keys in the range will give processes' class types. Setting keys in this range can update processes' class types. The process matching ``<address>`` will be assigned to the given type if the commit is successful. The valid class types are ``storage``, ``transaction``, ``resolution`` and etc. A full list can be found via ``fdbcli`` command ``help setclass``. Clear keys are forbidden in the range. Instead, you can set the type as ``default``, which will clear the assigned class type if existing. For more details, see help text of ``fdbcli`` command ``setclass``.
-#. ``\xff\xff/configuration/process/class_source/<address> := <class_source>`` Read-only. Reading keys in the range will give processes' class source. The class source can be one of ``command_line``, ``configure_auto``, ``set_class`` and ``invalid``, and each represents where the process's class type came from.
-#. ``\xff\xff/configuration/coordinators`` Read/write. Modifying keys in the range can change cooridinators or cluster description. Read ``\xff\xff/configuration/coordinators/cluster_description := <new_description>`` will return the cluster description and thus modify the key will update the decription. The new description needs to match ``[A-Za-z0-9_]+``, otherwise, the ``special_keys_api_failure`` error will be thrown. Read ``\xff\xff/configuration/coordinators/processes`` will return a comma delimited string of coordinators's network addresses, i.e. ``<ip:port>,<ip:port>,...,<ip:port>``. Thus to provide a new set of cooridinators, set the key with a correct formatted string of new coordinators' network addresses. For more details, see help text of ``fdbcli`` command ``coordinators``.
+#. ``\xff\xff/configuration/process/class_type/<address> := <class_type>`` Read/write. Reading keys in the range will retrieve processes' class types. Setting keys in the range will update processes' class types. The process matching ``<address>`` will be assigned to the given class type if the commit is successful. The valid class types are ``storage``, ``transaction``, ``resolution``, etc. A full list of class type can be found via ``fdbcli`` command ``help setclass``. Clear keys are forbidden in the range. Instead, you can set the type as ``default``, which will clear the assigned class type if existing. For more details, see help text of ``fdbcli`` command ``setclass``.
+#. ``\xff\xff/configuration/process/class_source/<address> := <class_source>`` Read-only. Reading keys in the range will retrieve processes' class source. The class source is supposed to be one of ``command_line``, ``configure_auto``, ``set_class`` and ``invalid``, where each of them represents the source that the process's class type comes from.
+#. ``\xff\xff/configuration/coordinators/processes := <ip:port>,<ip:port>,...,<ip:port>`` Read/write. A single key, if read, will return a comma delimited string of coordinators's network addresses. Thus to provide a new set of cooridinators, set the key with a correct formatted string of new coordinators' network addresses. As there's always the need to have coordinators, clear on the key is forbidden and a transaction will fail with the ``special_keys_api_failure`` error if the clear is committed. For more details, see help text of ``fdbcli`` command ``coordinators``.
+#. ``\xff\xff/configuration/coordinators/cluster_description := <new_description>`` Read/write. A single key, if read, will return the cluster description. Thus modifying the key will update the cluster decription. The new description needs to match ``[A-Za-z0-9_]+``, otherwise, the ``special_keys_api_failure`` error will be thrown. In addition, clear on the key is meaningless thus forbidden. For more details, see help text of ``fdbcli`` command ``coordinators``.
 
 The ``<address>`` here is the network address of the corresponding process. Thus the general form is ``ip:port``.
 

From 336a9bff66f05a8120c3c32388d537712340480e Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Wed, 5 May 2021 19:43:44 +0000
Subject: [PATCH 345/461] Provide "time since last full recovery" in fdbcli
 status

Changes:

Schemas.cpp: Extend the JSON schema to include a new field that reports
the number of seconds since last full recovery.

Status.actor.cpp: Extend recoveryStateStatusFetcher() to populate the
new field that has been added to Schemas.cpp.

mr-status-json-schemas.rst.inc: Update the schema to reflect the change
made in Schemas.cpp.
---
 documentation/sphinx/source/mr-status-json-schemas.rst.inc | 1 +
 fdbclient/Schemas.cpp                                      | 1 +
 fdbserver/Status.actor.cpp                                 | 7 +++++++
 3 files changed, 9 insertions(+)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 73a0c22578..acce461308 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -477,6 +477,7 @@
       ],
       "recovery_state":{
          "seconds_since_last_recovered":1,
+         "seconds_since_fully_recovered":1,
          "required_resolvers":1,
          "required_commit_proxies":1,
          "required_grv_proxies":1,
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 682ddc9c9a..1b24af0e77 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -521,6 +521,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                                                           R"statusSchema(
       "recovery_state":{
          "seconds_since_last_recovered":1,
+         "seconds_since_fully_recovered":1,
          "required_resolvers":1,
          "required_commit_proxies":1,
          "required_grv_proxies":1,
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 8579fcd2df..992338310c 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -1168,6 +1168,13 @@ ACTOR static Future<JsonBuilderObject> recoveryStateStatusFetcher(Database cx,
 			message["required_resolvers"] = requiredResolvers;
 		} else if (mStatusCode == RecoveryStatus::locking_old_transaction_servers) {
 			message["missing_logs"] = md.getValue("MissingIDs").c_str();
+		} else if (mStatusCode == RecoveryStatus::fully_recovered) {
+			if (!rv.isError()) {
+				int64_t fullyRecoveredAtVersion = md.getInt64("FullyRecoveredAtVersion");
+				double secondsSinceFulyRecovered = std::max((int64_t)0, (int64_t)(rv.get() - fullyRecoveredAtVersion)) /
+							                       (double)SERVER_KNOBS->VERSIONS_PER_SECOND;
+				message["seconds_since_fully_recovered"] = secondsSinceFulyRecovered;
+			}
 		}
 		// TODO:  time_in_recovery: 0.5
 		//        time_in_state: 0.1

From 5376e4ab81072ee25e6b7ff256aaa94bc4f0fe24 Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Wed, 5 May 2021 20:44:26 +0000
Subject: [PATCH 346/461] Provide "time since last full recovery" in fdbcli
 status

release-notes-700.rst: Add a note about the new metric in "Status" section.
---
 documentation/sphinx/source/release-notes/release-notes-700.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst
index 99fcc930c1..8e825035a9 100644
--- a/documentation/sphinx/source/release-notes/release-notes-700.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-700.rst
@@ -31,7 +31,7 @@ Fixes
 Status
 ------
 * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4735) <https://github.com/apple/foundationdb/pull/4735>`_
-
+* Added ``seconds_since_fully_recovered`` to the recovery_state section of status to report the time, in seconds, since last full recovery. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 
 Bindings
 --------

From 1081dcc0782db158f4021402d6d41f5c6d7f65e8 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Wed, 5 May 2021 14:02:10 -0700
Subject: [PATCH 347/461] fix typos, solve comments

---
 documentation/sphinx/source/developer-guide.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index 0c3e9c111b..f51db018bb 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -955,9 +955,9 @@ that process, and wait for necessary data to be moved away.
    Maintenance mode will be unable to use until the key is cleared, which is the same as the fdbcli command ``datadistribution enable ssfailure``.
    While the key is set, any commit that tries to set a key in the range will fail with the ``special_keys_api_failure`` error.
 #. ``\xff\xff/management/data_distribution/<mode|rebalance_ignored>`` Read/write. Changing these two keys will change the two corresponding system keys ``\xff/dataDistributionMode`` and ``\xff\x02/rebalanceDDIgnored``. The value of ``\xff\xff/management/data_distribution/mode`` is a literal text of ``0`` (disable) or ``1`` (enable). Transactions committed with invalid values will throw ``special_keys_api_failure`` . The value of ``\xff\xff/management/data_distribution/rebalance_ignored`` is empty. If present, it means data distribution is disabled for rebalance. Any transaction committed with non-empty value for this key will throw ``special_keys_api_failure``. For more details, see help text of ``fdbcli`` command ``datadistribution``.
-#. ``\xff\xff/management/consistency_check_suspended`` Read/write. Set or read this key will set or read the underlying system key ``\xff\x02/ConsistencyCheck/Suspend``. The value of this special key is unused thus if present, will be empty. In particular, if the key exists, then consistency is suspended. For more details, see help text of ``fdbcli`` command ``consistencycheck``. For more details, see help text of ``fdbcli`` command ``lock`` and ``unlock``.
-#. ``\xff\xff/management/db_locked`` Read/write. A single key that can be read and modified. Set the key will lock the database and clear the key will unlock. If the database is already locked, then the commit will fail with the ``special_keys_api_failure`` error.
-#. ``\xff\xff/management/auto_coordinators`` Read-only. A single key, if read, will return a set of processes which is able to satisfy the current redundency level and serve as new cooridinators. The return value is formatted as a comma delimited string of network addresses of coordinators, i.e. ``<ip:port>,<ip:port>,...,<ip:port>``.
+#. ``\xff\xff/management/consistency_check_suspended`` Read/write. Set or read this key will set or read the underlying system key ``\xff\x02/ConsistencyCheck/Suspend``. The value of this special key is unused thus if present, will be empty. In particular, if the key exists, then consistency is suspended. For more details, see help text of ``fdbcli`` command ``consistencycheck``.
+#. ``\xff\xff/management/db_locked`` Read/write. A single key that can be read and modified. Set the key will lock the database and clear the key will unlock. If the database is already locked, then the commit will fail with the ``special_keys_api_failure`` error. For more details, see help text of ``fdbcli`` command ``lock`` and ``unlock``.
+#. ``\xff\xff/management/auto_coordinators`` Read-only. A single key, if read, will return a set of processes which is able to satisfy the current redundency level and serve as new coordinators. The return value is formatted as a comma delimited string of network addresses of coordinators, i.e. ``<ip:port>,<ip:port>,...,<ip:port>``.
 
 An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
 an ip address and port (e.g. ``127.0.0.1:4500``). If no port is specified,
@@ -969,8 +969,8 @@ Configuration module
 The configuration module is for changing the cluster configuration.
 For example, you can change a process type or update coordinators by manipulating related special keys through transactions.
 
-#. ``\xff\xff/configuration/process/class_type/<address> := <class_type>`` Read/write. Reading keys in the range will retrieve processes' class types. Setting keys in the range will update processes' class types. The process matching ``<address>`` will be assigned to the given class type if the commit is successful. The valid class types are ``storage``, ``transaction``, ``resolution``, etc. A full list of class type can be found via ``fdbcli`` command ``help setclass``. Clear keys are forbidden in the range. Instead, you can set the type as ``default``, which will clear the assigned class type if existing. For more details, see help text of ``fdbcli`` command ``setclass``.
-#. ``\xff\xff/configuration/process/class_source/<address> := <class_source>`` Read-only. Reading keys in the range will retrieve processes' class source. The class source is supposed to be one of ``command_line``, ``configure_auto``, ``set_class`` and ``invalid``, where each of them represents the source that the process's class type comes from.
+#. ``\xff\xff/configuration/process/class_type/<address> := <class_type>`` Read/write. Reading keys in the range will retrieve processes' class types. Setting keys in the range will update processes' class types. The process matching ``<address>`` will be assigned to the given class type if the commit is successful. The valid class types are ``storage``, ``transaction``, ``resolution``, etc. A full list of class type can be found via ``fdbcli`` command ``help setclass``. Clearing keys is forbidden in the range. Instead, you can set the type as ``default``, which will clear the assigned class type if existing. For more details, see help text of ``fdbcli`` command ``setclass``.
+#. ``\xff\xff/configuration/process/class_source/<address> := <class_source>`` Read-only. Reading keys in the range will retrieve processes' class source. The class source is one of ``command_line``, ``configure_auto``, ``set_class`` and ``invalid``, indicating the source that the process's class type comes from.
 #. ``\xff\xff/configuration/coordinators/processes := <ip:port>,<ip:port>,...,<ip:port>`` Read/write. A single key, if read, will return a comma delimited string of coordinators's network addresses. Thus to provide a new set of cooridinators, set the key with a correct formatted string of new coordinators' network addresses. As there's always the need to have coordinators, clear on the key is forbidden and a transaction will fail with the ``special_keys_api_failure`` error if the clear is committed. For more details, see help text of ``fdbcli`` command ``coordinators``.
 #. ``\xff\xff/configuration/coordinators/cluster_description := <new_description>`` Read/write. A single key, if read, will return the cluster description. Thus modifying the key will update the cluster decription. The new description needs to match ``[A-Za-z0-9_]+``, otherwise, the ``special_keys_api_failure`` error will be thrown. In addition, clear on the key is meaningless thus forbidden. For more details, see help text of ``fdbcli`` command ``coordinators``.
 

From 3e229457a7cf23c4ef43831347036c72730c8414 Mon Sep 17 00:00:00 2001
From: sfc-gh-tclinkenbeard <trevor.clinkenbeard@snowflake.com>
Date: Wed, 5 May 2021 14:18:06 -0700
Subject: [PATCH 348/461] Use default initializer for TraceEvent::errorKind

---
 flow/Trace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/Trace.h b/flow/Trace.h
index ef86671cc1..4c2eadb215 100644
--- a/flow/Trace.h
+++ b/flow/Trace.h
@@ -483,7 +483,7 @@ private:
 	std::string trackingKey;
 	TraceEventFields fields;
 	Severity severity;
-	ErrorKind errorKind;
+	ErrorKind errorKind{ ErrorKind::Unset };
 	const char* type;
 	UID id;
 	Error err;

From f8a8bf315b2efcd518e3b0fda77bbe18ee302b02 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 5 May 2021 13:50:52 -0700
Subject: [PATCH 349/461] Added Arena::allocateAlignedBuffer() to get an
 aligned memory block owned by an Arena, and ArenaPage uses this.

---
 fdbrpc/dsltest.actor.cpp |  9 ++++++-
 fdbserver/IPager.h       |  6 +----
 flow/Arena.cpp           | 58 ++++++++++++++++++++++++++++++++++------
 flow/Arena.h             | 11 +++++++-
 4 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/fdbrpc/dsltest.actor.cpp b/fdbrpc/dsltest.actor.cpp
index ccb0a247d0..21e1808afd 100644
--- a/fdbrpc/dsltest.actor.cpp
+++ b/fdbrpc/dsltest.actor.cpp
@@ -630,7 +630,14 @@ void showArena(ArenaBlock* a, ArenaBlock* parent) {
 		int o = a->nextBlockOffset;
 		while (o) {
 			ArenaBlockRef* r = (ArenaBlockRef*)((char*)a->getData() + o);
-			showArena(r->next, a);
+
+			// If alignedBuffer is valid then print its pointer and size, else recurse
+			if (r->alignedBufferSize != 0) {
+				printf("AlignedBuffer %p (<-%p) %u bytes\n", r->alignedBuffer, a, r->alignedBufferSize);
+			} else {
+				showArena(r->next, a);
+			}
+
 			o = r->nextBlockOffset;
 		}
 	}
diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h
index 2c4ea486bb..6d8672827f 100644
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@@ -43,11 +43,7 @@ public:
 	// The page's logical size includes an opaque checksum, use size() to get usable size
 	ArenaPage(int logicalSize, int bufferSize) : logicalSize(logicalSize), bufferSize(bufferSize), userData(nullptr) {
 		if (bufferSize > 0) {
-			// Get an aligned pointer to bufferSize from an arena in a wasteful way
-			// TODO:  Use arena.allocateAlignedBuffer()
-			size_t space = bufferSize * 2;
-			void* pSpace = new (arena) uint8_t[space];
-			buffer = (uint8_t*)std::align(4096, bufferSize, pSpace, space);
+			buffer = (uint8_t*)arena.allocateAlignedBuffer(4096, bufferSize);
 
 			// Mark any unused page portion defined
 			VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize);
diff --git a/flow/Arena.cpp b/flow/Arena.cpp
index b4007f93d4..251b1b4ab7 100644
--- a/flow/Arena.cpp
+++ b/flow/Arena.cpp
@@ -101,6 +101,11 @@ void Arena::dependsOn(const Arena& p) {
 		}
 	}
 }
+
+void* Arena::allocateAlignedBuffer(size_t alignment, size_t size) {
+	return ArenaBlock::dependOnAlignedBuffer(impl, alignment, size);
+}
+
 size_t Arena::getSize() const {
 	if (impl) {
 		allowAccess(impl.getPtr());
@@ -172,9 +177,13 @@ size_t ArenaBlock::totalSize() {
 	while (o) {
 		ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + o);
 		makeDefined(r, sizeof(ArenaBlockRef));
-		allowAccess(r->next);
-		s += r->next->totalSize();
-		disallowAccess(r->next);
+		if (r->alignedBufferSize != 0) {
+			s += r->alignedBufferSize;
+		} else {
+			allowAccess(r->next);
+			s += r->next->totalSize();
+			disallowAccess(r->next);
+		}
 		o = r->nextBlockOffset;
 		makeNoAccess(r, sizeof(ArenaBlockRef));
 	}
@@ -190,7 +199,12 @@ void ArenaBlock::getUniqueBlocks(std::set<ArenaBlock*>& a) {
 	while (o) {
 		ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + o);
 		makeDefined(r, sizeof(ArenaBlockRef));
-		r->next->getUniqueBlocks(a);
+
+		// If next is valid recursively count its blocks
+		if (r->alignedBufferSize == 0) {
+			r->next->getUniqueBlocks(a);
+		}
+
 		o = r->nextBlockOffset;
 		makeNoAccess(r, sizeof(ArenaBlockRef));
 	}
@@ -212,6 +226,7 @@ int ArenaBlock::addUsed(int bytes) {
 void ArenaBlock::makeReference(ArenaBlock* next) {
 	ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + bigUsed);
 	makeDefined(r, sizeof(ArenaBlockRef));
+	r->alignedBufferSize = 0;
 	r->next = next;
 	r->nextBlockOffset = nextBlockOffset;
 	makeNoAccess(r, sizeof(ArenaBlockRef));
@@ -219,6 +234,18 @@ void ArenaBlock::makeReference(ArenaBlock* next) {
 	bigUsed += sizeof(ArenaBlockRef);
 }
 
+void* ArenaBlock::makeAlignedBuffer(size_t alignment, size_t size) {
+	ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + bigUsed);
+	makeDefined(r, sizeof(ArenaBlockRef));
+	r->alignedBufferSize = size;
+	r->alignedBuffer = aligned_alloc(alignment, size);
+	r->nextBlockOffset = nextBlockOffset;
+	makeNoAccess(r, sizeof(ArenaBlockRef));
+	nextBlockOffset = bigUsed;
+	bigUsed += sizeof(ArenaBlockRef);
+	return r->alignedBuffer;
+}
+
 void ArenaBlock::dependOn(Reference<ArenaBlock>& self, ArenaBlock* other) {
 	other->addref();
 	if (!self || self->isTiny() || self->unused() < sizeof(ArenaBlockRef))
@@ -227,6 +254,14 @@ void ArenaBlock::dependOn(Reference<ArenaBlock>& self, ArenaBlock* other) {
 		self->makeReference(other);
 }
 
+void* ArenaBlock::dependOnAlignedBuffer(Reference<ArenaBlock>& self, size_t alignment, size_t size) {
+	if (!self || self->isTiny() || self->unused() < sizeof(ArenaBlockRef)) {
+		return create(SMALL, self)->makeAlignedBuffer(alignment, size);
+	} else {
+		return self->makeAlignedBuffer(alignment, size);
+	}
+}
+
 void* ArenaBlock::allocate(Reference<ArenaBlock>& self, int bytes) {
 	ArenaBlock* b = self.getPtr();
 	allowAccess(b);
@@ -359,10 +394,17 @@ void ArenaBlock::destroy() {
 			while (o) {
 				ArenaBlockRef* br = (ArenaBlockRef*)((char*)b->getData() + o);
 				makeDefined(br, sizeof(ArenaBlockRef));
-				allowAccess(br->next);
-				if (br->next->delref_no_destroy())
-					stack.push_back(stackArena, br->next);
-				disallowAccess(br->next);
+
+				// If alignedBuffer is valid, free it
+				if (br->alignedBufferSize != 0) {
+					aligned_free(br->alignedBuffer);
+				} else {
+					allowAccess(br->next);
+					if (br->next->delref_no_destroy())
+						stack.push_back(stackArena, br->next);
+					disallowAccess(br->next);
+				}
+
 				o = br->nextBlockOffset;
 			}
 		}
diff --git a/flow/Arena.h b/flow/Arena.h
index 12c6932ff0..999e873044 100644
--- a/flow/Arena.h
+++ b/flow/Arena.h
@@ -102,6 +102,7 @@ public:
 	Arena& operator=(Arena&&) noexcept;
 
 	void dependsOn(const Arena& p);
+	void* allocateAlignedBuffer(size_t alignment, size_t size);
 	size_t getSize() const;
 
 	bool hasFree(size_t size, const void* address);
@@ -129,7 +130,13 @@ struct scalar_traits<Arena> : std::true_type {
 };
 
 struct ArenaBlockRef {
-	ArenaBlock* next;
+	// If alignedBufferSize is not 0, alignedBuffer is valid and must be freed with aligned_free()
+	// Otherwise, next is valid
+	size_t alignedBufferSize;
+	union {
+		ArenaBlock* next;
+		void* alignedBuffer;
+	};
 	uint32_t nextBlockOffset;
 };
 
@@ -160,7 +167,9 @@ struct ArenaBlock : NonCopyable, ThreadSafeReferenceCounted<ArenaBlock> {
 	void getUniqueBlocks(std::set<ArenaBlock*>& a);
 	int addUsed(int bytes);
 	void makeReference(ArenaBlock* next);
+	void* makeAlignedBuffer(size_t alignment, size_t size);
 	static void dependOn(Reference<ArenaBlock>& self, ArenaBlock* other);
+	static void* dependOnAlignedBuffer(Reference<ArenaBlock>& self, size_t alignment, size_t size);
 	static void* allocate(Reference<ArenaBlock>& self, int bytes);
 	// Return an appropriately-sized ArenaBlock to store the given data
 	static ArenaBlock* create(int dataSize, Reference<ArenaBlock>& next);

From a274c4768d8628daeb40301f2f079c1325e8c823 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 5 May 2021 15:51:00 -0700
Subject: [PATCH 350/461] Fix remaining download links

---
 build/cmake/Dockerfile   | 2 +-
 cmake/CompileBoost.cmake | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build/cmake/Dockerfile b/build/cmake/Dockerfile
index 82619f6e26..0452606a1f 100644
--- a/build/cmake/Dockerfile
+++ b/build/cmake/Dockerfile
@@ -13,7 +13,7 @@ RUN curl -L https://github.com/Kitware/CMake/releases/download/v3.13.4/cmake-3.1
     cd /tmp && tar xf cmake.tar.gz && cp -r cmake-3.13.4-Linux-x86_64/* /usr/local/
 
 # install boost
-RUN curl -L https://boostorg.jfrog.io/artifactory/main/release/1.72.0/source/boost_1_72_0.tar.bz2 > /tmp/boost.tar.bz2 &&\
+RUN curl -L https://boostorg.jfrog.io/artifactory/main/release/1.67.0/source/boost_1_67_0.tar.bz2 > /tmp/boost.tar.bz2 &&\
     cd /tmp && echo "2684c972994ee57fc5632e03bf044746f6eb45d4920c343937a465fd67a5adba  boost.tar.bz2" > boost-sha.txt &&\
     sha256sum -c boost-sha.txt && tar xf boost.tar.bz2 && cp -r boost_1_72_0/boost /usr/local/include/ &&\
     rm -rf boost.tar.bz2 boost_1_72_0
diff --git a/cmake/CompileBoost.cmake b/cmake/CompileBoost.cmake
index 0b1cc68502..687c266f0b 100644
--- a/cmake/CompileBoost.cmake
+++ b/cmake/CompileBoost.cmake
@@ -38,7 +38,7 @@ function(compile_boost)
   include(ExternalProject)
   set(BOOST_INSTALL_DIR "${CMAKE_BINARY_DIR}/boost_install")
   ExternalProject_add("${MY_TARGET}Project"
-    URL "https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.tar.bz2"
+    URL "https://boostorg.jfrog.io/artifactory/main/release/1.72.0/source/boost_1_72_0.tar.bz2"
     URL_HASH SHA256=59c9b274bc451cf91a9ba1dd2c7fdcaf5d60b1b3aa83f2c9fa143417cc660722
     CONFIGURE_COMMAND ./bootstrap.sh ${BOOTSTRAP_ARGS}
     BUILD_COMMAND ${B2_COMMAND} link=static ${MY_BUILD_ARGS} --prefix=${BOOST_INSTALL_DIR} ${USER_CONFIG_FLAG} install

From 799e7cc2136ab12883cbb5f247219f4717d6c17c Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 5 May 2021 20:55:48 -0700
Subject: [PATCH 351/461] BTreeCursor now exposes the current Page and Mirror
 arenas so that users extracting Standalone Keys and Values can just reference
 those instead of copying.

---
 fdbserver/DeltaTree.h              |   2 +-
 fdbserver/IPager.h                 |   2 +
 fdbserver/VersionedBTree.actor.cpp | 118 +++++++++++++++++------------
 3 files changed, 73 insertions(+), 49 deletions(-)

diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h
index dc113ff98d..6270986fd5 100644
--- a/fdbserver/DeltaTree.h
+++ b/fdbserver/DeltaTree.h
@@ -374,9 +374,9 @@ public:
 		const T* upperBound() const { return upper; }
 
 		DeltaTree* tree;
+		Arena arena;
 
 	private:
-		Arena arena;
 		DecodedNode* root;
 		const T* lower;
 		const T* upper;
diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h
index 6d8672827f..a204c377a5 100644
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@@ -110,6 +110,8 @@ public:
 
 	bool verifyChecksum(LogicalPageID pageID) { return getChecksum() == calculateChecksum(pageID); }
 
+	const Arena& getArena() const { return arena; }
+
 private:
 	Arena arena;
 	int logicalSize;
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 507b95013c..1080ad146b 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -5546,17 +5546,19 @@ public:
 	// Holds references to all pages touched.
 	// All record references returned from it are valid until the cursor is destroyed.
 	class BTreeCursor {
-		Arena arena;
-		Reference<IPagerSnapshot> pager;
-		std::unordered_map<LogicalPageID, Reference<const ArenaPage>> pages;
-		VersionedBTree* btree;
-		bool valid;
-
+	public:
 		struct PathEntry {
-			BTreePage* btPage;
+			Reference<const ArenaPage> page;
 			BTreePage::BinaryTree::Cursor cursor;
+
+			const BTreePage* btPage() const { return (BTreePage*)page->begin(); };
 		};
-		VectorRef<PathEntry> path;
+
+	private:
+		VersionedBTree* btree;
+		Reference<IPagerSnapshot> pager;
+		bool valid;
+		std::vector<PathEntry> path;
 
 	public:
 		BTreeCursor() {}
@@ -5569,7 +5571,7 @@ public:
 				r += format("[%d/%d: %s] ",
 				            i + 1,
 				            path.size(),
-				            path[i].cursor.valid() ? path[i].cursor.get().toString(path[i].btPage->isLeaf()).c_str()
+				            path[i].cursor.valid() ? path[i].cursor.get().toString(path[i].btPage()->isLeaf()).c_str()
 				                                   : "<invalid>");
 			}
 			if (!valid) {
@@ -5583,29 +5585,17 @@ public:
 
 		bool inRoot() const { return path.size() == 1; }
 
-		// Pop and return the page cursor at the end of the path.
-		// This is meant to enable range scans to consume the contents of a leaf page more efficiently.
-		// Can only be used when inRoot() is true.
-		BTreePage::BinaryTree::Cursor popPath() {
-			BTreePage::BinaryTree::Cursor c = path.back().cursor;
-			path.pop_back();
-			return c;
-		}
+		// To enable more efficient range scans, caller can read the lowest page
+		// of the cursor and pop it.
+		PathEntry& back() { return path.back(); }
+		void popPath() { path.pop_back(); }
 
 		Future<Void> pushPage(BTreePageIDRef id,
 		                      const RedwoodRecordRef& lowerBound,
 		                      const RedwoodRecordRef& upperBound) {
-			Reference<const ArenaPage>& page = pages[id.front()];
-			if (page.isValid()) {
-				// The pager won't see this access so count it as a cache hit
-				++g_redwoodMetrics.pagerCacheHit;
-				path.push_back(arena, { (BTreePage*)page->begin(), getCursor(page) });
-				return Void();
-			}
 
-			return map(readPage(pager, id, &lowerBound, &upperBound), [this, &page, id](Reference<const ArenaPage> p) {
-				page = p;
-				path.push_back(arena, { (BTreePage*)p->begin(), getCursor(p) });
+			return map(readPage(pager, id, &lowerBound, &upperBound), [this, id](Reference<const ArenaPage> p) {
+				path.push_back({ p, getCursor(p) });
 				return Void();
 			});
 		}
@@ -5621,7 +5611,7 @@ public:
 		Future<Void> init(VersionedBTree* btree_in, Reference<IPagerSnapshot> pager_in, BTreePageIDRef root) {
 			btree = btree_in;
 			pager = pager_in;
-			path.reserve(arena, 6);
+			path.reserve(6);
 			valid = false;
 			return pushPage(root, dbBegin, dbEnd);
 		}
@@ -5637,13 +5627,13 @@ public:
 		// to query.compare(cursor.get())
 		ACTOR Future<int> seek_impl(BTreeCursor* self, RedwoodRecordRef query, int prefetchBytes) {
 			state RedwoodRecordRef internalPageQuery = query.withMaxPageID();
-			self->path = self->path.slice(0, 1);
+			self->path.resize(1);
 			debug_printf(
 			    "seek(%s, %d) start cursor = %s\n", query.toString().c_str(), prefetchBytes, self->toString().c_str());
 
 			loop {
 				auto& entry = self->path.back();
-				if (entry.btPage->isLeaf()) {
+				if (entry.btPage()->isLeaf()) {
 					int cmp = entry.cursor.seek(query);
 					self->valid = entry.cursor.valid() && !entry.cursor.node->isDeleted();
 					debug_printf("seek(%s, %d) loop exit cmp=%d cursor=%s\n",
@@ -5667,7 +5657,7 @@ public:
 
 					// Prefetch siblings, at least prefetchBytes, at level 2 but without jumping to another level 2
 					// sibling
-					if (prefetchBytes != 0 && entry.btPage->height == 2) {
+					if (prefetchBytes != 0 && entry.btPage()->height == 2) {
 						auto c = entry.cursor;
 						bool fwd = prefetchBytes > 0;
 						prefetchBytes = abs(prefetchBytes);
@@ -5736,7 +5726,7 @@ public:
 				}
 
 				// Skip over internal page entries that do not link to child pages.  There should never be two in a row.
-				if (success && !entry.btPage->isLeaf() && !entry.cursor.get().value.present()) {
+				if (success && !entry.btPage()->isLeaf() && !entry.cursor.get().value.present()) {
 					success = forward ? entry.cursor.moveNext() : entry.cursor.movePrev();
 					ASSERT(!success || entry.cursor.get().value.present());
 				}
@@ -5752,14 +5742,14 @@ public:
 				}
 
 				// Move to parent
-				self->path = self->path.slice(0, self->path.size() - 1);
+				self->path.pop_back();
 			}
 
 			// While not on a leaf page, move down to get to one.
 			while (1) {
 				debug_printf("move%s() second loop cursor=%s\n", forward ? "Next" : "Prev", self->toString().c_str());
 				auto& entry = self->path.back();
-				if (entry.btPage->isLeaf()) {
+				if (entry.btPage()->isLeaf()) {
 					break;
 				}
 
@@ -6115,58 +6105,84 @@ public:
 			wait(cur.seekGTE(keys.begin, prefetchBytes));
 			while (cur.isValid()) {
 				// Read page contents without using waits
-				bool isRoot = cur.inRoot();
-				BTreePage::BinaryTree::Cursor leafCursor = cur.popPath();
+				BTreePage::BinaryTree::Cursor leafCursor = cur.back().cursor;
+
 				// we can bypass the bounds check for each key in the leaf if the entire leaf is in range
 				// > because both query end and page upper bound are exclusive of the query results and page contents,
 				// respectively
 				bool boundsCheck = leafCursor.upperBound() > keys.end;
+				// Whether or not any results from this page were added to results
+				bool usedPage = false;
+
 				while (leafCursor.valid()) {
 					KeyValueRef kv = leafCursor.get().toKeyValueRef();
 					if (boundsCheck && kv.key.compare(keys.end) >= 0) {
 						break;
 					}
 					accumulatedBytes += kv.expectedSize();
-					result.push_back_deep(result.arena(), kv);
+					result.push_back(result.arena(), kv);
+					usedPage = true;
 					if (--rowLimit == 0 || accumulatedBytes >= byteLimit) {
 						break;
 					}
 					leafCursor.moveNext();
 				}
+
+				// If the page was used, results must depend on the ArenaPage arena and the Mirror arena.
+				// This must be done after visiting all the results in case the Mirror arena changes.
+				if (usedPage) {
+					result.arena().dependsOn(leafCursor.mirror->arena);
+					result.arena().dependsOn(cur.back().page->getArena());
+				}
+
 				// Stop if the leaf cursor is still valid which means we hit a key or size limit or
-				// if we started in the root page
-				if (leafCursor.valid() || isRoot) {
+				// if the cursor is in the root page, in which case there are no more pages.
+				if (leafCursor.valid() || cur.inRoot()) {
 					break;
 				}
+				cur.popPath();
 				wait(cur.moveNext());
 			}
 		} else {
 			wait(cur.seekLT(keys.end, prefetchBytes));
 			while (cur.isValid()) {
 				// Read page contents without using waits
-				bool isRoot = cur.inRoot();
-				BTreePage::BinaryTree::Cursor leafCursor = cur.popPath();
+				BTreePage::BinaryTree::Cursor leafCursor = cur.back().cursor;
+
 				// we can bypass the bounds check for each key in the leaf if the entire leaf is in range
 				// < because both query begin and page lower bound are inclusive of the query results and page contents,
 				// respectively
 				bool boundsCheck = leafCursor.lowerBound() < keys.begin;
+				// Whether or not any results from this page were added to results
+				bool usedPage = false;
+
 				while (leafCursor.valid()) {
 					KeyValueRef kv = leafCursor.get().toKeyValueRef();
 					if (boundsCheck && kv.key.compare(keys.begin) < 0) {
 						break;
 					}
 					accumulatedBytes += kv.expectedSize();
-					result.push_back_deep(result.arena(), kv);
+					result.push_back(result.arena(), kv);
+					usedPage = true;
 					if (++rowLimit == 0 || accumulatedBytes >= byteLimit) {
 						break;
 					}
 					leafCursor.movePrev();
 				}
+
+				// If the page was used, results must depend on the ArenaPage arena and the Mirror arena.
+				// This must be done after visiting all the results in case the Mirror arena changes.
+				if (usedPage) {
+					result.arena().dependsOn(leafCursor.mirror->arena);
+					result.arena().dependsOn(cur.back().page->getArena());
+				}
+
 				// Stop if the leaf cursor is still valid which means we hit a key or size limit or
 				// if we started in the root page
-				if (leafCursor.valid() || isRoot) {
+				if (leafCursor.valid() || cur.inRoot()) {
 					break;
 				}
+				cur.popPath();
 				wait(cur.movePrev());
 			}
 		}
@@ -6192,8 +6208,10 @@ public:
 
 		wait(cur.seekGTE(key, 0));
 		if (cur.isValid() && cur.get().key == key) {
-			return cur.get().value.get();
+			// Return a Value whose arena is the source page's arena
+			return Value(cur.get().value.get(), cur.back().page->getArena());
 		}
+
 		return Optional<Value>();
 	}
 
@@ -6217,7 +6235,8 @@ public:
 		if (cur.isValid() && cur.get().key == key) {
 			Value v = cur.get().value.get();
 			int len = std::min(v.size(), maxLength);
-			return Value(v.substr(0, len));
+			// Return a Value prefix whose arena is the source page's arena
+			return Value(v.substr(0, len), cur.back().page->getArena());
 		}
 
 		return Optional<Value>();
@@ -6325,7 +6344,7 @@ ACTOR Future<int> verifyRangeBTreeCursor(VersionedBTree* btree,
 	    "VerifyRange(@%" PRId64 ", %s, %s): Actual seek\n", v, start.printable().c_str(), end.printable().c_str());
 	wait(cur.seekGTE(start, 0));
 
-	state std::vector<KeyValue> results;
+	state Standalone<VectorRef<KeyValueRef>> results;
 
 	while (cur.isValid() && cur.get().key < end) {
 		// Find the next written kv pair that would be present at this version
@@ -6383,7 +6402,10 @@ ACTOR Future<int> verifyRangeBTreeCursor(VersionedBTree* btree,
 
 		ASSERT(errors == 0);
 
-		results.push_back(KeyValue(KeyValueRef(cur.get().key, cur.get().value.get())));
+		results.push_back(results.arena(), cur.get().toKeyValueRef());
+		results.arena().dependsOn(cur.back().cursor.mirror->arena);
+		results.arena().dependsOn(cur.back().page->getArena());
+
 		wait(cur.moveNext());
 	}
 
@@ -6422,7 +6444,7 @@ ACTOR Future<int> verifyRangeBTreeCursor(VersionedBTree* btree,
 	// Now read the range from the tree in reverse order and compare to the saved results
 	wait(cur.seekLT(end, 0));
 
-	state std::vector<KeyValue>::const_reverse_iterator r = results.rbegin();
+	state std::reverse_iterator<const KeyValueRef*> r = results.rbegin();
 
 	while (cur.isValid() && cur.get().key >= start) {
 		if (r == results.rend()) {

From 3ce31cb3476b05722ca6ad75ed51b6de1e4107ca Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Tue, 4 May 2021 14:32:52 -0700
Subject: [PATCH 352/461] Upgrade api version from 700 to 710

---
 bindings/bindingtester/__init__.py                   |  2 +-
 bindings/bindingtester/bindingtester.py              |  2 +-
 bindings/bindingtester/known_testers.py              |  2 +-
 bindings/bindingtester/tests/scripted.py             |  2 +-
 bindings/c/fdb_c.cpp                                 |  2 +-
 bindings/c/foundationdb/fdb_c.h                      |  6 +++---
 bindings/c/test/fdb_c90_test.c                       |  4 ++--
 bindings/c/test/mako/mako.h                          |  2 +-
 bindings/c/test/performance_test.c                   |  2 +-
 bindings/c/test/ryw_benchmark.c                      |  2 +-
 bindings/c/test/test.h                               |  2 +-
 bindings/c/test/txn_size_test.c                      |  2 +-
 bindings/c/test/unit/fdb_api.hpp                     |  2 +-
 bindings/c/test/unit/setup_tests.cpp                 |  8 ++++----
 bindings/c/test/unit/unit_tests.cpp                  |  4 ++--
 bindings/c/test/workloads/SimpleWorkload.cpp         |  4 ++--
 bindings/flow/fdb_flow.actor.cpp                     |  4 ++--
 bindings/flow/fdb_flow.h                             |  2 +-
 bindings/flow/tester/Tester.actor.cpp                |  2 +-
 bindings/go/README.md                                |  2 +-
 bindings/go/src/fdb/cluster.go                       |  2 +-
 bindings/go/src/fdb/database.go                      |  2 +-
 bindings/go/src/fdb/doc.go                           |  2 +-
 bindings/go/src/fdb/errors.go                        |  2 +-
 bindings/go/src/fdb/fdb.go                           |  8 ++++----
 bindings/go/src/fdb/fdb_test.go                      | 12 ++++++------
 bindings/go/src/fdb/futures.go                       |  2 +-
 bindings/go/src/fdb/range.go                         |  2 +-
 bindings/go/src/fdb/transaction.go                   |  2 +-
 bindings/java/JavaWorkload.cpp                       |  4 ++--
 bindings/java/fdbJNI.cpp                             |  2 +-
 .../com/apple/foundationdb/DirectoryTest.java        |  2 +-
 .../foundationdb/RangeQueryIntegrationTest.java      |  2 +-
 .../com/apple/foundationdb/RequiresDatabase.java     |  2 +-
 .../junit/com/apple/foundationdb/FDBLibraryRule.java |  2 +-
 .../java/src/main/com/apple/foundationdb/FDB.java    |  6 +++---
 bindings/java/src/main/overview.html.in              |  4 ++--
 .../com/apple/foundationdb/test/AbstractTester.java  |  2 +-
 .../apple/foundationdb/test/BlockingBenchmark.java   |  2 +-
 .../apple/foundationdb/test/ConcurrentGetSetGet.java |  2 +-
 .../test/com/apple/foundationdb/test/Example.java    |  2 +-
 .../com/apple/foundationdb/test/IterableTest.java    |  2 +-
 .../com/apple/foundationdb/test/LocalityTests.java   |  2 +-
 .../apple/foundationdb/test/ParallelRandomScan.java  |  2 +-
 .../test/com/apple/foundationdb/test/RangeTest.java  |  2 +-
 .../com/apple/foundationdb/test/SerialInsertion.java |  2 +-
 .../com/apple/foundationdb/test/SerialIteration.java |  2 +-
 .../test/com/apple/foundationdb/test/SerialTest.java |  2 +-
 .../foundationdb/test/SnapshotTransactionTest.java   |  2 +-
 .../test/com/apple/foundationdb/test/TupleTest.java  |  2 +-
 .../foundationdb/test/VersionstampSmokeTest.java     |  2 +-
 .../test/com/apple/foundationdb/test/WatchTest.java  |  2 +-
 bindings/python/fdb/__init__.py                      |  2 +-
 bindings/python/fdb/impl.py                          |  2 +-
 bindings/python/tests/size_limit_tests.py            |  2 +-
 bindings/ruby/lib/fdb.rb                             |  2 +-
 build/cmake/package_tester/fdb_c_app/app.c           |  4 ++--
 build/cmake/package_tester/modules/tests.sh          |  2 +-
 documentation/sphinx/source/administration.rst       |  2 +-
 documentation/sphinx/source/api-c.rst                |  2 +-
 documentation/sphinx/source/api-common.rst.inc       |  2 +-
 documentation/sphinx/source/api-python.rst           |  2 +-
 documentation/sphinx/source/api-ruby.rst             |  2 +-
 .../sphinx/source/api-version-upgrade-guide.rst      |  8 ++++++++
 documentation/sphinx/source/class-scheduling-go.rst  |  6 +++---
 .../sphinx/source/class-scheduling-java.rst          |  6 +++---
 .../sphinx/source/class-scheduling-ruby.rst          |  6 +++---
 documentation/sphinx/source/class-scheduling.rst     |  8 ++++----
 documentation/sphinx/source/developer-guide.rst      |  2 +-
 .../sphinx/source/hierarchical-documents-java.rst    |  2 +-
 documentation/sphinx/source/multimaps-java.rst       |  2 +-
 documentation/sphinx/source/priority-queues-java.rst |  2 +-
 documentation/sphinx/source/queues-java.rst          |  2 +-
 documentation/sphinx/source/simple-indexes-java.rst  |  2 +-
 documentation/sphinx/source/tables-java.rst          |  2 +-
 documentation/sphinx/source/vector-java.rst          |  2 +-
 recipes/go-recipes/blob.go                           |  2 +-
 recipes/go-recipes/doc.go                            |  2 +-
 recipes/go-recipes/graph.go                          |  2 +-
 recipes/go-recipes/indirect.go                       |  2 +-
 recipes/go-recipes/multi.go                          |  2 +-
 recipes/go-recipes/priority.go                       |  2 +-
 recipes/go-recipes/queue.go                          |  2 +-
 recipes/go-recipes/table.go                          |  2 +-
 84 files changed, 122 insertions(+), 114 deletions(-)

diff --git a/bindings/bindingtester/__init__.py b/bindings/bindingtester/__init__.py
index f8ad0030e2..17d06cf4fe 100644
--- a/bindings/bindingtester/__init__.py
+++ b/bindings/bindingtester/__init__.py
@@ -26,7 +26,7 @@ sys.path[:0] = [os.path.join(os.path.dirname(__file__), '..', '..', 'bindings',
 
 import util
 
-FDB_API_VERSION = 700
+FDB_API_VERSION = 710
 
 LOGGING = {
     'version': 1,
diff --git a/bindings/bindingtester/bindingtester.py b/bindings/bindingtester/bindingtester.py
index 58db70f5db..9c178a09d5 100755
--- a/bindings/bindingtester/bindingtester.py
+++ b/bindings/bindingtester/bindingtester.py
@@ -157,7 +157,7 @@ def choose_api_version(selected_api_version, tester_min_version, tester_max_vers
             api_version = min_version
         elif random.random() < 0.9:
             api_version = random.choice([v for v in [13, 14, 16, 21, 22, 23, 100, 200, 300, 400, 410, 420, 430,
-                                                     440, 450, 460, 500, 510, 520, 600, 610, 620, 630, 700] if v >= min_version and v <= max_version])
+                                                     440, 450, 460, 500, 510, 520, 600, 610, 620, 630, 700, 710] if v >= min_version and v <= max_version])
         else:
             api_version = random.randint(min_version, max_version)
 
diff --git a/bindings/bindingtester/known_testers.py b/bindings/bindingtester/known_testers.py
index e1522039db..0fe5ad638f 100644
--- a/bindings/bindingtester/known_testers.py
+++ b/bindings/bindingtester/known_testers.py
@@ -20,7 +20,7 @@
 
 import os
 
-MAX_API_VERSION = 700
+MAX_API_VERSION = 710
 COMMON_TYPES = ['null', 'bytes', 'string', 'int', 'uuid', 'bool', 'float', 'double', 'tuple']
 ALL_TYPES = COMMON_TYPES + ['versionstamp']
 
diff --git a/bindings/bindingtester/tests/scripted.py b/bindings/bindingtester/tests/scripted.py
index c113ebc07f..c250b9d8af 100644
--- a/bindings/bindingtester/tests/scripted.py
+++ b/bindings/bindingtester/tests/scripted.py
@@ -34,7 +34,7 @@ fdb.api_version(FDB_API_VERSION)
 
 
 class ScriptedTest(Test):
-    TEST_API_VERSION = 700
+    TEST_API_VERSION = 710
 
     def __init__(self, subspace):
         super(ScriptedTest, self).__init__(subspace, ScriptedTest.TEST_API_VERSION, ScriptedTest.TEST_API_VERSION)
diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp
index 66bb974b71..16fbddf1c9 100644
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@@ -19,7 +19,7 @@
  */
 
 #include <cstdint>
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 710
 #define FDB_INCLUDE_LEGACY_TYPES
 
 #include "fdbclient/MultiVersionTransaction.h"
diff --git a/bindings/c/foundationdb/fdb_c.h b/bindings/c/foundationdb/fdb_c.h
index 4ea59ac11e..81bf10d8a8 100644
--- a/bindings/c/foundationdb/fdb_c.h
+++ b/bindings/c/foundationdb/fdb_c.h
@@ -27,10 +27,10 @@
 #endif
 
 #if !defined(FDB_API_VERSION)
-#error You must #define FDB_API_VERSION prior to including fdb_c.h (current version is 700)
+#error You must #define FDB_API_VERSION prior to including fdb_c.h (current version is 710)
 #elif FDB_API_VERSION < 13
 #error API version no longer supported (upgrade to 13)
-#elif FDB_API_VERSION > 700
+#elif FDB_API_VERSION > 710
 #error Requested API version requires a newer version of this header
 #endif
 
@@ -97,7 +97,7 @@ typedef struct key {
 	const uint8_t* key;
 	int key_length;
 } FDBKey;
-#if FDB_API_VERSION >= 700
+#if FDB_API_VERSION >= 710
 typedef struct keyvalue {
 	const uint8_t* key;
 	int key_length;
diff --git a/bindings/c/test/fdb_c90_test.c b/bindings/c/test/fdb_c90_test.c
index 1569d98250..bbfb7f6dbf 100644
--- a/bindings/c/test/fdb_c90_test.c
+++ b/bindings/c/test/fdb_c90_test.c
@@ -1,9 +1,9 @@
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 710
 #include <foundationdb/fdb_c.h>
 
 int main(int argc, char* argv[]) {
 	(void)argc;
 	(void)argv;
-	fdb_select_api_version(700);
+	fdb_select_api_version(710);
 	return 0;
 }
diff --git a/bindings/c/test/mako/mako.h b/bindings/c/test/mako/mako.h
index 7df4c72c76..214e3e6fc6 100644
--- a/bindings/c/test/mako/mako.h
+++ b/bindings/c/test/mako/mako.h
@@ -3,7 +3,7 @@
 #pragma once
 
 #ifndef FDB_API_VERSION
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 710
 #endif
 
 #include <foundationdb/fdb_c.h>
diff --git a/bindings/c/test/performance_test.c b/bindings/c/test/performance_test.c
index d2f8655b87..f73f673bcf 100644
--- a/bindings/c/test/performance_test.c
+++ b/bindings/c/test/performance_test.c
@@ -641,7 +641,7 @@ void runTests(struct ResultSet* rs) {
 int main(int argc, char** argv) {
 	srand(time(NULL));
 	struct ResultSet* rs = newResultSet();
-	checkError(fdb_select_api_version(700), "select API version", rs);
+	checkError(fdb_select_api_version(710), "select API version", rs);
 	printf("Running performance test at client version: %s\n", fdb_get_client_version());
 
 	valueStr = (uint8_t*)malloc((sizeof(uint8_t)) * valueSize);
diff --git a/bindings/c/test/ryw_benchmark.c b/bindings/c/test/ryw_benchmark.c
index 8021a1fc9d..98f92208c0 100644
--- a/bindings/c/test/ryw_benchmark.c
+++ b/bindings/c/test/ryw_benchmark.c
@@ -285,7 +285,7 @@ void runTests(struct ResultSet* rs) {
 int main(int argc, char** argv) {
 	srand(time(NULL));
 	struct ResultSet* rs = newResultSet();
-	checkError(fdb_select_api_version(700), "select API version", rs);
+	checkError(fdb_select_api_version(710), "select API version", rs);
 	printf("Running RYW Benchmark test at client version: %s\n", fdb_get_client_version());
 
 	keys = generateKeys(numKeys, keySize);
diff --git a/bindings/c/test/test.h b/bindings/c/test/test.h
index 1e0622dd3a..0b79e232c6 100644
--- a/bindings/c/test/test.h
+++ b/bindings/c/test/test.h
@@ -29,7 +29,7 @@
 #include <inttypes.h>
 
 #ifndef FDB_API_VERSION
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 710
 #endif
 
 #include <foundationdb/fdb_c.h>
diff --git a/bindings/c/test/txn_size_test.c b/bindings/c/test/txn_size_test.c
index ca0261edf2..f1c90cd720 100644
--- a/bindings/c/test/txn_size_test.c
+++ b/bindings/c/test/txn_size_test.c
@@ -97,7 +97,7 @@ void runTests(struct ResultSet* rs) {
 int main(int argc, char** argv) {
 	srand(time(NULL));
 	struct ResultSet* rs = newResultSet();
-	checkError(fdb_select_api_version(700), "select API version", rs);
+	checkError(fdb_select_api_version(710), "select API version", rs);
 	printf("Running performance test at client version: %s\n", fdb_get_client_version());
 
 	keys = generateKeys(numKeys, KEY_SIZE);
diff --git a/bindings/c/test/unit/fdb_api.hpp b/bindings/c/test/unit/fdb_api.hpp
index fc4b3e8e6b..17f25d55ee 100644
--- a/bindings/c/test/unit/fdb_api.hpp
+++ b/bindings/c/test/unit/fdb_api.hpp
@@ -39,7 +39,7 @@
 
 #pragma once
 
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 710
 #include <foundationdb/fdb_c.h>
 
 #include <string>
diff --git a/bindings/c/test/unit/setup_tests.cpp b/bindings/c/test/unit/setup_tests.cpp
index a5109b68f0..602af99845 100644
--- a/bindings/c/test/unit/setup_tests.cpp
+++ b/bindings/c/test/unit/setup_tests.cpp
@@ -20,7 +20,7 @@
 
 // Unit tests for API setup, network initialization functions from the FDB C API.
 
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 710
 #include <foundationdb/fdb_c.h>
 #include <iostream>
 #include <thread>
@@ -42,13 +42,13 @@ TEST_CASE("setup") {
 	CHECK(err);
 
 	// Select current API version
-	fdb_check(fdb_select_api_version(700));
+	fdb_check(fdb_select_api_version(710));
 
 	// Error to call again after a successful return
-	err = fdb_select_api_version(700);
+	err = fdb_select_api_version(710);
 	CHECK(err);
 
-	CHECK(fdb_get_max_api_version() >= 700);
+	CHECK(fdb_get_max_api_version() >= 710);
 
 	fdb_check(fdb_setup_network());
 	// Calling a second time should fail
diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp
index 54f763fb5c..360284e55d 100644
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@@ -20,7 +20,7 @@
 
 // Unit tests for the FoundationDB C API.
 
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 710
 #include <foundationdb/fdb_c.h>
 #include <assert.h>
 #include <string.h>
@@ -2151,7 +2151,7 @@ int main(int argc, char** argv) {
 		          << "Usage: fdb_c_unit_tests /path/to/cluster_file key_prefix [externalClient]" << std::endl;
 		return 1;
 	}
-	fdb_check(fdb_select_api_version(700));
+	fdb_check(fdb_select_api_version(710));
 	if (argc == 4) {
 		std::string externalClientLibrary = argv[3];
 		fdb_check(fdb_network_set_option(
diff --git a/bindings/c/test/workloads/SimpleWorkload.cpp b/bindings/c/test/workloads/SimpleWorkload.cpp
index 6d1adbefdf..2be433b9c1 100644
--- a/bindings/c/test/workloads/SimpleWorkload.cpp
+++ b/bindings/c/test/workloads/SimpleWorkload.cpp
@@ -18,7 +18,7 @@
  * limitations under the License.
  */
 
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 710
 #include "foundationdb/fdb_c.h"
 #undef DLLEXPORT
 #include "workloads.h"
@@ -266,7 +266,7 @@ struct SimpleWorkload : FDBWorkload {
 		insertsPerTx = context->getOption("insertsPerTx", 100ul);
 		opsPerTx = context->getOption("opsPerTx", 100ul);
 		runFor = context->getOption("runFor", 10.0);
-		auto err = fdb_select_api_version(700);
+		auto err = fdb_select_api_version(710);
 		if (err) {
 			context->trace(
 			    FDBSeverity::Info, "SelectAPIVersionFailed", { { "Error", std::string(fdb_get_error(err)) } });
diff --git a/bindings/flow/fdb_flow.actor.cpp b/bindings/flow/fdb_flow.actor.cpp
index fc753a6fbe..90e1a68621 100644
--- a/bindings/flow/fdb_flow.actor.cpp
+++ b/bindings/flow/fdb_flow.actor.cpp
@@ -37,7 +37,7 @@ THREAD_FUNC networkThread(void* fdb) {
 }
 
 ACTOR Future<Void> _test() {
-	API* fdb = FDB::API::selectAPIVersion(700);
+	API* fdb = FDB::API::selectAPIVersion(710);
 	auto db = fdb->createDatabase();
 	state Reference<Transaction> tr = db->createTransaction();
 
@@ -81,7 +81,7 @@ ACTOR Future<Void> _test() {
 }
 
 void fdb_flow_test() {
-	API* fdb = FDB::API::selectAPIVersion(700);
+	API* fdb = FDB::API::selectAPIVersion(710);
 	fdb->setupNetwork();
 	startThread(networkThread, fdb);
 
diff --git a/bindings/flow/fdb_flow.h b/bindings/flow/fdb_flow.h
index 28eab34e3c..f1b87c16ba 100644
--- a/bindings/flow/fdb_flow.h
+++ b/bindings/flow/fdb_flow.h
@@ -23,7 +23,7 @@
 
 #include <flow/flow.h>
 
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 710
 #include <bindings/c/foundationdb/fdb_c.h>
 #undef DLLEXPORT
 
diff --git a/bindings/flow/tester/Tester.actor.cpp b/bindings/flow/tester/Tester.actor.cpp
index 8a5f5adc26..958ff1a0be 100644
--- a/bindings/flow/tester/Tester.actor.cpp
+++ b/bindings/flow/tester/Tester.actor.cpp
@@ -1863,7 +1863,7 @@ ACTOR void _test_versionstamp() {
 	try {
 		g_network = newNet2(TLSConfig());
 
-		API* fdb = FDB::API::selectAPIVersion(700);
+		API* fdb = FDB::API::selectAPIVersion(710);
 
 		fdb->setupNetwork();
 		startThread(networkThread, fdb);
diff --git a/bindings/go/README.md b/bindings/go/README.md
index 8619e1692a..87bf502d36 100644
--- a/bindings/go/README.md
+++ b/bindings/go/README.md
@@ -9,7 +9,7 @@ This package requires:
 - [Mono](http://www.mono-project.com/) (macOS or Linux) or [Visual Studio](https://www.visualstudio.com/) (Windows)  (build-time only)
 - FoundationDB C API 2.0.x-6.1.x (part of the [FoundationDB client packages](https://apple.github.io/foundationdb/downloads.html#c))
 
-Use of this package requires the selection of a FoundationDB API version at runtime. This package currently supports FoundationDB API versions 200-700.
+Use of this package requires the selection of a FoundationDB API version at runtime. This package currently supports FoundationDB API versions 200-710.
 
 To install this package, you can run the "fdb-go-install.sh" script (for versions 5.0.x and greater):
 
diff --git a/bindings/go/src/fdb/cluster.go b/bindings/go/src/fdb/cluster.go
index 5ab17b5273..b5556d93fd 100644
--- a/bindings/go/src/fdb/cluster.go
+++ b/bindings/go/src/fdb/cluster.go
@@ -22,7 +22,7 @@
 
 package fdb
 
-// #define FDB_API_VERSION 700
+// #define FDB_API_VERSION 710
 // #include <foundationdb/fdb_c.h>
 import "C"
 
diff --git a/bindings/go/src/fdb/database.go b/bindings/go/src/fdb/database.go
index 60f3f03d06..0e18ab908c 100644
--- a/bindings/go/src/fdb/database.go
+++ b/bindings/go/src/fdb/database.go
@@ -22,7 +22,7 @@
 
 package fdb
 
-// #define FDB_API_VERSION 700
+// #define FDB_API_VERSION 710
 // #include <foundationdb/fdb_c.h>
 import "C"
 
diff --git a/bindings/go/src/fdb/doc.go b/bindings/go/src/fdb/doc.go
index e1759701ff..2ecf99f200 100644
--- a/bindings/go/src/fdb/doc.go
+++ b/bindings/go/src/fdb/doc.go
@@ -46,7 +46,7 @@ A basic interaction with the FoundationDB API is demonstrated below:
 
     func main() {
         // Different API versions may expose different runtime behaviors.
-        fdb.MustAPIVersion(700)
+        fdb.MustAPIVersion(710)
 
         // Open the default database from the system cluster
         db := fdb.MustOpenDefault()
diff --git a/bindings/go/src/fdb/errors.go b/bindings/go/src/fdb/errors.go
index 9c9f75b566..9ce11ca150 100644
--- a/bindings/go/src/fdb/errors.go
+++ b/bindings/go/src/fdb/errors.go
@@ -22,7 +22,7 @@
 
 package fdb
 
-// #define FDB_API_VERSION 700
+// #define FDB_API_VERSION 710
 // #include <foundationdb/fdb_c.h>
 import "C"
 
diff --git a/bindings/go/src/fdb/fdb.go b/bindings/go/src/fdb/fdb.go
index bc05a05dba..662951be82 100644
--- a/bindings/go/src/fdb/fdb.go
+++ b/bindings/go/src/fdb/fdb.go
@@ -22,7 +22,7 @@
 
 package fdb
 
-// #define FDB_API_VERSION 700
+// #define FDB_API_VERSION 710
 // #include <foundationdb/fdb_c.h>
 // #include <stdlib.h>
 import "C"
@@ -108,7 +108,7 @@ func (opt NetworkOptions) setOpt(code int, param []byte) error {
 // library, an error will be returned. APIVersion must be called prior to any
 // other functions in the fdb package.
 //
-// Currently, this package supports API versions 200 through 700.
+// Currently, this package supports API versions 200 through 710.
 //
 // Warning: When using the multi-version client API, setting an API version that
 // is not supported by a particular client library will prevent that client from
@@ -116,7 +116,7 @@ func (opt NetworkOptions) setOpt(code int, param []byte) error {
 // the API version of your application after upgrading your client until the
 // cluster has also been upgraded.
 func APIVersion(version int) error {
-	headerVersion := 700
+	headerVersion := 710
 
 	networkMutex.Lock()
 	defer networkMutex.Unlock()
@@ -128,7 +128,7 @@ func APIVersion(version int) error {
 		return errAPIVersionAlreadySet
 	}
 
-	if version < 200 || version > 700 {
+	if version < 200 || version > 710 {
 		return errAPIVersionNotSupported
 	}
 
diff --git a/bindings/go/src/fdb/fdb_test.go b/bindings/go/src/fdb/fdb_test.go
index e455dba473..d55a3a7d63 100644
--- a/bindings/go/src/fdb/fdb_test.go
+++ b/bindings/go/src/fdb/fdb_test.go
@@ -32,7 +32,7 @@ import (
 func ExampleOpenDefault() {
 	var e error
 
-	e = fdb.APIVersion(700)
+	e = fdb.APIVersion(710)
 	if e != nil {
 		fmt.Printf("Unable to set API version: %v\n", e)
 		return
@@ -52,7 +52,7 @@ func ExampleOpenDefault() {
 }
 
 func TestVersionstamp(t *testing.T) {
-	fdb.MustAPIVersion(700)
+	fdb.MustAPIVersion(710)
 	db := fdb.MustOpenDefault()
 
 	setVs := func(t fdb.Transactor, key fdb.Key) (fdb.FutureKey, error) {
@@ -98,7 +98,7 @@ func TestVersionstamp(t *testing.T) {
 }
 
 func ExampleTransactor() {
-	fdb.MustAPIVersion(700)
+	fdb.MustAPIVersion(710)
 	db := fdb.MustOpenDefault()
 
 	setOne := func(t fdb.Transactor, key fdb.Key, value []byte) error {
@@ -149,7 +149,7 @@ func ExampleTransactor() {
 }
 
 func ExampleReadTransactor() {
-	fdb.MustAPIVersion(700)
+	fdb.MustAPIVersion(710)
 	db := fdb.MustOpenDefault()
 
 	getOne := func(rt fdb.ReadTransactor, key fdb.Key) ([]byte, error) {
@@ -202,7 +202,7 @@ func ExampleReadTransactor() {
 }
 
 func ExamplePrefixRange() {
-	fdb.MustAPIVersion(700)
+	fdb.MustAPIVersion(710)
 	db := fdb.MustOpenDefault()
 
 	tr, e := db.CreateTransaction()
@@ -241,7 +241,7 @@ func ExamplePrefixRange() {
 }
 
 func ExampleRangeIterator() {
-	fdb.MustAPIVersion(700)
+	fdb.MustAPIVersion(710)
 	db := fdb.MustOpenDefault()
 
 	tr, e := db.CreateTransaction()
diff --git a/bindings/go/src/fdb/futures.go b/bindings/go/src/fdb/futures.go
index e51d5eaa8d..35115f8594 100644
--- a/bindings/go/src/fdb/futures.go
+++ b/bindings/go/src/fdb/futures.go
@@ -23,7 +23,7 @@
 package fdb
 
 //  #cgo LDFLAGS: -lfdb_c -lm
-//  #define FDB_API_VERSION 700
+//  #define FDB_API_VERSION 710
 //  #include <foundationdb/fdb_c.h>
 //  #include <string.h>
 //
diff --git a/bindings/go/src/fdb/range.go b/bindings/go/src/fdb/range.go
index 584f23cb2b..32155eae45 100644
--- a/bindings/go/src/fdb/range.go
+++ b/bindings/go/src/fdb/range.go
@@ -22,7 +22,7 @@
 
 package fdb
 
-// #define FDB_API_VERSION 700
+// #define FDB_API_VERSION 710
 // #include <foundationdb/fdb_c.h>
 import "C"
 
diff --git a/bindings/go/src/fdb/transaction.go b/bindings/go/src/fdb/transaction.go
index 9c64b06ac7..98bfa86c08 100644
--- a/bindings/go/src/fdb/transaction.go
+++ b/bindings/go/src/fdb/transaction.go
@@ -22,7 +22,7 @@
 
 package fdb
 
-// #define FDB_API_VERSION 700
+// #define FDB_API_VERSION 710
 // #include <foundationdb/fdb_c.h>
 import "C"
 
diff --git a/bindings/java/JavaWorkload.cpp b/bindings/java/JavaWorkload.cpp
index 7eaf9527b6..b2506965eb 100644
--- a/bindings/java/JavaWorkload.cpp
+++ b/bindings/java/JavaWorkload.cpp
@@ -19,7 +19,7 @@
  */
 
 #include <foundationdb/ClientWorkload.h>
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 710
 #include <foundationdb/fdb_c.h>
 
 #include <jni.h>
@@ -375,7 +375,7 @@ struct JVM {
 		jmethodID selectMethod =
 		    env->GetStaticMethodID(fdbClass, "selectAPIVersion", "(I)Lcom/apple/foundationdb/FDB;");
 		checkException();
-		auto fdbInstance = env->CallStaticObjectMethod(fdbClass, selectMethod, jint(700));
+		auto fdbInstance = env->CallStaticObjectMethod(fdbClass, selectMethod, jint(710));
 		checkException();
 		env->CallObjectMethod(fdbInstance, getMethod(fdbClass, "disableShutdownHook", "()V"));
 		checkException();
diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp
index 06acae658e..587190d3a5 100644
--- a/bindings/java/fdbJNI.cpp
+++ b/bindings/java/fdbJNI.cpp
@@ -21,7 +21,7 @@
 #include <jni.h>
 #include <string.h>
 
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 710
 
 #include <foundationdb/fdb_c.h>
 
diff --git a/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java b/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java
index ddddd20ad1..5634e7d741 100644
--- a/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java
+++ b/bindings/java/src/integration/com/apple/foundationdb/DirectoryTest.java
@@ -42,7 +42,7 @@ import org.junit.jupiter.api.extension.ExtendWith;
  */
 @ExtendWith(RequiresDatabase.class)
 class DirectoryTest {
-	private static final FDB fdb = FDB.selectAPIVersion(700);
+	private static final FDB fdb = FDB.selectAPIVersion(710);
 
 	@Test
 	void testCanCreateDirectory() throws Exception {
diff --git a/bindings/java/src/integration/com/apple/foundationdb/RangeQueryIntegrationTest.java b/bindings/java/src/integration/com/apple/foundationdb/RangeQueryIntegrationTest.java
index e7490fd038..8c9dbc049c 100644
--- a/bindings/java/src/integration/com/apple/foundationdb/RangeQueryIntegrationTest.java
+++ b/bindings/java/src/integration/com/apple/foundationdb/RangeQueryIntegrationTest.java
@@ -41,7 +41,7 @@ import org.junit.jupiter.api.extension.ExtendWith;
  */
 @ExtendWith(RequiresDatabase.class)
 class RangeQueryIntegrationTest {
-	private static final FDB fdb = FDB.selectAPIVersion(700);
+	private static final FDB fdb = FDB.selectAPIVersion(710);
 
 	@BeforeEach
 	@AfterEach
diff --git a/bindings/java/src/integration/com/apple/foundationdb/RequiresDatabase.java b/bindings/java/src/integration/com/apple/foundationdb/RequiresDatabase.java
index 803a25ab1c..69537c8a8d 100644
--- a/bindings/java/src/integration/com/apple/foundationdb/RequiresDatabase.java
+++ b/bindings/java/src/integration/com/apple/foundationdb/RequiresDatabase.java
@@ -80,7 +80,7 @@ public class RequiresDatabase implements ExecutionCondition, BeforeAllCallback {
 		 * assume that if we are here, then canRunIntegrationTest() is returning true and we don't have to bother
 		 * checking it.
 		 */
-		try (Database db = FDB.selectAPIVersion(700).open()) {
+		try (Database db = FDB.selectAPIVersion(710).open()) {
 			db.run(tr -> {
 				CompletableFuture<byte[]> future = tr.get("test".getBytes());
 
diff --git a/bindings/java/src/junit/com/apple/foundationdb/FDBLibraryRule.java b/bindings/java/src/junit/com/apple/foundationdb/FDBLibraryRule.java
index c50899fef9..455cb9c4b6 100644
--- a/bindings/java/src/junit/com/apple/foundationdb/FDBLibraryRule.java
+++ b/bindings/java/src/junit/com/apple/foundationdb/FDBLibraryRule.java
@@ -37,7 +37,7 @@ public class FDBLibraryRule implements BeforeAllCallback {
 
 	public FDBLibraryRule(int apiVersion) { this.apiVersion = apiVersion; }
 
-	public static FDBLibraryRule current() { return new FDBLibraryRule(700); }
+	public static FDBLibraryRule current() { return new FDBLibraryRule(710); }
 
 	public static FDBLibraryRule v63() { return new FDBLibraryRule(630); }
 
diff --git a/bindings/java/src/main/com/apple/foundationdb/FDB.java b/bindings/java/src/main/com/apple/foundationdb/FDB.java
index 031a1e2472..1a54e108d5 100644
--- a/bindings/java/src/main/com/apple/foundationdb/FDB.java
+++ b/bindings/java/src/main/com/apple/foundationdb/FDB.java
@@ -35,7 +35,7 @@ import java.util.concurrent.atomic.AtomicInteger;
  *   This call is required before using any other part of the API. The call allows
  *   an error to be thrown at this point to prevent client code from accessing a later library
  *   with incorrect assumptions from the current version. The API version documented here is version
- *   {@code 700}.<br><br>
+ *   {@code 710}.<br><br>
  *  FoundationDB encapsulates multiple versions of its interface by requiring
  *   the client to explicitly specify the version of the API it uses. The purpose
  *   of this design is to allow you to upgrade the server, client libraries, or
@@ -183,8 +183,8 @@ public class FDB {
 		}
 		if(version < 510)
 			throw new IllegalArgumentException("API version not supported (minimum 510)");
-		if(version > 700)
-			throw new IllegalArgumentException("API version not supported (maximum 700)");
+		if(version > 710)
+			throw new IllegalArgumentException("API version not supported (maximum 710)");
 
 		Select_API_version(version);
 		singleton = new FDB(version);
diff --git a/bindings/java/src/main/overview.html.in b/bindings/java/src/main/overview.html.in
index adaedd1a03..fe20448dfb 100644
--- a/bindings/java/src/main/overview.html.in
+++ b/bindings/java/src/main/overview.html.in
@@ -13,7 +13,7 @@ and then added to your classpath.<br>
 <h1>Getting started</h1>
 To start using FoundationDB from Java, create an instance of the 
 {@link com.apple.foundationdb.FDB FoundationDB API interface} with the version of the
-API that you want to use (this release of the FoundationDB Java API supports versions between {@code 510} and {@code 700}).
+API that you want to use (this release of the FoundationDB Java API supports versions between {@code 510} and {@code 710}).
 With this API object you can then open {@link com.apple.foundationdb.Cluster Cluster}s and
 {@link com.apple.foundationdb.Database Database}s and start using
 {@link com.apple.foundationdb.Transaction Transaction}s.
@@ -29,7 +29,7 @@ import com.apple.foundationdb.tuple.Tuple;
 
 public class Example {
   public static void main(String[] args) {
-    FDB fdb = FDB.selectAPIVersion(700);
+    FDB fdb = FDB.selectAPIVersion(710);
 
     try(Database db = fdb.open()) {
       // Run an operation on the database
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/AbstractTester.java b/bindings/java/src/test/com/apple/foundationdb/test/AbstractTester.java
index e27e80b082..8cb1230c2f 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/AbstractTester.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/AbstractTester.java
@@ -27,7 +27,7 @@ import com.apple.foundationdb.Database;
 import com.apple.foundationdb.FDB;
 
 public abstract class AbstractTester {
-	public static final int API_VERSION = 700;
+	public static final int API_VERSION = 710;
 	protected static final int NUM_RUNS = 25;
 	protected static final Charset ASCII = Charset.forName("ASCII");
 
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/BlockingBenchmark.java b/bindings/java/src/test/com/apple/foundationdb/test/BlockingBenchmark.java
index 68f7d74a95..d9c8c20d23 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/BlockingBenchmark.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/BlockingBenchmark.java
@@ -33,7 +33,7 @@ public class BlockingBenchmark {
 	private static final int PARALLEL = 100;
 
 	public static void main(String[] args) throws InterruptedException {
-		FDB fdb = FDB.selectAPIVersion(700);
+		FDB fdb = FDB.selectAPIVersion(710);
 
 		// The cluster file DOES NOT need to be valid, although it must exist.
 		//  This is because the database is never really contacted in this test.
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java b/bindings/java/src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java
index bddfd6f57d..046a39f66d 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/ConcurrentGetSetGet.java
@@ -48,7 +48,7 @@ public class ConcurrentGetSetGet {
 	}
 
 	public static void main(String[] args) {
-		try(Database database = FDB.selectAPIVersion(700).open()) {
+		try(Database database = FDB.selectAPIVersion(710).open()) {
 			new ConcurrentGetSetGet().apply(database);
 		}
 	}
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/Example.java b/bindings/java/src/test/com/apple/foundationdb/test/Example.java
index 44e9087b3e..80c35b5ca2 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/Example.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/Example.java
@@ -26,7 +26,7 @@ import com.apple.foundationdb.tuple.Tuple;
 
 public class Example {
 	public static void main(String[] args) {
-		FDB fdb = FDB.selectAPIVersion(700);
+		FDB fdb = FDB.selectAPIVersion(710);
 
 		try(Database db = fdb.open()) {
 			// Run an operation on the database
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/IterableTest.java b/bindings/java/src/test/com/apple/foundationdb/test/IterableTest.java
index ce1f623f4c..a9a7a37b66 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/IterableTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/IterableTest.java
@@ -31,7 +31,7 @@ public class IterableTest {
 	public static void main(String[] args) throws InterruptedException {
 		final int reps = 1000;
 		try {
-			FDB fdb = FDB.selectAPIVersion(700);
+			FDB fdb = FDB.selectAPIVersion(710);
 			try(Database db = fdb.open()) {
 				runTests(reps, db);
 			}
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/LocalityTests.java b/bindings/java/src/test/com/apple/foundationdb/test/LocalityTests.java
index d049ac83f7..a14b466514 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/LocalityTests.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/LocalityTests.java
@@ -34,7 +34,7 @@ import com.apple.foundationdb.tuple.ByteArrayUtil;
 public class LocalityTests {
 
 	public static void main(String[] args) {
-		FDB fdb = FDB.selectAPIVersion(700);
+		FDB fdb = FDB.selectAPIVersion(710);
 		try(Database database = fdb.open(args[0])) {
 			try(Transaction tr = database.createTransaction()) {
 				String[] keyAddresses = LocalityUtil.getAddressesForKey(tr, "a".getBytes()).join();
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/ParallelRandomScan.java b/bindings/java/src/test/com/apple/foundationdb/test/ParallelRandomScan.java
index 624566964a..a218a6460e 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/ParallelRandomScan.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/ParallelRandomScan.java
@@ -43,7 +43,7 @@ public class ParallelRandomScan {
 	private static final int PARALLELISM_STEP = 5;
 
 	public static void main(String[] args) throws InterruptedException {
-		FDB api = FDB.selectAPIVersion(700);
+		FDB api = FDB.selectAPIVersion(710);
 		try(Database database = api.open(args[0])) {
 			for(int i = PARALLELISM_MIN; i <= PARALLELISM_MAX; i += PARALLELISM_STEP) {
 				runTest(database, i, ROWS, DURATION_MS);
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/RangeTest.java b/bindings/java/src/test/com/apple/foundationdb/test/RangeTest.java
index 4232a6d664..38eaf7b424 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/RangeTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/RangeTest.java
@@ -34,7 +34,7 @@ import com.apple.foundationdb.Transaction;
 import com.apple.foundationdb.async.AsyncIterable;
 
 public class RangeTest {
-	private static final int API_VERSION = 700;
+	private static final int API_VERSION = 710;
 
 	public static void main(String[] args) {
 		System.out.println("About to use version " + API_VERSION);
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/SerialInsertion.java b/bindings/java/src/test/com/apple/foundationdb/test/SerialInsertion.java
index c16599196c..90adea8ac9 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/SerialInsertion.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/SerialInsertion.java
@@ -34,7 +34,7 @@ public class SerialInsertion {
 	private static final int NODES = 1000000;
 
 	public static void main(String[] args) {
-		FDB api = FDB.selectAPIVersion(700);
+		FDB api = FDB.selectAPIVersion(710);
 		try(Database database = api.open()) {
 			long start = System.currentTimeMillis();
 
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/SerialIteration.java b/bindings/java/src/test/com/apple/foundationdb/test/SerialIteration.java
index db63999daa..8e4578d97f 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/SerialIteration.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/SerialIteration.java
@@ -39,7 +39,7 @@ public class SerialIteration {
 	private static final int THREAD_COUNT = 1;
 
 	public static void main(String[] args) throws InterruptedException {
-		FDB api = FDB.selectAPIVersion(700);
+		FDB api = FDB.selectAPIVersion(710);
 		try(Database database = api.open(args[0])) {
 			for(int i = 1; i <= THREAD_COUNT; i++) {
 				runThreadedTest(database, i);
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/SerialTest.java b/bindings/java/src/test/com/apple/foundationdb/test/SerialTest.java
index df084d564f..5b89379350 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/SerialTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/SerialTest.java
@@ -30,7 +30,7 @@ public class SerialTest {
 	public static void main(String[] args) throws InterruptedException {
 		final int reps = 1000;
 		try {
-			FDB fdb = FDB.selectAPIVersion(700);
+			FDB fdb = FDB.selectAPIVersion(710);
 			try(Database db = fdb.open()) {
 				runTests(reps, db);
 			}
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java b/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java
index 78de1ae3db..cb58c3e72d 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/SnapshotTransactionTest.java
@@ -39,7 +39,7 @@ public class SnapshotTransactionTest {
 	private static final Subspace SUBSPACE = new Subspace(Tuple.from("test", "conflict_ranges"));
 
 	public static void main(String[] args) {
-		FDB fdb = FDB.selectAPIVersion(700);
+		FDB fdb = FDB.selectAPIVersion(710);
 		try(Database db = fdb.open()) {
 			snapshotReadShouldNotConflict(db);
 			snapshotShouldNotAddConflictRange(db);
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java
index c3ad8313be..2145b88966 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/TupleTest.java
@@ -37,7 +37,7 @@ public class TupleTest {
 	public static void main(String[] args) throws NoSuchFieldException {
 		final int reps = 1000;
 		try {
-			FDB fdb = FDB.selectAPIVersion(700);
+			FDB fdb = FDB.selectAPIVersion(710);
 			try(Database db = fdb.open()) {
 				runTests(reps, db);
 			}
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java b/bindings/java/src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java
index e50bc9c031..6ed02c008b 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/VersionstampSmokeTest.java
@@ -32,7 +32,7 @@ import com.apple.foundationdb.tuple.Versionstamp;
 
 public class VersionstampSmokeTest {
 	public static void main(String[] args) {
-		FDB fdb = FDB.selectAPIVersion(700);
+		FDB fdb = FDB.selectAPIVersion(710);
 		try(Database db = fdb.open()) {
 			db.run(tr -> {
 				tr.clear(Tuple.from("prefix").range());
diff --git a/bindings/java/src/test/com/apple/foundationdb/test/WatchTest.java b/bindings/java/src/test/com/apple/foundationdb/test/WatchTest.java
index 14c0aa1d43..eb675d1518 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/WatchTest.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/WatchTest.java
@@ -34,7 +34,7 @@ import com.apple.foundationdb.Transaction;
 public class WatchTest {
 
 	public static void main(String[] args) {
-		FDB fdb = FDB.selectAPIVersion(700);
+		FDB fdb = FDB.selectAPIVersion(710);
 		try(Database database = fdb.open(args[0])) {
 			database.options().setLocationCacheSize(42);
 			try(Transaction tr = database.createTransaction()) {
diff --git a/bindings/python/fdb/__init__.py b/bindings/python/fdb/__init__.py
index c969b6c70c..0054e72808 100644
--- a/bindings/python/fdb/__init__.py
+++ b/bindings/python/fdb/__init__.py
@@ -52,7 +52,7 @@ def get_api_version():
 
 
 def api_version(ver):
-    header_version = 700
+    header_version = 710
 
     if '_version' in globals():
         if globals()['_version'] != ver:
diff --git a/bindings/python/fdb/impl.py b/bindings/python/fdb/impl.py
index e8cc2a79b8..d38582b459 100644
--- a/bindings/python/fdb/impl.py
+++ b/bindings/python/fdb/impl.py
@@ -253,7 +253,7 @@ def transactional(*tr_args, **tr_kwargs):
             @functools.wraps(func)
             def wrapper(*args, **kwargs):
                 # We can't throw this from the decorator, as when a user runs
-                # >>> import fdb ; fdb.api_version(700)
+                # >>> import fdb ; fdb.api_version(710)
                 # the code above uses @transactional before the API version is set
                 if fdb.get_api_version() >= 630 and inspect.isgeneratorfunction(func):
                     raise ValueError("Generators can not be wrapped with fdb.transactional")
diff --git a/bindings/python/tests/size_limit_tests.py b/bindings/python/tests/size_limit_tests.py
index 756d9422e0..fdc9cdaf54 100644
--- a/bindings/python/tests/size_limit_tests.py
+++ b/bindings/python/tests/size_limit_tests.py
@@ -22,7 +22,7 @@ import fdb
 import sys
 
 if __name__ == '__main__':
-    fdb.api_version(700)
+    fdb.api_version(710)
 
 @fdb.transactional
 def setValue(tr, key, value):
diff --git a/bindings/ruby/lib/fdb.rb b/bindings/ruby/lib/fdb.rb
index df8448ea0b..f96c25945a 100644
--- a/bindings/ruby/lib/fdb.rb
+++ b/bindings/ruby/lib/fdb.rb
@@ -36,7 +36,7 @@ module FDB
     end
   end
   def self.api_version(version)
-    header_version = 700
+    header_version = 710
     if self.is_api_version_selected?()
       if @@chosen_version != version
         raise "FDB API already loaded at version #{@@chosen_version}."
diff --git a/build/cmake/package_tester/fdb_c_app/app.c b/build/cmake/package_tester/fdb_c_app/app.c
index f26b2513c1..6fe24068f9 100644
--- a/build/cmake/package_tester/fdb_c_app/app.c
+++ b/build/cmake/package_tester/fdb_c_app/app.c
@@ -1,7 +1,7 @@
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 710
 #include <foundationdb/fdb_c.h>
 
 int main(int argc, char* argv[]) {
-	fdb_select_api_version(700);
+	fdb_select_api_version(710);
 	return 0;
 }
diff --git a/build/cmake/package_tester/modules/tests.sh b/build/cmake/package_tester/modules/tests.sh
index 35ff098a6f..2495e21a94 100644
--- a/build/cmake/package_tester/modules/tests.sh
+++ b/build/cmake/package_tester/modules/tests.sh
@@ -65,7 +65,7 @@ then
        python setup.py install
        successOr "Installing python bindings failed"
        popd
-       python -c 'import fdb; fdb.api_version(700)'
+       python -c 'import fdb; fdb.api_version(710)'
        successOr "Loading python bindings failed"
 
        # Test cmake and pkg-config integration: https://github.com/apple/foundationdb/issues/1483
diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst
index 7053a78ca0..e88fcb0f68 100644
--- a/documentation/sphinx/source/administration.rst
+++ b/documentation/sphinx/source/administration.rst
@@ -235,7 +235,7 @@ If you interrupt the exclude command with Ctrl-C after seeing the "waiting for s
 
 7) If you ever want to add a removed machine back to the cluster, you will have to take it off the excluded servers list to which it was added in step 3. This can be done using the ``include`` command of ``fdbcli``. If attempting to re-include a failed server, this can be done using the ``include failed`` command of ``fdbcli``. Typing ``exclude`` with no parameters will tell you the current list of excluded and failed machines.
 
-As of api version 700, excluding servers can be done with the :ref:`special key space management module <special-key-space-management-module>` as well.
+As of api version 710, excluding servers can be done with the :ref:`special key space management module <special-key-space-management-module>` as well.
 
 Moving a cluster
 ================
diff --git a/documentation/sphinx/source/api-c.rst b/documentation/sphinx/source/api-c.rst
index 0d02dc18dd..0acafea8ba 100644
--- a/documentation/sphinx/source/api-c.rst
+++ b/documentation/sphinx/source/api-c.rst
@@ -133,7 +133,7 @@ API versioning
 
 Prior to including ``fdb_c.h``, you must define the ``FDB_API_VERSION`` macro. This, together with the :func:`fdb_select_api_version()` function, allows programs written against an older version of the API to compile and run with newer versions of the C library. The current version of the FoundationDB C API is |api-version|. ::
 
-  #define FDB_API_VERSION 700
+  #define FDB_API_VERSION 710
   #include <foundationdb/fdb_c.h>
 
 .. function:: fdb_error_t fdb_select_api_version(int version)
diff --git a/documentation/sphinx/source/api-common.rst.inc b/documentation/sphinx/source/api-common.rst.inc
index 0be8cc30fd..f70e16a5d6 100644
--- a/documentation/sphinx/source/api-common.rst.inc
+++ b/documentation/sphinx/source/api-common.rst.inc
@@ -148,7 +148,7 @@
 .. |atomic-versionstamps-tuple-warning-value| replace::
     At this time, versionstamped values are not compatible with the Tuple layer except in Java, Python, and Go. Note that this implies versionstamped values may not be used with the Subspace and Directory layers except in those languages.
 
-.. |api-version| replace:: 700
+.. |api-version| replace:: 710
 
 .. |streaming-mode-blurb1| replace::
     When using |get-range-func| and similar interfaces, API clients can request large ranges of the database to iterate over.  Making such a request doesn't necessarily mean that the client will consume all of the data in the range - sometimes the client doesn't know how far it intends to iterate in advance.  FoundationDB tries to balance latency and bandwidth by requesting data for iteration in batches.
diff --git a/documentation/sphinx/source/api-python.rst b/documentation/sphinx/source/api-python.rst
index 59b82406e0..0cd1e8f078 100644
--- a/documentation/sphinx/source/api-python.rst
+++ b/documentation/sphinx/source/api-python.rst
@@ -108,7 +108,7 @@ Opening a database
 After importing the ``fdb`` module and selecting an API version, you probably want to open a :class:`Database` using :func:`open`::
 
     import fdb
-    fdb.api_version(700)
+    fdb.api_version(710)
     db = fdb.open()
 
 .. function:: open( cluster_file=None, event_model=None )
diff --git a/documentation/sphinx/source/api-ruby.rst b/documentation/sphinx/source/api-ruby.rst
index 7c707f445b..ddb721a0d0 100644
--- a/documentation/sphinx/source/api-ruby.rst
+++ b/documentation/sphinx/source/api-ruby.rst
@@ -93,7 +93,7 @@ Opening a database
 After requiring the ``FDB`` gem and selecting an API version, you probably want to open a :class:`Database` using :func:`open`::
 
     require 'fdb'
-    FDB.api_version 700
+    FDB.api_version 710
     db = FDB.open
 
 .. function:: open( cluster_file=nil ) -> Database
diff --git a/documentation/sphinx/source/api-version-upgrade-guide.rst b/documentation/sphinx/source/api-version-upgrade-guide.rst
index 83486986a6..707d8e3246 100644
--- a/documentation/sphinx/source/api-version-upgrade-guide.rst
+++ b/documentation/sphinx/source/api-version-upgrade-guide.rst
@@ -9,6 +9,14 @@ This document provides an overview of changes that an application developer may
 
 For more details about API versions, see :ref:`api-versions`.
 
+.. _api-version-upgrade-guide-710:
+
+API version 710
+===============
+
+General
+-------
+
 .. _api-version-upgrade-guide-700:
 
 API version 700
diff --git a/documentation/sphinx/source/class-scheduling-go.rst b/documentation/sphinx/source/class-scheduling-go.rst
index 77d9c01e90..4f505d4931 100644
--- a/documentation/sphinx/source/class-scheduling-go.rst
+++ b/documentation/sphinx/source/class-scheduling-go.rst
@@ -29,7 +29,7 @@ Before using the API, we need to specify the API version. This allows programs t
 
 .. code-block:: go
 
-  fdb.MustAPIVersion(700)
+  fdb.MustAPIVersion(710)
 
 Next, we open a FoundationDB database.  The API will connect to the FoundationDB cluster indicated by the :ref:`default cluster file <default-cluster-file>`.
 
@@ -78,7 +78,7 @@ If this is all working, it looks like we are ready to start building a real appl
 
   func main() {
       // Different API versions may expose different runtime behaviors.
-      fdb.MustAPIVersion(700)
+      fdb.MustAPIVersion(710)
 
       // Open the default database from the system cluster
       db := fdb.MustOpenDefault()
@@ -666,7 +666,7 @@ Here's the code for the scheduling tutorial:
   }
 
   func main() {
-    fdb.MustAPIVersion(700)
+    fdb.MustAPIVersion(710)
     db := fdb.MustOpenDefault()
     db.Options().SetTransactionTimeout(60000)  // 60,000 ms = 1 minute
     db.Options().SetTransactionRetryLimit(100)
diff --git a/documentation/sphinx/source/class-scheduling-java.rst b/documentation/sphinx/source/class-scheduling-java.rst
index c5dda17d55..dec3b23e18 100644
--- a/documentation/sphinx/source/class-scheduling-java.rst
+++ b/documentation/sphinx/source/class-scheduling-java.rst
@@ -30,7 +30,7 @@ Before using the API, we need to specify the API version. This allows programs t
   private static final Database db;
 
   static {
-    fdb = FDB.selectAPIVersion(700);
+    fdb = FDB.selectAPIVersion(710);
     db = fdb.open();
   }
 
@@ -66,7 +66,7 @@ If this is all working, it looks like we are ready to start building a real appl
     private static final Database db;
 
     static {
-      fdb = FDB.selectAPIVersion(700);
+      fdb = FDB.selectAPIVersion(710);
       db = fdb.open();
     }
 
@@ -441,7 +441,7 @@ Here's the code for the scheduling tutorial:
     private static final Database db;
 
     static {
-      fdb = FDB.selectAPIVersion(700);
+      fdb = FDB.selectAPIVersion(710);
       db = fdb.open();
       db.options().setTransactionTimeout(60000);  // 60,000 ms = 1 minute
       db.options().setTransactionRetryLimit(100);
diff --git a/documentation/sphinx/source/class-scheduling-ruby.rst b/documentation/sphinx/source/class-scheduling-ruby.rst
index c8d8483aad..f5871578e3 100644
--- a/documentation/sphinx/source/class-scheduling-ruby.rst
+++ b/documentation/sphinx/source/class-scheduling-ruby.rst
@@ -23,7 +23,7 @@ Open a Ruby interactive interpreter and import the FoundationDB API module::
 
 Before using the API, we need to specify the API version. This allows programs to maintain compatibility even if the API is modified in future versions::
 
-    > FDB.api_version 700
+    > FDB.api_version 710
     => nil
 
 Next, we open a FoundationDB database.  The API will connect to the FoundationDB cluster indicated by the :ref:`default cluster file <default-cluster-file>`. ::
@@ -46,7 +46,7 @@ If this is all working, it looks like we are ready to start building a real appl
 .. code-block:: ruby
 
     require 'fdb'
-    FDB.api_version 700
+    FDB.api_version 710
     @db = FDB.open
     @db['hello'] = 'world'
     print 'hello ', @db['hello']
@@ -373,7 +373,7 @@ Here's the code for the scheduling tutorial:
 
     require 'fdb'
 
-    FDB.api_version 700
+    FDB.api_version 710
 
     ####################################
     ##        Initialization          ##
diff --git a/documentation/sphinx/source/class-scheduling.rst b/documentation/sphinx/source/class-scheduling.rst
index 23615a08a6..bdf3c72680 100644
--- a/documentation/sphinx/source/class-scheduling.rst
+++ b/documentation/sphinx/source/class-scheduling.rst
@@ -30,7 +30,7 @@ Open a Python interactive interpreter and import the FoundationDB API module::
 
 Before using the API, we need to specify the API version. This allows programs to maintain compatibility even if the API is modified in future versions::
 
-    >>> fdb.api_version(700)
+    >>> fdb.api_version(710)
 
 Next, we open a FoundationDB database.  The API will connect to the FoundationDB cluster indicated by the :ref:`default cluster file <default-cluster-file>`. ::
 
@@ -48,7 +48,7 @@ When this command returns without exception, the modification is durably stored
 If this is all working, it looks like we are ready to start building a real application. For reference, here's the full code for "hello world"::
 
     import fdb
-    fdb.api_version(700)
+    fdb.api_version(710)
     db = fdb.open()
     db[b'hello'] = b'world'
     print 'hello', db[b'hello']
@@ -91,7 +91,7 @@ FoundationDB includes a few tools that make it easy to model data using this app
 opening a :ref:`directory <developer-guide-directories>` in the database::
 
     import fdb
-    fdb.api_version(700)
+    fdb.api_version(710)
 
     db = fdb.open()
     scheduling = fdb.directory.create_or_open(db, ('scheduling',))
@@ -337,7 +337,7 @@ Here's the code for the scheduling tutorial::
     import fdb
     import fdb.tuple
 
-    fdb.api_version(700)
+    fdb.api_version(710)
 
 
     ####################################
diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index d26f235304..de7ae57283 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -918,7 +918,7 @@ Caveats
 Read/write modules
 ------------------
 
-As of api version 700, some modules in the special key space allow writes as
+As of api version 710, some modules in the special key space allow writes as
 well as reads. In these modules, a user can expect that mutations (i.e. sets,
 clears, etc) do not have side-effects outside of the current transaction
 until commit is called (the same is true for writes to the normal key space).
diff --git a/documentation/sphinx/source/hierarchical-documents-java.rst b/documentation/sphinx/source/hierarchical-documents-java.rst
index db33abd4ef..b9869afd01 100644
--- a/documentation/sphinx/source/hierarchical-documents-java.rst
+++ b/documentation/sphinx/source/hierarchical-documents-java.rst
@@ -69,7 +69,7 @@ Here’s a basic implementation of the recipe.
         private static final long EMPTY_ARRAY = -1;
 
         static {
-            fdb = FDB.selectAPIVersion(700);
+            fdb = FDB.selectAPIVersion(710);
             db = fdb.open();
             docSpace = new Subspace(Tuple.from("D"));
         }
diff --git a/documentation/sphinx/source/multimaps-java.rst b/documentation/sphinx/source/multimaps-java.rst
index 3c9a46ad3c..d321a8a0aa 100644
--- a/documentation/sphinx/source/multimaps-java.rst
+++ b/documentation/sphinx/source/multimaps-java.rst
@@ -74,7 +74,7 @@ Here’s a simple implementation of multimaps with multisets as described:
         private static final int N = 100;
 
         static {
-            fdb = FDB.selectAPIVersion(700);
+            fdb = FDB.selectAPIVersion(710);
             db = fdb.open();
             multi = new Subspace(Tuple.from("M"));
         }
diff --git a/documentation/sphinx/source/priority-queues-java.rst b/documentation/sphinx/source/priority-queues-java.rst
index 0fafb08b4b..37476a3663 100644
--- a/documentation/sphinx/source/priority-queues-java.rst
+++ b/documentation/sphinx/source/priority-queues-java.rst
@@ -74,7 +74,7 @@ Here's a basic implementation of the model:
         private static final Random randno;
 
         static{
-            fdb = FDB.selectAPIVersion(700);
+            fdb = FDB.selectAPIVersion(710);
             db = fdb.open();
             pq = new Subspace(Tuple.from("P"));
 
diff --git a/documentation/sphinx/source/queues-java.rst b/documentation/sphinx/source/queues-java.rst
index b4b60df48b..033f0df88a 100644
--- a/documentation/sphinx/source/queues-java.rst
+++ b/documentation/sphinx/source/queues-java.rst
@@ -73,7 +73,7 @@ The following is a simple implementation of the basic pattern:
         private static final Random randno;
 
         static{
-            fdb = FDB.selectAPIVersion(700);
+            fdb = FDB.selectAPIVersion(710);
             db = fdb.open();
             queue = new Subspace(Tuple.from("Q"));
             randno = new Random();
diff --git a/documentation/sphinx/source/simple-indexes-java.rst b/documentation/sphinx/source/simple-indexes-java.rst
index c5edf02e71..61769ea847 100644
--- a/documentation/sphinx/source/simple-indexes-java.rst
+++ b/documentation/sphinx/source/simple-indexes-java.rst
@@ -87,7 +87,7 @@ In this example, we’re storing user data based on user ID but sometimes need t
         private static final Subspace index;
 
         static {
-            fdb = FDB.selectAPIVersion(700);
+            fdb = FDB.selectAPIVersion(710);
             db = fdb.open();
             main = new Subspace(Tuple.from("user"));
             index = new Subspace(Tuple.from("zipcode_index"));
diff --git a/documentation/sphinx/source/tables-java.rst b/documentation/sphinx/source/tables-java.rst
index 235dbd5b47..14cd0348ca 100644
--- a/documentation/sphinx/source/tables-java.rst
+++ b/documentation/sphinx/source/tables-java.rst
@@ -62,7 +62,7 @@ Here’s a simple implementation of the basic table pattern:
         private static final Subspace colIndex;
 
         static {
-            fdb = FDB.selectAPIVersion(700);
+            fdb = FDB.selectAPIVersion(710);
             db = fdb.open();
             table = new Subspace(Tuple.from("T"));
             rowIndex = table.subspace(Tuple.from("R"));
diff --git a/documentation/sphinx/source/vector-java.rst b/documentation/sphinx/source/vector-java.rst
index 17da6ebed8..4341948316 100644
--- a/documentation/sphinx/source/vector-java.rst
+++ b/documentation/sphinx/source/vector-java.rst
@@ -77,7 +77,7 @@ Here’s the basic pattern:
         private static final Subspace vector;
 
         static {
-            fdb = FDB.selectAPIVersion(700);
+            fdb = FDB.selectAPIVersion(710);
             db = fdb.open();
             vector = new Subspace(Tuple.from("V"));
         }
diff --git a/recipes/go-recipes/blob.go b/recipes/go-recipes/blob.go
index 2ac8681803..4c10c1ea47 100644
--- a/recipes/go-recipes/blob.go
+++ b/recipes/go-recipes/blob.go
@@ -78,7 +78,7 @@ func read_blob(t fdb.ReadTransactor, blob_subspace subspace.Subspace) ([]byte, e
 }
 
 func main() {
-	fdb.MustAPIVersion(700)
+	fdb.MustAPIVersion(710)
 
 	db := fdb.MustOpenDefault()
 
diff --git a/recipes/go-recipes/doc.go b/recipes/go-recipes/doc.go
index 5595f3b799..b73ee8c87e 100644
--- a/recipes/go-recipes/doc.go
+++ b/recipes/go-recipes/doc.go
@@ -219,7 +219,7 @@ func (doc Doc) GetDoc(trtr fdb.Transactor, doc_id int) interface{} {
 }
 
 func main() {
-	fdb.MustAPIVersion(700)
+	fdb.MustAPIVersion(710)
 
 	db := fdb.MustOpenDefault()
 
diff --git a/recipes/go-recipes/graph.go b/recipes/go-recipes/graph.go
index 966b3e5c5f..9b5728743a 100644
--- a/recipes/go-recipes/graph.go
+++ b/recipes/go-recipes/graph.go
@@ -124,7 +124,7 @@ func (graph *Graph) get_in_neighbors(trtr fdb.Transactor, node int) ([]int, erro
 }
 
 func main() {
-	fdb.MustAPIVersion(700)
+	fdb.MustAPIVersion(710)
 
 	db := fdb.MustOpenDefault()
 
diff --git a/recipes/go-recipes/indirect.go b/recipes/go-recipes/indirect.go
index e354a1af2f..6fc499c19e 100644
--- a/recipes/go-recipes/indirect.go
+++ b/recipes/go-recipes/indirect.go
@@ -93,7 +93,7 @@ func (wrkspc Workspace) Session(foo func(directory.DirectorySubspace)) (err erro
 }
 
 func main() {
-	fdb.MustAPIVersion(700)
+	fdb.MustAPIVersion(710)
 
 	db := fdb.MustOpenDefault()
 
diff --git a/recipes/go-recipes/multi.go b/recipes/go-recipes/multi.go
index 58cbfd2ba1..3470b8e7f9 100644
--- a/recipes/go-recipes/multi.go
+++ b/recipes/go-recipes/multi.go
@@ -132,7 +132,7 @@ func (multi MultiMap) MultiIsElement(trtr fdb.Transactor, index, value interface
 
 func main() {
 
-	fdb.MustAPIVersion(700)
+	fdb.MustAPIVersion(710)
 
 	db := fdb.MustOpenDefault()
 
diff --git a/recipes/go-recipes/priority.go b/recipes/go-recipes/priority.go
index b4f455716a..a2ff849696 100644
--- a/recipes/go-recipes/priority.go
+++ b/recipes/go-recipes/priority.go
@@ -117,7 +117,7 @@ func (prty Priority) Peek(trtr fdb.Transactor, max bool) interface{} {
 }
 
 func main() {
-	fdb.MustAPIVersion(700)
+	fdb.MustAPIVersion(710)
 
 	db := fdb.MustOpenDefault()
 
diff --git a/recipes/go-recipes/queue.go b/recipes/go-recipes/queue.go
index 6e6c1cee69..98502c0718 100644
--- a/recipes/go-recipes/queue.go
+++ b/recipes/go-recipes/queue.go
@@ -107,7 +107,7 @@ func (q *Queue) FirstItem(trtr fdb.Transactor) (interface{}, error) {
 func main() {
 	fmt.Println("Queue Example Program")
 
-	fdb.MustAPIVersion(700)
+	fdb.MustAPIVersion(710)
 
 	db := fdb.MustOpenDefault()
 
diff --git a/recipes/go-recipes/table.go b/recipes/go-recipes/table.go
index b037699b95..cef7061688 100644
--- a/recipes/go-recipes/table.go
+++ b/recipes/go-recipes/table.go
@@ -144,7 +144,7 @@ func (tbl Table) TableGetCol(tr fdb.ReadTransactor, col int) ([]interface{}, err
 }
 
 func main() {
-	fdb.MustAPIVersion(700)
+	fdb.MustAPIVersion(710)
 
 	db := fdb.MustOpenDefault()
 

From f74275807348bdae3198363b73226484b12744ce Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Tue, 4 May 2021 15:20:20 -0700
Subject: [PATCH 353/461] Update documentation/sphinx/source/administration.rst

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>
---
 documentation/sphinx/source/administration.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst
index e88fcb0f68..7053a78ca0 100644
--- a/documentation/sphinx/source/administration.rst
+++ b/documentation/sphinx/source/administration.rst
@@ -235,7 +235,7 @@ If you interrupt the exclude command with Ctrl-C after seeing the "waiting for s
 
 7) If you ever want to add a removed machine back to the cluster, you will have to take it off the excluded servers list to which it was added in step 3. This can be done using the ``include`` command of ``fdbcli``. If attempting to re-include a failed server, this can be done using the ``include failed`` command of ``fdbcli``. Typing ``exclude`` with no parameters will tell you the current list of excluded and failed machines.
 
-As of api version 710, excluding servers can be done with the :ref:`special key space management module <special-key-space-management-module>` as well.
+As of api version 700, excluding servers can be done with the :ref:`special key space management module <special-key-space-management-module>` as well.
 
 Moving a cluster
 ================

From 682558803a604e62a7cde46b0a3006ab1db08038 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Tue, 4 May 2021 15:20:39 -0700
Subject: [PATCH 354/461] Update
 documentation/sphinx/source/developer-guide.rst

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>
---
 documentation/sphinx/source/developer-guide.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index de7ae57283..d26f235304 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -918,7 +918,7 @@ Caveats
 Read/write modules
 ------------------
 
-As of api version 710, some modules in the special key space allow writes as
+As of api version 700, some modules in the special key space allow writes as
 well as reads. In these modules, a user can expect that mutations (i.e. sets,
 clears, etc) do not have side-effects outside of the current transaction
 until commit is called (the same is true for writes to the normal key space).

From 548d363dc3efe58c82357c5c2257c2f607721b97 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Tue, 4 May 2021 16:07:52 -0700
Subject: [PATCH 355/461] Update cmake target to 7.1

---
 CMakeLists.txt  | 2 +-
 versions.target | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e48d95447..3803330790 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,7 +18,7 @@
 # limitations under the License.
 cmake_minimum_required(VERSION 3.13)
 project(foundationdb
-  VERSION 7.0.0
+  VERSION 7.1.0
   DESCRIPTION "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions."
   HOMEPAGE_URL "http://www.foundationdb.org/"
   LANGUAGES C CXX ASM)
diff --git a/versions.target b/versions.target
index 1b42c29961..7672f81d95 100644
--- a/versions.target
+++ b/versions.target
@@ -1,7 +1,7 @@
 <?xml version="1.0"?>
 <Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <PropertyGroup>
-    <Version>7.0.0</Version>
-    <PackageName>7.0</PackageName>
+    <Version>7.1.0</Version>
+    <PackageName>7.1</PackageName>
   </PropertyGroup>
 </Project>

From e0e3cd39f13a847930c538598da3167ffedd1564 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 5 May 2021 22:02:44 -0700
Subject: [PATCH 356/461] Cherry picked #4678 changes onto #4780 to resolve
 conflicts.

---
 fdbserver/IPager.h                 |  11 ++-
 fdbserver/VersionedBTree.actor.cpp | 134 ++++++++++++++++++++++-------
 2 files changed, 114 insertions(+), 31 deletions(-)

diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h
index a204c377a5..cb9612fd95 100644
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@@ -125,7 +125,11 @@ public:
 
 class IPagerSnapshot {
 public:
-	virtual Future<Reference<const ArenaPage>> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool nohit) = 0;
+	virtual Future<Reference<const ArenaPage>> getPhysicalPage(LogicalPageID pageID,
+	                                                           bool cacheable,
+	                                                           bool nohit,
+	                                                           bool* fromCache = nullptr) = 0;
+	virtual bool tryEvictPage(LogicalPageID id) = 0;
 	virtual Version getVersion() const = 0;
 
 	virtual Key getMetaKey() const = 0;
@@ -176,7 +180,10 @@ public:
 	// Cacheable indicates that the page should be added to the page cache (if applicable?) as a result of this read.
 	// NoHit indicates that the read should not be considered a cache hit, such as when preloading pages that are
 	// considered likely to be needed soon.
-	virtual Future<Reference<ArenaPage>> readPage(LogicalPageID pageID, bool cacheable = true, bool noHit = false) = 0;
+	virtual Future<Reference<ArenaPage>> readPage(LogicalPageID pageID,
+	                                              bool cacheable = true,
+	                                              bool noHit = false,
+	                                              bool* fromCache = nullptr) = 0;
 
 	// Get a snapshot of the metakey and all pages as of the version v which must be >= getOldestVersion()
 	// Note that snapshots at any version may still see the results of updatePage() calls.
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 1080ad146b..b51f81b6f8 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -1108,6 +1108,22 @@ public:
 		return nullptr;
 	}
 
+	// Try to evict the item at index from cache
+	// Returns true if item is evicted or was not present in cache
+	bool tryEvict(const IndexType& index) {
+		auto i = cache.find(index);
+		if (i == cache.end() || !i->second.item.evictable()) {
+			return false;
+		}
+		Entry& toEvict = i->second;
+		if (toEvict.hits == 0) {
+			++g_redwoodMetrics.pagerEvictUnhit;
+		}
+		evictionOrder.erase(evictionOrder.iterator_to(toEvict));
+		cache.erase(toEvict.index);
+		return true;
+	}
+
 	// Get the object for i or create a new one.
 	// After a get(), the object for i is the last in evictionOrder.
 	// If noHit is set, do not consider this access to be cache hit if the object is present
@@ -1769,14 +1785,29 @@ public:
 		return readPhysicalPage(self, pageID, true);
 	}
 
-	// Reads the most recent version of pageID, either previously committed or written using updatePage() in the current
-	// commit
-	Future<Reference<ArenaPage>> readPage(LogicalPageID pageID, bool cacheable, bool noHit = false) override {
+	bool tryEvictPage(LogicalPageID logicalID, Version v) {
+		PhysicalPageID physicalID = getPhysicalPageID(logicalID, v);
+		return pageCache.tryEvict(physicalID);
+	}
+
+	// Reads the most recent version of pageID, either previously committed or written using updatePage()
+	// in the current commit
+	// If cacheable is false then if fromCache is valid it will be set to true if the page is from cache, otherwise
+	// false. If cacheable is true, fromCache is ignored as the result is automatically from cache by virtue of being
+	// cacheable.
+	Future<Reference<ArenaPage>> readPage(LogicalPageID pageID,
+	                                      bool cacheable,
+	                                      bool noHit = false,
+	                                      bool* fromCache = nullptr) override {
 		// Use cached page if present, without triggering a cache hit.
 		// Otherwise, read the page and return it but don't add it to the cache
 		if (!cacheable) {
 			debug_printf("DWALPager(%s) op=readUncached %s\n", filename.c_str(), toString(pageID).c_str());
 			PageCacheEntry* pCacheEntry = pageCache.getIfExists(pageID);
+			if (fromCache != nullptr) {
+				*fromCache = pCacheEntry != nullptr;
+			}
+
 			if (pCacheEntry != nullptr) {
 				debug_printf("DWALPager(%s) op=readUncachedHit %s\n", filename.c_str(), toString(pageID).c_str());
 				return pCacheEntry->readFuture;
@@ -1804,14 +1835,14 @@ public:
 		return cacheEntry.readFuture;
 	}
 
-	Future<Reference<ArenaPage>> readPageAtVersion(LogicalPageID pageID, Version v, bool cacheable, bool noHit) {
+	PhysicalPageID getPhysicalPageID(LogicalPageID pageID, Version v) {
 		auto i = remappedPages.find(pageID);
 
 		if (i != remappedPages.end()) {
 			auto j = i->second.upper_bound(v);
 			if (j != i->second.begin()) {
 				--j;
-				debug_printf("DWALPager(%s) op=readAtVersionRemapped %s @%" PRId64 " -> %s\n",
+				debug_printf("DWALPager(%s) op=lookupRemapped %s @%" PRId64 " -> %s\n",
 				             filename.c_str(),
 				             toString(pageID).c_str(),
 				             v,
@@ -1820,13 +1851,22 @@ public:
 				ASSERT(pageID != invalidLogicalPageID);
 			}
 		} else {
-			debug_printf("DWALPager(%s) op=readAtVersionNotRemapped %s @%" PRId64 " (not remapped)\n",
+			debug_printf("DWALPager(%s) op=lookupNotRemapped %s @%" PRId64 " (not remapped)\n",
 			             filename.c_str(),
 			             toString(pageID).c_str(),
 			             v);
 		}
 
-		return readPage(pageID, cacheable, noHit);
+		return (PhysicalPageID)pageID;
+	}
+
+	Future<Reference<ArenaPage>> readPageAtVersion(LogicalPageID logicalID,
+	                                               Version v,
+	                                               bool cacheable,
+	                                               bool noHit,
+	                                               bool* fromCache) {
+		PhysicalPageID physicalID = getPhysicalPageID(logicalID, v);
+		return readPage(physicalID, cacheable, noHit, fromCache);
 	}
 
 	// Get snapshot as of the most recent committed version of the pager
@@ -2365,14 +2405,19 @@ public:
 	  : pager(pager), metaKey(meta), version(version), expired(expiredFuture) {}
 	~DWALPagerSnapshot() override {}
 
-	Future<Reference<const ArenaPage>> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool noHit) override {
+	Future<Reference<const ArenaPage>> getPhysicalPage(LogicalPageID pageID,
+	                                                   bool cacheable,
+	                                                   bool noHit,
+	                                                   bool* fromCache) override {
 		if (expired.isError()) {
 			throw expired.getError();
 		}
-		return map(pager->readPageAtVersion(pageID, version, cacheable, noHit),
-		           [=](Reference<ArenaPage> p) { return Reference<const ArenaPage>(p); });
+		return map(pager->readPageAtVersion(pageID, version, cacheable, noHit, fromCache),
+		           [=](Reference<ArenaPage> p) { return Reference<const ArenaPage>(std::move(p)); });
 	}
 
+	bool tryEvictPage(LogicalPageID id) override { return pager->tryEvictPage(id, version); }
+
 	Key getMetaKey() const override { return metaKey; }
 
 	Version getVersion() const override { return version; }
@@ -3384,7 +3429,7 @@ public:
 				}
 				// Start reading the page, without caching
 				entries.push_back(
-				    std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true)));
+				    std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true, false)));
 
 				--toPop;
 			}
@@ -4111,11 +4156,26 @@ private:
 		return records;
 	}
 
+	// Try to evict a BTree page from the pager cache.
+	// Returns true if, at the end of the call, the page is no longer in cache,
+	// so the caller can assume its IPage reference is the only one.
+	bool tryEvictPage(IPagerSnapshot* pager, BTreePageIDRef id) {
+		// If it's an oversized page, currently it cannot be in the cache
+		if (id.size() > 0) {
+			return true;
+		}
+		return pager->tryEvictPage(id.front());
+	}
+
 	ACTOR static Future<Reference<const ArenaPage>> readPage(Reference<IPagerSnapshot> snapshot,
 	                                                         BTreePageIDRef id,
 	                                                         const RedwoodRecordRef* lowerBound,
 	                                                         const RedwoodRecordRef* upperBound,
-	                                                         bool forLazyClear = false) {
+	                                                         bool forLazyClear = false,
+	                                                         bool cacheable = true,
+	                                                         bool* fromCache = nullptr)
+
+	{
 		if (!forLazyClear) {
 			debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n",
 			             toString(id).c_str(),
@@ -4132,17 +4192,22 @@ private:
 		state Reference<const ArenaPage> page;
 
 		if (id.size() == 1) {
-			Reference<const ArenaPage> p = wait(snapshot->getPhysicalPage(id.front(), !forLazyClear, false));
-			page = p;
+			Reference<const ArenaPage> p = wait(snapshot->getPhysicalPage(id.front(), cacheable, false, fromCache));
+			page = std::move(p);
 		} else {
 			ASSERT(!id.empty());
 			std::vector<Future<Reference<const ArenaPage>>> reads;
 			for (auto& pageID : id) {
-				reads.push_back(snapshot->getPhysicalPage(pageID, !forLazyClear, false));
+				reads.push_back(snapshot->getPhysicalPage(pageID, cacheable, false));
 			}
 			std::vector<Reference<const ArenaPage>> pages = wait(getAll(reads));
 			// TODO:  Cache reconstituted super pages somehow, perhaps with help from the Pager.
 			page = ArenaPage::concatPages(pages);
+
+			// In the current implementation, SuperPages are never present in the cache
+			if (fromCache != nullptr) {
+				*fromCache = false;
+			}
 		}
 
 		debug_printf("readPage() op=readComplete %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion());
@@ -4152,7 +4217,7 @@ private:
 		metrics.pageReadExt += (id.size() - 1);
 
 		if (!forLazyClear && page->userData == nullptr) {
-			debug_printf("readPage() Creating Reader for %s @%" PRId64 " lower=%s upper=%s\n",
+			debug_printf("readPage() Creating Mirror for %s @%" PRId64 " lower=%s upper=%s\n",
 			             toString(id).c_str(),
 			             snapshot->getVersion(),
 			             lowerBound->toString(false).c_str(),
@@ -4166,7 +4231,7 @@ private:
 			             pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str());
 		}
 
-		return page;
+		return std::move(page);
 	}
 
 	static void preLoadPage(IPagerSnapshot* snapshot, BTreePageIDRef id) {
@@ -4536,8 +4601,9 @@ private:
 		state Reference<FlowLock> commitReadLock = self->m_commitReadLock;
 		wait(commitReadLock->take());
 		state FlowLock::Releaser readLock(*commitReadLock);
-		state Reference<const ArenaPage> page =
-		    wait(readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound));
+		state bool fromCache = false;
+		state Reference<const ArenaPage> page = wait(
+		    readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound, false, false, &fromCache));
 		readLock.release();
 
 		state BTreePage* btPage = (BTreePage*)page->begin();
@@ -4549,11 +4615,13 @@ private:
 		// though it is awkward to reason about.
 		state bool tryToUpdate = btPage->tree().numItems > 0 && update->boundariesNormal();
 
-		// If trying to update the page, we need to clone it so we don't modify the original.
+		// If trying to update the page and the page reference points into the cache,
+		// we need to clone it so we don't modify the original version of the page.
 		// TODO: Refactor DeltaTree::Mirror so it can be shared between different versions of pages
-		if (tryToUpdate) {
+		if (tryToUpdate && fromCache) {
 			page = self->cloneForUpdate(page);
 			btPage = (BTreePage*)page->begin();
+			fromCache = false;
 		}
 
 		debug_printf(
@@ -5034,6 +5102,7 @@ private:
 			state bool detachChildren = (parentInfo->count > 2);
 			state bool forceUpdate = false;
 
+			// If no changes were made, but we should rewrite it to point directly to remapped child pages
 			if (!m.changesMade && detachChildren) {
 				debug_printf(
 				    "%s Internal page forced rewrite because at least %d children have been updated in-place.\n",
@@ -5041,12 +5110,17 @@ private:
 				    parentInfo->count);
 				forceUpdate = true;
 				if (!m.updating) {
-					page = self->cloneForUpdate(page);
-					cursor = getCursor(page);
-					btPage = (BTreePage*)page->begin();
-					m.btPage = btPage;
-					m.m = cursor.mirror;
 					m.updating = true;
+
+					// Copy the page before modification if the page references the cache
+					if (fromCache) {
+						page = self->cloneForUpdate(page);
+						cursor = getCursor(page);
+						btPage = (BTreePage*)page->begin();
+						m.btPage = btPage;
+						m.m = cursor.mirror;
+						fromCache = false;
+					}
 				}
 				++g_redwoodMetrics.level(btPage->height).forceUpdate;
 			}
@@ -5064,7 +5138,7 @@ private:
 					if (m.updating) {
 						// Page was updated in place (or being forced to be updated in place to update child page ids)
 						debug_printf(
-						    "%s Internal page modified in-place tryUpdate=%d forceUpdate=%d detachChildren=%d\n",
+						    "%s Internal page modified in-place tryToUpdate=%d forceUpdate=%d detachChildren=%d\n",
 						    context.c_str(),
 						    tryToUpdate,
 						    forceUpdate,
@@ -7653,12 +7727,14 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") {
 			pos = newPos;
 		}
 		double elapsed = timer() - start;
-		printf("Seek/skip test, jumpMax=%d, items=%d, oldSeek=%d useHint=%d:  Elapsed %f s\n",
+		printf("Seek/skip test, count=%d jumpMax=%d, items=%d, oldSeek=%d useHint=%d:  Elapsed %f seconds  %.2f M/s\n",
+		       count,
 		       jumpMax,
 		       items.size(),
 		       old,
 		       useHint,
-		       elapsed);
+		       elapsed,
+		       double(count) / elapsed / 1e6);
 	};
 
 	// Compare seeking to nearby elements with and without hints, using the old and new SeekLessThanOrEqual methods.

From 16f9f6e75c4355a58cd6d70b7af9f2207b9e97a2 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Thu, 6 May 2021 02:05:09 -0700
Subject: [PATCH 357/461] Added a test of Arena dependencies and destruction
 when using the new aligned buffer feature.

---
 fdbserver/VersionedBTree.actor.cpp | 21 +++++++++++++++++++++
 flow/Arena.cpp                     |  2 ++
 2 files changed, 23 insertions(+)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index b51f81b6f8..1c97ab11bd 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -7820,6 +7820,27 @@ TEST_CASE(":/redwood/performance/mutationBuffer") {
 	return Void();
 }
 
+// This test is only useful with Arena debug statements which show when aligned buffers are allocated and freed.
+TEST_CASE(":/redwood/pager/ArenaPage") {
+	Arena x;
+	printf("Making p\n");
+	Reference<ArenaPage> p(new ArenaPage(4096, 4096));
+	printf("Made p=%p\n", p->begin());
+	printf("Clearing p\n");
+	p.clear();
+	printf("Making p\n");
+	p = Reference<ArenaPage>(new ArenaPage(4096, 4096));
+	printf("Made p=%p\n", p->begin());
+	printf("Making x depend on p\n");
+	x.dependsOn(p->getArena());
+	printf("Clearing p\n");
+	p.clear();
+	printf("Clearing x\n");
+	x = Arena();
+	printf("Pointer should be freed\n");
+	return Void();
+}
+
 TEST_CASE("/redwood/correctness/btree") {
 	g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting
 	g_redwoodMetrics.clear();
diff --git a/flow/Arena.cpp b/flow/Arena.cpp
index 251b1b4ab7..fe649c548b 100644
--- a/flow/Arena.cpp
+++ b/flow/Arena.cpp
@@ -239,6 +239,7 @@ void* ArenaBlock::makeAlignedBuffer(size_t alignment, size_t size) {
 	makeDefined(r, sizeof(ArenaBlockRef));
 	r->alignedBufferSize = size;
 	r->alignedBuffer = aligned_alloc(alignment, size);
+	// printf("Arena::alignedBuffer alloc %p\n", r->alignedBuffer);
 	r->nextBlockOffset = nextBlockOffset;
 	makeNoAccess(r, sizeof(ArenaBlockRef));
 	nextBlockOffset = bigUsed;
@@ -397,6 +398,7 @@ void ArenaBlock::destroy() {
 
 				// If alignedBuffer is valid, free it
 				if (br->alignedBufferSize != 0) {
+					// printf("Arena::alignedBuffer free %p\n", br->alignedBuffer);
 					aligned_free(br->alignedBuffer);
 				} else {
 					allowAccess(br->next);

From 0fc15526830f857859b96c52c4774160c9c07619 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 6 May 2021 15:22:03 -0700
Subject: [PATCH 358/461] Fix global config updates not triggering on fdbserver
 processes

---
 fdbclient/ActorLineageProfiler.cpp    | 22 ++++++++---------
 fdbclient/GlobalConfig.actor.cpp      | 34 +++++++++++++++------------
 fdbclient/GlobalConfig.actor.h        | 23 +++++++++---------
 fdbclient/NativeAPI.actor.cpp         |  8 +++----
 fdbserver/ClusterController.actor.cpp | 15 ++++++++----
 fdbserver/worker.actor.cpp            | 19 ++++++++-------
 6 files changed, 67 insertions(+), 54 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 9be7a60704..9cda13535e 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -296,17 +296,6 @@ boost::asio::io_context& ActorLineageProfilerT::context() {
 
 SampleIngestor::~SampleIngestor() {}
 
-// Callback used to update the sampling profilers run frequency whenever the
-// frequency changes.
-void samplingProfilerUpdateFrequency(std::optional<std::any> freq) {
-	double frequency = 0;
-	if (freq.has_value()) {
-		frequency = std::any_cast<double>(freq.value());
-	}
-	TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency);
-	ActorLineageProfiler::instance().setFrequency(frequency);
-}
-
 void ProfilerConfigT::reset(std::map<std::string, std::string> const& config) {
 	bool expectNoMore = false, useFluentD = false, useTCP = false;
 	std::string endpoint;
@@ -370,6 +359,17 @@ std::map<std::string, std::string> ProfilerConfigT::getConfig() const {
 	return res;
 }
 
+// Callback used to update the sampling profilers run frequency whenever the
+// frequency changes.
+void samplingProfilerUpdateFrequency(std::optional<std::any> freq) {
+	double frequency = 0;
+	if (freq.has_value()) {
+		frequency = std::any_cast<double>(freq.value());
+	}
+	TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency);
+	ActorLineageProfiler::instance().setFrequency(frequency);
+}
+
 // Callback used to update the sample collector window size.
 void samplingProfilerUpdateWindow(std::optional<std::any> window) {
 	double duration = 0;
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index e4f0791431..7ad5524d7d 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -39,13 +39,12 @@ const KeyRef samplingWindow = LiteralStringRef("visibility/sampling/window");
 
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
-void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+void GlobalConfig::create(DatabaseContext* cx, const ClientDBInfo* dbInfo, std::function<Future<Void>()> onChange) {
 	if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
 		auto config = new GlobalConfig{};
 		config->cx = Database(cx);
-		config->dbInfo = dbInfo;
 		g_network->setGlobal(INetwork::enGlobalConfig, config);
-		config->_updater = updater(config);
+		config->_updater = updater(config, dbInfo, onChange);
 	}
 }
 
@@ -55,10 +54,6 @@ GlobalConfig& GlobalConfig::globalConfig() {
 	return *reinterpret_cast<GlobalConfig*>(res);
 }
 
-void GlobalConfig::updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo) {
-	// this->dbInfo = dbInfo;
-}
-
 Key GlobalConfig::prefixedKey(KeyRef key) {
 	return key.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin);
 }
@@ -120,7 +115,7 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
 			callbacks[stableKey](data[stableKey]->value);
 		}
 	} catch (Error& e) {
-		TraceEvent("GlobalConfigTupleParseError").detail("What", e.what());
+		TraceEvent(SevWarn, "GlobalConfigTupleParseError").detail("What", e.what());
 	}
 }
 
@@ -180,7 +175,14 @@ ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
 			wait(tr->commit());
 			return Void();
 		} catch (Error& e) {
-			throw;
+			TraceEvent(SevInfo, "GlobalConfigMigrationError").detail("What", e.what());
+			if (e.code() == error_code_not_committed) {
+				// If multiple fdbserver processes are started at once, they
+				// will all attempt this migration at the same time, sometimes
+				// resulting in aborts due to conflicts. To avoid continuous
+				// contention, just return if a conflict error occurs.
+				return Void();
+			}
 		}
 	}
 }
@@ -201,8 +203,10 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 
 // Applies updates to the local copy of the global configuration when this
 // process receives an updated history.
-ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self) {
-	// wait(self->cx->onConnected());
+ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self,
+                                         const ClientDBInfo* dbInfo,
+                                         std::function<Future<Void>()> onChange) {
+	wait(self->cx->onConnected());
 	wait(self->migrate(self));
 
 	wait(self->refresh(self));
@@ -210,9 +214,9 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self) {
 
 	loop {
 		try {
-			wait(self->dbInfo->onChange());
+			wait(onChange());
 
-			auto& history = self->dbInfo->get().history;
+			auto& history = dbInfo->history;
 			if (history.size() == 0) {
 				continue;
 			}
@@ -222,8 +226,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self) {
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
 				wait(self->refresh(self));
-				if (self->dbInfo->get().history.size() > 0) {
-					self->lastUpdate = self->dbInfo->get().history.back().version;
+				if (dbInfo->history.size() > 0) {
+					self->lastUpdate = dbInfo->history.back().version;
 				}
 			} else {
 				// Apply history in order, from lowest version to highest
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 967ec77f8d..941c18f7e3 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -68,22 +68,20 @@ struct ConfigValue : ReferenceCounted<ConfigValue> {
 class GlobalConfig : NonCopyable {
 public:
 	// Creates a GlobalConfig singleton, accessed by calling GlobalConfig().
-	// This function should only be called once by each process (however, it is
-	// idempotent and calling it multiple times will have no effect).
-	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
+	// This function requires a database context object to allow global
+	// configuration to run transactions on the database, and a ClientDBInfo
+	// object to read the latest global configuration history. The
+	// std::function parameter should refer to a function that triggers when
+	// the ClientDBInfo object is changed. This function should only be called
+	// once by each process (however, it is idempotent and calling it multiple
+	// times will have no effect).
+	static void create(DatabaseContext* cx, const ClientDBInfo* dbInfo, std::function<Future<Void>()> onChange);
 
 	// Returns a reference to the global GlobalConfig object. Clients should
 	// call this function whenever they need to read a value out of the global
 	// configuration.
 	static GlobalConfig& globalConfig();
 
-	// Updates the ClientDBInfo object used by global configuration to read new
-	// data. For server processes, this value needs to be set by the cluster
-	// controller, but global config is initialized before the cluster
-	// controller is, so this function provides a mechanism to update the
-	// object after initialization.
-	void updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo);
-
 	// Use this function to turn a global configuration key defined above into
 	// the full path needed to set the value in the database.
 	//
@@ -156,10 +154,11 @@ private:
 
 	ACTOR static Future<Void> migrate(GlobalConfig* self);
 	ACTOR static Future<Void> refresh(GlobalConfig* self);
-	ACTOR static Future<Void> updater(GlobalConfig* self);
+	ACTOR static Future<Void> updater(GlobalConfig* self,
+	                                  const ClientDBInfo* dbInfo,
+	                                  std::function<Future<Void>()> onChange);
 
 	Database cx;
-	Reference<AsyncVar<ClientDBInfo>> dbInfo;
 	Future<Void> _updater;
 	Promise<Void> initialized;
 	AsyncTrigger configChanged;
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index a9bc969a13..d13b46c2a0 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -962,10 +962,6 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 	getValueSubmitted.init(LiteralStringRef("NativeAPI.GetValueSubmitted"));
 	getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted"));
 
-	GlobalConfig::create(this, clientInfo);
-	GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
-	GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow);
-
 	monitorProxiesInfoChange = monitorProxiesChange(clientInfo, &proxiesChangeTrigger);
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);
 	cacheListMonitor = monitorCacheList(this);
@@ -1568,6 +1564,10 @@ Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
 		                         /*switchable*/ true);
 	}
 
+	GlobalConfig::create(
+	    db, std::addressof(clientInfo->get()), std::bind(&AsyncVar<ClientDBInfo>::onChange, clientInfo));
+	GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
+	GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow);
 	return Database(db);
 }
 
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index b031957c38..c2023e1f57 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -135,9 +135,7 @@ public:
 		                                                                         true,
 		                                                                         TaskPriority::DefaultEndpoint,
 		                                                                         true)) // SOMEDAY: Locality!
-		{
-			GlobalConfig::globalConfig().updateDBInfo(clientInfo);
-		}
+		{}
 
 		void setDistributor(const DataDistributorInterface& interf) {
 			auto newInfo = serverInfo->get();
@@ -3767,7 +3765,7 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 				state Optional<Value> globalConfigVersion = wait(tr.get(globalConfigVersionKey));
-				state ClientDBInfo clientInfo = db->clientInfo->get();
+				state ClientDBInfo clientInfo = db->serverInfo->get().client;
 
 				if (globalConfigVersion.present()) {
 					// Since the history keys end with versionstamps, they
@@ -3825,6 +3823,15 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 					}
 
 					clientInfo.id = deterministicRandom()->randomUniqueID();
+
+					// Update ServerDBInfo so fdbserver processes receive updated history.
+					ServerDBInfo serverInfo = db->serverInfo->get();
+					serverInfo.id = deterministicRandom()->randomUniqueID();
+					serverInfo.infoGeneration = ++db->dbInfoCount;
+					serverInfo.client = clientInfo;
+					db->serverInfo->set(serverInfo);
+
+					// Update ClientDBInfo so client processes receive updated history.
 					db->clientInfo->set(clientInfo);
 				}
 
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 15151f3d12..040af2cd11 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -147,12 +147,17 @@ Database openDBOnServer(Reference<AsyncVar<ServerDBInfo>> const& db,
                         bool enableLocalityLoadBalance,
                         bool lockAware) {
 	auto info = makeReference<AsyncVar<ClientDBInfo>>();
-	return DatabaseContext::create(info,
-	                               extractClientInfo(db, info),
-	                               enableLocalityLoadBalance ? db->get().myLocality : LocalityData(),
-	                               enableLocalityLoadBalance,
-	                               taskID,
-	                               lockAware);
+	auto cx = DatabaseContext::create(info,
+	                                  extractClientInfo(db, info),
+	                                  enableLocalityLoadBalance ? db->get().myLocality : LocalityData(),
+	                                  enableLocalityLoadBalance,
+	                                  taskID,
+	                                  lockAware);
+	GlobalConfig::create(
+	    cx.getPtr(), std::addressof(db->get().client), std::bind(&AsyncVar<ServerDBInfo>::onChange, db));
+	GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
+	GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow);
+	return cx;
 }
 
 struct ErrorInfo {
@@ -1049,8 +1054,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 			metricsLogger = runMetrics(openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, lockAware),
 			                           KeyRef(metricsPrefix));
 		}
-
-		GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
 	}
 
 	errorForwarders.add(resetAfter(degraded,

From 24247057ff1d67b7777d4b7e1adc197d591076a0 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Thu, 6 May 2021 16:34:13 -0700
Subject: [PATCH 359/461] Add two safeThreadFuture unit tests into ctest

---
 flow/ThreadHelper.actor.cpp | 4 ++--
 tests/CMakeLists.txt        | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/flow/ThreadHelper.actor.cpp b/flow/ThreadHelper.actor.cpp
index a558940dbe..4c0a89c7d5 100644
--- a/flow/ThreadHelper.actor.cpp
+++ b/flow/ThreadHelper.actor.cpp
@@ -44,7 +44,7 @@ struct ThreadFutureCancelObj {
 };
 
 // This unit test should be running with TSAN enabled binary
-TEST_CASE("/safeThreadFutureSend") {
+TEST_CASE("/flow/safeThreadFutureToFuture/Send") {
 	// std::thread is not working in simulation at present, disable this in simulation
 	if (g_network->isSimulated())
 		return Void();
@@ -58,7 +58,7 @@ TEST_CASE("/safeThreadFutureSend") {
 }
 
 // Test the case where the underlying threadFuture is cancelled
-TEST_CASE("/safeThreadFutureCancel") {
+TEST_CASE("/flow/safeThreadFutureToFuture/Cancel") {
 	// std::thread is not working in simulation at present, disable this in simulation
 	if (g_network->isSimulated())
 		return Void();
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 94203e109a..32aa6e70e9 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -268,6 +268,11 @@ if(WITH_PYTHON)
     COMMAND $<TARGET_FILE:fdbserver> -r unittests -f /fdbclient/multiversionclient/
   )
 
+  add_test(
+    NAME threadsafe_threadfuture_to_future/unit_tests
+    COMMAND $<TARGET_FILE:fdbserver> -r unittests -f /flow/safeThreadFutureToFuture/
+  )
+
   verify_testing()
   if (NOT OPEN_FOR_IDE AND NOT WIN32)
     create_correctness_package()

From 478397a81e8b50859600f79b93fc552531ea6596 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Thu, 6 May 2021 16:35:28 -0700
Subject: [PATCH 360/461] fix merging conflict

---
 documentation/sphinx/source/developer-guide.rst | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index c51d762476..f51db018bb 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -956,11 +956,8 @@ that process, and wait for necessary data to be moved away.
    While the key is set, any commit that tries to set a key in the range will fail with the ``special_keys_api_failure`` error.
 #. ``\xff\xff/management/data_distribution/<mode|rebalance_ignored>`` Read/write. Changing these two keys will change the two corresponding system keys ``\xff/dataDistributionMode`` and ``\xff\x02/rebalanceDDIgnored``. The value of ``\xff\xff/management/data_distribution/mode`` is a literal text of ``0`` (disable) or ``1`` (enable). Transactions committed with invalid values will throw ``special_keys_api_failure`` . The value of ``\xff\xff/management/data_distribution/rebalance_ignored`` is empty. If present, it means data distribution is disabled for rebalance. Any transaction committed with non-empty value for this key will throw ``special_keys_api_failure``. For more details, see help text of ``fdbcli`` command ``datadistribution``.
 #. ``\xff\xff/management/consistency_check_suspended`` Read/write. Set or read this key will set or read the underlying system key ``\xff\x02/ConsistencyCheck/Suspend``. The value of this special key is unused thus if present, will be empty. In particular, if the key exists, then consistency is suspended. For more details, see help text of ``fdbcli`` command ``consistencycheck``.
-<<<<<<< HEAD
-=======
 #. ``\xff\xff/management/db_locked`` Read/write. A single key that can be read and modified. Set the key will lock the database and clear the key will unlock. If the database is already locked, then the commit will fail with the ``special_keys_api_failure`` error. For more details, see help text of ``fdbcli`` command ``lock`` and ``unlock``.
 #. ``\xff\xff/management/auto_coordinators`` Read-only. A single key, if read, will return a set of processes which is able to satisfy the current redundency level and serve as new coordinators. The return value is formatted as a comma delimited string of network addresses of coordinators, i.e. ``<ip:port>,<ip:port>,...,<ip:port>``.
->>>>>>> e9b07dc77c46c5cf94cc22149371c4d47499270e
 
 An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
 an ip address and port (e.g. ``127.0.0.1:4500``). If no port is specified,

From 73fdb55f4bd0c66043fcf7f1b258f8b9fa4bf20e Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Thu, 6 May 2021 16:43:56 -0700
Subject: [PATCH 361/461] Add headers in new files

---
 fdbcli/ConsistencyCheckCommand.actor.cpp | 20 ++++++++++++++++++++
 fdbcli/Util.cpp                          | 23 ++++++++++++++++++++++-
 flow/ThreadHelper.actor.h                |  1 -
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/fdbcli/ConsistencyCheckCommand.actor.cpp b/fdbcli/ConsistencyCheckCommand.actor.cpp
index 3180c7a82d..b2fd36c0da 100644
--- a/fdbcli/ConsistencyCheckCommand.actor.cpp
+++ b/fdbcli/ConsistencyCheckCommand.actor.cpp
@@ -1,3 +1,23 @@
+/*
+ * ConsistencyCheckCommand.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "fdbcli/fdbcli.h"
 
 #include "fdbclient/FDBOptions.g.h"
diff --git a/fdbcli/Util.cpp b/fdbcli/Util.cpp
index fa1bb273a3..2b755bd9d3 100644
--- a/fdbcli/Util.cpp
+++ b/fdbcli/Util.cpp
@@ -1,4 +1,25 @@
+/*
+ * Util.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "fdbcli/fdbcli.h"
+
 #include "flow/Arena.h"
 
 namespace fdb_cli {
@@ -19,4 +40,4 @@ void printUsage(StringRef command) {
 		fprintf(stderr, "ERROR: Unknown command `%s'\n", command.toString().c_str());
 }
 
-}
+} // namespace fdb_cli
diff --git a/flow/ThreadHelper.actor.h b/flow/ThreadHelper.actor.h
index 8ff7ac4702..7bf87a57d4 100644
--- a/flow/ThreadHelper.actor.h
+++ b/flow/ThreadHelper.actor.h
@@ -22,7 +22,6 @@
 
 // When actually compiled (NO_INTELLISENSE), include the generated
 // version of this file.  In intellisense use the source version.
-#include "flow/Error.h"
 #if defined(NO_INTELLISENSE) && !defined(FLOW_THREADHELPER_ACTOR_G_H)
 #define FLOW_THREADHELPER_ACTOR_G_H
 #include "flow/ThreadHelper.actor.g.h"

From 97cd53d64996af21a863eb790597e148545ca78a Mon Sep 17 00:00:00 2001
From: Neethu Haneesha Bingi <nbingi@apple.com>
Date: Wed, 28 Apr 2021 17:46:45 -0700
Subject: [PATCH 362/461] Adding TraceEvent when a worker is removed from
 cluster controller.

---
 fdbserver/ClusterController.actor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index b031957c38..3f33dab4ca 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3095,6 +3095,7 @@ ACTOR Future<Void> workerAvailabilityWatch(WorkerInterface worker,
 				cluster->removedDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint());
 				cluster->id_worker.erase(worker.locality.processId());
 				cluster->updateWorkerList.set(worker.locality.processId(), Optional<ProcessData>());
+				TraceEvent("ClusterFailedWorker", cluster->id).detail("WorkerProcessId", worker.locality.processId());
 				return Void();
 			}
 		}

From 7a6d7cae969c0d7a9b20f1aaee592cf275a82a73 Mon Sep 17 00:00:00 2001
From: Neethu Haneesha Bingi <nbingi@apple.com>
Date: Fri, 30 Apr 2021 01:42:54 -0700
Subject: [PATCH 363/461] Added worker network address and process class
 details to trace log.

---
 fdbserver/ClusterController.actor.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 3f33dab4ca..3aa6583662 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3095,7 +3095,10 @@ ACTOR Future<Void> workerAvailabilityWatch(WorkerInterface worker,
 				cluster->removedDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint());
 				cluster->id_worker.erase(worker.locality.processId());
 				cluster->updateWorkerList.set(worker.locality.processId(), Optional<ProcessData>());
-				TraceEvent("ClusterFailedWorker", cluster->id).detail("WorkerProcessId", worker.locality.processId());
+				TraceEvent("ClusterControllerWorkerFailed", cluster->id)
+					.detail("WorkerProcessId", worker.locality.processId())
+					.detail("WorkerProcessClass", failedWorkerInfo.details.processClass.toString())
+					.detail("WorkerAddress", worker.address());
 				return Void();
 			}
 		}

From d975e563cc5e88396b6bcba43adf1c27136dda23 Mon Sep 17 00:00:00 2001
From: Neethu Haneesha Bingi <nbingi@apple.com>
Date: Fri, 30 Apr 2021 14:35:01 -0700
Subject: [PATCH 364/461] Rename of variables in trace event.

---
 fdbserver/ClusterController.actor.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 3aa6583662..20564dac2e 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3096,9 +3096,9 @@ ACTOR Future<Void> workerAvailabilityWatch(WorkerInterface worker,
 				cluster->id_worker.erase(worker.locality.processId());
 				cluster->updateWorkerList.set(worker.locality.processId(), Optional<ProcessData>());
 				TraceEvent("ClusterControllerWorkerFailed", cluster->id)
-					.detail("WorkerProcessId", worker.locality.processId())
-					.detail("WorkerProcessClass", failedWorkerInfo.details.processClass.toString())
-					.detail("WorkerAddress", worker.address());
+					.detail("ProcessId", worker.locality.processId())
+					.detail("ProcessClass", failedWorkerInfo.details.processClass.toString())
+					.detail("Address", worker.address());
 				return Void();
 			}
 		}

From 3c94a09baf3f4e15bb75797b270e33f60f2323a5 Mon Sep 17 00:00:00 2001
From: Neethu Haneesha Bingi <nbingi@apple.com>
Date: Thu, 6 May 2021 17:04:25 -0700
Subject: [PATCH 365/461] Using failedWorkerInfo before its gets erased

---
 fdbserver/ClusterController.actor.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 20564dac2e..4d69aaa0c1 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3092,13 +3092,13 @@ ACTOR Future<Void> workerAvailabilityWatch(WorkerInterface worker,
 				if (worker.locality.processId() == cluster->masterProcessId) {
 					cluster->masterProcessId = Optional<Key>();
 				}
-				cluster->removedDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint());
-				cluster->id_worker.erase(worker.locality.processId());
-				cluster->updateWorkerList.set(worker.locality.processId(), Optional<ProcessData>());
 				TraceEvent("ClusterControllerWorkerFailed", cluster->id)
 					.detail("ProcessId", worker.locality.processId())
 					.detail("ProcessClass", failedWorkerInfo.details.processClass.toString())
 					.detail("Address", worker.address());
+				cluster->removedDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint());
+				cluster->id_worker.erase(worker.locality.processId());
+				cluster->updateWorkerList.set(worker.locality.processId(), Optional<ProcessData>());
 				return Void();
 			}
 		}

From 3d9619790d7a548cae973d72e7ce112477548e91 Mon Sep 17 00:00:00 2001
From: Edwin Zhang <edwin.zhang@snowflake.com>
Date: Fri, 7 May 2021 17:54:59 -0400
Subject: [PATCH 366/461] Fix OPEN_FOR_IDE option for cmake

---
 bindings/CMakeLists.txt |  2 +-
 tests/CMakeLists.txt    | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt
index e363695ac2..378ea504b1 100644
--- a/bindings/CMakeLists.txt
+++ b/bindings/CMakeLists.txt
@@ -1,6 +1,6 @@
-add_subdirectory(c)
 if(NOT OPEN_FOR_IDE)
   # flow bindings currently doesn't support that
+  add_subdirectory(c)
   add_subdirectory(flow)
 endif()
 add_subdirectory(python)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 94203e109a..23df5095db 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -263,10 +263,12 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES status/separate_not_enough_servers.txt)
   add_fdb_test(TEST_FILES status/single_process_too_many_config_params.txt)
 
-  add_test(
-    NAME multiversion_client/unit_tests
-    COMMAND $<TARGET_FILE:fdbserver> -r unittests -f /fdbclient/multiversionclient/
-  )
+  if(NOT OPEN_FOR_IDE)
+    add_test(
+      NAME multiversion_client/unit_tests
+      COMMAND $<TARGET_FILE:fdbserver> -r unittests -f /fdbclient/multiversionclient/
+    )
+  endif()
 
   verify_testing()
   if (NOT OPEN_FOR_IDE AND NOT WIN32)

From 520c4fc2b9142e0a7d9634beeed8112770634174 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Fri, 7 May 2021 16:03:27 -0700
Subject: [PATCH 367/461] Fix typo in the variable name

---
 fdbcli/ConsistencyCheckCommand.actor.cpp | 8 ++++----
 fdbcli/fdbcli.h                          | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fdbcli/ConsistencyCheckCommand.actor.cpp b/fdbcli/ConsistencyCheckCommand.actor.cpp
index b2fd36c0da..4c4370ff30 100644
--- a/fdbcli/ConsistencyCheckCommand.actor.cpp
+++ b/fdbcli/ConsistencyCheckCommand.actor.cpp
@@ -33,13 +33,13 @@ using namespace fdb_cli;
 ACTOR static Future<bool> consistencyCheckCommandActor(Reference<ITransaction> tr, std::vector<StringRef> tokens) {
 	tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
 	if (tokens.size() == 1) {
-		Optional<Value> suspended = wait(safeThreadFutureToFuture(tr->get(consistencyCheckSpeicalKey)));
+		Optional<Value> suspended = wait(safeThreadFutureToFuture(tr->get(consistencyCheckSpecialKey)));
 		printf("ConsistencyCheck is %s\n", suspended.present() ? "off" : "on");
 	} else if (tokens.size() == 2 && tokencmp(tokens[1], "off")) {
-		tr->set(consistencyCheckSpeicalKey, Value());
+		tr->set(consistencyCheckSpecialKey, Value());
 		wait(safeThreadFutureToFuture(tr->commit()));
 	} else if (tokens.size() == 2 && tokencmp(tokens[1], "on")) {
-		tr->clear(consistencyCheckSpeicalKey);
+		tr->clear(consistencyCheckSpecialKey);
 		wait(safeThreadFutureToFuture(tr->commit()));
 	} else {
 		printUsage(tokens[0]);
@@ -50,7 +50,7 @@ ACTOR static Future<bool> consistencyCheckCommandActor(Reference<ITransaction> t
 
 namespace fdb_cli {
 
-const KeyRef consistencyCheckSpeicalKey = LiteralStringRef("\xff\xff/management/consistency_check_suspended");
+const KeyRef consistencyCheckSpecialKey = LiteralStringRef("\xff\xff/management/consistency_check_suspended");
 
 Future<bool> consistencyCheckCommand(Reference<ITransaction> tr, std::vector<StringRef> tokens) {
 	return consistencyCheckCommandActor(tr, tokens);
diff --git a/fdbcli/fdbcli.h b/fdbcli/fdbcli.h
index 652862b9c2..831de2decd 100644
--- a/fdbcli/fdbcli.h
+++ b/fdbcli/fdbcli.h
@@ -51,7 +51,7 @@ struct CommandFactory {
 // Special keys used by fdbcli commands
 
 // consistencycheck
-extern const KeyRef consistencyCheckSpeicalKey;
+extern const KeyRef consistencyCheckSpecialKey;
 
 // help functions (Copied from fdbcli.actor.cpp)
 

From 71ba6f4501e7386a7daf7a666a1b4b1d99f1e065 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 7 May 2021 16:05:03 -0700
Subject: [PATCH 368/461] Switch std::function to AsyncTrigger

---
 fdbclient/GlobalConfig.actor.cpp | 15 ++-------------
 fdbclient/GlobalConfig.actor.h   | 31 ++++++++++++++++++++-----------
 fdbclient/NativeAPI.actor.cpp    |  3 +--
 fdbserver/worker.actor.cpp       |  3 +--
 flow/genericactors.actor.h       | 10 ++++++++++
 5 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 7ad5524d7d..9d3515827e 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -39,15 +39,6 @@ const KeyRef samplingWindow = LiteralStringRef("visibility/sampling/window");
 
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
-void GlobalConfig::create(DatabaseContext* cx, const ClientDBInfo* dbInfo, std::function<Future<Void>()> onChange) {
-	if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
-		auto config = new GlobalConfig{};
-		config->cx = Database(cx);
-		g_network->setGlobal(INetwork::enGlobalConfig, config);
-		config->_updater = updater(config, dbInfo, onChange);
-	}
-}
-
 GlobalConfig& GlobalConfig::globalConfig() {
 	void* res = g_network->global(INetwork::enGlobalConfig);
 	ASSERT(res);
@@ -203,9 +194,7 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 
 // Applies updates to the local copy of the global configuration when this
 // process receives an updated history.
-ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self,
-                                         const ClientDBInfo* dbInfo,
-                                         std::function<Future<Void>()> onChange) {
+ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, const ClientDBInfo* dbInfo) {
 	wait(self->cx->onConnected());
 	wait(self->migrate(self));
 
@@ -214,7 +203,7 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self,
 
 	loop {
 		try {
-			wait(onChange());
+			wait(self->dbInfoChanged.onTrigger());
 
 			auto& history = dbInfo->history;
 			if (history.size() == 0) {
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 941c18f7e3..76490fb67f 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -67,15 +67,25 @@ struct ConfigValue : ReferenceCounted<ConfigValue> {
 
 class GlobalConfig : NonCopyable {
 public:
-	// Creates a GlobalConfig singleton, accessed by calling GlobalConfig().
+	// Creates a GlobalConfig singleton, accessed by calling globalConfig().
 	// This function requires a database context object to allow global
-	// configuration to run transactions on the database, and a ClientDBInfo
-	// object to read the latest global configuration history. The
-	// std::function parameter should refer to a function that triggers when
-	// the ClientDBInfo object is changed. This function should only be called
-	// once by each process (however, it is idempotent and calling it multiple
-	// times will have no effect).
-	static void create(DatabaseContext* cx, const ClientDBInfo* dbInfo, std::function<Future<Void>()> onChange);
+	// configuration to run transactions on the database, and an AsyncVar
+	// object to watch for changes on. The ClientDBInfo pointer should point to
+	// a ClientDBInfo object which will contain the updated global
+	// configuration history when the given AsyncVar changes. This function
+	// should only be called once (however, it is idempotent and calling it
+	// multiple times will have no effect).
+	template <class T>
+	static void create(DatabaseContext* cx, Reference<AsyncVar<T>> db, const ClientDBInfo* dbInfo) {
+		if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
+			auto config = new GlobalConfig{};
+			config->cx = Database(cx);
+			g_network->setGlobal(INetwork::enGlobalConfig, config);
+			config->_updater = updater(config, dbInfo);
+			// Bind changes in `db` to the `dbInfoChanged` AsyncTrigger.
+			forward(db, std::addressof(config->dbInfoChanged));
+		}
+	}
 
 	// Returns a reference to the global GlobalConfig object. Clients should
 	// call this function whenever they need to read a value out of the global
@@ -154,11 +164,10 @@ private:
 
 	ACTOR static Future<Void> migrate(GlobalConfig* self);
 	ACTOR static Future<Void> refresh(GlobalConfig* self);
-	ACTOR static Future<Void> updater(GlobalConfig* self,
-	                                  const ClientDBInfo* dbInfo,
-	                                  std::function<Future<Void>()> onChange);
+	ACTOR static Future<Void> updater(GlobalConfig* self, const ClientDBInfo* dbInfo);
 
 	Database cx;
+	AsyncTrigger dbInfoChanged;
 	Future<Void> _updater;
 	Promise<Void> initialized;
 	AsyncTrigger configChanged;
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index d13b46c2a0..d9a8f5109e 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -1564,8 +1564,7 @@ Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
 		                         /*switchable*/ true);
 	}
 
-	GlobalConfig::create(
-	    db, std::addressof(clientInfo->get()), std::bind(&AsyncVar<ClientDBInfo>::onChange, clientInfo));
+	GlobalConfig::create(db, clientInfo, std::addressof(clientInfo->get()));
 	GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
 	GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow);
 	return Database(db);
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 040af2cd11..ee617e09e3 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -153,8 +153,7 @@ Database openDBOnServer(Reference<AsyncVar<ServerDBInfo>> const& db,
 	                                  enableLocalityLoadBalance,
 	                                  taskID,
 	                                  lockAware);
-	GlobalConfig::create(
-	    cx.getPtr(), std::addressof(db->get().client), std::bind(&AsyncVar<ServerDBInfo>::onChange, db));
+	GlobalConfig::create(cx.getPtr(), db, std::addressof(db->get().client));
 	GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
 	GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow);
 	return cx;
diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h
index 8561bc623c..147da0230b 100644
--- a/flow/genericactors.actor.h
+++ b/flow/genericactors.actor.h
@@ -697,6 +697,16 @@ private:
 	AsyncVar<Void> v;
 };
 
+// Binds an AsyncTrigger object to an AsyncVar, so when the AsyncVar changes
+// the AsyncTrigger is triggered.
+ACTOR template <class T>
+void forward(Reference<AsyncVar<T>> from, AsyncTrigger* to) {
+	loop {
+		wait(from->onChange());
+		to->trigger();
+	}
+}
+
 class Debouncer : NonCopyable {
 public:
 	explicit Debouncer(double delay) { worker = debounceWorker(this, delay); }

From 3f6ef143848343de417eebdf31bafea8915bb83a Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 7 May 2021 16:28:39 -0700
Subject: [PATCH 369/461] Fix OOM

---
 fdbclient/GlobalConfig.actor.cpp | 54 +++++++++++++++-----------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 9d3515827e..3bd982b1ad 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -145,36 +145,32 @@ ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
 	state Optional<Value> sampleRate = wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_sample_rate/"_sr)));
 	state Optional<Value> sizeLimit = wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_size_limit/"_sr)));
 
-	loop {
-		try {
-			tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
-			// The value doesn't matter too much, as long as the key is set.
-			tr->set(migratedKey.contents(), "1"_sr);
-			if (sampleRate.present()) {
-				const double sampleRateDbl =
-				    BinaryReader::fromStringRef<double>(sampleRate.get().contents(), Unversioned());
-				Tuple rate = Tuple().appendDouble(sampleRateDbl);
-				tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
-			}
-			if (sizeLimit.present()) {
-				const int64_t sizeLimitInt =
-				    BinaryReader::fromStringRef<int64_t>(sizeLimit.get().contents(), Unversioned());
-				Tuple size = Tuple().append(sizeLimitInt);
-				tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
-			}
-
-			wait(tr->commit());
-			return Void();
-		} catch (Error& e) {
-			TraceEvent(SevInfo, "GlobalConfigMigrationError").detail("What", e.what());
-			if (e.code() == error_code_not_committed) {
-				// If multiple fdbserver processes are started at once, they
-				// will all attempt this migration at the same time, sometimes
-				// resulting in aborts due to conflicts. To avoid continuous
-				// contention, just return if a conflict error occurs.
-				return Void();
-			}
+	try {
+		tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+		// The value doesn't matter too much, as long as the key is set.
+		tr->set(migratedKey.contents(), "1"_sr);
+		if (sampleRate.present()) {
+			const double sampleRateDbl =
+			    BinaryReader::fromStringRef<double>(sampleRate.get().contents(), Unversioned());
+			Tuple rate = Tuple().appendDouble(sampleRateDbl);
+			tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
 		}
+		if (sizeLimit.present()) {
+			const int64_t sizeLimitInt =
+			    BinaryReader::fromStringRef<int64_t>(sizeLimit.get().contents(), Unversioned());
+			Tuple size = Tuple().append(sizeLimitInt);
+			tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
+		}
+
+		wait(tr->commit());
+		return Void();
+	} catch (Error& e) {
+		// If multiple fdbserver processes are started at once, they will all
+		// attempt this migration at the same time, sometimes resulting in
+		// aborts due to conflicts. Purposefully avoid retrying, making this
+		// migration best-effort.
+		TraceEvent(SevInfo, "GlobalConfigMigrationError").detail("What", e.what());
+		throw;
 	}
 }
 

From fe7419262d68b7b3781cde2f2c70ec678b16efc7 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Sat, 8 May 2021 15:47:21 -0700
Subject: [PATCH 370/461] Add tool for upgrading API versions

---
 contrib/apiversioner.py | 217 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100755 contrib/apiversioner.py

diff --git a/contrib/apiversioner.py b/contrib/apiversioner.py
new file mode 100755
index 0000000000..07e23423c3
--- /dev/null
+++ b/contrib/apiversioner.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python2
+#
+# alloc_instrumentation.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import logging
+import os
+import re
+import sys
+import traceback
+
+
+LOG_FORMAT = '%(created)f [%(levelname)s] %(message)s'
+
+EXCLUDED_FILES = map(re.compile, [
+    # Output directories
+    r'\.git/.*', r'bin/.*', r'packages/.*', r'\.objs/.*', r'\.deps/.*', r'bindings/go/build/.*', r'documentation/sphinx/\.out/.*',
+
+    # Generated files
+    r'.*\.g\.cpp$', r'.*\.g\.h$', r'(^|.*/)generated.mk$', r'.*\.g\.S$',
+    r'.*/MutationType\.java', r'.*/generated\.go',
+
+    # Binary files
+    r'.*\.class$', r'.*\.o$', r'.*\.a$', r'.*[\.-]debug', r'.*\.so$', r'.*\.dylib$', r'.*\.dll$', r'.*\.tar[^/]*$', r'.*\.jar$', r'.*pyc$', r'bindings/flow/bin/.*',
+    r'.*\.pdf$',
+
+    # Project configuration files
+    r'.*foundationdb\.VC\.db$', r'.*foundationdb\.VC\.VC\.opendb$', r'.*iml$',
+
+    # Source files from someone else
+    r'(^|.*/)Hash3\..*', r'(^|.*/)sqlite.*',
+    r'bindings/go/godoc-resources/.*',
+    r'fdbcli/linenoise/.*',
+    r'fdbrpc/rapidjson/.*', r'fdbrpc/rapidxml/.*', r'fdbrpc/zlib/.*', r'fdbrpc/sha1/.*',
+    r'fdbrpc/xml2json.hpp$', r'fdbrpc/libcoroutine/.*', r'fdbrpc/libeio/.*', r'fdbrpc/lib64/.*',
+    r'fdbrpc/generated-constants.cpp$',
+
+    # Miscellaneous
+    r'bindings/nodejs/node_modules/.*', r'bindings/go/godoc/.*', r'.*trace.*xml$', r'.*log$', r'.*\.DS_Store$', r'simfdb/\.*', r'.*~$', r'.*.swp$'
+])
+
+SUSPECT_PHRASES = map(re.compile, [
+    r'#define\s+FDB_API_VERSION\s+(\d+)',
+    r'\.\s*selectApiVersion\s*\(\s*(\d+)\s*\)',
+    r'\.\s*APIVersion\s*\(\s*(\d+)\s*\)',
+    r'\.\s*MustAPIVersion\s*\(\s*(\d+)\s*\)',
+    r'header_version\s+=\s+(\d+)',
+    r'\.\s*apiVersion\s*\(\s*(\d+)\s*\)',
+    r'API_VERSION\s*=\s*(\d+)',
+    r'fdb_select_api_version\s*\((\d+)\)'
+])
+
+DIM_CODE = '\033[2m'
+BOLD_CODE = '\033[1m'
+RED_COLOR = '\033[91m'
+GREEN_COLOR = '\033[92m'
+END_COLOR = '\033[0m'
+
+
+def positive_response(val):
+    return val.lower() in {'y', 'yes'}
+
+
+# Returns: new line list + a dirty flag
+def rewrite_lines(lines, version_re, new_version, suspect_only=True, print_diffs=False, ask_confirm=False, grayscale=False):
+    new_lines = []
+    dirty = False
+    new_str = str(new_version)
+    regexes = SUSPECT_PHRASES if suspect_only else [version_re]
+    group_index = 1 if suspect_only else 2
+    for line_no, line in enumerate(lines):
+        new_line = line
+        offset = 0
+
+        for regex in regexes:
+            for m in regex.finditer(line):
+                # Replace suspect code with new version.
+                start = m.start(group_index)
+                end = m.end(group_index)
+                new_line = new_line[:start + offset] + new_str + new_line[end + offset:]
+                offset += len(new_str) - (end - start)
+
+        if (print_diffs or ask_confirm) and line != new_line:
+            print 'Rewrite:'
+            print '\n'.join(map(lambda (x, s): ' {:4d}: {}'.format(line_no - 1 + x, s), enumerate(lines[line_no - 2:line_no])))
+            print (DIM_CODE if grayscale else RED_COLOR) + '-{:4d}: {}'.format(line_no + 1, line) + END_COLOR
+            print (BOLD_CODE if grayscale else GREEN_COLOR) + '+{:4d}: {}'.format(line_no + 1, new_line) + END_COLOR
+            print '\n'.join(map(lambda (x, s): ' {:4d}: {}'.format(line_no + 2 + x, s), enumerate(lines[line_no + 1:line_no + 3])))
+
+            if ask_confirm:
+                text = raw_input('Looks good (y/n)? ')
+                if not positive_response(text):
+                    print 'Okay, skipping.'
+                    new_line = line
+
+        dirty = dirty or (new_line != line)
+        new_lines.append(new_line)
+
+    return new_lines, dirty
+
+
+def address_file(base_path, file_path, version, new_version=None, suspect_only=False, show_diffs=False,
+                 rewrite=False, ask_confirm=True, grayscale=False, paths_only=False):
+    if any(map(lambda x: x.match(file_path), EXCLUDED_FILES)):
+        logging.debug('skipping file %s as matches excluded list', file_path)
+        return True
+
+    # Look for all instances of the version number where it is not part of a larger number
+    version_re = re.compile('(^|[^\\d])(' + str(version) + ')([^\\d]|$)')
+    try:
+        contents = open(os.path.join(base_path, file_path), 'r').read()
+        lines = contents.split('\n')
+        new_lines = lines
+        dirty = False
+
+        if suspect_only:
+            # Look for suspect lines (lines that attempt to set a version)
+            found = False
+            for line_no, line in enumerate(lines):
+                for suspect_phrase in SUSPECT_PHRASES:
+                    for match in suspect_phrase.finditer(line):
+                        curr_version = int(match.groups()[0])
+                        if (new_version is None and curr_version < version) or (new_version is not None and curr_version < new_version):
+                            found = True
+                            logging.info('Old version: %s:%d:%s', file_path, line_no + 1, line)
+
+            if found and new_version is not None and (show_diffs or rewrite):
+                new_lines, dirty = rewrite_lines(lines, version_re, new_version, True, print_diffs=True,
+                                                 ask_confirm=(rewrite and ask_confirm), grayscale=grayscale)
+
+        else:
+            matching_lines = filter(lambda (_, x): version_re.search(x), enumerate(lines))
+
+            # Look for lines with the version
+            if matching_lines:
+                if paths_only:
+                    logging.info('File %s matches', file_path)
+                else:
+                    for line_no, line in matching_lines:
+                        logging.info('Match: %s:%d:%s', file_path, line_no + 1, line)
+                    if new_version is not None and (show_diffs or rewrite):
+                        new_lines, dirty = rewrite_lines(lines, version_re, new_version, False, print_diffs=True,
+                                                         ask_confirm=(rewrite and ask_confirm), grayscale=grayscale)
+            else:
+                logging.debug('File %s does not match', file_path)
+
+        if dirty and rewrite:
+            logging.info('Rewriting %s', os.path.join(base_path, file_path))
+            with open(os.path.join(base_path, file_path), 'w') as fout:
+                fout.write('\n'.join(new_lines))
+
+        return True
+    except OSError as e:
+        logging.exception('Unable to read file %s due to OSError')
+        return False
+
+
+def address_path(path, version, new_version=None, suspect_only=False, show_diffs=False, rewrite=False, ask_confirm=True, grayscale=False, paths_only=False):
+    try:
+        if os.path.exists(path):
+            if os.path.isdir(path):
+                status = True
+                for dir_path, dir_names, file_names in os.walk(path):
+                    for file_name in file_names:
+                        file_path = os.path.relpath(os.path.join(dir_path, file_name), path)
+                        status = address_file(path, file_path, version, new_version, suspect_only, show_diffs,
+                                              rewrite, ask_confirm, grayscale, paths_only) and status
+                return status
+            else:
+                base_name, file_name = os.path.split(path)
+                return address_file(base_name, file_name, version, new_version, suspect_only, show_diffs, rewrite, ask_confirm, grayscale)
+        else:
+            logging.error('Path %s does not exist', path)
+            return False
+    except OSError as e:
+        logging.exception('Unable to find all API versions due to OSError')
+        return False
+
+
+def run(arg_list):
+    parser = argparse.ArgumentParser(description='finds and rewrites the API version in FDB source files')
+    parser.add_argument('path', help='path to search for FDB source files')
+    parser.add_argument('version', type=int, help='current/old version to search for')
+    parser.add_argument('--new-version', type=int, default=None, help='new version to update to')
+    parser.add_argument('--suspect-only', action='store_true', default=False, help='only look for phrases trying to set the API version')
+    parser.add_argument('--show-diffs', action='store_true', default=False, help='show suggested diffs for fixing version')
+    parser.add_argument('--rewrite', action='store_true', default=False, help='rewrite offending files')
+    parser.add_argument('-y', '--skip-confirm', action='store_true', default=False, help='do not ask for confirmation before rewriting')
+    parser.add_argument('--grayscale', action='store_true', default=False,
+                        help='print diffs using grayscale output instead of red and green')
+    parser.add_argument('--paths-only', action='store_true', default=False, help='display only the path instead of the offending lines')
+    args = parser.parse_args(arg_list)
+    return address_path(args.path, args.version, args.new_version, args.suspect_only, args.show_diffs,
+                        args.rewrite, not args.skip_confirm, args.grayscale, args.paths_only)
+
+
+if __name__ == '__main__':
+    logging.basicConfig(format=LOG_FORMAT, level=logging.INFO)
+    if not run(sys.argv[1:]):
+        exit(1)

From 98665f9baeece3822f131fda7d5c75ed190e03bf Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Sat, 8 May 2021 16:22:53 -0700
Subject: [PATCH 371/461] Convert apiversioner.py to Python 3

---
 contrib/apiversioner.py | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/contrib/apiversioner.py b/contrib/apiversioner.py
index 07e23423c3..47e9867092 100755
--- a/contrib/apiversioner.py
+++ b/contrib/apiversioner.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 #
 # alloc_instrumentation.py
 #
@@ -29,7 +29,7 @@ import traceback
 
 LOG_FORMAT = '%(created)f [%(levelname)s] %(message)s'
 
-EXCLUDED_FILES = map(re.compile, [
+EXCLUDED_FILES = list(map(re.compile, [
     # Output directories
     r'\.git/.*', r'bin/.*', r'packages/.*', r'\.objs/.*', r'\.deps/.*', r'bindings/go/build/.*', r'documentation/sphinx/\.out/.*',
 
@@ -39,7 +39,8 @@ EXCLUDED_FILES = map(re.compile, [
 
     # Binary files
     r'.*\.class$', r'.*\.o$', r'.*\.a$', r'.*[\.-]debug', r'.*\.so$', r'.*\.dylib$', r'.*\.dll$', r'.*\.tar[^/]*$', r'.*\.jar$', r'.*pyc$', r'bindings/flow/bin/.*',
-    r'.*\.pdf$',
+    r'.*\.pdf$', r'.*\.jp[e]*g', r'.*\.png', r'.*\.ico',
+    r'packaging/msi/art/.*',
 
     # Project configuration files
     r'.*foundationdb\.VC\.db$', r'.*foundationdb\.VC\.VC\.opendb$', r'.*iml$',
@@ -47,6 +48,7 @@ EXCLUDED_FILES = map(re.compile, [
     # Source files from someone else
     r'(^|.*/)Hash3\..*', r'(^|.*/)sqlite.*',
     r'bindings/go/godoc-resources/.*',
+    r'bindings/go/src/fdb/tuple/testdata/tuples.golden',
     r'fdbcli/linenoise/.*',
     r'fdbrpc/rapidjson/.*', r'fdbrpc/rapidxml/.*', r'fdbrpc/zlib/.*', r'fdbrpc/sha1/.*',
     r'fdbrpc/xml2json.hpp$', r'fdbrpc/libcoroutine/.*', r'fdbrpc/libeio/.*', r'fdbrpc/lib64/.*',
@@ -54,7 +56,7 @@ EXCLUDED_FILES = map(re.compile, [
 
     # Miscellaneous
     r'bindings/nodejs/node_modules/.*', r'bindings/go/godoc/.*', r'.*trace.*xml$', r'.*log$', r'.*\.DS_Store$', r'simfdb/\.*', r'.*~$', r'.*.swp$'
-])
+]))
 
 SUSPECT_PHRASES = map(re.compile, [
     r'#define\s+FDB_API_VERSION\s+(\d+)',
@@ -98,16 +100,16 @@ def rewrite_lines(lines, version_re, new_version, suspect_only=True, print_diffs
                 offset += len(new_str) - (end - start)
 
         if (print_diffs or ask_confirm) and line != new_line:
-            print 'Rewrite:'
-            print '\n'.join(map(lambda (x, s): ' {:4d}: {}'.format(line_no - 1 + x, s), enumerate(lines[line_no - 2:line_no])))
-            print (DIM_CODE if grayscale else RED_COLOR) + '-{:4d}: {}'.format(line_no + 1, line) + END_COLOR
-            print (BOLD_CODE if grayscale else GREEN_COLOR) + '+{:4d}: {}'.format(line_no + 1, new_line) + END_COLOR
-            print '\n'.join(map(lambda (x, s): ' {:4d}: {}'.format(line_no + 2 + x, s), enumerate(lines[line_no + 1:line_no + 3])))
+            print('Rewrite:')
+            print('\n'.join(map(lambda pair: ' {:4d}: {}'.format(line_no - 1 + pair[0], pair[1]), enumerate(lines[line_no - 2:line_no]))))
+            print((DIM_CODE if grayscale else RED_COLOR) + '-{:4d}: {}'.format(line_no + 1, line) + END_COLOR)
+            print((BOLD_CODE if grayscale else GREEN_COLOR) + '+{:4d}: {}'.format(line_no + 1, new_line) + END_COLOR)
+            print('\n'.join(map(lambda pair: ' {:4d}: {}'.format(line_no + 2 + pair[0], pair[1]), enumerate(lines[line_no + 1:line_no + 3]))))
 
             if ask_confirm:
                 text = raw_input('Looks good (y/n)? ')
                 if not positive_response(text):
-                    print 'Okay, skipping.'
+                    print('Okay, skipping.')
                     new_line = line
 
         dirty = dirty or (new_line != line)
@@ -146,7 +148,7 @@ def address_file(base_path, file_path, version, new_version=None, suspect_only=F
                                                  ask_confirm=(rewrite and ask_confirm), grayscale=grayscale)
 
         else:
-            matching_lines = filter(lambda (_, x): version_re.search(x), enumerate(lines))
+            matching_lines = filter(lambda pair: version_re.search(pair[1]), enumerate(lines))
 
             # Look for lines with the version
             if matching_lines:
@@ -167,8 +169,8 @@ def address_file(base_path, file_path, version, new_version=None, suspect_only=F
                 fout.write('\n'.join(new_lines))
 
         return True
-    except OSError as e:
-        logging.exception('Unable to read file %s due to OSError')
+    except (OSError, UnicodeDecodeError) as e:
+        logging.exception('Unable to read file %s due to OSError', os.path.join(base_path, file_path))
         return False
 
 

From d07f527a980a29a15e4d6743aef03884d98b38d7 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Sat, 8 May 2021 16:29:07 -0700
Subject: [PATCH 372/461] Fix raw_input

---
 contrib/apiversioner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/apiversioner.py b/contrib/apiversioner.py
index 47e9867092..65bbbbc774 100755
--- a/contrib/apiversioner.py
+++ b/contrib/apiversioner.py
@@ -107,7 +107,7 @@ def rewrite_lines(lines, version_re, new_version, suspect_only=True, print_diffs
             print('\n'.join(map(lambda pair: ' {:4d}: {}'.format(line_no + 2 + pair[0], pair[1]), enumerate(lines[line_no + 1:line_no + 3]))))
 
             if ask_confirm:
-                text = raw_input('Looks good (y/n)? ')
+                text = input('Looks good (y/n)? ')
                 if not positive_response(text):
                     print('Okay, skipping.')
                     new_line = line

From ccb092a2c88cb33011556f689eef1103a54f73bb Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Mon, 10 May 2021 09:01:24 -0700
Subject: [PATCH 373/461] Fix file name

---
 contrib/apiversioner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/apiversioner.py b/contrib/apiversioner.py
index 65bbbbc774..9806216671 100755
--- a/contrib/apiversioner.py
+++ b/contrib/apiversioner.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-# alloc_instrumentation.py
+# apiversioner.py
 #
 # This source file is part of the FoundationDB open source project
 #

From cdf98f987d6adbd3afad7364fe26b60e2695846a Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 10 May 2021 12:18:28 -0700
Subject: [PATCH 374/461] Revert "Fix global config not triggering changes on
 server processes"

---
 fdbclient/ActorLineageProfiler.cpp    | 22 ++++----
 fdbclient/GlobalConfig.actor.cpp      | 75 +++++++++++++++------------
 fdbclient/GlobalConfig.actor.h        | 34 +++++-------
 fdbclient/NativeAPI.actor.cpp         |  7 +--
 fdbserver/ClusterController.actor.cpp | 15 ++----
 fdbserver/worker.actor.cpp            | 18 +++----
 flow/genericactors.actor.h            | 10 ----
 7 files changed, 83 insertions(+), 98 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 9cda13535e..9be7a60704 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -296,6 +296,17 @@ boost::asio::io_context& ActorLineageProfilerT::context() {
 
 SampleIngestor::~SampleIngestor() {}
 
+// Callback used to update the sampling profilers run frequency whenever the
+// frequency changes.
+void samplingProfilerUpdateFrequency(std::optional<std::any> freq) {
+	double frequency = 0;
+	if (freq.has_value()) {
+		frequency = std::any_cast<double>(freq.value());
+	}
+	TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency);
+	ActorLineageProfiler::instance().setFrequency(frequency);
+}
+
 void ProfilerConfigT::reset(std::map<std::string, std::string> const& config) {
 	bool expectNoMore = false, useFluentD = false, useTCP = false;
 	std::string endpoint;
@@ -359,17 +370,6 @@ std::map<std::string, std::string> ProfilerConfigT::getConfig() const {
 	return res;
 }
 
-// Callback used to update the sampling profilers run frequency whenever the
-// frequency changes.
-void samplingProfilerUpdateFrequency(std::optional<std::any> freq) {
-	double frequency = 0;
-	if (freq.has_value()) {
-		frequency = std::any_cast<double>(freq.value());
-	}
-	TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency);
-	ActorLineageProfiler::instance().setFrequency(frequency);
-}
-
 // Callback used to update the sample collector window size.
 void samplingProfilerUpdateWindow(std::optional<std::any> window) {
 	double duration = 0;
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 3bd982b1ad..e4f0791431 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -39,12 +39,26 @@ const KeyRef samplingWindow = LiteralStringRef("visibility/sampling/window");
 
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
+void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+	if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
+		auto config = new GlobalConfig{};
+		config->cx = Database(cx);
+		config->dbInfo = dbInfo;
+		g_network->setGlobal(INetwork::enGlobalConfig, config);
+		config->_updater = updater(config);
+	}
+}
+
 GlobalConfig& GlobalConfig::globalConfig() {
 	void* res = g_network->global(INetwork::enGlobalConfig);
 	ASSERT(res);
 	return *reinterpret_cast<GlobalConfig*>(res);
 }
 
+void GlobalConfig::updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+	// this->dbInfo = dbInfo;
+}
+
 Key GlobalConfig::prefixedKey(KeyRef key) {
 	return key.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin);
 }
@@ -106,7 +120,7 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
 			callbacks[stableKey](data[stableKey]->value);
 		}
 	} catch (Error& e) {
-		TraceEvent(SevWarn, "GlobalConfigTupleParseError").detail("What", e.what());
+		TraceEvent("GlobalConfigTupleParseError").detail("What", e.what());
 	}
 }
 
@@ -145,32 +159,29 @@ ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
 	state Optional<Value> sampleRate = wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_sample_rate/"_sr)));
 	state Optional<Value> sizeLimit = wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_size_limit/"_sr)));
 
-	try {
-		tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
-		// The value doesn't matter too much, as long as the key is set.
-		tr->set(migratedKey.contents(), "1"_sr);
-		if (sampleRate.present()) {
-			const double sampleRateDbl =
-			    BinaryReader::fromStringRef<double>(sampleRate.get().contents(), Unversioned());
-			Tuple rate = Tuple().appendDouble(sampleRateDbl);
-			tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
-		}
-		if (sizeLimit.present()) {
-			const int64_t sizeLimitInt =
-			    BinaryReader::fromStringRef<int64_t>(sizeLimit.get().contents(), Unversioned());
-			Tuple size = Tuple().append(sizeLimitInt);
-			tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
-		}
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+			// The value doesn't matter too much, as long as the key is set.
+			tr->set(migratedKey.contents(), "1"_sr);
+			if (sampleRate.present()) {
+				const double sampleRateDbl =
+				    BinaryReader::fromStringRef<double>(sampleRate.get().contents(), Unversioned());
+				Tuple rate = Tuple().appendDouble(sampleRateDbl);
+				tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
+			}
+			if (sizeLimit.present()) {
+				const int64_t sizeLimitInt =
+				    BinaryReader::fromStringRef<int64_t>(sizeLimit.get().contents(), Unversioned());
+				Tuple size = Tuple().append(sizeLimitInt);
+				tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
+			}
 
-		wait(tr->commit());
-		return Void();
-	} catch (Error& e) {
-		// If multiple fdbserver processes are started at once, they will all
-		// attempt this migration at the same time, sometimes resulting in
-		// aborts due to conflicts. Purposefully avoid retrying, making this
-		// migration best-effort.
-		TraceEvent(SevInfo, "GlobalConfigMigrationError").detail("What", e.what());
-		throw;
+			wait(tr->commit());
+			return Void();
+		} catch (Error& e) {
+			throw;
+		}
 	}
 }
 
@@ -190,8 +201,8 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 
 // Applies updates to the local copy of the global configuration when this
 // process receives an updated history.
-ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, const ClientDBInfo* dbInfo) {
-	wait(self->cx->onConnected());
+ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self) {
+	// wait(self->cx->onConnected());
 	wait(self->migrate(self));
 
 	wait(self->refresh(self));
@@ -199,9 +210,9 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, const ClientDBInfo*
 
 	loop {
 		try {
-			wait(self->dbInfoChanged.onTrigger());
+			wait(self->dbInfo->onChange());
 
-			auto& history = dbInfo->history;
+			auto& history = self->dbInfo->get().history;
 			if (history.size() == 0) {
 				continue;
 			}
@@ -211,8 +222,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, const ClientDBInfo*
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
 				wait(self->refresh(self));
-				if (dbInfo->history.size() > 0) {
-					self->lastUpdate = dbInfo->history.back().version;
+				if (self->dbInfo->get().history.size() > 0) {
+					self->lastUpdate = self->dbInfo->get().history.back().version;
 				}
 			} else {
 				// Apply history in order, from lowest version to highest
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 76490fb67f..967ec77f8d 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -67,31 +67,23 @@ struct ConfigValue : ReferenceCounted<ConfigValue> {
 
 class GlobalConfig : NonCopyable {
 public:
-	// Creates a GlobalConfig singleton, accessed by calling globalConfig().
-	// This function requires a database context object to allow global
-	// configuration to run transactions on the database, and an AsyncVar
-	// object to watch for changes on. The ClientDBInfo pointer should point to
-	// a ClientDBInfo object which will contain the updated global
-	// configuration history when the given AsyncVar changes. This function
-	// should only be called once (however, it is idempotent and calling it
-	// multiple times will have no effect).
-	template <class T>
-	static void create(DatabaseContext* cx, Reference<AsyncVar<T>> db, const ClientDBInfo* dbInfo) {
-		if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
-			auto config = new GlobalConfig{};
-			config->cx = Database(cx);
-			g_network->setGlobal(INetwork::enGlobalConfig, config);
-			config->_updater = updater(config, dbInfo);
-			// Bind changes in `db` to the `dbInfoChanged` AsyncTrigger.
-			forward(db, std::addressof(config->dbInfoChanged));
-		}
-	}
+	// Creates a GlobalConfig singleton, accessed by calling GlobalConfig().
+	// This function should only be called once by each process (however, it is
+	// idempotent and calling it multiple times will have no effect).
+	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 
 	// Returns a reference to the global GlobalConfig object. Clients should
 	// call this function whenever they need to read a value out of the global
 	// configuration.
 	static GlobalConfig& globalConfig();
 
+	// Updates the ClientDBInfo object used by global configuration to read new
+	// data. For server processes, this value needs to be set by the cluster
+	// controller, but global config is initialized before the cluster
+	// controller is, so this function provides a mechanism to update the
+	// object after initialization.
+	void updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo);
+
 	// Use this function to turn a global configuration key defined above into
 	// the full path needed to set the value in the database.
 	//
@@ -164,10 +156,10 @@ private:
 
 	ACTOR static Future<Void> migrate(GlobalConfig* self);
 	ACTOR static Future<Void> refresh(GlobalConfig* self);
-	ACTOR static Future<Void> updater(GlobalConfig* self, const ClientDBInfo* dbInfo);
+	ACTOR static Future<Void> updater(GlobalConfig* self);
 
 	Database cx;
-	AsyncTrigger dbInfoChanged;
+	Reference<AsyncVar<ClientDBInfo>> dbInfo;
 	Future<Void> _updater;
 	Promise<Void> initialized;
 	AsyncTrigger configChanged;
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index d9a8f5109e..a9bc969a13 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -962,6 +962,10 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 	getValueSubmitted.init(LiteralStringRef("NativeAPI.GetValueSubmitted"));
 	getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted"));
 
+	GlobalConfig::create(this, clientInfo);
+	GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
+	GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow);
+
 	monitorProxiesInfoChange = monitorProxiesChange(clientInfo, &proxiesChangeTrigger);
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);
 	cacheListMonitor = monitorCacheList(this);
@@ -1564,9 +1568,6 @@ Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
 		                         /*switchable*/ true);
 	}
 
-	GlobalConfig::create(db, clientInfo, std::addressof(clientInfo->get()));
-	GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
-	GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow);
 	return Database(db);
 }
 
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index a0586132cd..4d69aaa0c1 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -135,7 +135,9 @@ public:
 		                                                                         true,
 		                                                                         TaskPriority::DefaultEndpoint,
 		                                                                         true)) // SOMEDAY: Locality!
-		{}
+		{
+			GlobalConfig::globalConfig().updateDBInfo(clientInfo);
+		}
 
 		void setDistributor(const DataDistributorInterface& interf) {
 			auto newInfo = serverInfo->get();
@@ -3769,7 +3771,7 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 				state Optional<Value> globalConfigVersion = wait(tr.get(globalConfigVersionKey));
-				state ClientDBInfo clientInfo = db->serverInfo->get().client;
+				state ClientDBInfo clientInfo = db->clientInfo->get();
 
 				if (globalConfigVersion.present()) {
 					// Since the history keys end with versionstamps, they
@@ -3827,15 +3829,6 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 					}
 
 					clientInfo.id = deterministicRandom()->randomUniqueID();
-
-					// Update ServerDBInfo so fdbserver processes receive updated history.
-					ServerDBInfo serverInfo = db->serverInfo->get();
-					serverInfo.id = deterministicRandom()->randomUniqueID();
-					serverInfo.infoGeneration = ++db->dbInfoCount;
-					serverInfo.client = clientInfo;
-					db->serverInfo->set(serverInfo);
-
-					// Update ClientDBInfo so client processes receive updated history.
 					db->clientInfo->set(clientInfo);
 				}
 
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index ee617e09e3..15151f3d12 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -147,16 +147,12 @@ Database openDBOnServer(Reference<AsyncVar<ServerDBInfo>> const& db,
                         bool enableLocalityLoadBalance,
                         bool lockAware) {
 	auto info = makeReference<AsyncVar<ClientDBInfo>>();
-	auto cx = DatabaseContext::create(info,
-	                                  extractClientInfo(db, info),
-	                                  enableLocalityLoadBalance ? db->get().myLocality : LocalityData(),
-	                                  enableLocalityLoadBalance,
-	                                  taskID,
-	                                  lockAware);
-	GlobalConfig::create(cx.getPtr(), db, std::addressof(db->get().client));
-	GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
-	GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow);
-	return cx;
+	return DatabaseContext::create(info,
+	                               extractClientInfo(db, info),
+	                               enableLocalityLoadBalance ? db->get().myLocality : LocalityData(),
+	                               enableLocalityLoadBalance,
+	                               taskID,
+	                               lockAware);
 }
 
 struct ErrorInfo {
@@ -1053,6 +1049,8 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 			metricsLogger = runMetrics(openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, lockAware),
 			                           KeyRef(metricsPrefix));
 		}
+
+		GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
 	}
 
 	errorForwarders.add(resetAfter(degraded,
diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h
index 147da0230b..8561bc623c 100644
--- a/flow/genericactors.actor.h
+++ b/flow/genericactors.actor.h
@@ -697,16 +697,6 @@ private:
 	AsyncVar<Void> v;
 };
 
-// Binds an AsyncTrigger object to an AsyncVar, so when the AsyncVar changes
-// the AsyncTrigger is triggered.
-ACTOR template <class T>
-void forward(Reference<AsyncVar<T>> from, AsyncTrigger* to) {
-	loop {
-		wait(from->onChange());
-		to->trigger();
-	}
-}
-
 class Debouncer : NonCopyable {
 public:
 	explicit Debouncer(double delay) { worker = debounceWorker(this, delay); }

From b5767e2a648b333d375b18ca30655f031e747540 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Mon, 10 May 2021 12:29:46 -0700
Subject: [PATCH 375/461] Add documentation for getTransaction function, which
 is used for refactoring

---
 fdbcli/fdbcli.actor.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index a60f04889a..4d49c8efc6 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -2661,12 +2661,19 @@ Reference<ReadYourWritesTransaction> getTransaction(Database db,
 	return tr;
 }
 
-// TODO: Update this function to get rid of Database and ReadYourWritesTransaction after refactoring
+// TODO: Update the function to get rid of Database and ReadYourWritesTransaction after refactoring
+// The original ReadYourWritesTransaciton handle "tr" is needed as some commands can be called inside a
+// transaction and "tr" holds the pointer to the ongoing transaction object. As it's not easy to get ride of "tr" in
+// one shot and we are refactoring the code to use Reference<ITransaction> (tr2), we need to let "tr2" point to the same
+// underlying transaction like "tr". Thus everytime we need to use "tr2",  we first update "tr" and let "tr2" points to
+// "tr1". "tr2" is always having the same lifetime as "tr1"
 Reference<ITransaction> getTransaction(Database db,
                                        Reference<ReadYourWritesTransaction>& tr,
                                        Reference<ITransaction>& tr2,
                                        FdbOptions* options,
                                        bool intrans) {
+	// Update "tr" to point to a brand new transaction object when it's not initialized or "intrans" flag is "false",
+	// which indicates we need a new transaction object
 	if (!tr || !intrans) {
 		tr = makeReference<ReadYourWritesTransaction>(db);
 		options->apply(tr);

From ba0aceccbed10f58ddb8203ed60712b75bb72674 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel_b_smith@apple.com>
Date: Mon, 10 May 2021 19:04:01 -0400
Subject: [PATCH 376/461] Add ASAN and TSAN to the build container (#4796)

* Add ASAN and TSAN to the build container

* Add ASAN and TSAN to the centos6 build container
---
 build/docker/centos6/build/Dockerfile | 2 ++
 build/docker/centos7/build/Dockerfile | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/build/docker/centos6/build/Dockerfile b/build/docker/centos6/build/Dockerfile
index 08ad28256b..0a1fbbd70a 100644
--- a/build/docker/centos6/build/Dockerfile
+++ b/build/docker/centos6/build/Dockerfile
@@ -22,6 +22,8 @@ RUN sed -i -e '/enabled/d' /etc/yum.repos.d/CentOS-Base.repo && \
         curl \
         debbuild \
         devtoolset-8 \
+        devtoolset-8-libasan-devel \
+        devtoolset-8-libtsan-devel \
         devtoolset-8-libubsan-devel \
         devtoolset-8-valgrind-devel \
         dos2unix \
diff --git a/build/docker/centos7/build/Dockerfile b/build/docker/centos7/build/Dockerfile
index 3492fc15f4..de376d2557 100644
--- a/build/docker/centos7/build/Dockerfile
+++ b/build/docker/centos7/build/Dockerfile
@@ -18,6 +18,8 @@ RUN rpmkeys --import mono-project.com.rpmkey.pgp && \
         curl \
         debbuild \
         devtoolset-8 \
+        devtoolset-8-libasan-devel \
+        devtoolset-8-libtsan-devel \
         devtoolset-8-libubsan-devel \
         devtoolset-8-systemtap-sdt-devel \
         docker-ce \

From 4ea760b2a91062ef22ed18ba57c48f22675b3952 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 10 May 2021 20:26:12 -0700
Subject: [PATCH 377/461] Revert "Merge pull request #4136 from
 sfc-gh-mpilman/features/actor-lineage"

This reverts commit da41534618a2a1edbf6b0b760635175372a66294, reversing
changes made to e6300905d6f294c52ebd166f4714541b084f37b4.
---
 CMakeLists.txt                          |   1 -
 cmake/GetMsgpack.cmake                  |  20 --
 fdbclient/ActorLineageProfiler.cpp      | 381 ------------------------
 fdbclient/ActorLineageProfiler.h        | 192 ------------
 fdbclient/AnnotateActor.cpp             |  23 --
 fdbclient/AnnotateActor.h               |  85 ------
 fdbclient/CMakeLists.txt                |   9 +-
 fdbclient/FluentDSampleIngestor.cpp     | 257 ----------------
 fdbclient/GlobalConfig.actor.cpp        |  46 +--
 fdbclient/GlobalConfig.actor.h          |  29 +-
 fdbclient/NativeAPI.actor.cpp           |  20 +-
 fdbclient/ProcessInterface.h            |  81 -----
 fdbclient/SpecialKeySpace.actor.cpp     | 293 +-----------------
 fdbclient/SpecialKeySpace.actor.h       |  29 --
 fdbclient/TransactionLineage.cpp        |  25 --
 fdbclient/TransactionLineage.h          | 128 --------
 fdbclient/Tuple.cpp                     |  29 --
 fdbclient/Tuple.h                       |   4 +-
 fdbrpc/AsyncFileKAIO.actor.h            |   7 +-
 fdbrpc/FlowTests.actor.cpp              |   4 -
 fdbrpc/FlowTransport.actor.cpp          |   2 +-
 fdbrpc/IAsyncFile.h                     |   4 -
 fdbrpc/Locality.h                       |   1 -
 fdbrpc/Net2FileSystem.cpp               |   4 -
 fdbrpc/Net2FileSystem.h                 |   3 -
 fdbrpc/sim2.actor.cpp                   |  11 -
 fdbrpc/simulator.h                      |   4 -
 fdbserver/CMakeLists.txt                |   3 -
 fdbserver/ClusterController.actor.cpp   |   4 +-
 fdbserver/CommitProxyServer.actor.cpp   |   6 +-
 fdbserver/GrvProxyServer.actor.cpp      |   5 -
 fdbserver/RoleLineage.actor.cpp         |  25 --
 fdbserver/RoleLineage.actor.h           |  67 -----
 fdbserver/SigStack.cpp                  |  23 --
 fdbserver/WorkerInterface.actor.h       |  35 ---
 fdbserver/fdbserver.actor.cpp           |  38 +--
 fdbserver/storageserver.actor.cpp       |  11 -
 fdbserver/worker.actor.cpp              |  70 -----
 flow/CMakeLists.txt                     |   1 -
 flow/Net2.actor.cpp                     |  12 +-
 flow/Platform.actor.cpp                 |   4 -
 flow/Platform.h                         |   6 +-
 flow/Profiler.actor.cpp                 |   3 -
 flow/WriteOnlySet.actor.cpp             | 273 -----------------
 flow/WriteOnlySet.h                     | 162 ----------
 flow/actorcompiler/ActorCompiler.cs     |   2 -
 flow/actorcompiler/actorcompiler.csproj | 108 ++++++-
 flow/actorcompiler/actorcompiler.sln    |  34 ---
 flow/flow.cpp                           |  21 --
 flow/flow.h                             | 188 +-----------
 flow/genericactors.actor.h              |   4 -
 flow/network.h                          |   4 -
 flow/singleton.h                        | 237 ---------------
 tests/TestRunner/local_cluster.py       |   2 +-
 54 files changed, 142 insertions(+), 2898 deletions(-)
 delete mode 100644 cmake/GetMsgpack.cmake
 delete mode 100644 fdbclient/ActorLineageProfiler.cpp
 delete mode 100644 fdbclient/ActorLineageProfiler.h
 delete mode 100644 fdbclient/AnnotateActor.cpp
 delete mode 100644 fdbclient/AnnotateActor.h
 delete mode 100644 fdbclient/FluentDSampleIngestor.cpp
 delete mode 100644 fdbclient/ProcessInterface.h
 delete mode 100644 fdbclient/TransactionLineage.cpp
 delete mode 100644 fdbclient/TransactionLineage.h
 delete mode 100644 fdbserver/RoleLineage.actor.cpp
 delete mode 100644 fdbserver/RoleLineage.actor.h
 delete mode 100644 fdbserver/SigStack.cpp
 delete mode 100644 flow/WriteOnlySet.actor.cpp
 delete mode 100644 flow/WriteOnlySet.h
 delete mode 100644 flow/actorcompiler/actorcompiler.sln
 delete mode 100644 flow/singleton.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3803330790..08df8edfe0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,7 +152,6 @@ if(CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")
 endif()
 
 include(CompileBoost)
-include(GetMsgpack)
 add_subdirectory(flow)
 add_subdirectory(fdbrpc)
 add_subdirectory(fdbclient)
diff --git a/cmake/GetMsgpack.cmake b/cmake/GetMsgpack.cmake
deleted file mode 100644
index b3313f336e..0000000000
--- a/cmake/GetMsgpack.cmake
+++ /dev/null
@@ -1,20 +0,0 @@
-find_package(msgpack 3.3.0 EXACT QUIET CONFIG)
-
-add_library(msgpack INTERFACE)
-
-if(msgpack_FOUND)
-  target_link_libraries(msgpack INTERFACE msgpackc-cxx)
-else()
-  include(ExternalProject)
-  ExternalProject_add(msgpackProject
-    URL "https://github.com/msgpack/msgpack-c/releases/download/cpp-3.3.0/msgpack-3.3.0.tar.gz"
-    URL_HASH SHA256=6e114d12a5ddb8cb11f669f83f32246e484a8addd0ce93f274996f1941c1f07b
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND ""
-    INSTALL_COMMAND ""
-  )
-
-  ExternalProject_Get_property(msgpackProject SOURCE_DIR)
-  target_include_directories(msgpack SYSTEM INTERFACE "${SOURCE_DIR}/include")
-  add_dependencies(msgpack msgpackProject)
-endif()
diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
deleted file mode 100644
index 9be7a60704..0000000000
--- a/fdbclient/ActorLineageProfiler.cpp
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * ActorLineageProfiler.cpp
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flow/flow.h"
-#include "flow/singleton.h"
-#include "fdbrpc/IAsyncFile.h"
-#include "fdbclient/ActorLineageProfiler.h"
-#include <msgpack.hpp>
-#include <memory>
-#include <boost/endian/conversion.hpp>
-#include <boost/asio.hpp>
-
-using namespace std::literals;
-
-class Packer : public msgpack::packer<msgpack::sbuffer> {
-	struct visitor_t {
-		using VisitorMap = std::unordered_map<std::type_index, std::function<void(std::any const&, Packer& packer)>>;
-		VisitorMap visitorMap;
-
-		template <class T>
-		static void any_visitor(std::any const& val, Packer& packer) {
-			const T& v = std::any_cast<const T&>(val);
-			packer.pack(v);
-		}
-
-		template <class... Args>
-		struct populate_visitor_map;
-		template <class Head, class... Tail>
-		struct populate_visitor_map<Head, Tail...> {
-			static void populate(VisitorMap& map) {
-				map.emplace(std::type_index(typeid(Head)), any_visitor<Head>);
-				populate_visitor_map<Tail...>::populate(map);
-			}
-		};
-		template <class Head>
-		struct populate_visitor_map<Head> {
-			static void populate(VisitorMap&) {}
-		};
-
-		visitor_t() {
-			populate_visitor_map<int64_t,
-			                     uint64_t,
-			                     bool,
-			                     float,
-			                     double,
-			                     std::string,
-			                     std::string_view,
-			                     std::vector<std::any>,
-			                     std::map<std::string, std::any>,
-			                     std::map<std::string_view, std::any>,
-			                     std::vector<std::map<std::string_view, std::any>>>::populate(visitorMap);
-		}
-
-		void visit(const std::any& val, Packer& packer) {
-			auto iter = visitorMap.find(val.type());
-			if (iter == visitorMap.end()) {
-				TraceEvent(SevError, "PackerTypeNotFound").detail("Type", val.type().name());
-			} else {
-				iter->second(val, packer);
-			}
-		}
-	};
-	msgpack::sbuffer sbuffer;
-	// Initializing visitor_t involves building a type-map. As this is a relatively expensive operation, we don't want
-	// to do this each time we create a Packer object. So visitor_t is a stateless class and we only use it as a
-	// visitor.
-	crossbow::singleton<visitor_t> visitor;
-
-public:
-	Packer() : msgpack::packer<msgpack::sbuffer>(sbuffer) {}
-
-	void pack(std::any const& val) { visitor->visit(val, *this); }
-
-	void pack(bool val) {
-		if (val) {
-			pack_true();
-		} else {
-			pack_false();
-		}
-	}
-
-	void pack(uint64_t val) {
-		if (val <= std::numeric_limits<uint8_t>::max()) {
-			pack_uint8(uint8_t(val));
-		} else if (val <= std::numeric_limits<uint16_t>::max()) {
-			pack_uint16(uint16_t(val));
-		} else if (val <= std::numeric_limits<uint32_t>::max()) {
-			pack_uint32(uint32_t(val));
-		} else {
-			pack_uint64(val);
-		}
-	}
-
-	void pack(int64_t val) {
-		if (val >= 0) {
-			this->pack(uint64_t(val));
-		} else if (val >= std::numeric_limits<uint8_t>::min()) {
-			pack_int8(int8_t(val));
-		} else if (val >= std::numeric_limits<int16_t>::min()) {
-			pack_int16(int16_t(val));
-		} else if (val >= std::numeric_limits<int32_t>::min()) {
-			pack_int32(int32_t(val));
-		} else if (val >= std::numeric_limits<int64_t>::min()) {
-			pack_int64(int64_t(val));
-		}
-	}
-
-	void pack(float val) { pack_float(val); }
-	void pack(double val) { pack_double(val); }
-	void pack(std::string const& str) {
-		pack_str(str.size());
-		pack_str_body(str.data(), str.size());
-	}
-
-	void pack(std::string_view val) {
-		pack_str(val.size());
-		pack_str_body(val.data(), val.size());
-	}
-
-	template <class K, class V>
-	void pack(std::map<K, V> const& map) {
-		pack_map(map.size());
-		for (const auto& p : map) {
-			pack(p.first);
-			pack(p.second);
-		}
-	}
-
-	template <class T>
-	void pack(std::vector<T> const& val) {
-		pack_array(val.size());
-		for (const auto& v : val) {
-			pack(v);
-		}
-	}
-
-	std::pair<char*, unsigned> getbuf() {
-		unsigned size = sbuffer.size();
-		return std::make_pair(sbuffer.release(), size);
-	}
-};
-
-IALPCollectorBase::IALPCollectorBase() {
-	SampleCollector::instance().addCollector(this);
-}
-
-std::map<std::string_view, std::any> SampleCollectorT::collect(ActorLineage* lineage) {
-	std::map<std::string_view, std::any> out;
-	for (auto& collector : collectors) {
-		auto val = collector->collect(lineage);
-		if (val.has_value()) {
-			out[collector->name()] = val.value();
-		}
-	}
-	return out;
-}
-
-std::shared_ptr<Sample> SampleCollectorT::collect() {
-	auto sample = std::make_shared<Sample>();
-	double time = g_network->now();
-	sample->time = time;
-	for (auto& p : getSamples) {
-		Packer packer;
-		std::vector<std::map<std::string_view, std::any>> samples;
-		auto sampleVec = p.second();
-		for (auto& val : sampleVec) {
-			auto m = collect(val.getPtr());
-			if (!m.empty()) {
-				samples.emplace_back(std::move(m));
-			}
-		}
-		if (!samples.empty()) {
-			packer.pack(samples);
-			sample->data[p.first] = packer.getbuf();
-		}
-	}
-	return sample;
-}
-
-void SampleCollection_t::refresh() {
-	auto sample = _collector->collect();
-	auto min = std::min(sample->time - windowSize, sample->time);
-	{
-		Lock _{ mutex };
-		data.emplace_back(std::move(sample));
-	}
-	double oldest = data.front()->time;
-	// we don't need to check for data.empty() in this loop (or the inner loop) as we know that we will end
-	// up with at least one entry which is the most recent sample
-	while (oldest < min) {
-		Lock _{ mutex };
-		// we remove at most 10 elements at a time. This is so we don't block the main thread for too long.
-		for (int i = 0; i < 10 && oldest < min; ++i) {
-			data.pop_front();
-			oldest = data.front()->time;
-		}
-	}
-	//config->ingest(sample);
-}
-
-std::vector<std::shared_ptr<Sample>> SampleCollection_t::get(double from /*= 0.0*/,
-                                                             double to /*= std::numeric_limits<double>::max()*/) const {
-	Lock _{ mutex };
-	std::vector<std::shared_ptr<Sample>> res;
-	for (const auto& sample : data) {
-		if (sample->time > to) {
-			break;
-		} else if (sample->time >= from) {
-			res.push_back(sample);
-		}
-	}
-	return res;
-}
-
-struct ProfilerImpl {
-	boost::asio::io_context context;
-	boost::asio::executor_work_guard<decltype(context.get_executor())> workGuard;
-	boost::asio::steady_timer timer;
-	std::thread mainThread;
-	unsigned frequency;
-
-	SampleCollection collection;
-
-	ProfilerImpl() : workGuard(context.get_executor()), timer(context) {
-		mainThread = std::thread([this]() { context.run(); });
-	}
-	~ProfilerImpl() {
-		setFrequency(0);
-		workGuard.reset();
-		mainThread.join();
-	}
-
-	void profileHandler(boost::system::error_code const& ec) {
-		if (ec) {
-			return;
-		}
-		collection->refresh();
-		timer = boost::asio::steady_timer(context, std::chrono::microseconds(1000000 / frequency));
-		timer.async_wait([this](auto const& ec) { profileHandler(ec); });
-	}
-
-	void setFrequency(unsigned frequency) {
-		boost::asio::post(context, [this, frequency]() {
-			this->frequency = frequency;
-			timer.cancel();
-			if (frequency > 0) {
-				profileHandler(boost::system::error_code{});
-			}
-		});
-	}
-};
-
-ActorLineageProfilerT::ActorLineageProfilerT() : impl(new ProfilerImpl()) {
-	collection->collector()->addGetter(WaitState::Network,
-	                                   std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet())));
-	collection->collector()->addGetter(
-	    WaitState::Disk,
-	    std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet())));
-	collection->collector()->addGetter(WaitState::Running, []() {
-		auto res = currentLineageThreadSafe.get();
-		if (res.isValid()) {
-			return std::vector<Reference<ActorLineage>>({ res });
-		}
-		return std::vector<Reference<ActorLineage>>();
-	});
-}
-
-ActorLineageProfilerT::~ActorLineageProfilerT() {
-	delete impl;
-}
-
-void ActorLineageProfilerT::setFrequency(unsigned frequency) {
-	impl->setFrequency(frequency);
-}
-
-boost::asio::io_context& ActorLineageProfilerT::context() {
-	return impl->context;
-}
-
-SampleIngestor::~SampleIngestor() {}
-
-// Callback used to update the sampling profilers run frequency whenever the
-// frequency changes.
-void samplingProfilerUpdateFrequency(std::optional<std::any> freq) {
-	double frequency = 0;
-	if (freq.has_value()) {
-		frequency = std::any_cast<double>(freq.value());
-	}
-	TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency);
-	ActorLineageProfiler::instance().setFrequency(frequency);
-}
-
-void ProfilerConfigT::reset(std::map<std::string, std::string> const& config) {
-	bool expectNoMore = false, useFluentD = false, useTCP = false;
-	std::string endpoint;
-	ConfigError err;
-	for (auto& kv : config) {
-		if (expectNoMore) {
-			err.description = format("Unexpected option %s", kv.first.c_str());
-			throw err;
-		}
-		if (kv.first == "collector") {
-			std::string val = kv.second;
-			std::for_each(val.begin(), val.end(), [](auto c) { return std::tolower(c); });
-			if (val == "none") {
-				setBackend(std::make_shared<NoneIngestor>());
-			} else if (val == "fluentd") {
-				useFluentD = true;
-			} else {
-				err.description = format("Unsupported collector: %s", val.c_str());
-				throw err;
-			}
-		} else if (kv.first == "collector_endpoint") {
-			endpoint = kv.second;
-		} else if (kv.first == "collector_protocol") {
-			auto val = kv.second;
-			std::for_each(val.begin(), val.end(), [](auto c) { return std::tolower(c); });
-			if (val == "tcp") {
-				useTCP = true;
-			} else if (val == "udp") {
-				useTCP = false;
-			} else {
-				err.description = format("Unsupported protocol for fluentd: %s", kv.second.c_str());
-				throw err;
-			}
-		} else {
-			err.description = format("Unknown option %s", kv.first.c_str());
-			throw err;
-		}
-	}
-	if (useFluentD) {
-		if (endpoint.empty()) {
-			err.description = "Endpoint is required for fluentd ingestor";
-			throw err;
-		}
-		NetworkAddress address;
-		try {
-			address = NetworkAddress::parse(endpoint);
-		} catch (Error& e) {
-			err.description = format("Can't parse address %s", endpoint.c_str());
-			throw err;
-		}
-		setBackend(std::make_shared<FluentDIngestor>(
-		    useTCP ? FluentDIngestor::Protocol::TCP : FluentDIngestor::Protocol::TCP, address));
-	}
-}
-
-std::map<std::string, std::string> ProfilerConfigT::getConfig() const {
-	std::map<std::string, std::string> res;
-	if (ingestor) {
-		ingestor->getConfig(res);
-	}
-	return res;
-}
-
-// Callback used to update the sample collector window size.
-void samplingProfilerUpdateWindow(std::optional<std::any> window) {
-	double duration = 0;
-	if (window.has_value()) {
-		duration = std::any_cast<double>(window.value());
-	}
-	TraceEvent(SevInfo, "SamplingProfilerUpdateWindow").detail("Duration", duration);
-	SampleCollection::instance().setWindowSize(duration);
-}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
deleted file mode 100644
index 30a3eec3e7..0000000000
--- a/fdbclient/ActorLineageProfiler.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * ActorLineageProfiler.h
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "fdbclient/AnnotateActor.h"
-
-#include <optional>
-#include <string>
-#include <any>
-#include <vector>
-#include <mutex>
-#include <condition_variable>
-#include "flow/singleton.h"
-#include "flow/flow.h"
-
-void samplingProfilerUpdateFrequency(std::optional<std::any> freq);
-void samplingProfilerUpdateWindow(std::optional<std::any> window);
-
-struct IALPCollectorBase {
-	virtual std::optional<std::any> collect(ActorLineage*) = 0;
-	virtual const std::string_view& name() = 0;
-	IALPCollectorBase();
-};
-
-template <class T>
-struct IALPCollector : IALPCollectorBase {
-	const std::string_view& name() override { return T::name; }
-};
-
-struct Sample : std::enable_shared_from_this<Sample> {
-	double time = 0.0;
-	Sample() {}
-	Sample(Sample const&) = delete;
-	Sample& operator=(Sample const&) = delete;
-	std::unordered_map<WaitState, std::pair<char*, unsigned>> data;
-	~Sample() {
-		std::for_each(data.begin(), data.end(), [](std::pair<WaitState, std::pair<char*, unsigned>> entry) {
-			::free(entry.second.first);
-		});
-	}
-};
-
-class SampleIngestor : std::enable_shared_from_this<SampleIngestor> {
-public:
-	virtual ~SampleIngestor();
-	virtual void ingest(std::shared_ptr<Sample> const& sample) = 0;
-	virtual void getConfig(std::map<std::string, std::string>&) const = 0;
-};
-
-class NoneIngestor : public SampleIngestor {
-public:
-	void ingest(std::shared_ptr<Sample> const& sample) override {}
-	void getConfig(std::map<std::string, std::string>& res) const override { res["ingestor"] = "none"; }
-};
-
-// The FluentD ingestor uses the pimpl idiom. This is to make compilation less heavy weight as this implementation has
-// dependencies to boost::asio
-struct FluentDIngestorImpl;
-
-class FluentDIngestor : public SampleIngestor {
-public: // Public Types
-	enum class Protocol { TCP, UDP };
-
-private: // members
-	FluentDIngestorImpl* impl;
-
-public: // interface
-	void ingest(std::shared_ptr<Sample> const& sample) override;
-	FluentDIngestor(Protocol protocol, NetworkAddress& endpoint);
-	void getConfig(std::map<std::string, std::string>& res) const override;
-	~FluentDIngestor();
-};
-
-struct ConfigError {
-	std::string description;
-};
-
-class ProfilerConfigT {
-private: // private types
-	using Lock = std::unique_lock<std::mutex>;
-	friend class crossbow::create_static<ProfilerConfigT>;
-
-private: // members
-	std::shared_ptr<SampleIngestor> ingestor = std::make_shared<NoneIngestor>();
-
-private: // construction
-	ProfilerConfigT() {}
-	ProfilerConfigT(ProfilerConfigT const&) = delete;
-	ProfilerConfigT& operator=(ProfilerConfigT const&) = delete;
-	void setBackend(std::shared_ptr<SampleIngestor> ingestor) { this->ingestor = ingestor; }
-
-public:
-	void reset(std::map<std::string, std::string> const& config);
-	std::map<std::string, std::string> getConfig() const;
-};
-
-using ProfilerConfig = crossbow::singleton<ProfilerConfigT>;
-
-class SampleCollectorT {
-public: // Types
-	friend struct crossbow::create_static<SampleCollectorT>;
-	using Getter = std::function<std::vector<Reference<ActorLineage>>()>;
-
-private:
-	std::vector<IALPCollectorBase*> collectors;
-	std::map<WaitState, Getter> getSamples;
-	SampleCollectorT() {}
-	std::map<std::string_view, std::any> collect(ActorLineage* lineage);
-
-public:
-	void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); }
-	std::shared_ptr<Sample> collect();
-	void addGetter(WaitState waitState, Getter const& getter) { getSamples[waitState] = getter; };
-};
-
-using SampleCollector = crossbow::singleton<SampleCollectorT>;
-
-class SampleCollection_t {
-	friend struct crossbow::create_static<SampleCollection_t>;
-	using Lock = std::unique_lock<std::mutex>;
-	SampleCollection_t() {}
-
-	SampleCollector _collector;
-	mutable std::mutex mutex;
-	std::atomic<double> windowSize = 0.0;
-	std::deque<std::shared_ptr<Sample>> data;
-	ProfilerConfig config;
-
-public:
-	/**
-	 * Define how many samples the collection shoul keep. The window size is defined by time dimension.
-	 *
-	 * \param duration How long a sample should be kept in the collection.
-	 */
-	void setWindowSize(double duration) { windowSize.store(duration); }
-	/**
-	 * By default returns reference counted pointers of all samples. A window can be defined in terms of absolute time.
-	 *
-	 * \param from The minimal age of all returned samples.
-	 * \param to The max age of all returned samples.
-	 */
-	std::vector<std::shared_ptr<Sample>> get(double from = 0.0, double to = std::numeric_limits<double>::max()) const;
-	/**
-	 * Collects all new samples from the sample collector and stores them in the collection.
-	 */
-	void refresh();
-	const SampleCollector& collector() const { return _collector; }
-	SampleCollector& collector() { return _collector; }
-};
-
-using SampleCollection = crossbow::singleton<SampleCollection_t>;
-
-struct ProfilerImpl;
-
-namespace boost {
-namespace asio {
-// forward declare io_context because including boost asio is super expensive
-class io_context;
-} // namespace asio
-} // namespace boost
-
-class ActorLineageProfilerT {
-	friend struct crossbow::create_static<ActorLineageProfilerT>;
-	ProfilerImpl* impl;
-	SampleCollection collection;
-	ActorLineageProfilerT();
-
-public:
-	~ActorLineageProfilerT();
-	void setFrequency(unsigned frequency);
-	boost::asio::io_context& context();
-};
-
-using ActorLineageProfiler = crossbow::singleton<ActorLineageProfilerT>;
diff --git a/fdbclient/AnnotateActor.cpp b/fdbclient/AnnotateActor.cpp
deleted file mode 100644
index 80b9a8cec4..0000000000
--- a/fdbclient/AnnotateActor.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * AnnotateActor.cpp
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "fdbclient/AnnotateActor.h"
-
-std::map<WaitState, std::function<std::vector<Reference<ActorLineage>>()>> samples;
diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h
deleted file mode 100644
index dfc944fd02..0000000000
--- a/fdbclient/AnnotateActor.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * AnnotateActor.h
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "flow/flow.h"
-#include "flow/network.h"
-
-#include <string_view>
-
-// Used to manually instrument waiting actors to collect samples for the
-// sampling profiler.
-struct AnnotateActor {
-	unsigned index;
-	bool set;
-
-	AnnotateActor() : set(false) {}
-
-	AnnotateActor(Reference<ActorLineage> lineage) : set(true) {
-		index = g_network->getActorLineageSet().insert(lineage);
-		if (index == ActorLineageSet::npos) {
-			set = false;
-		}
-	}
-
-	AnnotateActor(const AnnotateActor& other) = delete;
-	AnnotateActor(AnnotateActor&& other) = delete;
-	AnnotateActor& operator=(const AnnotateActor& other) = delete;
-
-	AnnotateActor& operator=(AnnotateActor&& other) {
-		if (this == &other) {
-			return *this;
-		}
-
-		this->index = other.index;
-		this->set = other.set;
-
-		other.set = false;
-
-		return *this;
-	}
-
-	~AnnotateActor() {
-		if (set) {
-			g_network->getActorLineageSet().erase(index);
-		}
-	}
-};
-
-enum class WaitState { Disk, Network, Running };
-// usually we shouldn't use `using namespace` in a header file, but literals should be safe as user defined literals
-// need to be prefixed with `_`
-using namespace std::literals;
-
-constexpr std::string_view to_string(WaitState st) {
-	switch (st) {
-	case WaitState::Disk:
-		return "Disk"sv;
-	case WaitState::Network:
-		return "Network"sv;
-	case WaitState::Running:
-		return "Running"sv;
-	default:
-		return ""sv;
-	}
-}
-
-extern std::map<WaitState, std::function<std::vector<Reference<ActorLineage>>()>> samples;
diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index e9d3d3716b..bd14ef7b52 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -1,7 +1,4 @@
 set(FDBCLIENT_SRCS
-  ActorLineageProfiler.h
-  ActorLineageProfiler.cpp
-  AnnotateActor.cpp
   AsyncFileS3BlobStore.actor.cpp
   AsyncFileS3BlobStore.actor.h
   AsyncTaskThread.actor.cpp
@@ -30,7 +27,6 @@ set(FDBCLIENT_SRCS
   EventTypes.actor.h
   FDBOptions.h
   FDBTypes.h
-  FluentDSampleIngestor.cpp
   FileBackupAgent.actor.cpp
   GlobalConfig.h
   GlobalConfig.actor.h
@@ -144,7 +140,8 @@ endif()
 
 add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs})
 add_dependencies(fdbclient fdboptions)
-target_link_libraries(fdbclient PUBLIC fdbrpc msgpack)
 if(BUILD_AZURE_BACKUP)
-  target_link_libraries(fdbclient PRIVATE curl uuid azure-storage-lite)
+  target_link_libraries(fdbclient PUBLIC fdbrpc PRIVATE curl uuid azure-storage-lite)
+else()
+  target_link_libraries(fdbclient PUBLIC fdbrpc)
 endif()
diff --git a/fdbclient/FluentDSampleIngestor.cpp b/fdbclient/FluentDSampleIngestor.cpp
deleted file mode 100644
index 08d5bfe55f..0000000000
--- a/fdbclient/FluentDSampleIngestor.cpp
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * FluentDSampleIngestor.cpp
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "fdbclient/ActorLineageProfiler.h"
-#include <boost/asio.hpp>
-#include <boost/asio/co_spawn.hpp>
-#include <msgpack.hpp>
-
-namespace {
-
-boost::asio::ip::address ipAddress(IPAddress const& n) {
-	if (n.isV6()) {
-		return boost::asio::ip::address_v6(n.toV6());
-	} else {
-		return boost::asio::ip::address_v4(n.toV4());
-	}
-}
-
-template <class Protocol>
-boost::asio::ip::basic_endpoint<Protocol> toEndpoint(NetworkAddress const n) {
-	return boost::asio::ip::basic_endpoint<Protocol>(ipAddress(n.ip), n.port);
-}
-
-struct FluentDSocket {
-	virtual ~FluentDSocket() {}
-	virtual void connect(NetworkAddress const& endpoint) = 0;
-	virtual void send(std::shared_ptr<Sample> const& sample) = 0;
-	virtual const boost::system::error_code& failed() const = 0;
-};
-
-template <class Protocol, class Callback>
-class SampleSender : public std::enable_shared_from_this<SampleSender<Protocol, Callback>> {
-	using Socket = typename Protocol::socket;
-	using Iter = typename decltype(Sample::data)::iterator;
-	Socket& socket;
-	Callback callback;
-	Iter iter, end;
-
-	struct Buf {
-		const char* data;
-		const unsigned size;
-		Buf(const char* data, unsigned size) : data(data), size(size) {}
-		Buf(Buf const&) = delete;
-		Buf& operator=(Buf const&) = delete;
-		~Buf() { delete[] data; }
-	};
-
-	void sendCompletionHandler(boost::system::error_code const& ec) {
-		if (ec) {
-			callback(ec);
-		} else {
-			++iter;
-			sendNext();
-		}
-	}
-
-	void send(boost::asio::ip::tcp::socket& socket, std::shared_ptr<Buf> const& buf) {
-		boost::asio::async_write(
-		    socket,
-		    boost::asio::const_buffer(buf->data, buf->size),
-		    [buf, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
-	}
-	void send(boost::asio::ip::udp::socket& socket, std::shared_ptr<Buf> const& buf) {
-		socket.async_send(
-		    boost::asio::const_buffer(buf->data, buf->size),
-		    [buf, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
-	}
-
-	void sendNext() {
-		if (iter == end) {
-			callback(boost::system::error_code());
-		}
-		// 1. calculate size of buffer
-		unsigned size = 1; // 1 for fixmap identifier byte
-		auto waitState = to_string(iter->first);
-		if (waitState.size() < 32) {
-			size = waitState.size() + 1;
-		} else {
-			size = waitState.size() + 2;
-		}
-		size += iter->second.second;
-		// 2. allocate the buffer
-		std::unique_ptr<char[]> buf(new char[size]);
-		unsigned off = 0;
-		// 3. serialize fixmap
-		buf[off++] = 0x81; // map of size 1
-		// 3.1 serialize key
-		if (waitState.size() < 32) {
-			buf[off++] = 0xa0 + waitState.size(); // fixstr
-		} else {
-			buf[off++] = 0xd9;
-			buf[off++] = char(waitState.size());
-		}
-		memcpy(buf.get() + off, waitState.data(), waitState.size());
-		off += waitState.size();
-		// 3.2 append serialized value
-		memcpy(buf.get() + off, iter->second.first, iter->second.second);
-		// 4. send the result to fluentd
-		send(socket, std::make_shared<Buf>(buf.release(), size));
-	}
-
-public:
-	SampleSender(Socket& socket, Callback const& callback, std::shared_ptr<Sample> const& sample)
-	  : socket(socket), callback(callback), iter(sample->data.begin()), end(sample->data.end()) {
-			sendNext();
-		}
-};
-
-// Sample function to make instanciation of SampleSender easier
-template <class Protocol, class Callback>
-std::shared_ptr<SampleSender<Protocol, Callback>> makeSampleSender(typename Protocol::socket& socket, Callback const& callback, std::shared_ptr<Sample> const& sample) {
-	return std::make_shared<SampleSender<Protocol, Callback>>(socket, callback, sample);
-}
-
-template <class Protocol>
-struct FluentDSocketImpl : FluentDSocket, std::enable_shared_from_this<FluentDSocketImpl<Protocol>> {
-	static constexpr unsigned MAX_QUEUE_SIZE = 100;
-	boost::asio::io_context& context;
-	typename Protocol::socket socket;
-	FluentDSocketImpl(boost::asio::io_context& context) : context(context), socket(context) {}
-	bool ready = false;
-	std::deque<std::shared_ptr<Sample>> queue;
-	boost::system::error_code _failed;
-
-	const boost::system::error_code& failed() const override { return _failed; }
-
-	void sendCompletionHandler(boost::system::error_code const& ec) {
-		if (ec) {
-			// TODO: trace error
-			_failed = ec;
-			return;
-		}
-		if (queue.empty()) {
-			ready = true;
-		} else {
-			auto sample = queue.front();
-			queue.pop_front();
-			sendImpl(sample);
-		}
-	}
-
-
-	void sendImpl(std::shared_ptr<Sample> const& sample) {
-		makeSampleSender<Protocol>(socket, [self = this->shared_from_this()](boost::system::error_code const& ec){
-			self->sendCompletionHandler(ec);
-		}, sample);
-	}
-
-	void send(std::shared_ptr<Sample> const& sample) override {
-		if (_failed) {
-			return;
-		}
-		if (ready) {
-			ready = false;
-			sendImpl(sample);
-		} else {
-			if (queue.size() < MAX_QUEUE_SIZE) {
-				queue.push_back(sample);
-			} // TODO: else trace a warning
-		}
-	}
-
-	void connect(NetworkAddress const& endpoint) override {
-		auto to = toEndpoint<Protocol>(endpoint);
-		socket.async_connect(to, [self = this->shared_from_this()](boost::system::error_code const& ec) {
-			if (ec) {
-				// TODO: error handling
-				self->_failed = ec;
-				return;
-			}
-			self->ready = true;
-		});
-	}
-};
-
-} // namespace
-
-struct FluentDIngestorImpl {
-	using Protocol = FluentDIngestor::Protocol;
-	Protocol protocol;
-	NetworkAddress endpoint;
-	boost::asio::io_context& io_context;
-	std::unique_ptr<FluentDSocket> socket;
-	boost::asio::steady_timer retryTimer;
-	FluentDIngestorImpl(Protocol protocol, NetworkAddress const& endpoint)
-	  : protocol(protocol), endpoint(endpoint), io_context(ActorLineageProfiler::instance().context()),
-	    retryTimer(io_context) {
-		connect();
-	}
-
-	~FluentDIngestorImpl() { retryTimer.cancel(); }
-
-	void connect() {
-		switch (protocol) {
-		case Protocol::TCP:
-			socket.reset(new FluentDSocketImpl<boost::asio::ip::tcp>(io_context));
-			break;
-		case Protocol::UDP:
-			socket.reset(new FluentDSocketImpl<boost::asio::ip::udp>(io_context));
-			break;
-		}
-		socket->connect(endpoint);
-	}
-
-	void retry() {
-		retryTimer = boost::asio::steady_timer(io_context, std::chrono::seconds(1));
-		retryTimer.async_wait([this](auto const& ec) {
-			if (ec) {
-				return;
-			}
-			connect();
-		});
-		socket.reset();
-	}
-};
-
-FluentDIngestor::~FluentDIngestor() {
-	delete impl;
-}
-
-FluentDIngestor::FluentDIngestor(Protocol protocol, NetworkAddress& endpoint)
-  : impl(new FluentDIngestorImpl(protocol, endpoint)) {}
-
-void FluentDIngestor::ingest(const std::shared_ptr<Sample>& sample) {
-	if (!impl->socket) {
-		// the connection failed in the past and we wait for a timeout before we retry
-		return;
-	} else if (impl->socket->failed()) {
-		impl->retry();
-		return;
-	} else {
-		impl->socket->send(sample);
-	}
-}
-
-void FluentDIngestor::getConfig(std::map<std::string, std::string>& res) const {
-	res["ingestor"] = "fluentd";
-	res["collector_endpoint"] = impl->endpoint.toString();
-	res["collector_protocol"] = impl->protocol == Protocol::TCP ? "tcp" : "udp";
-}
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index e4f0791431..5fa901df0e 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -34,18 +34,14 @@ const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_inf
 const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag_sample_rate");
 const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost");
 
-const KeyRef samplingFrequency = LiteralStringRef("visibility/sampling/frequency");
-const KeyRef samplingWindow = LiteralStringRef("visibility/sampling/window");
-
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
 void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
 	if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
 		auto config = new GlobalConfig{};
 		config->cx = Database(cx);
-		config->dbInfo = dbInfo;
 		g_network->setGlobal(INetwork::enGlobalConfig, config);
-		config->_updater = updater(config);
+		config->_updater = updater(config, dbInfo);
 	}
 }
 
@@ -55,10 +51,6 @@ GlobalConfig& GlobalConfig::globalConfig() {
 	return *reinterpret_cast<GlobalConfig*>(res);
 }
 
-void GlobalConfig::updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo) {
-	// this->dbInfo = dbInfo;
-}
-
 Key GlobalConfig::prefixedKey(KeyRef key) {
 	return key.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin);
 }
@@ -85,14 +77,6 @@ Future<Void> GlobalConfig::onInitialized() {
 	return initialized.getFuture();
 }
 
-Future<Void> GlobalConfig::onChange() {
-	return configChanged.onTrigger();
-}
-
-void GlobalConfig::trigger(KeyRef key, std::function<void(std::optional<std::any>)> fn) {
-	callbacks.emplace(key, std::move(fn));
-}
-
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
 	data.erase(key);
 
@@ -105,8 +89,6 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
 			any = StringRef(arena, t.getString(0).contents());
 		} else if (t.getType(0) == Tuple::ElementType::INT) {
 			any = t.getInt(0);
-		} else if (t.getType(0) == Tuple::ElementType::BOOL) {
-			any = t.getBool(0);
 		} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
 			any = t.getFloat(0);
 		} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
@@ -115,26 +97,19 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
 			ASSERT(false);
 		}
 		data[stableKey] = makeReference<ConfigValue>(std::move(arena), std::move(any));
-
-		if (callbacks.find(stableKey) != callbacks.end()) {
-			callbacks[stableKey](data[stableKey]->value);
-		}
 	} catch (Error& e) {
 		TraceEvent("GlobalConfigTupleParseError").detail("What", e.what());
 	}
 }
 
-void GlobalConfig::erase(Key key) {
-	erase(KeyRangeRef(key, keyAfter(key)));
+void GlobalConfig::erase(KeyRef key) {
+	data.erase(key);
 }
 
 void GlobalConfig::erase(KeyRangeRef range) {
 	auto it = data.begin();
 	while (it != data.end()) {
 		if (range.contains(it->first)) {
-			if (callbacks.find(it->first) != callbacks.end()) {
-				callbacks[it->first](std::nullopt);
-			}
 			it = data.erase(it);
 		} else {
 			++it;
@@ -188,7 +163,7 @@ ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
 // Updates local copy of global configuration by reading the entire key-range
 // from storage.
 ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
-	self->erase(KeyRangeRef(""_sr, "\xff"_sr));
+	self->data.clear();
 
 	Transaction tr(self->cx);
 	RangeResult result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
@@ -201,8 +176,7 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 
 // Applies updates to the local copy of the global configuration when this
 // process receives an updated history.
-ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self) {
-	// wait(self->cx->onConnected());
+ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
 	wait(self->migrate(self));
 
 	wait(self->refresh(self));
@@ -210,9 +184,9 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self) {
 
 	loop {
 		try {
-			wait(self->dbInfo->onChange());
+			wait(dbInfo->onChange());
 
-			auto& history = self->dbInfo->get().history;
+			auto& history = dbInfo->get().history;
 			if (history.size() == 0) {
 				continue;
 			}
@@ -222,8 +196,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self) {
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
 				wait(self->refresh(self));
-				if (self->dbInfo->get().history.size() > 0) {
-					self->lastUpdate = self->dbInfo->get().history.back().version;
+				if (dbInfo->get().history.size() > 0) {
+					self->lastUpdate = dbInfo->get().history.back().version;
 				}
 			} else {
 				// Apply history in order, from lowest version to highest
@@ -248,8 +222,6 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self) {
 					self->lastUpdate = vh.version;
 				}
 			}
-
-			self->configChanged.trigger();
 		} catch (Error& e) {
 			throw;
 		}
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 967ec77f8d..5c3693f450 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -27,9 +27,7 @@
 #define FDBCLIENT_GLOBALCONFIG_ACTOR_H
 
 #include <any>
-#include <functional>
 #include <map>
-#include <optional>
 #include <type_traits>
 #include <unordered_map>
 
@@ -51,9 +49,6 @@ extern const KeyRef fdbClientInfoTxnSizeLimit;
 extern const KeyRef transactionTagSampleRate;
 extern const KeyRef transactionTagSampleCost;
 
-extern const KeyRef samplingFrequency;
-extern const KeyRef samplingWindow;
-
 // Structure used to hold the values stored by global configuration. The arena
 // is used as memory to store both the key and the value (the value is only
 // stored in the arena if it is an object; primitives are just copied).
@@ -77,13 +72,6 @@ public:
 	// configuration.
 	static GlobalConfig& globalConfig();
 
-	// Updates the ClientDBInfo object used by global configuration to read new
-	// data. For server processes, this value needs to be set by the cluster
-	// controller, but global config is initialized before the cluster
-	// controller is, so this function provides a mechanism to update the
-	// object after initialization.
-	void updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo);
-
 	// Use this function to turn a global configuration key defined above into
 	// the full path needed to set the value in the database.
 	//
@@ -126,16 +114,6 @@ public:
 	// been created and is ready.
 	Future<Void> onInitialized();
 
-	// Triggers the returned future when any key-value pair in the global
-	// configuration changes.
-	Future<Void> onChange();
-
-	// Calls \ref fn when the value associated with \ref key is changed. \ref
-	// fn is passed the updated value for the key, or an empty optional if the
-	// key has been cleared. If the value is an allocated object, its memory
-	// remains in the control of the global configuration.
-	void trigger(KeyRef key, std::function<void(std::optional<std::any>)> fn);
-
 private:
 	GlobalConfig();
 
@@ -149,23 +127,20 @@ private:
 	void insert(KeyRef key, ValueRef value);
 	// Removes the given key (and associated value) from the local copy of the
 	// global configuration keyspace.
-	void erase(Key key);
+	void erase(KeyRef key);
 	// Removes the given key range (and associated values) from the local copy
 	// of the global configuration keyspace.
 	void erase(KeyRangeRef range);
 
 	ACTOR static Future<Void> migrate(GlobalConfig* self);
 	ACTOR static Future<Void> refresh(GlobalConfig* self);
-	ACTOR static Future<Void> updater(GlobalConfig* self);
+	ACTOR static Future<Void> updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 
 	Database cx;
-	Reference<AsyncVar<ClientDBInfo>> dbInfo;
 	Future<Void> _updater;
 	Promise<Void> initialized;
-	AsyncTrigger configChanged;
 	std::unordered_map<StringRef, Reference<ConfigValue>> data;
 	Version lastUpdate;
-	std::unordered_map<KeyRef, std::function<void(std::optional<std::any>)>> callbacks;
 };
 
 #endif
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index a9bc969a13..75c11db594 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -32,8 +32,6 @@
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/MultiInterface.h"
 
-#include "fdbclient/ActorLineageProfiler.h"
-#include "fdbclient/AnnotateActor.h"
 #include "fdbclient/Atomic.h"
 #include "fdbclient/ClusterInterface.h"
 #include "fdbclient/CoordinationInterface.h"
@@ -50,7 +48,6 @@
 #include "fdbclient/SpecialKeySpace.actor.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/SystemData.h"
-#include "fdbclient/TransactionLineage.h"
 #include "fdbclient/versions.h"
 #include "fdbrpc/LoadBalance.h"
 #include "fdbrpc/Net2FileSystem.h"
@@ -88,8 +85,6 @@ using std::pair;
 
 namespace {
 
-TransactionLineageCollector transactionLineageCollector;
-
 template <class Interface, class Request>
 Future<REPLY_TYPE(Request)> loadBalance(
     DatabaseContext* ctx,
@@ -963,8 +958,6 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 	getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted"));
 
 	GlobalConfig::create(this, clientInfo);
-	GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
-	GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow);
 
 	monitorProxiesInfoChange = monitorProxiesChange(clientInfo, &proxiesChangeTrigger);
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);
@@ -1069,14 +1062,6 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 		    std::make_unique<DataDistributionImpl>(
 		        KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
 		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
-		registerSpecialKeySpaceModule(
-		    SpecialKeySpace::MODULE::ACTORLINEAGE,
-		    SpecialKeySpace::IMPLTYPE::READONLY,
-		    std::make_unique<ActorLineageImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE)));
-		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF,
-		                              SpecialKeySpace::IMPLTYPE::READWRITE,
-		                              std::make_unique<ActorProfilerConf>(SpecialKeySpace::getModuleRange(
-		                                  SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF)));
 	}
 	if (apiVersionAtLeast(630)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION,
@@ -2523,10 +2508,8 @@ ACTOR Future<Version> watchValue(Future<Version> version,
 				cx->invalidateCache(key);
 				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID));
 			} else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) {
-				// clang-format off
-				TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead
+				TEST(e.code() == error_code_watch_cancelled); // Too many watches on storage server, poll for changes
 				TEST(e.code() == error_code_process_behind); // The storage servers are all behind
-				// clang-format on
 				wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, info.taskID));
 			} else if (e.code() == error_code_timed_out) { // The storage server occasionally times out watches in case
 				                                           // it was cancelled
@@ -3099,7 +3082,6 @@ ACTOR Future<RangeResult> getRange(Database cx,
 						throw deterministicRandom()->randomChoice(
 						    std::vector<Error>{ transaction_too_old(), future_version() });
 					}
-					state AnnotateActor annotation(currentLineage);
 					GetKeyValuesReply _rep =
 					    wait(loadBalance(cx.getPtr(),
 					                     beginServer.second,
diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
deleted file mode 100644
index 11bafc2987..0000000000
--- a/fdbclient/ProcessInterface.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * ProcessInterface.h
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "fdbclient/AnnotateActor.h"
-#include "fdbclient/FDBTypes.h"
-#include "fdbrpc/fdbrpc.h"
-
-constexpr UID WLTOKEN_PROCESS(-1, 11);
-
-struct ProcessInterface {
-	constexpr static FileIdentifier file_identifier = 985636;
-	RequestStream<struct GetProcessInterfaceRequest> getInterface;
-	RequestStream<struct ActorLineageRequest> actorLineage;
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		serializer(ar, actorLineage);
-	}
-};
-
-struct GetProcessInterfaceRequest {
-	constexpr static FileIdentifier file_identifier = 7632546;
-	ReplyPromise<ProcessInterface> reply;
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		serializer(ar, reply);
-	}
-};
-
-// This type is used to send serialized sample data over the network.
-struct SerializedSample {
-	constexpr static FileIdentifier file_identifier = 15785634;
-
-	double time;
-	std::unordered_map<WaitState, std::string> data;
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		serializer(ar, time, data);
-	}
-};
-
-struct ActorLineageReply {
-	constexpr static FileIdentifier file_identifier = 1887656;
-	std::vector<SerializedSample> samples;
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		serializer(ar, samples);
-	}
-};
-
-struct ActorLineageRequest {
-	constexpr static FileIdentifier file_identifier = 11654765;
-	WaitState waitStateStart, waitStateEnd;
-	time_t timeStart, timeEnd;
-	ReplyPromise<ActorLineageReply> reply;
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		serializer(ar, waitStateStart, waitStateEnd, timeStart, timeEnd, reply);
-	}
-};
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 5d24fa4c98..6b147eaa07 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -21,14 +21,6 @@
 #include "boost/lexical_cast.hpp"
 #include "boost/algorithm/string.hpp"
 
-#include <time.h>
-#include <msgpack.hpp>
-
-#include <exception>
-
-#include "fdbclient/ActorLineageProfiler.h"
-#include "fdbclient/Knobs.h"
-#include "fdbclient/ProcessInterface.h"
 #include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/SpecialKeySpace.actor.h"
 #include "flow/Arena.h"
@@ -75,12 +67,7 @@ std::unordered_map<SpecialKeySpace::MODULE, KeyRange> SpecialKeySpace::moduleToB
 	{ SpecialKeySpace::MODULE::GLOBALCONFIG,
 	  KeyRangeRef(LiteralStringRef("\xff\xff/global_config/"), LiteralStringRef("\xff\xff/global_config0")) },
 	{ SpecialKeySpace::MODULE::TRACING,
-	  KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) },
-	{ SpecialKeySpace::MODULE::ACTORLINEAGE,
-	  KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) },
-	{ SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF,
-	  KeyRangeRef(LiteralStringRef("\xff\xff/actor_profiler_conf/"),
-	              LiteralStringRef("\xff\xff/actor_profiler_conf0")) }
+	  KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) }
 };
 
 std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandToRange = {
@@ -111,15 +98,6 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandT
 	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
 };
 
-std::unordered_map<std::string, KeyRange> SpecialKeySpace::actorLineageApiCommandToRange = {
-	{ "state",
-	  KeyRangeRef(LiteralStringRef("state/"), LiteralStringRef("state0"))
-	      .withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) },
-	{ "time",
-	  KeyRangeRef(LiteralStringRef("time/"), LiteralStringRef("time0"))
-	      .withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) }
-};
-
 std::set<std::string> SpecialKeySpace::options = { "excluded/force", "failed/force" };
 
 std::set<std::string> SpecialKeySpace::tracingOptions = { kTracingTransactionIdKey, kTracingTokenKey };
@@ -1406,9 +1384,6 @@ Future<RangeResult> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw, K
 			} else if (config->value.type() == typeid(int64_t)) {
 				result.push_back_deep(result.arena(),
 				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config->value))));
-			} else if (config->value.type() == typeid(bool)) {
-				result.push_back_deep(result.arena(),
-				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<bool>(config->value))));
 			} else if (config->value.type() == typeid(float)) {
 				result.push_back_deep(result.arena(),
 				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config->value))));
@@ -1930,272 +1905,6 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke
 	    "Clear operation is forbidden for profile client. You can set it to default to disable profiling.");
 }
 
-ActorLineageImpl::ActorLineageImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
-
-void parse(StringRef& val, int& i) {
-	i = std::stoi(val.toString());
-}
-
-void parse(StringRef& val, double& d) {
-	d = std::stod(val.toString());
-}
-
-void parse(StringRef& val, WaitState& w) {
-	if (val == LiteralStringRef("disk")) {
-		w = WaitState::Disk;
-	} else if (val == LiteralStringRef("network")) {
-		w = WaitState::Network;
-	} else if (val == LiteralStringRef("running")) {
-		w = WaitState::Running;
-	} else {
-		throw std::range_error("failed to parse run state");
-	}
-}
-
-void parse(StringRef& val, time_t& t) {
-	struct tm tm = { 0 };
-	if (strptime(val.toString().c_str(), "%FT%T%z", &tm) == nullptr) {
-		throw std::invalid_argument("failed to parse ISO 8601 datetime");
-	}
-
-	long timezone = tm.tm_gmtoff;
-	t = timegm(&tm);
-	if (t == -1) {
-		throw std::runtime_error("failed to convert ISO 8601 datetime");
-	}
-	t -= timezone;
-}
-
-void parse(StringRef& val, NetworkAddress& a) {
-	auto address = NetworkAddress::parse(val.toString());
-	if (!address.isValid()) {
-		throw std::invalid_argument("invalid host");
-	}
-	a = address;
-}
-
-// Base case function for parsing function below.
-template <typename T>
-void parse(std::vector<StringRef>::iterator it, std::vector<StringRef>::iterator end, T& t1) {
-	if (it == end) {
-		return;
-	}
-	parse(*it, t1);
-}
-
-// Given an iterator into a vector of string tokens, an iterator to the end of
-// the search space in the vector (exclusive), and a list of references to
-// types, parses each token in the vector into the associated type according to
-// the order of the arguments.
-//
-// For example, given the vector ["1", "1.5", "127.0.0.1:4000"] and the
-// argument list int a, double b, NetworkAddress c, after this function returns
-// each parameter passed in will hold the parsed value from the token list.
-//
-// The appropriate parsing function must be implemented for the type you wish
-// to parse. See the existing parsing functions above, and add your own if
-// necessary.
-template <typename T, typename... Types>
-void parse(std::vector<StringRef>::iterator it, std::vector<StringRef>::iterator end, T& t1, Types&... remaining) {
-	// Return as soon as all tokens have been parsed. This allows parameters
-	// passed at the end to act as optional parameters -- they will only be set
-	// if the value exists.
-	if (it == end) {
-		return;
-	}
-
-	try {
-		parse(*it, t1);
-		parse(++it, end, remaining...);
-	} catch (Error& e) {
-		throw e;
-	} catch (std::exception& e) {
-		throw e;
-	}
-}
-
-ACTOR static Future<RangeResult> actorLineageGetRangeActor(ReadYourWritesTransaction* ryw,
-                                                           KeyRef prefix,
-                                                           KeyRangeRef kr) {
-	state RangeResult result;
-
-	// Set default values for all fields. The default will be used if the field
-	// is missing in the key.
-	state NetworkAddress host;
-	state WaitState waitStateStart = WaitState{ 0 };
-	state WaitState waitStateEnd = WaitState{ 2 };
-	state time_t timeStart = 0;
-	state time_t timeEnd = std::numeric_limits<time_t>::max();
-	state int seqStart = 0;
-	state int seqEnd = std::numeric_limits<int>::max();
-
-	state std::vector<StringRef> beginValues = kr.begin.removePrefix(prefix).splitAny("/"_sr);
-	state std::vector<StringRef> endValues = kr.end.removePrefix(prefix).splitAny("/"_sr);
-	// Require index (either "state" or "time") and address:port.
-	if (beginValues.size() < 2 || endValues.size() < 2) {
-		ryw->setSpecialKeySpaceErrorMsg("missing required parameters (index, host)");
-		throw special_keys_api_failure();
-	}
-
-	state NetworkAddress endRangeHost;
-	try {
-		if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
-			// For the range \xff\xff/actor_lineage/state/ip:port/wait-state/time/seq
-			parse(beginValues.begin() + 1, beginValues.end(), host, waitStateStart, timeStart, seqStart);
-			if (kr.begin != kr.end) {
-				parse(endValues.begin() + 1, endValues.end(), endRangeHost, waitStateEnd, timeEnd, seqEnd);
-			}
-		} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
-			// For the range \xff\xff/actor_lineage/time/ip:port/time/wait-state/seq
-			parse(beginValues.begin() + 1, beginValues.end(), host, timeStart, waitStateStart, seqStart);
-			if (kr.begin != kr.end) {
-				parse(endValues.begin() + 1, endValues.end(), endRangeHost, timeEnd, waitStateEnd, seqEnd);
-			}
-		} else {
-			ryw->setSpecialKeySpaceErrorMsg("invalid index in actor_lineage");
-			throw special_keys_api_failure();
-		}
-	} catch (Error& e) {
-		if (e.code() != special_keys_api_failure().code()) {
-			ryw->setSpecialKeySpaceErrorMsg("failed to parse key");
-			throw special_keys_api_failure();
-		} else {
-			throw e;
-		}
-	}
-
-	if (kr.begin != kr.end && host != endRangeHost) {
-		// The client doesn't know about all the hosts, so a get range covering
-		// multiple hosts has no way of knowing which IP:port combos to use.
-		ryw->setSpecialKeySpaceErrorMsg("the host must remain the same on both ends of the range");
-		throw special_keys_api_failure();
-	}
-
-	// Open endpoint to target process on each call. This can be optimized at
-	// some point...
-	state ProcessInterface process;
-	process.getInterface = RequestStream<GetProcessInterfaceRequest>(Endpoint({ host }, WLTOKEN_PROCESS));
-	ProcessInterface p = wait(retryBrokenPromise(process.getInterface, GetProcessInterfaceRequest{}));
-	process = p;
-
-	ActorLineageRequest actorLineageRequest;
-	actorLineageRequest.waitStateStart = waitStateStart;
-	actorLineageRequest.waitStateEnd = waitStateEnd;
-	actorLineageRequest.timeStart = timeStart;
-	actorLineageRequest.timeEnd = timeEnd;
-	ActorLineageReply reply = wait(process.actorLineage.getReply(actorLineageRequest));
-
-	time_t dt = 0;
-	int seq = -1;
-	for (const auto& sample : reply.samples) {
-		for (const auto& [waitState, data] : sample.data) {
-			time_t datetime = (time_t)sample.time;
-			seq = dt == datetime ? seq + 1 : 0;
-			dt = datetime;
-
-			if (seq < seqStart) { continue; }
-			else if (seq >= seqEnd) { break; }
-
-			char buf[50];
-			struct tm* tm;
-			tm = localtime(&datetime);
-			size_t size = strftime(buf, 50, "%FT%T%z", tm);
-			std::string date(buf, size);
-
-			std::ostringstream streamKey;
-			if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
-				streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("state").toString() << host.toString()
-				          << "/" << to_string(waitState) << "/" << date;
-			} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
-				streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("time").toString() << host.toString()
-				          << "/" << date << "/" << to_string(waitState);
-				;
-			} else {
-				ASSERT(false);
-			}
-			streamKey << "/" << seq;
-
-			msgpack::object_handle oh = msgpack::unpack(data.data(), data.size());
-			msgpack::object deserialized = oh.get();
-
-			std::ostringstream stream;
-			stream << deserialized;
-
-			result.push_back_deep(result.arena(), KeyValueRef(streamKey.str(), stream.str()));
-		}
-	}
-
-	return result;
-}
-
-Future<RangeResult> ActorLineageImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
-	return actorLineageGetRangeActor(ryw, getKeyRange().begin, kr);
-}
-
-namespace {
-std::string_view to_string_view(StringRef sr) {
-	return std::string_view(reinterpret_cast<const char*>(sr.begin()), sr.size());
-}
-} // namespace
-
-ActorProfilerConf::ActorProfilerConf(KeyRangeRef kr)
-  : SpecialKeyRangeRWImpl(kr), config(ProfilerConfig::instance().getConfig()) {}
-
-Future<RangeResult> ActorProfilerConf::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
-	RangeResult res;
-	std::string_view begin(to_string_view(kr.begin.removePrefix(range.begin))),
-	    end(to_string_view(kr.end.removePrefix(range.begin)));
-	for (auto& p : config) {
-		if (p.first > end) {
-			break;
-		} else if (p.first > begin) {
-			KeyValueRef kv;
-			kv.key = StringRef(res.arena(), p.first);
-			kv.value = StringRef(res.arena(), p.second);
-			res.push_back(res.arena(), kv);
-		}
-	}
-	return res;
-}
-
-void ActorProfilerConf::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
-	config[key.removePrefix(range.begin).toString()] = value.toString();
-	didWrite = true;
-}
-
-void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& kr) {
-	std::string begin(kr.begin.removePrefix(range.begin).toString()), end(kr.end.removePrefix(range.begin).toString());
-	auto first = config.lower_bound(begin);
-	if (first == config.end()) {
-		// nothing to clear
-		return;
-	}
-	didWrite = true;
-	auto last = config.upper_bound(end);
-	config.erase(first, last);
-}
-
-void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
-	std::string k = key.removePrefix(range.begin).toString();
-	auto iter = config.find(k);
-	if (iter != config.end()) {
-		config.erase(iter);
-	}
-	didWrite = true;
-}
-
-Future<Optional<std::string>> ActorProfilerConf::commit(ReadYourWritesTransaction* ryw) {
-	Optional<std::string> res{};
-	try {
-		if (didWrite) {
-			ProfilerConfig::instance().reset(config);
-		}
-		return res;
-	} catch (ConfigError& err) {
-		return Optional<std::string>{ err.description };
-	}
-}
-
 MaintenanceImpl::MaintenanceImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
 // Used to read the healthZoneKey
diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h
index 9bf6bb7109..084135bfb6 100644
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@@ -140,8 +140,6 @@ public:
 class SpecialKeySpace {
 public:
 	enum class MODULE {
-		ACTORLINEAGE, // Sampling data
-		ACTOR_PROFILER_CONF, // profiler configuration
 		CLUSTERFILEPATH,
 		CONFIGURATION, // Configuration of the cluster
 		CONNECTIONSTRING,
@@ -199,12 +197,6 @@ public:
 	static KeyRef getManagementApiCommandPrefix(const std::string& command) {
 		return managementApiCommandToRange.at(command).begin;
 	}
-	static KeyRangeRef getActorLineageApiCommandRange(const std::string& command) {
-		return actorLineageApiCommandToRange.at(command);
-	}
-	static KeyRef getActorLineageApiCommandPrefix(const std::string& command) {
-		return actorLineageApiCommandToRange.at(command).begin;
-	}
 	static Key getManagementApiCommandOptionSpecialKey(const std::string& command, const std::string& option);
 	static const std::set<std::string>& getManagementApiOptionsSet() { return options; }
 	static const std::set<std::string>& getTracingOptions() { return tracingOptions; }
@@ -233,7 +225,6 @@ private:
 	static std::unordered_map<SpecialKeySpace::MODULE, KeyRange> moduleToBoundary;
 	static std::unordered_map<std::string, KeyRange>
 	    managementApiCommandToRange; // management command to its special keys' range
-	static std::unordered_map<std::string, KeyRange> actorLineageApiCommandToRange;
 	static std::set<std::string> options; // "<command>/<option>"
 	static std::set<std::string> tracingOptions;
 
@@ -395,32 +386,12 @@ public:
 	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
 };
 
-class ActorLineageImpl : public SpecialKeyRangeReadImpl {
-public:
-	explicit ActorLineageImpl(KeyRangeRef kr);
-	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
-};
-
-class ActorProfilerConf : public SpecialKeyRangeRWImpl {
-	bool didWrite = false;
-	std::map<std::string, std::string> config;
-
-public:
-	explicit ActorProfilerConf(KeyRangeRef kr);
-	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
-	void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) override;
-	void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
-	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
-	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
-};
-
 class MaintenanceImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit MaintenanceImpl(KeyRangeRef kr);
 	Future<RangeResult> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };
-
 class DataDistributionImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit DataDistributionImpl(KeyRangeRef kr);
diff --git a/fdbclient/TransactionLineage.cpp b/fdbclient/TransactionLineage.cpp
deleted file mode 100644
index 9ef0f21e1b..0000000000
--- a/fdbclient/TransactionLineage.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * TransactionLineage.cpp
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "fdbclient/TransactionLineage.h"
-
-namespace {
-TransactionLineageCollector transactionLineageCollector;
-}
\ No newline at end of file
diff --git a/fdbclient/TransactionLineage.h b/fdbclient/TransactionLineage.h
deleted file mode 100644
index 711d89101c..0000000000
--- a/fdbclient/TransactionLineage.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * TransactionLineage.h
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "fdbclient/ActorLineageProfiler.h"
-
-struct TransactionLineage : LineageProperties<TransactionLineage> {
-	enum class Operation {
-		Unset,
-		GetValue,
-		GetKey,
-		GetKeyValues,
-		WatchValue,
-		GetConsistentReadVersion,
-		Commit,
-		GetKeyServersLocations
-	};
-	static constexpr std::string_view name = "Transaction"sv;
-	uint64_t txID;
-	Operation operation = Operation::Unset;
-
-	bool isSet(uint64_t TransactionLineage::*member) const { return this->*member > 0; }
-	bool isSet(Operation TransactionLineage::*member) const { return this->*member != Operation::Unset; }
-};
-
-struct TransactionLineageCollector : IALPCollector<TransactionLineage> {
-	using Operation = TransactionLineage::Operation;
-	std::optional<std::any> collect(ActorLineage* lineage) {
-		std::map<std::string_view, std::any> res;
-		auto txID = lineage->get(&TransactionLineage::txID);
-		if (txID.has_value()) {
-			res["ID"sv] = txID.value();
-		}
-		auto operation = lineage->get(&TransactionLineage::operation);
-		if (operation.has_value()) {
-			switch (operation.value()) {
-			case Operation::Unset:
-				res["operation"sv] = "Unset"sv;
-				break;
-			case Operation::GetValue:
-				res["operation"sv] = "GetValue"sv;
-				break;
-			case Operation::GetKey:
-				res["operation"sv] = "GetKey"sv;
-				break;
-			case Operation::GetKeyValues:
-				res["operation"sv] = "GetKeyValues"sv;
-				break;
-			case Operation::WatchValue:
-				res["operation"sv] = "WatchValue"sv;
-				break;
-			case Operation::GetConsistentReadVersion:
-				res["operation"sv] = "GetConsistentReadVersion"sv;
-				break;
-			case Operation::Commit:
-				res["operation"sv] = "Commit"sv;
-				break;
-			case Operation::GetKeyServersLocations:
-				res["operation"sv] = "GetKeyServersLocations"sv;
-				break;
-			}
-		}
-		if (res.empty()) {
-			return std::optional<std::any>{};
-		} else {
-			return res;
-		}
-	}
-};
-
-template <class T, class V>
-class ScopedLineage {
-	V before;
-	V T::*member;
-	bool valid = true;
-
-public:
-	ScopedLineage(V T::*member, V const& value) : member(member) {
-		auto& val = currentLineage->modify(member);
-		before = val;
-		val = value;
-	}
-	~ScopedLineage() {
-		if (!valid) {
-			return;
-		}
-		currentLineage->modify(member) = before;
-	}
-	ScopedLineage(ScopedLineage<T, V>&& o) : before(std::move(o.before)), member(o.member), valid(o.valid) {
-		o.release();
-	}
-	ScopedLineage& operator=(ScopedLineage<T, V>&& o) {
-		if (valid) {
-			currentLineage->modify(member) = before;
-		}
-		before = std::move(o.before);
-		member = o.member;
-		valid = o.valid;
-		o.release();
-		return *this;
-	}
-	ScopedLineage(const ScopedLineage<T, V>&) = delete;
-	ScopedLineage& operator=(const ScopedLineage<T, V>&) = delete;
-	void release() { valid = false; }
-};
-
-template <class T, class V>
-ScopedLineage<T, V> make_scoped_lineage(V T::*member, V const& value) {
-	return ScopedLineage<T, V>(member, value);
-}
diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp
index ab1fcb0314..367a7b80fb 100644
--- a/fdbclient/Tuple.cpp
+++ b/fdbclient/Tuple.cpp
@@ -71,8 +71,6 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
 			i += sizeof(float) + 1;
 		} else if (data[i] == 0x21) {
 			i += sizeof(double) + 1;
-		} else if (data[i] == 0x26 || data[i] == 0x27) {
-			i += 1;
 		} else if (data[i] == '\x00') {
 			i += 1;
 		} else {
@@ -146,16 +144,6 @@ Tuple& Tuple::append(int64_t value) {
 	return *this;
 }
 
-Tuple& Tuple::appendBool(bool value) {
-	offsets.push_back(data.size());
-	if (value) {
-		data.push_back(data.arena(), 0x27);
-	} else {
-		data.push_back(data.arena(), 0x26);
-	}
-	return *this;
-}
-
 Tuple& Tuple::appendFloat(float value) {
 	offsets.push_back(data.size());
 	float swap = bigEndianFloat(value);
@@ -204,8 +192,6 @@ Tuple::ElementType Tuple::getType(size_t index) const {
 		return ElementType::FLOAT;
 	} else if (code == 0x21) {
 		return ElementType::DOUBLE;
-	} else if (code == 0x26 || code == 0x27) {
-		return ElementType::BOOL;
 	} else {
 		throw invalid_tuple_data_type();
 	}
@@ -301,21 +287,6 @@ int64_t Tuple::getInt(size_t index, bool allow_incomplete) const {
 }
 
 // TODO: Combine with bindings/flow/Tuple.*. This code is copied from there.
-bool Tuple::getBool(size_t index) const {
-	if (index >= offsets.size()) {
-		throw invalid_tuple_index();
-	}
-	ASSERT_LT(offsets[index], data.size());
-	uint8_t code = data[offsets[index]];
-	if (code == 0x26) {
-		return false;
-	} else if (code == 0x27) {
-		return true;
-	} else {
-		throw invalid_tuple_data_type();
-	}
-}
-
 float Tuple::getFloat(size_t index) const {
 	if (index >= offsets.size()) {
 		throw invalid_tuple_index();
diff --git a/fdbclient/Tuple.h b/fdbclient/Tuple.h
index 62feba307b..3dc597f262 100644
--- a/fdbclient/Tuple.h
+++ b/fdbclient/Tuple.h
@@ -40,7 +40,6 @@ struct Tuple {
 	Tuple& append(int64_t);
 	// There are some ambiguous append calls in fdbclient, so to make it easier
 	// to add append for floats and doubles, name them differently for now.
-	Tuple& appendBool(bool);
 	Tuple& appendFloat(float);
 	Tuple& appendDouble(double);
 	Tuple& appendNull();
@@ -52,7 +51,7 @@ struct Tuple {
 		return append(t);
 	}
 
-	enum ElementType { NULL_TYPE, INT, BYTES, UTF8, BOOL, FLOAT, DOUBLE };
+	enum ElementType { NULL_TYPE, INT, BYTES, UTF8, FLOAT, DOUBLE };
 
 	// this is number of elements, not length of data
 	size_t size() const { return offsets.size(); }
@@ -60,7 +59,6 @@ struct Tuple {
 	ElementType getType(size_t index) const;
 	Standalone<StringRef> getString(size_t index) const;
 	int64_t getInt(size_t index, bool allow_incomplete = false) const;
-	bool getBool(size_t index) const;
 	float getFloat(size_t index) const;
 	double getDouble(size_t index) const;
 
diff --git a/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/AsyncFileKAIO.actor.h
index cdc7fe9954..5e6592e6ba 100644
--- a/fdbrpc/AsyncFileKAIO.actor.h
+++ b/fdbrpc/AsyncFileKAIO.actor.h
@@ -242,12 +242,7 @@ public:
 		// result = map(result, [=](int r) mutable { KAIOLogBlockEvent(io, OpLogEntry::READY, r); return r; });
 #endif
 
-		auto& actorLineageSet = IAsyncFileSystem::filesystem()->getActorLineageSet();
-		auto index = actorLineageSet.insert(currentLineage);
-		ASSERT(index != ActorLineageSet::npos);
-		Future<Void> res = success(result);
-		actorLineageSet.erase(index);
-		return res;
+		return success(result);
 	}
 // TODO(alexmiller): Remove when we upgrade the dev docker image to >14.10
 #ifndef FALLOC_FL_ZERO_RANGE
diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp
index c965149f70..40e4ed1c52 100644
--- a/fdbrpc/FlowTests.actor.cpp
+++ b/fdbrpc/FlowTests.actor.cpp
@@ -24,7 +24,6 @@
 #include "flow/UnitTest.h"
 #include "flow/DeterministicRandom.h"
 #include "flow/IThreadPool.h"
-#include "flow/WriteOnlySet.h"
 #include "fdbrpc/fdbrpc.h"
 #include "fdbrpc/IAsyncFile.h"
 #include "flow/TLSConfig.actor.h"
@@ -284,9 +283,6 @@ struct YieldMockNetwork final : INetwork, ReferenceCounted<YieldMockNetwork> {
 		static TLSConfig emptyConfig;
 		return emptyConfig;
 	}
-	ActorLineageSet& getActorLineageSet() override {
-		throw std::exception();
-	}
 	ProtocolVersion protocolVersion() override { return baseNetwork->protocolVersion(); }
 };
 
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index fe0f2277ef..8cc9d0d8e6 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -334,7 +334,7 @@ ACTOR Future<Void> pingLatencyLogger(TransportData* self) {
 }
 
 TransportData::TransportData(uint64_t transportId)
-  : endpoints(/*wellKnownTokenCount*/ 12), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
+  : endpoints(/*wellKnownTokenCount*/ 11), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
     warnAlwaysForLargePacket(true), lastIncompatibleMessage(0), transportId(transportId),
     numIncompatibleConnections(0) {
 	degraded = makeReference<AsyncVar<bool>>(false);
diff --git a/fdbrpc/IAsyncFile.h b/fdbrpc/IAsyncFile.h
index ad48db5f07..ed703514c6 100644
--- a/fdbrpc/IAsyncFile.h
+++ b/fdbrpc/IAsyncFile.h
@@ -25,7 +25,6 @@
 
 #include <ctime>
 #include "flow/flow.h"
-#include "flow/WriteOnlySet.h"
 #include "fdbrpc/IRateControl.h"
 
 // All outstanding operations must be cancelled before the destructor of IAsyncFile is called.
@@ -119,9 +118,6 @@ public:
 	// Returns the time of the last modification of the file.
 	virtual Future<std::time_t> lastWriteTime(const std::string& filename) = 0;
 
-	// Returns the shared memory data structure used to store actor lineages.
-	virtual ActorLineageSet& getActorLineageSet() = 0;
-
 	static IAsyncFileSystem* filesystem() { return filesystem(g_network); }
 	static runCycleFuncPtr runCycleFunc() {
 		return reinterpret_cast<runCycleFuncPtr>(
diff --git a/fdbrpc/Locality.h b/fdbrpc/Locality.h
index 3d712f5699..0a7467e0bf 100644
--- a/fdbrpc/Locality.h
+++ b/fdbrpc/Locality.h
@@ -71,7 +71,6 @@ struct ProcessClass {
 		Ratekeeper,
 		StorageCache,
 		Backup,
-		Worker, // used for actor lineage tracking
 		NoRole
 	};
 	enum ClassSource { CommandLineSource, AutoSource, DBSource, InvalidSource = -1 };
diff --git a/fdbrpc/Net2FileSystem.cpp b/fdbrpc/Net2FileSystem.cpp
index 8e895c08dc..71a7d784a1 100644
--- a/fdbrpc/Net2FileSystem.cpp
+++ b/fdbrpc/Net2FileSystem.cpp
@@ -89,10 +89,6 @@ Future<std::time_t> Net2FileSystem::lastWriteTime(const std::string& filename) {
 	return Net2AsyncFile::lastWriteTime(filename);
 }
 
-ActorLineageSet& Net2FileSystem::getActorLineageSet() {
-	return actorLineageSet;
-}
-
 void Net2FileSystem::newFileSystem(double ioTimeout, const std::string& fileSystemPath) {
 	g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Net2FileSystem(ioTimeout, fileSystemPath));
 }
diff --git a/fdbrpc/Net2FileSystem.h b/fdbrpc/Net2FileSystem.h
index 0c2229b5ca..702b87828f 100644
--- a/fdbrpc/Net2FileSystem.h
+++ b/fdbrpc/Net2FileSystem.h
@@ -39,8 +39,6 @@ public:
 
 	Future<Void> renameFile(std::string const& from, std::string const& to) override;
 
-	ActorLineageSet& getActorLineageSet() override;
-
 	// void init();
 	static void stop();
 
@@ -54,7 +52,6 @@ public:
 	dev_t fileSystemDeviceId;
 	bool checkFileSystem;
 #endif
-	ActorLineageSet actorLineageSet;
 };
 
 #endif
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index 5cf65da0a5..1af14ec676 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -31,7 +31,6 @@
 #include "flow/IThreadPool.h"
 #include "flow/ProtocolVersion.h"
 #include "flow/Util.h"
-#include "flow/WriteOnlySet.h"
 #include "fdbrpc/IAsyncFile.h"
 #include "fdbrpc/AsyncFileCached.actor.h"
 #include "fdbrpc/AsyncFileNonDurable.actor.h"
@@ -976,10 +975,6 @@ public:
 
 	bool checkRunnable() override { return net2->checkRunnable(); }
 
-	ActorLineageSet& getActorLineageSet() override {
-		return actorLineageSet;
-	}
-
 	void stop() override { isStopped = true; }
 	void addStopCallback(std::function<void()> fn) override { stopCallbacks.emplace_back(std::move(fn)); }
 	bool isSimulated() const override { return true; }
@@ -2122,8 +2117,6 @@ public:
 	// Whether or not yield has returned true during the current iteration of the run loop
 	bool yielded;
 	int yield_limit; // how many more times yield may return false before next returning true
-
-	ActorLineageSet actorLineageSet;
 };
 
 class UDPSimSocket : public IUDPSocket, ReferenceCounted<UDPSimSocket> {
@@ -2501,10 +2494,6 @@ Future<std::time_t> Sim2FileSystem::lastWriteTime(const std::string& filename) {
 	return fileWrites[filename];
 }
 
-ActorLineageSet& Sim2FileSystem::getActorLineageSet() {
-	return actorLineageSet;
-}
-
 void Sim2FileSystem::newFileSystem() {
 	g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Sim2FileSystem());
 }
diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h
index 8cf8d1ec37..4b74ed91ba 100644
--- a/fdbrpc/simulator.h
+++ b/fdbrpc/simulator.h
@@ -472,8 +472,6 @@ public:
 
 	Future<std::time_t> lastWriteTime(const std::string& filename) override;
 
-	ActorLineageSet& getActorLineageSet() override;
-
 	Future<Void> renameFile(std::string const& from, std::string const& to) override;
 
 	Sim2FileSystem() {}
@@ -481,8 +479,6 @@ public:
 	~Sim2FileSystem() override {}
 
 	static void newFileSystem();
-
-	ActorLineageSet actorLineageSet;
 };
 
 #endif
diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt
index 019c207556..f3d37bb01e 100644
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@@ -85,11 +85,8 @@ set(FDBSERVER_SRCS
   RestoreWorker.actor.cpp
   Resolver.actor.cpp
   ResolverInterface.h
-  RoleLineage.actor.h
-  RoleLineage.actor.cpp
   ServerDBInfo.actor.h
   ServerDBInfo.h
-  SigStack.cpp
   SimulatedCluster.actor.cpp
   SimulatedCluster.h
   SkipList.cpp
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 4d69aaa0c1..b43e0de27d 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -135,9 +135,7 @@ public:
 		                                                                         true,
 		                                                                         TaskPriority::DefaultEndpoint,
 		                                                                         true)) // SOMEDAY: Locality!
-		{
-			GlobalConfig::globalConfig().updateDBInfo(clientInfo);
-		}
+		{}
 
 		void setDistributor(const DataDistributorInterface& interf) {
 			auto newInfo = serverInfo->get();
diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp
index 208744e3a0..d1469c0d3b 100644
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@@ -28,7 +28,6 @@
 #include "fdbclient/CommitProxyInterface.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/SystemData.h"
-#include "fdbclient/TransactionLineage.h"
 #include "fdbrpc/sim_validation.h"
 #include "fdbserver/ApplyMetadataMutation.h"
 #include "fdbserver/ConflictSet.h"
@@ -1397,7 +1396,6 @@ ACTOR Future<Void> commitBatch(ProxyCommitData* self,
 	// WARNING: this code is run at a high priority (until the first delay(0)), so it needs to do as little work as
 	// possible
 	state CommitBatch::CommitBatchContext context(self, trs, currentBatchMemBytesCount);
-	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::Commit;
 
 	// Active load balancing runs at a very high priority (to obtain accurate estimate of memory used by commit batches)
 	// so we need to downgrade here
@@ -1434,8 +1432,6 @@ ACTOR Future<Void> commitBatch(ProxyCommitData* self,
 
 ACTOR static Future<Void> doKeyServerLocationRequest(GetKeyServerLocationsRequest req, ProxyCommitData* commitData) {
 	// We can't respond to these requests until we have valid txnStateStore
-	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKeyServersLocations;
-	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 	wait(commitData->validState.getFuture());
 	wait(delay(0, TaskPriority::DefaultEndpoint));
 
@@ -1941,7 +1937,7 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
 					state KeyRange txnKeys = allKeys;
 					RangeResult UIDtoTagMap = commitData.txnStateStore->readRange(serverTagKeys).get();
 					state std::map<Tag, UID> tag_uid;
-					for (const KeyValueRef& kv : UIDtoTagMap) {
+					for (const KeyValueRef kv : UIDtoTagMap) {
 						tag_uid[decodeServerTagValue(kv.value)] = decodeServerTagKey(kv.key);
 					}
 					loop {
diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index d0b1da2676..aaf9f8b186 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -19,7 +19,6 @@
  */
 
 #include "fdbclient/Notified.h"
-#include "fdbclient/TransactionLineage.h"
 #include "fdbserver/LogSystem.h"
 #include "fdbserver/LogSystemDiskQueueAdapter.h"
 #include "fdbclient/CommitProxyInterface.h"
@@ -356,11 +355,8 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>>
                                                GrvProxyStats* stats,
                                                GrvTransactionRateInfo* batchRateInfo,
                                                TransactionTagMap<uint64_t>* transactionTagCounter) {
-	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetConsistentReadVersion;
 	loop choose {
 		when(GetReadVersionRequest req = waitNext(readVersionRequests)) {
-			auto lineage = make_scoped_lineage(&TransactionLineage::txID, req.spanContext.first());
-			// currentLineage->modify(&TransactionLineage::txID) =
 			// WARNING: this code is run at a high priority, so it needs to do as little work as possible
 			if (stats->txnRequestIn.getValue() - stats->txnRequestOut.getValue() >
 			    SERVER_KNOBS->START_TRANSACTION_MAX_QUEUE_SIZE) {
@@ -654,7 +650,6 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 	state Span span;
 
 	state int64_t midShardSize = SERVER_KNOBS->MIN_SHARD_BYTES;
-	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetConsistentReadVersion;
 	addActor.send(monitorDDMetricsChanges(&midShardSize, db));
 
 	addActor.send(getRate(proxy.id(),
diff --git a/fdbserver/RoleLineage.actor.cpp b/fdbserver/RoleLineage.actor.cpp
deleted file mode 100644
index b54282f5f0..0000000000
--- a/fdbserver/RoleLineage.actor.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * RoleLineage.cpp
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "fdbserver/RoleLineage.actor.h"
-
-using namespace std::literals;
-
-std::string_view RoleLineage::name = "RoleLineage"sv;
diff --git a/fdbserver/RoleLineage.actor.h b/fdbserver/RoleLineage.actor.h
deleted file mode 100644
index 5cbf65ed53..0000000000
--- a/fdbserver/RoleLineage.actor.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * RoleLineage.actor.h
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include "flow/flow.h"
-#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_G_H)
-#define FDBSERVER_ROLE_LINEAGE_ACTOR_G_H
-#include "fdbserver/RoleLineage.actor.g.h"
-#elif !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_H)
-#define FDBSERVER_ROLE_LINEAGE_ACTOR_H
-
-#include "flow/singleton.h"
-#include "fdbrpc/Locality.h"
-#include "fdbclient/ActorLineageProfiler.h"
-#include "fdbserver/WorkerInterface.actor.h"
-
-#include <string_view>
-#include <msgpack.hpp>
-#include <any>
-#include "flow/actorcompiler.h" // This must be the last include
-
-struct RoleLineage : LineageProperties<RoleLineage> {
-	static std::string_view name;
-	ProcessClass::ClusterRole role = ProcessClass::NoRole;
-
-	bool isSet(ProcessClass::ClusterRole RoleLineage::*member) const { return this->*member != ProcessClass::NoRole; }
-};
-
-struct RoleLineageCollector : IALPCollector<RoleLineage> {
-	RoleLineageCollector() : IALPCollector() {}
-	std::optional<std::any> collect(ActorLineage* lineage) override {
-		auto res = lineage->get(&RoleLineage::role);
-		if (res.has_value()) {
-			return Role::get(res.value()).abbreviation;
-		} else {
-			return std::optional<std::any>();
-		}
-	}
-};
-
-// creates a new root and sets the role lineage
-ACTOR template <class Fun>
-Future<decltype(std::declval<Fun>()())> runInRole(Fun fun, ProcessClass::ClusterRole role) {
-	currentLineage->makeRoot();
-	currentLineage->modify(&RoleLineage::role) = role;
-	decltype(std::declval<Fun>()()) res = wait(fun());
-	return res;
-}
-
-#endif
diff --git a/fdbserver/SigStack.cpp b/fdbserver/SigStack.cpp
deleted file mode 100644
index 0c35326766..0000000000
--- a/fdbserver/SigStack.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "flow/flow.h"
-#include <csignal>
-#include <iostream>
-#include <string_view>
-
-// This is not yet correct, as this is not async safe
-// However, this should be good enough for an initial
-// proof of concept.
-extern "C" void stackSignalHandler(int sig) {
-	auto stack = getActorStackTrace();
-	int i = 0;
-	while (!stack.empty()) {
-		auto s = stack.back();
-		stack.pop_back();
-		std::string_view n(reinterpret_cast<const char*>(s.begin()), s.size());
-		std::cout << i << ": " << n << std::endl;
-		++i;
-	}
-}
-
-void setupStackSignal() {
-	std::signal(SIGUSR1, &stackSignalHandler);
-}
diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h
index fb9a190feb..3446b3a7b8 100644
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@@ -789,41 +789,6 @@ struct Role {
 	std::string abbreviation;
 	bool includeInTraceRoles;
 
-	static const Role& get(ProcessClass::ClusterRole role) {
-		switch (role) {
-		case ProcessClass::Storage:
-			return STORAGE_SERVER;
-		case ProcessClass::TLog:
-			return TRANSACTION_LOG;
-		case ProcessClass::CommitProxy:
-			return COMMIT_PROXY;
-		case ProcessClass::GrvProxy:
-			return GRV_PROXY;
-		case ProcessClass::Master:
-			return MASTER;
-		case ProcessClass::Resolver:
-			return RESOLVER;
-		case ProcessClass::LogRouter:
-			return LOG_ROUTER;
-		case ProcessClass::ClusterController:
-			return CLUSTER_CONTROLLER;
-		case ProcessClass::DataDistributor:
-			return DATA_DISTRIBUTOR;
-		case ProcessClass::Ratekeeper:
-			return RATEKEEPER;
-		case ProcessClass::StorageCache:
-			return STORAGE_CACHE;
-		case ProcessClass::Backup:
-			return BACKUP;
-		case ProcessClass::Worker:
-			return WORKER;
-		case ProcessClass::NoRole:
-		default:
-			ASSERT(false);
-			throw internal_error();
-		}
-	}
-
 	bool operator==(const Role& r) const { return roleName == r.roleName; }
 	bool operator!=(const Role& r) const { return !(*this == r); }
 
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index ac75e87947..712186affe 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -66,9 +66,7 @@
 #include "flow/SystemMonitor.h"
 #include "flow/TLSConfig.actor.h"
 #include "flow/Tracing.h"
-#include "flow/WriteOnlySet.h"
 #include "flow/UnitTest.h"
-#include "fdbclient/ActorLineageProfiler.h"
 
 #if defined(__linux__) || defined(__FreeBSD__)
 #include <execinfo.h>
@@ -86,8 +84,6 @@
 
 #include "flow/actorcompiler.h" // This must be the last #include.
 
-using namespace std::literals;
-
 // clang-format off
 enum {
 	OPT_CONNFILE, OPT_SEEDCONNFILE, OPT_SEEDCONNSTRING, OPT_ROLE, OPT_LISTEN, OPT_PUBLICADDR, OPT_DATAFOLDER, OPT_LOGFOLDER, OPT_PARENTPID, OPT_TRACER, OPT_NEWCONSOLE,
@@ -95,7 +91,7 @@ enum {
 	OPT_DCID, OPT_MACHINE_CLASS, OPT_BUGGIFY, OPT_VERSION, OPT_BUILD_FLAGS, OPT_CRASHONERROR, OPT_HELP, OPT_NETWORKIMPL, OPT_NOBUFSTDOUT, OPT_BUFSTDOUTERR,
 	OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_UNITTESTPARAM, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE,
 	OPT_METRICSPREFIX, OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE,
-	OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE, OPT_PROFILER
+	OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE
 };
 
 CSimpleOpt::SOption g_rgOptions[] = {
@@ -175,10 +171,9 @@ CSimpleOpt::SOption g_rgOptions[] = {
 	{ OPT_METRICSPREFIX,         "--metrics_prefix",            SO_REQ_SEP },
 	{ OPT_IO_TRUST_SECONDS,      "--io_trust_seconds",          SO_REQ_SEP },
 	{ OPT_IO_TRUST_WARN_ONLY,    "--io_trust_warn_only",        SO_NONE },
-	{ OPT_TRACE_FORMAT,          "--trace_format",              SO_REQ_SEP },
+	{ OPT_TRACE_FORMAT      ,    "--trace_format",              SO_REQ_SEP },
 	{ OPT_WHITELIST_BINPATH,     "--whitelist_binpath",         SO_REQ_SEP },
 	{ OPT_BLOB_CREDENTIAL_FILE,  "--blob_credential_file",      SO_REQ_SEP },
-	{ OPT_PROFILER,	             "--profiler_",                 SO_REQ_SEP},
 
 #ifndef TLS_DISABLED
 	TLS_OPTION_FLAGS
@@ -622,11 +617,6 @@ static void printUsage(const char* name, bool devhelp) {
 	                 " Machine class (valid options are storage, transaction,"
 	                 " resolution, grv_proxy, commit_proxy, master, test, unset, stateless, log, router,"
 	                 " and cluster_controller).");
-	printOptionUsage("--profiler_",
-	                 "Set an actor profiler option. Supported options are:\n"
-	                 "  collector -- None or FluentD (FluentD requires collector_endpoint to be set)\n"
-	                 "  collector_endpoint -- IP:PORT of the fluentd server\n"
-	                 "  collector_protocol -- UDP or TCP (default is UDP)");
 #ifndef TLS_DISABLED
 	printf(TLS_HELP);
 #endif
@@ -990,8 +980,6 @@ struct CLIOptions {
 	Standalone<StringRef> machineId;
 	UnitTestParameters testParams;
 
-	std::map<std::string, std::string> profilerConfig;
-
 	static CLIOptions parseArgs(int argc, char* argv[]) {
 		CLIOptions opts;
 		opts.parseArgsInternal(argc, argv);
@@ -1065,18 +1053,6 @@ private:
 				knobs.push_back(std::make_pair(syn, args.OptionArg()));
 				break;
 			}
-			case OPT_PROFILER: {
-				std::string syn = args.OptionSyntax();
-				std::string_view key = syn;
-				auto prefix = "--profiler_"sv;
-				if (key.find(prefix) != 0) {
-					fprintf(stderr, "ERROR: unable to parse profiler option '%s'\n", syn.c_str());
-					flushAndExit(FDB_EXIT_ERROR);
-				}
-				key.remove_prefix(prefix.size());
-				profilerConfig.emplace(key, args.OptionArg());
-				break;
-			};
 			case OPT_UNITTESTPARAM: {
 				std::string syn = args.OptionSyntax();
 				if (!StringRef(syn).startsWith(LiteralStringRef("--test_"))) {
@@ -1477,13 +1453,6 @@ private:
 			}
 		}
 
-		try {
-			ProfilerConfig::instance().reset(profilerConfig);
-		} catch (ConfigError& e) {
-			printf("Error seting up profiler: %s", e.description.c_str());
-			flushAndExit(FDB_EXIT_ERROR);
-		}
-
 		if (seedConnString.length() && seedConnFile.length()) {
 			fprintf(
 			    stderr, "%s\n", "--seed_cluster_file and --seed_connection_string may not both be specified at once.");
@@ -1624,9 +1593,6 @@ private:
 } // namespace
 
 int main(int argc, char* argv[]) {
-	// TODO: Remove later, this is just to force the statics to be initialized
-	// otherwise the unit test won't run
-	ActorLineageSet _;
 	try {
 		platformInit();
 
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index c92402ce0d..ecb96c9696 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -42,7 +42,6 @@
 #include "fdbclient/Notified.h"
 #include "fdbclient/StatusClient.h"
 #include "fdbclient/SystemData.h"
-#include "fdbclient/TransactionLineage.h"
 #include "fdbclient/VersionedMap.h"
 #include "fdbserver/FDBExecHelper.actor.h"
 #include "fdbserver/IKeyValueStore.h"
@@ -1105,7 +1104,6 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 	state int64_t resultSize = 0;
 	Span span("SS:getValue"_loc, { req.spanContext });
 	span.addTag("key"_sr, req.key);
-	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 
 	try {
 		++data->counters.getValueQueries;
@@ -1802,7 +1800,6 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 {
 	state Span span("SS:getKeyValues"_loc, { req.spanContext });
 	state int64_t resultSize = 0;
-	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 
 	++data->counters.getRangeQueries;
 	++data->counters.allQueries;
@@ -1963,7 +1960,6 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
 	state Span span("SS:getKey"_loc, { req.spanContext });
 	state int64_t resultSize = 0;
-	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 
 	++data->counters.getKeyQueries;
 	++data->counters.allQueries;
@@ -4337,7 +4333,6 @@ ACTOR Future<Void> checkBehind(StorageServer* self) {
 }
 
 ACTOR Future<Void> serveGetValueRequests(StorageServer* self, FutureStream<GetValueRequest> getValue) {
-	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetValue;
 	loop {
 		GetValueRequest req = waitNext(getValue);
 		// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade
@@ -4355,7 +4350,6 @@ ACTOR Future<Void> serveGetValueRequests(StorageServer* self, FutureStream<GetVa
 }
 
 ACTOR Future<Void> serveGetKeyValuesRequests(StorageServer* self, FutureStream<GetKeyValuesRequest> getKeyValues) {
-	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKeyValues;
 	loop {
 		GetKeyValuesRequest req = waitNext(getKeyValues);
 		// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade
@@ -4365,7 +4359,6 @@ ACTOR Future<Void> serveGetKeyValuesRequests(StorageServer* self, FutureStream<G
 }
 
 ACTOR Future<Void> serveGetKeyRequests(StorageServer* self, FutureStream<GetKeyRequest> getKey) {
-	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKey;
 	loop {
 		GetKeyRequest req = waitNext(getKey);
 		// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade
@@ -4378,7 +4371,6 @@ ACTOR Future<Void> watchValueWaitForVersion(StorageServer* self,
                                             WatchValueRequest req,
                                             PromiseStream<WatchValueRequest> stream) {
 	state Span span("SS:watchValueWaitForVersion"_loc, { req.spanContext });
-	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 	try {
 		wait(success(waitForVersionNoTooOld(self, req.version)));
 		stream.send(req);
@@ -4392,11 +4384,9 @@ ACTOR Future<Void> watchValueWaitForVersion(StorageServer* self,
 
 ACTOR Future<Void> serveWatchValueRequestsImpl(StorageServer* self, FutureStream<WatchValueRequest> stream) {
 	loop {
-		currentLineage->modify(&TransactionLineage::txID) = 0;
 		state WatchValueRequest req = waitNext(stream);
 		state Reference<ServerWatchMetadata> metadata = self->getWatchMetadata(req.key.contents());
 		state Span span("SS:serveWatchValueRequestsImpl"_loc, { req.spanContext });
-		currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 
 		if (!metadata.isValid()) { // case 1: no watch set for the current key
 			metadata = makeReference<ServerWatchMetadata>(req.key, req.value, req.version, req.tags, req.debugID);
@@ -4470,7 +4460,6 @@ ACTOR Future<Void> serveWatchValueRequestsImpl(StorageServer* self, FutureStream
 
 ACTOR Future<Void> serveWatchValueRequests(StorageServer* self, FutureStream<WatchValueRequest> watchValue) {
 	state PromiseStream<WatchValueRequest> stream;
-	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::WatchValue;
 	self->actors.add(serveWatchValueRequestsImpl(self, stream.getFuture()));
 
 	loop {
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 15151f3d12..5a568fc96d 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -22,8 +22,6 @@
 #include <boost/lexical_cast.hpp>
 
 #include "fdbrpc/Locality.h"
-#include "fdbclient/GlobalConfig.actor.h"
-#include "fdbclient/ProcessInterface.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbserver/Knobs.h"
 #include "flow/ActorCollection.h"
@@ -34,7 +32,6 @@
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/MetricLogger.actor.h"
 #include "fdbserver/BackupInterface.h"
-#include "fdbserver/RoleLineage.actor.h"
 #include "fdbserver/WorkerInterface.actor.h"
 #include "fdbserver/IKeyValueStore.h"
 #include "fdbserver/WaitFailure.h"
@@ -50,7 +47,6 @@
 #include "flow/Profiler.h"
 #include "flow/ThreadHelper.actor.h"
 #include "flow/Trace.h"
-#include "flow/flow.h"
 #include "flow/network.h"
 
 #ifdef __linux__
@@ -80,10 +76,6 @@ extern IKeyValueStore* keyValueStoreCompressTestData(IKeyValueStore* store);
 #define KV_STORE(filename, uid) keyValueStoreMemory(filename, uid)
 #endif
 
-namespace {
-RoleLineageCollector roleLineageCollector;
-}
-
 ACTOR Future<std::vector<Endpoint>> tryDBInfoBroadcast(RequestStream<UpdateServerDBInfoRequest> stream,
                                                        UpdateServerDBInfoRequest req) {
 	ErrorOr<std::vector<Endpoint>> rep =
@@ -1049,8 +1041,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 			metricsLogger = runMetrics(openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, lockAware),
 			                           KeyRef(metricsPrefix));
 		}
-
-		GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
 	}
 
 	errorForwarders.add(resetAfter(degraded,
@@ -1098,8 +1088,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 			DiskStore s = stores[f];
 			// FIXME: Error handling
 			if (s.storedComponent == DiskStore::Storage) {
-				LocalLineage _;
-				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage;
 				IKeyValueStore* kv =
 				    openKVStore(s.storeType, s.filename, s.storeID, memoryLimit, false, validateDataFiles);
 				Future<Void> kvClosed = kv->onClosed();
@@ -1144,8 +1132,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 				                                  kv);
 				errorForwarders.add(forwardError(errors, Role::STORAGE_SERVER, recruited.id(), f));
 			} else if (s.storedComponent == DiskStore::TLogData) {
-				LocalLineage _;
-				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::TLog;
 				std::string logQueueBasename;
 				const std::string filename = basename(s.filename);
 				if (StringRef(filename).startsWith(fileLogDataPrefix)) {
@@ -1345,8 +1331,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 				}
 			}
 			when(RecruitMasterRequest req = waitNext(interf.master.getFuture())) {
-				LocalLineage _;
-				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Master;
 				MasterInterface recruited;
 				recruited.locality = locality;
 				recruited.initEndpoints();
@@ -1369,8 +1353,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 				req.reply.send(recruited);
 			}
 			when(InitializeDataDistributorRequest req = waitNext(interf.dataDistributor.getFuture())) {
-				LocalLineage _;
-				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::DataDistributor;
 				DataDistributorInterface recruited(locality);
 				recruited.initEndpoints();
 
@@ -1393,8 +1375,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 				req.reply.send(recruited);
 			}
 			when(InitializeRatekeeperRequest req = waitNext(interf.ratekeeper.getFuture())) {
-				LocalLineage _;
-				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Ratekeeper;
 				RatekeeperInterface recruited(locality, req.reqId);
 				recruited.initEndpoints();
 
@@ -1421,8 +1401,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 			}
 			when(InitializeBackupRequest req = waitNext(interf.backup.getFuture())) {
 				if (!backupWorkerCache.exists(req.reqId)) {
-					LocalLineage _;
-					currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Backup;
 					BackupInterface recruited(locality);
 					recruited.initEndpoints();
 
@@ -1452,8 +1430,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 					    .detail("MinRecruitable", TLogVersion::MIN_RECRUITABLE);
 					req.reply.sendError(internal_error());
 				}
-				LocalLineage _;
-				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::TLog;
 				TLogOptions tLogOptions(req.logVersion, req.spillType);
 				TLogFn tLogFn = tLogFnForOptions(tLogOptions);
 				auto& logData = sharedLogs[SharedLogsKey(tLogOptions, req.storeType)];
@@ -1507,8 +1483,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 			}
 			when(InitializeStorageRequest req = waitNext(interf.storage.getFuture())) {
 				if (!storageCache.exists(req.reqId)) {
-					LocalLineage _;
-					currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage;
 					StorageServerInterface recruited(req.interfaceId);
 					recruited.locality = locality;
 					recruited.initEndpoints();
@@ -1557,8 +1531,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 					forwardPromise(req.reply, storageCache.get(req.reqId));
 			}
 			when(InitializeCommitProxyRequest req = waitNext(interf.commitProxy.getFuture())) {
-				LocalLineage _;
-				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::CommitProxy;
 				CommitProxyInterface recruited;
 				recruited.processId = locality.processId();
 				recruited.provisional = false;
@@ -1584,8 +1556,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 				req.reply.send(recruited);
 			}
 			when(InitializeGrvProxyRequest req = waitNext(interf.grvProxy.getFuture())) {
-				LocalLineage _;
-				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::GrvProxy;
 				GrvProxyInterface recruited;
 				recruited.processId = locality.processId();
 				recruited.provisional = false;
@@ -1606,8 +1576,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 				req.reply.send(recruited);
 			}
 			when(InitializeResolverRequest req = waitNext(interf.resolver.getFuture())) {
-				LocalLineage _;
-				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Resolver;
 				ResolverInterface recruited;
 				recruited.locality = locality;
 				recruited.initEndpoints();
@@ -1625,8 +1593,6 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 				req.reply.send(recruited);
 			}
 			when(InitializeLogRouterRequest req = waitNext(interf.logRouter.getFuture())) {
-				LocalLineage _;
-				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::LogRouter;
 				TLogInterface recruited(locality);
 				recruited.initEndpoints();
 
@@ -2034,8 +2000,6 @@ ACTOR Future<Void> monitorLeaderRemotelyWithDelayedCandidacy(
 	}
 }
 
-extern void setupStackSignal();
-
 ACTOR Future<Void> serveProtocolInfo() {
 	state RequestStream<ProtocolInfoRequest> protocolInfo(
 	    PeerCompatibilityPolicy{ RequirePeer::AtLeast, ProtocolVersion::withStableInterfaces() });
@@ -2046,37 +2010,6 @@ ACTOR Future<Void> serveProtocolInfo() {
 	}
 }
 
-// Handles requests from ProcessInterface, an interface meant for direct
-// communication between the client and FDB processes.
-ACTOR Future<Void> serveProcess() {
-	state ProcessInterface process;
-	process.getInterface.makeWellKnownEndpoint(WLTOKEN_PROCESS, TaskPriority::DefaultEndpoint);
-	loop {
-		choose {
-			when(GetProcessInterfaceRequest req = waitNext(process.getInterface.getFuture())) {
-				req.reply.send(process);
-			}
-			when(ActorLineageRequest req = waitNext(process.actorLineage.getFuture())) {
-				state SampleCollection sampleCollector;
-				auto samples = sampleCollector->get(req.timeStart, req.timeEnd);
-
-				std::vector<SerializedSample> serializedSamples;
-				for (const auto& samplePtr : samples) {
-					auto serialized = SerializedSample{ .time = samplePtr->time };
-					for (const auto& [waitState, pair] : samplePtr->data) {
-						if (waitState >= req.waitStateStart && waitState <= req.waitStateEnd) {
-							serialized.data[waitState] = std::string(pair.first, pair.second);
-						}
-					}
-					serializedSamples.push_back(std::move(serialized));
-				}
-				ActorLineageReply reply{ serializedSamples };
-				req.reply.send(reply);
-			}
-		}
-	}
-}
-
 ACTOR Future<Void> fdbd(Reference<ClusterConnectionFile> connFile,
                         LocalityData localities,
                         ProcessClass processClass,
@@ -2089,11 +2022,8 @@ ACTOR Future<Void> fdbd(Reference<ClusterConnectionFile> connFile,
                         std::string whitelistBinPaths) {
 	state vector<Future<Void>> actors;
 	state Promise<Void> recoveredDiskFiles;
-	setupStackSignal();
-	currentLineage->modify(&RoleLineage::role) = ProcessClass::Worker;
 
 	actors.push_back(serveProtocolInfo());
-	actors.push_back(serveProcess());
 
 	try {
 		ServerCoordinators coordinators(connFile);
diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt
index 4c28aee437..c838e8eff8 100644
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@@ -69,7 +69,6 @@ set(FLOW_SRCS
   TreeBenchmark.h
   UnitTest.cpp
   UnitTest.h
-  WriteOnlySet.actor.cpp
   XmlTraceLogFormatter.cpp
   XmlTraceLogFormatter.h
   actorcompiler.h
diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp
index af5613f1fa..bb0b0325c6 100644
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@@ -204,8 +204,6 @@ public:
 
 	bool checkRunnable() override;
 
-	ActorLineageSet& getActorLineageSet() override;
-
 	bool useThreadPool;
 
 	// private:
@@ -228,15 +226,11 @@ public:
 	TaskPriority currentTaskID;
 	uint64_t tasksIssued;
 	TDMetricCollection tdmetrics;
-	// we read now() from a different thread. On Intel, reading a double is atomic anyways, but on other platforms it's
-	// not. For portability this should be atomic
-	std::atomic<double> currentTime;
+	double currentTime;
 	// May be accessed off the network thread, e.g. by onMainThread
 	std::atomic<bool> stopped;
 	mutable std::map<IPAddress, bool> addressOnHostCache;
 
-	ActorLineageSet actorLineageSet;
-
 	std::atomic<bool> started;
 
 	uint64_t numYields;
@@ -1389,10 +1383,6 @@ bool Net2::checkRunnable() {
 	return !started.exchange(true);
 }
 
-ActorLineageSet& Net2::getActorLineageSet() {
-	return actorLineageSet;
-}
-
 void Net2::run() {
 	TraceEvent::setNetworkThread();
 	TraceEvent("Net2Running");
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 8cdb34f769..42d8decccc 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -48,10 +48,6 @@
 #include "flow/UnitTest.h"
 #include "flow/FaultInjection.h"
 
-#include "fdbrpc/IAsyncFile.h"
-
-#include "fdbclient/AnnotateActor.h"
-
 #ifdef _WIN32
 #include <windows.h>
 #include <winioctl.h>
diff --git a/flow/Platform.h b/flow/Platform.h
index c50c13e11a..74c9395c53 100644
--- a/flow/Platform.h
+++ b/flow/Platform.h
@@ -791,17 +791,17 @@ inline void fdb_probe_actor_exit(const char* name, unsigned long id, int index)
 #include <inttypes.h>
 static inline uint32_t hwCrc32cU8(unsigned int crc, unsigned char v) {
 	uint32_t ret;
-	asm volatile("crc32cb %w[r], %w[c], %w[v]" : [ r ] "=r"(ret) : [ c ] "r"(crc), [ v ] "r"(v));
+	asm volatile("crc32cb %w[r], %w[c], %w[v]" : [r] "=r"(ret) : [c] "r"(crc), [v] "r"(v));
 	return ret;
 }
 static inline uint32_t hwCrc32cU32(unsigned int crc, unsigned int v) {
 	uint32_t ret;
-	asm volatile("crc32cw %w[r], %w[c], %w[v]" : [ r ] "=r"(ret) : [ c ] "r"(crc), [ v ] "r"(v));
+	asm volatile("crc32cw %w[r], %w[c], %w[v]" : [r] "=r"(ret) : [c] "r"(crc), [v] "r"(v));
 	return ret;
 }
 static inline uint64_t hwCrc32cU64(uint64_t crc, uint64_t v) {
 	uint64_t ret;
-	asm volatile("crc32cx %w[r], %w[c], %x[v]" : [ r ] "=r"(ret) : [ c ] "r"(crc), [ v ] "r"(v));
+	asm volatile("crc32cx %w[r], %w[c], %x[v]" : [r] "=r"(ret) : [c] "r"(crc), [v] "r"(v));
 	return ret;
 }
 #else
diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp
index 24bba87739..1275c5d410 100644
--- a/flow/Profiler.actor.cpp
+++ b/flow/Profiler.actor.cpp
@@ -142,8 +142,6 @@ struct Profiler {
 	}
 
 	void signal_handler() { // async signal safe!
-		static std::atomic<bool> inSigHandler = false;
-		if (inSigHandler.exchange(true)) { return; }
 		if (profilingEnabled) {
 			double t = timer();
 			output_buffer->push(*(void**)&t);
@@ -152,7 +150,6 @@ struct Profiler {
 				output_buffer->push(addresses[i]);
 			output_buffer->push((void*)-1LL);
 		}
-		inSigHandler.store(false);
 	}
 
 	static void signal_handler_for_closure(int, siginfo_t* si, void*, void* self) { // async signal safe!
diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
deleted file mode 100644
index 4a2e60d542..0000000000
--- a/flow/WriteOnlySet.actor.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * WriteOnlySet.actor.cpp
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "flow/DeterministicRandom.h"
-#include "flow/WriteOnlySet.h"
-#include "flow/flow.h"
-#include "flow/UnitTest.h"
-
-#include <chrono>
-#include <random>
-#include "flow/actorcompiler.h" // has to be last include
-
-template <class T, class IndexType, IndexType CAPACITY>
-auto WriteOnlySet<T, IndexType, CAPACITY>::insert(const Reference<T>& lineage) -> Index {
-	Index res;
-	if (!freeQueue.pop(res)) {
-		TraceEvent(SevWarnAlways, "NoCapacityInWriteOnlySet");
-		return npos;
-	}
-	ASSERT(_set[res].load() == 0);
-	auto ptr = reinterpret_cast<uintptr_t>(lineage.getPtr());
-	ASSERT((ptr % 2) == 0); // this needs to be at least 2-byte aligned
-	ASSERT(ptr != 0);
-	lineage->addref();
-	_set[res].store(ptr);
-	return res;
-}
-
-template <class T, class IndexType, IndexType CAPACITY>
-bool WriteOnlySet<T, IndexType, CAPACITY>::eraseImpl(Index idx) {
-	while (true) {
-		auto ptr = _set[idx].load();
-		if (ptr & LOCK) {
-			_set[idx].store(0);
-			freeList.push(reinterpret_cast<T*>(ptr ^ LOCK));
-			return false;
-		} else {
-			if (_set[idx].compare_exchange_strong(ptr, 0)) {
-				reinterpret_cast<T*>(ptr)->delref();
-				return true;
-			}
-		}
-	}
-}
-
-template <class T, class IndexType, IndexType CAPACITY>
-bool WriteOnlySet<T, IndexType, CAPACITY>::erase(Index idx) {
-	ASSERT(idx >= 0 && idx < CAPACITY);
-	auto res = eraseImpl(idx);
-	ASSERT(freeQueue.push(idx));
-	return res;
-}
-
-template <class T, class IndexType, IndexType CAPACITY>
-bool WriteOnlySet<T, IndexType, CAPACITY>::replace(Index idx, const Reference<T>& lineage) {
-	auto lineagePtr = reinterpret_cast<uintptr_t>(lineage.getPtr());
-	if (lineage.isValid()) {
-		lineage->addref();
-	}
-	ASSERT((lineagePtr % 2) == 0); // this needs to be at least 2-byte aligned
-
-	while (true) {
-		auto ptr = _set[idx].load();
-		if (ptr & LOCK) {
-			_set[idx].store(lineagePtr);
-			ASSERT(freeList.push(reinterpret_cast<T*>(ptr ^ LOCK)));
-			return false;
-		} else {
-			if (_set[idx].compare_exchange_strong(ptr, lineagePtr)) {
-				if (ptr) {
-					reinterpret_cast<T*>(ptr)->delref();
-				}
-				return ptr != 0;
-			}
-		}
-	}
-}
-
-template <class T, class IndexType, IndexType CAPACITY>
-WriteOnlySet<T, IndexType, CAPACITY>::WriteOnlySet() : _set(CAPACITY) {
-	// insert the free indexes in reverse order
-	for (unsigned i = CAPACITY; i > 0; --i) {
-		freeQueue.push(i - 1);
-		std::atomic_init(&_set[i - 1], uintptr_t(0));
-	}
-}
-
-template <class T, class IndexType, IndexType CAPACITY>
-std::vector<Reference<T>> WriteOnlySet<T, IndexType, CAPACITY>::copy() {
-	std::vector<Reference<T>> result;
-	for (int i = 0; i < CAPACITY; ++i) {
-		auto ptr = _set[i].load();
-		if (ptr) {
-			ASSERT((ptr & LOCK) == 0); // if we lock something we need to immediately unlock after we're done copying
-			// We attempt lock so this won't get deleted. We will try this only once, if the other thread removed the
-			// object from the set between the previews lines and now, we just won't make it part of the result.
-			if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) {
-				T* entry = reinterpret_cast<T*>(ptr);
-				ptr |= LOCK;
-				entry->addref();
-				// we try to unlock now. If this element was removed while we incremented the refcount, the element will
-				// end up in the freeList, so we will decrement later.
-				_set[i].compare_exchange_strong(ptr, ptr ^ LOCK);
-				result.push_back(Reference(entry));
-			}
-		}
-	}
-	// after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread
-	// might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next
-	// iteration
-	freeList.consume_all([](auto toClean) { toClean->delref(); });
-	return result;
-}
-
-template <class T, class IndexType>
-WriteOnlyVariable<T, IndexType>::WriteOnlyVariable() : WriteOnlySet<T, IndexType, 1>() {}
-
-template <class T, class IndexType>
-Reference<T> WriteOnlyVariable<T, IndexType>::get() {
-	auto result = WriteOnlySet<T, IndexType, 1>::copy();
-	return result.size() ? result.at(0) : Reference<T>();
-}
-
-template <class T, class IndexType>
-bool WriteOnlyVariable<T, IndexType>::replace(const Reference<T>& element) {
-	return WriteOnlySet<T, IndexType, 1>::replace(0, element);
-}
-
-// Explicit instantiation
-template class WriteOnlySet<ActorLineage, unsigned, 1024>;
-template class WriteOnlyVariable<ActorLineage, unsigned>;
-
-// testing code
-namespace {
-
-// Some statistics
-std::atomic<unsigned long> instanceCounter = 0;
-std::atomic<unsigned long> numInserts = 0;
-std::atomic<unsigned long> numErase = 0;
-std::atomic<unsigned long> numLockedErase = 0;
-std::atomic<unsigned long> numCopied = 0;
-
-// A simple object that counts the number of its instances. This is used to detect memory leaks.
-struct TestObject {
-	mutable std::atomic<unsigned> _refCount = 1;
-	TestObject() { instanceCounter.fetch_add(1); }
-	void delref() const {
-		if (--_refCount == 0) {
-			delete this;
-			--instanceCounter;
-		}
-	}
-	void addref() const { ++_refCount; }
-};
-
-using TestSet = WriteOnlySet<TestObject, unsigned, 128>;
-using Clock = std::chrono::steady_clock;
-
-// An actor that can join a set of threads in an async way.
-ACTOR Future<Void> threadjoiner(std::shared_ptr<std::vector<std::thread>> threads, std::shared_ptr<TestSet> set) {
-	loop {
-		wait(delay(0.1));
-		for (unsigned i = 0;;) {
-			if (threads->size() == i) {
-				break;
-			}
-			auto& t = (*threads)[i];
-			if (t.joinable()) {
-				t.join();
-				if (i + 1 < threads->size()) {
-					std::swap(*threads->rbegin(), (*threads)[i]);
-				}
-				threads->pop_back();
-			} else {
-				++i;
-			}
-		}
-		if (threads->empty()) {
-			set->copy();
-			ASSERT(instanceCounter.load() == 0);
-			return Void();
-		}
-	}
-}
-
-// occasionally copy the contents of the past set.
-void testCopier(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
-	auto start = Clock::now();
-	while (true) {
-		if (Clock::now() - start > runFor) {
-			return;
-		}
-		auto copy = set->copy();
-		numCopied.fetch_add(copy.size());
-		std::this_thread::sleep_for(std::chrono::milliseconds(10));
-	}
-}
-
-// In a loop adds and removes a set of objects to the set
-void writer(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
-	auto start = Clock::now();
-	std::random_device rDev;
-	DeterministicRandom rnd(rDev());
-	while (true) {
-		unsigned inserts = 0, erases = 0;
-		if (Clock::now() - start > runFor) {
-			return;
-		}
-		std::vector<TestSet::Index> positions;
-		for (int i = 0; i < rnd.randomInt(1, 101); ++i) {
-			Reference<TestObject> o(new TestObject());
-			auto pos = set->insert(o);
-			if (pos == TestSet::npos) {
-				// could not insert -- ignore
-				break;
-			}
-			++inserts;
-			ASSERT(pos < TestSet::capacity);
-			positions.push_back(pos);
-		}
-		rnd.randomShuffle(positions);
-		for (auto p : positions) {
-			if (!set->erase(p)) {
-				++numLockedErase;
-			}
-			++erases;
-		}
-		numInserts.fetch_add(inserts);
-		numErase.fetch_add(erases);
-		ASSERT(inserts == erases);
-		std::this_thread::sleep_for(std::chrono::milliseconds(1));
-	}
-}
-
-// This unit test creates 5 writer threads and one copier thread.
-TEST_CASE("/flow/WriteOnlySet") {
-	if (g_network->isSimulated()) {
-		// This test is not deterministic, so we shouldn't run it in simulation
-		return Void();
-	}
-	auto set = std::make_shared<TestSet>();
-	auto threads = std::make_shared<std::vector<std::thread>>();
-	std::chrono::seconds runFor(10);
-	for (int i = 0; i < 5; ++i) {
-		threads->emplace_back([set, runFor]() { writer(set, runFor); });
-	}
-	threads->emplace_back([set, runFor]() { testCopier(set, runFor); });
-	wait(threadjoiner(threads, set));
-	TraceEvent("WriteOnlySetTestResult")
-	    .detail("Inserts", numInserts.load())
-	    .detail("Erases", numErase.load())
-	    .detail("Copies", numCopied.load())
-	    .detail("LockedErase", numLockedErase.load());
-	return Void();
-}
-} // namespace
diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h
deleted file mode 100644
index 73da2bfac1..0000000000
--- a/flow/WriteOnlySet.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * WriteOnlySet.h
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include "flow/Error.h"
-#include "flow/FastRef.h"
-#include "flow/Trace.h"
-#include <boost/lockfree/queue.hpp>
-
-/**
- * This is a Write-Only set that supports copying the whole content. This data structure is lock-free and allows a user
- * to insert and remove objects up to a given capacity (passed by a template).
- *
- * Template parameters:
- * \param T The type to store.
- * \param IndexType The type used as an index
- * \param CAPACITY The maximum number of object this structure can store (if a user tries to store more, insert will
- *                 fail gracefully)
- * \pre T implements `void addref() const` and `void delref() const`
- * \pre IndexType must have a copy constructor
- * \pre IndexType must have a trivial assignment operator
- * \pre IndexType must have a trivial destructor
- * \pre IndexType can be used as an index into a std::vector
- */
-template <class T, class IndexType, IndexType CAPACITY>
-class WriteOnlySet {
-public:
-	// The type we use for lookup into the set. Gets assigned during insert
-	using Index = IndexType;
-	// For now we use a fixed size capacity
-	constexpr static Index npos = std::numeric_limits<Index>::max();
-	constexpr static IndexType capacity = CAPACITY;
-
-	explicit WriteOnlySet();
-	WriteOnlySet(const WriteOnlySet&) = delete;
-	WriteOnlySet(WriteOnlySet&&) = delete;
-	WriteOnlySet& operator=(const WriteOnlySet&) = delete;
-	WriteOnlySet& operator=(WriteOnlySet&&) = delete;
-
-	/**
-	 * Attempts to insert \p lineage into the set. This method can fail if the set is full (its size is equal to its
-	 * capacity). Calling insert on a full set is safe but the method will return \ref npos if the operation fails.
-	 *
-	 * \param lineage A reference to the object the user wants to insert.
-	 * \ret An index that can later be used to erase the value again or \ref npos if the insert failed.
-	 * \pre lineage.getPtr() % 2 == 0 (the memory for lineage has to be at least 2 byte aligned)
-	 */
-	[[nodiscard]] Index insert(const Reference<T>& lineage);
-
-	/**
-	 * Erases the object associated with \p idx from the set.
-	 *
-	 * \ret Whether the reference count was decremented. Usually the return value is only interesting for testing and
-	 *      benchmarking purposes and will in most cases be ignored. If \ref delref wasn't called, it will be called
-	 *      later. Note that at the time the return value is checked, \ref delref might already have been called.
-	 */
-	bool erase(Index idx);
-
-	/**
-	 * Replaces the object associated with \p idx with \p lineage.
-	 *
-	 * \ret Whether the reference count of the replaced object was decremented. Usually the return value is only
-	 *      interesting for testing and benchmarking purposes and will in most cases be ignored. If \ref delref
-	 *      wasn't called, it will be called later. Note that at the time the return value is checked, \ref delref
-	 *      might already have been called.
-	 */
-	bool replace(Index idx, const Reference<T>& lineage);
-
-	/**
-	 * Copies all elements that are stored in the set into a vector. This copy operation does NOT provide a snapshot of
-	 * the data structure. The contract is weak:
-	 * - All object that were in the set before copy is called and weren't removed until after copy returned are
-	 *   guaranteed to be in the result.
-	 * - Any object that was inserted while copy is running might be in the result.
-	 * - Any object that was erased while copy is running might be in the result.
-	 */
-	std::vector<Reference<T>> copy();
-
-protected:
-	// the implementation of erase -- the wrapper just makes the function a bit more readable.
-	bool eraseImpl(Index idx);
-
-	// the last bit of a pointer within the set is used like a boolean and true means that the object is locked. Locking
-	// an object is only relevant for memory management. A locked pointer can still be erased from the set, but the
-	// erase won't call delref on the object. Instead it will push the pointer into the \ref freeList and copy will call
-	// delref later.
-	static constexpr uintptr_t LOCK = 0b1;
-
-	// The actual memory
-	std::vector<std::atomic<std::uintptr_t>> _set;
-	static_assert(std::atomic<Index>::is_always_lock_free, "Index type can't be used as a lock-free type");
-	static_assert(std::atomic<uintptr_t>::is_always_lock_free, "uintptr_t can't be used as a lock-free type");
-
-	// The freeQueue. On creation all indexes (0..capacity-1) are pushed into this queue. On insert one element from
-	// this queue is consumed and the resulting number is used as an index into the set. On erase the index is given
-	// back to the freeQueue.
-	boost::lockfree::queue<Index, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeQueue;
-
-	// The freeList is used for memory management. Generally copying a shared pointer can't be done in a lock-free way.
-	// Instead, when we copy the data structure we first copy the address, then attempt to set the last bit to 1 and
-	// only if that succeeds we will increment the reference count. Whenever we attempt to remove an object
-	// in \ref erase we remove the object from the set (using an atomic compare and swap) and only decrement the
-	// reference count if the last bit is 0. If it's not we'll push the pointer into this free list.
-	// \ref copy will consume all elements from this freeList each time it runs and decrements the refcount for each
-	// element.
-	boost::lockfree::queue<T*, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeList;
-};
-
-/**
- * Provides a thread safe, lock-free write only variable.
- *
- * Template parameters:
- * \param T The type to store.
- * \param IndexType The type used as an index
- * \pre T implements `void addref() const` and `void delref() const`
- * \pre IndexType must have a copy constructor
- * \pre IndexType must have a trivial assignment operator
- * \pre IndexType must have a trivial destructor
- * \pre IndexType can be used as an index into a std::vector
- */
-template <class T, class IndexType>
-class WriteOnlyVariable : private WriteOnlySet<T, IndexType, 1> {
-public:
-	explicit WriteOnlyVariable();
-
-	/**
-	 * Returns a copied reference to the stored variable.
-	 */
-	Reference<T> get();
-
-	/**
-	 * Replaces the variable with \p lineage. \p lineage is permitted to be an invalid pointer.
-	 *
-	 * \ret Whether the reference count of the replaced object was decremented. Note that if the reference being replaced
-	 *      is invalid, this function will always return false. If \ref delref wasn't called and the reference was valid,
-	 *      it will be called later. Note that at the time the return value is checked, \ref delref might already have
-	 *      been called.
-	 */
-	bool replace(const Reference<T>& element);
-};
-
-class ActorLineage;
-extern template class WriteOnlySet<ActorLineage, unsigned, 1024>;
-
-using ActorLineageSet = WriteOnlySet<ActorLineage, unsigned, 1024>;
diff --git a/flow/actorcompiler/ActorCompiler.cs b/flow/actorcompiler/ActorCompiler.cs
index 28771f4503..7aef82a42e 100644
--- a/flow/actorcompiler/ActorCompiler.cs
+++ b/flow/actorcompiler/ActorCompiler.cs
@@ -452,7 +452,6 @@ namespace actorcompiler
                     fullClassName,
                     string.Join(", ", actor.parameters.Select(p => p.name).ToArray()));
 
-            writer.WriteLine("\trestore_lineage _;");
             if (actor.returnType != null)
                 writer.WriteLine("\treturn Future<{1}>({0});", newActor, actor.returnType);
             else
@@ -1287,7 +1286,6 @@ namespace actorcompiler
             constructor.WriteLine("{");
             constructor.Indent(+1);
             ProbeEnter(constructor, actor.name);
-            constructor.WriteLine("currentLineage->modify(&StackLineage::actorName) = LiteralStringRef(\"{0}\");", actor.name);
             constructor.WriteLine("this->{0};", body.call());
             ProbeExit(constructor, actor.name);
             WriteFunction(writer, constructor, constructor.BodyText);
diff --git a/flow/actorcompiler/actorcompiler.csproj b/flow/actorcompiler/actorcompiler.csproj
index b590913634..e737adabd2 100644
--- a/flow/actorcompiler/actorcompiler.csproj
+++ b/flow/actorcompiler/actorcompiler.csproj
@@ -1,8 +1,108 @@
-<Project Sdk="Microsoft.NET.Sdk">
-
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <ProductVersion>10.0.20506</ProductVersion>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}</ProjectGuid>
     <OutputType>Exe</OutputType>
-    <TargetFramework>net5.0</TargetFramework>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>actorcompiler</RootNamespace>
+    <AssemblyName>actorcompiler</AssemblyName>
+    <TargetFrameworkVersion>v4.0</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+    <OutputPath>$(SolutionDir)bin\$(Configuration)\</OutputPath>
+    <PublishUrl>publish\</PublishUrl>
+    <Install>true</Install>
+    <InstallFrom>Disk</InstallFrom>
+    <UpdateEnabled>false</UpdateEnabled>
+    <UpdateMode>Foreground</UpdateMode>
+    <UpdateInterval>7</UpdateInterval>
+    <UpdateIntervalUnits>Days</UpdateIntervalUnits>
+    <UpdatePeriodically>false</UpdatePeriodically>
+    <UpdateRequired>false</UpdateRequired>
+    <MapFileExtensions>true</MapFileExtensions>
+    <ApplicationRevision>0</ApplicationRevision>
+    <ApplicationVersion>1.0.0.%2a</ApplicationVersion>
+    <IsWebBootstrapper>false</IsWebBootstrapper>
+    <UseApplicationTrust>false</UseApplicationTrust>
+    <BootstrapperEnabled>true</BootstrapperEnabled>
   </PropertyGroup>
-
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|AnyCPU'">
+    <DebugSymbols>true</DebugSymbols>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <DebugType>full</DebugType>
+    <PlatformTarget>AnyCPU</PlatformTarget>
+    <LangVersion>default</LangVersion>
+    <ErrorReport>prompt</ErrorReport>
+    <CodeAnalysisIgnoreBuiltInRuleSets>false</CodeAnalysisIgnoreBuiltInRuleSets>
+    <CodeAnalysisFailOnMissingRules>false</CodeAnalysisFailOnMissingRules>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|AnyCPU'">
+    <DefineConstants>TRACE</DefineConstants>
+    <Optimize>true</Optimize>
+    <DebugType>pdbonly</DebugType>
+    <PlatformTarget>AnyCPU</PlatformTarget>
+    <LangVersion>default</LangVersion>
+    <ErrorReport>prompt</ErrorReport>
+    <CodeAnalysisIgnoreBuiltInRuleSets>false</CodeAnalysisIgnoreBuiltInRuleSets>
+    <CodeAnalysisIgnoreBuiltInRules>false</CodeAnalysisIgnoreBuiltInRules>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="System" />
+    <Reference Include="System.Core">
+      <RequiredTargetFramework>3.5</RequiredTargetFramework>
+    </Reference>
+    <Reference Include="System.Xml.Linq">
+      <RequiredTargetFramework>3.5</RequiredTargetFramework>
+    </Reference>
+    <Reference Include="System.Data.DataSetExtensions">
+      <RequiredTargetFramework>3.5</RequiredTargetFramework>
+    </Reference>
+    <Reference Include="Microsoft.CSharp">
+      <RequiredTargetFramework>4.0</RequiredTargetFramework>
+    </Reference>
+    <Reference Include="System.Data" />
+    <Reference Include="System.Xml" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="ActorCompiler.cs" />
+    <Compile Include="ActorParser.cs" />
+    <Compile Include="ParseTree.cs" />
+    <Compile Include="Program.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <BootstrapperPackage Include=".NETFramework,Version=v4.0">
+      <Visible>False</Visible>
+      <ProductName>Microsoft .NET Framework 4 %28x86 and x64%29</ProductName>
+      <Install>true</Install>
+    </BootstrapperPackage>
+    <BootstrapperPackage Include="Microsoft.Net.Client.3.5">
+      <Visible>False</Visible>
+      <ProductName>.NET Framework 3.5 SP1 Client Profile</ProductName>
+      <Install>false</Install>
+    </BootstrapperPackage>
+    <BootstrapperPackage Include="Microsoft.Net.Framework.3.5.SP1">
+      <Visible>False</Visible>
+      <ProductName>.NET Framework 3.5 SP1</ProductName>
+      <Install>false</Install>
+    </BootstrapperPackage>
+    <BootstrapperPackage Include="Microsoft.Windows.Installer.3.1">
+      <Visible>False</Visible>
+      <ProductName>Windows Installer 3.1</ProductName>
+      <Install>true</Install>
+    </BootstrapperPackage>
+  </ItemGroup>
+  <ItemGroup>
+    <Content Include="Actor checklist.txt" />
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
 </Project>
\ No newline at end of file
diff --git a/flow/actorcompiler/actorcompiler.sln b/flow/actorcompiler/actorcompiler.sln
deleted file mode 100644
index a4292bfaaa..0000000000
--- a/flow/actorcompiler/actorcompiler.sln
+++ /dev/null
@@ -1,34 +0,0 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 15
-VisualStudioVersion = 15.0.26124.0
-MinimumVisualStudioVersion = 15.0.26124.0
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "actorcompiler", "actorcompiler.csproj", "{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Any CPU = Debug|Any CPU
-		Debug|x64 = Debug|x64
-		Debug|x86 = Debug|x86
-		Release|Any CPU = Release|Any CPU
-		Release|x64 = Release|x64
-		Release|x86 = Release|x86
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x64.ActiveCfg = Debug|Any CPU
-		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x64.Build.0 = Debug|Any CPU
-		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x86.ActiveCfg = Debug|Any CPU
-		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x86.Build.0 = Debug|Any CPU
-		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|Any CPU.ActiveCfg = Debug|Any CPU
-		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|Any CPU.Build.0 = Debug|Any CPU
-		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x64.ActiveCfg = Debug|Any CPU
-		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x64.Build.0 = Debug|Any CPU
-		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x86.ActiveCfg = Debug|Any CPU
-		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x86.Build.0 = Debug|Any CPU
-	EndGlobalSection
-EndGlobal
diff --git a/flow/flow.cpp b/flow/flow.cpp
index ec65640fe2..74f0b334f5 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -26,27 +26,6 @@
 #include <stdarg.h>
 #include <cinttypes>
 
-thread_local Reference<ActorLineage> currentLineage;
-WriteOnlyVariable<ActorLineage, unsigned> currentLineageThreadSafe;
-
-LineagePropertiesBase::~LineagePropertiesBase() {}
-
-ActorLineage::ActorLineage() : properties(), parent(currentLineage) {}
-
-ActorLineage::~ActorLineage() {
-	for (auto ptr : properties) {
-		delete ptr.second;
-	}
-}
-
-using namespace std::literals;
-
-const std::string_view StackLineage::name = "StackLineage"sv;
-
-std::vector<StringRef> getActorStackTrace() {
-	return currentLineage->stack(&StackLineage::actorName);
-}
-
 #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER)
 // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test.
 void* rte_memcpy_noinline(void* __restrict __dest, const void* __restrict __src, size_t __n) {
diff --git a/flow/flow.h b/flow/flow.h
index 8388113253..987572d7c5 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -20,8 +20,6 @@
 
 #ifndef FLOW_FLOW_H
 #define FLOW_FLOW_H
-#include "flow/Arena.h"
-#include "flow/FastRef.h"
 #pragma once
 
 #pragma warning(disable : 4244 4267) // SOMEDAY: Carefully check for integer overflow issues (e.g. size_t to int
@@ -31,18 +29,14 @@
 
 #include <vector>
 #include <queue>
-#include <stack>
 #include <map>
 #include <unordered_map>
 #include <set>
 #include <functional>
 #include <iostream>
 #include <string>
-#include <string_view>
 #include <utility>
 #include <algorithm>
-#include <memory>
-#include <mutex>
 
 #include "flow/Platform.h"
 #include "flow/FastAlloc.h"
@@ -52,7 +46,6 @@
 #include "flow/ThreadPrimitives.h"
 #include "flow/network.h"
 #include "flow/FileIdentifier.h"
-#include "flow/WriteOnlySet.h"
 
 #include <boost/version.hpp>
 
@@ -427,127 +420,6 @@ struct SingleCallback {
 	}
 };
 
-struct LineagePropertiesBase {
-	virtual ~LineagePropertiesBase();
-};
-
-// helper class to make implementation of LineageProperties easier
-template <class Derived>
-struct LineageProperties : LineagePropertiesBase {
-	// Contract:
-	//
-	// StringRef name = "SomeUniqueName"_str;
-
-	// this has to be implemented by subclasses
-	// but can't be made virtual.
-	// A user should implement this for any type
-	// within the properies class.
-	template <class Value>
-	bool isSet(Value Derived::*member) const {
-		return true;
-	}
-};
-
-struct ActorLineage : ThreadSafeReferenceCounted<ActorLineage> {
-	friend class LocalLineage;
-
-private:
-	std::unordered_map<std::string_view, LineagePropertiesBase*> properties;
-	Reference<ActorLineage> parent;
-	mutable std::mutex mutex;
-	using Lock = std::unique_lock<std::mutex>;
-
-public:
-	ActorLineage();
-	~ActorLineage();
-	bool isRoot() const {
-		Lock _{ mutex };
-		return parent.getPtr() == nullptr;
-	}
-	void makeRoot() {
-		Lock _{ mutex };
-		parent.clear();
-	}
-	template <class T, class V>
-	V& modify(V T::*member) {
-		Lock _{ mutex };
-		auto& res = properties[T::name];
-		if (!res) {
-			res = new T{};
-		}
-		T* map = static_cast<T*>(res);
-		return map->*member;
-	}
-	template <class T, class V>
-	std::optional<V> get(V T::*member) const {
-		Lock _{ mutex };
-		auto current = this;
-		while (current != nullptr) {
-			auto iter = current->properties.find(T::name);
-			if (iter != current->properties.end()) {
-				T const& map = static_cast<T const&>(*iter->second);
-				if (map.isSet(member)) {
-					return map.*member;
-				}
-			}
-			current = current->parent.getPtr();
-		}
-		return std::optional<V>{};
-	}
-	template <class T, class V>
-	std::vector<V> stack(V T::*member) const {
-		Lock _{ mutex };
-		auto current = this;
-		std::vector<V> res;
-		while (current != nullptr) {
-			auto iter = current->properties.find(T::name);
-			if (iter != current->properties.end()) {
-				T const& map = static_cast<T const&>(*iter->second);
-				if (map.isSet(member)) {
-					res.push_back(map.*member);
-				}
-			}
-			current = current->parent.getPtr();
-		}
-		return res;
-	}
-};
-
-extern thread_local Reference<ActorLineage> currentLineage;
-extern WriteOnlyVariable<ActorLineage, unsigned> currentLineageThreadSafe;
-
-// This class can be used in order to modify all lineage properties
-// of actors created within a (non-actor) scope
-struct LocalLineage {
-	Reference<ActorLineage> lineage = Reference<ActorLineage>{ new ActorLineage() };
-	Reference<ActorLineage> oldLineage;
-	LocalLineage() {
-		oldLineage = currentLineage;
-		currentLineage = lineage;
-		currentLineageThreadSafe.replace(lineage);
-	}
-	~LocalLineage() {
-		currentLineage = oldLineage;
-		currentLineageThreadSafe.replace(oldLineage);
-	}
-};
-
-struct restore_lineage {
-	Reference<ActorLineage> prev;
-	restore_lineage() : prev(currentLineage) {}
-	~restore_lineage() {
-		currentLineage = prev;
-		currentLineageThreadSafe.replace(prev);
-	}
-};
-
-struct StackLineage : LineageProperties<StackLineage> {
-	static const std::string_view name;
-	StringRef actorName;
-};
-
-extern std::vector<StringRef> getActorStackTrace();
-
 // SAV is short for Single Assignment Variable: It can be assigned for only once!
 template <class T>
 struct SAV : private Callback<T>, FastAllocated<SAV<T>> {
@@ -589,9 +461,8 @@ public:
 		ASSERT(canBeSet());
 		new (&value_storage) T(std::forward<U>(value));
 		this->error_state = Error::fromCode(SET_ERROR_CODE);
-		while (Callback<T>::next != this) {
+		while (Callback<T>::next != this)
 			Callback<T>::next->fire(this->value());
-		}
 	}
 
 	void send(Never) {
@@ -602,9 +473,8 @@ public:
 	void sendError(Error err) {
 		ASSERT(canBeSet() && int16_t(err.code()) > 0);
 		this->error_state = err;
-		while (Callback<T>::next != this) {
+		while (Callback<T>::next != this)
 			Callback<T>::next->error(err);
-		}
 	}
 
 	template <class U>
@@ -753,9 +623,8 @@ struct NotifiedQueue : private SingleCallback<T>, FastAllocated<NotifiedQueue<T>
 			return;
 
 		this->error = err;
-		if (SingleCallback<T>::next != this) {
+		if (SingleCallback<T>::next != this)
 			SingleCallback<T>::next->error(err);
-		}
 	}
 
 	void addPromiseRef() { promises++; }
@@ -1123,73 +992,36 @@ static inline void destruct(T& t) {
 
 template <class ReturnValue>
 struct Actor : SAV<ReturnValue> {
-	Reference<ActorLineage> lineage = Reference<ActorLineage>{ new ActorLineage() };
 	int8_t actor_wait_state; // -1 means actor is cancelled; 0 means actor is not waiting; 1-N mean waiting in callback
 	                         // group #
 
-	Actor() : SAV<ReturnValue>(1, 1), actor_wait_state(0) {
-		/*++actorCount;*/
-		currentLineage = lineage;
-		currentLineageThreadSafe.replace(lineage);
+	Actor() : SAV<ReturnValue>(1, 1), actor_wait_state(0) { /*++actorCount;*/
 	}
 	//~Actor() { --actorCount; }
-
-	Reference<ActorLineage> setLineage() {
-		auto res = currentLineage;
-		currentLineage = lineage;
-		currentLineageThreadSafe.replace(lineage);
-		return res;
-	}
 };
 
 template <>
 struct Actor<void> {
 	// This specialization is for a void actor (one not returning a future, hence also uncancellable)
 
-	Reference<ActorLineage> lineage = Reference<ActorLineage>{ new ActorLineage() };
 	int8_t actor_wait_state; // 0 means actor is not waiting; 1-N mean waiting in callback group #
 
-	Actor() : actor_wait_state(0) {
-		/*++actorCount;*/
-		currentLineage = lineage;
-		currentLineageThreadSafe.replace(lineage);
+	Actor() : actor_wait_state(0) { /*++actorCount;*/
 	}
 	//~Actor() { --actorCount; }
-
-	Reference<ActorLineage> setLineage() {
-		auto res = currentLineage;
-		currentLineage = lineage;
-		currentLineageThreadSafe.replace(lineage);
-		return res;
-	}
 };
 
 template <class ActorType, int CallbackNumber, class ValueType>
 struct ActorCallback : Callback<ValueType> {
-	virtual void fire(ValueType const& value) override {
-		auto _ = static_cast<ActorType*>(this)->setLineage();
-		static_cast<ActorType*>(this)->a_callback_fire(this, value);
-	}
-	virtual void error(Error e) override {
-		auto _ = static_cast<ActorType*>(this)->setLineage();
-		static_cast<ActorType*>(this)->a_callback_error(this, e);
-	}
+	void fire(ValueType const& value) override { static_cast<ActorType*>(this)->a_callback_fire(this, value); }
+	void error(Error e) override { static_cast<ActorType*>(this)->a_callback_error(this, e); }
 };
 
 template <class ActorType, int CallbackNumber, class ValueType>
 struct ActorSingleCallback : SingleCallback<ValueType> {
-	void fire(ValueType const& value) override {
-		auto _ = static_cast<ActorType*>(this)->setLineage();
-		static_cast<ActorType*>(this)->a_callback_fire(this, value);
-	}
-	void fire(ValueType&& value) override {
-		auto _ = static_cast<ActorType*>(this)->setLineage();
-		static_cast<ActorType*>(this)->a_callback_fire(this, std::move(value));
-	}
-	void error(Error e) override {
-		auto _ = static_cast<ActorType*>(this)->setLineage();
-		static_cast<ActorType*>(this)->a_callback_error(this, e);
-	}
+	void fire(ValueType const& value) override { static_cast<ActorType*>(this)->a_callback_fire(this, value); }
+	void fire(ValueType&& value) override { static_cast<ActorType*>(this)->a_callback_fire(this, std::move(value)); }
+	void error(Error e) override { static_cast<ActorType*>(this)->a_callback_error(this, e); }
 };
 inline double now() {
 	return g_network->now();
diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h
index 8561bc623c..46cdb6d73f 100644
--- a/flow/genericactors.actor.h
+++ b/flow/genericactors.actor.h
@@ -1547,10 +1547,6 @@ struct YieldedFutureActor : SAV<Void>, ActorCallback<YieldedFutureActor, 1, Void
 
 	void destroy() override { delete this; }
 
-	Reference<ActorLineage> setLineage() {
-		return currentLineage;
-	}
-
 	void a_callback_fire(ActorCallback<YieldedFutureActor, 1, Void>*, Void) {
 		if (int16_t(in_error_state.code()) == UNSET_ERROR_CODE) {
 			in_error_state = Error::fromCode(SET_ERROR_CODE);
diff --git a/flow/network.h b/flow/network.h
index e5683e4ca7..1eeb5bdc2d 100644
--- a/flow/network.h
+++ b/flow/network.h
@@ -35,7 +35,6 @@
 #include "flow/Arena.h"
 #include "flow/IRandom.h"
 #include "flow/Trace.h"
-#include "flow/WriteOnlySet.h"
 
 enum class TaskPriority {
 	Max = 1000000,
@@ -560,9 +559,6 @@ public:
 	// returns false.
 	virtual bool checkRunnable() = 0;
 
-	// Returns the shared memory data structure used to store actor lineages.
-	virtual ActorLineageSet& getActorLineageSet() = 0;
-
 	virtual ProtocolVersion protocolVersion() = 0;
 
 	// Shorthand for transport().getLocalAddress()
diff --git a/flow/singleton.h b/flow/singleton.h
deleted file mode 100644
index c6a256ac42..0000000000
--- a/flow/singleton.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * (C) Copyright 2015 ETH Zurich Systems Group (http://www.systems.ethz.ch/) and others.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Contributors:
- *     Markus Pilman <mpilman@inf.ethz.ch>
- *     Simon Loesing <sloesing@inf.ethz.ch>
- *     Thomas Etter <etterth@gmail.com>
- *     Kevin Bocksrocker <kevin.bocksrocker@gmail.com>
- *     Lucas Braun <braunl@inf.ethz.ch>
- */
-#pragma once
-
-#include <mutex>
-#include <memory>
-#include <cstdlib>
-#include <cassert>
-
-namespace crossbow {
-
-/**
- * @brief A mock mutex for disabling locking in the singleton
- *
- * This class implements the mutex concept with empty methods.
- * This can be used to disable synchronization in the singleton
- * holder.
- */
-struct no_locking {
-	void lock() {}
-	void unlock() {}
-	bool try_lock() { return true; }
-};
-
-template <typename T>
-struct create_static {
-	static constexpr bool supports_recreation = false;
-	union max_align {
-		char t_[sizeof(T)];
-		short int short_int_;
-		long int long_int_;
-		float float_;
-		double double_;
-		long double longDouble_;
-		struct Test;
-		int Test::*pMember_;
-		int (Test::*pMemberFn_)(int);
-	};
-
-	static T* create() {
-		static max_align static_memory_;
-		return new (&static_memory_) T;
-	}
-
-	static void destroy(T* ptr) { ptr->~T(); }
-};
-
-template <typename T>
-struct create_using_new {
-	static constexpr bool supports_recreation = true;
-	static T* create() { return new T; };
-
-	static void destroy(T* ptr) { delete ptr; }
-};
-
-template <typename T>
-struct create_using_malloc {
-	static constexpr bool supports_recreation = true;
-	static T* create() {
-		void* p = std::malloc(sizeof(T));
-		if (!p)
-			return nullptr;
-		return new (p) T;
-	}
-
-	static void destroy(T* ptr) {
-		ptr->~T();
-		free(ptr);
-	}
-};
-
-template <class T, class allocator>
-struct create_using {
-	static constexpr bool supports_recreation = true;
-	static allocator alloc_;
-
-	static T* create() {
-		T* p = alloc_.allocate(1);
-		if (!p)
-			return nullptr;
-		alloc_.construct(p);
-		return p;
-	};
-
-	static void destroy(T* ptr) {
-		alloc_.destroy(ptr);
-		alloc_.deallocate(ptr, 1);
-	}
-};
-
-template <typename T>
-struct default_lifetime {
-	static void schedule_destruction(T*, void (*func)()) { std::atexit(func); }
-
-	static void on_dead_ref() { throw std::logic_error("Dead reference detected"); }
-};
-
-template <typename T>
-struct phoenix_lifetime {
-	static void schedule_destruction(T*, void (*func)()) { std::atexit(func); }
-
-	static void on_dead_ref() {}
-};
-
-template <typename T>
-struct infinite_lifetime {
-	static void schedule_destruction(T*, void (*)()) {}
-	static void on_dead_ref() {}
-};
-
-template <typename T>
-struct lifetime_traits {
-	static constexpr bool supports_recreation = true;
-};
-
-template <typename T>
-struct lifetime_traits<infinite_lifetime<T>> {
-	static constexpr bool supports_recreation = false;
-};
-
-template <typename T>
-struct lifetime_traits<default_lifetime<T>> {
-	static constexpr bool supports_recreation = false;
-};
-
-template <typename Type,
-          typename Create = create_static<Type>,
-          typename LifetimePolicy = default_lifetime<Type>,
-          typename Mutex = std::mutex>
-class singleton {
-public:
-	typedef Type value_type;
-	typedef Type* pointer;
-	typedef const Type* const_pointer;
-	typedef const Type& const_reference;
-	typedef Type& reference;
-
-private:
-	static bool destroyed_;
-	static pointer instance_;
-	static Mutex mutex_;
-
-	static void destroy() {
-		if (destroyed_)
-			return;
-		Create::destroy(instance_);
-		instance_ = nullptr;
-		destroyed_ = true;
-	}
-
-public:
-	static reference instance() {
-		static_assert(Create::supports_recreation || !lifetime_traits<LifetimePolicy>::supports_recreation,
-		              "The creation policy does not support instance recreation, while the lifetime does support it.");
-		if (!instance_) {
-			std::lock_guard<Mutex> l(mutex_);
-			if (!instance_) {
-				if (destroyed_) {
-					destroyed_ = false;
-					LifetimePolicy::on_dead_ref();
-				}
-				instance_ = Create::create();
-				LifetimePolicy::schedule_destruction(instance_, &destroy);
-			}
-		}
-		return *instance_;
-	}
-	/**
-	 * WARNING: DO NOT EXECUTE THIS MULTITHREADED!!!
-	 */
-	static void destroy_instance() {
-		if (instance_) {
-			std::lock_guard<Mutex> l(mutex_);
-			destroy();
-		}
-	}
-
-public:
-	pointer operator->() {
-		if (!instance_) {
-			instance();
-		}
-		return instance_;
-	}
-
-	reference operator*() {
-		if (!instance_) {
-			instance();
-		}
-		return *instance_;
-	}
-
-	const_pointer operator->() const {
-		if (!instance_) {
-			instance();
-		}
-		return instance_;
-	}
-
-	const_reference operator*() const {
-		if (!instance_) {
-			instance();
-		}
-		return *instance_;
-	}
-};
-
-template <typename T, typename C, typename L, typename M>
-bool singleton<T, C, L, M>::destroyed_ = false;
-
-template <typename T, typename C, typename L, typename M>
-typename singleton<T, C, L, M>::pointer singleton<T, C, L, M>::instance_ = nullptr;
-
-template <typename T, typename C, typename L, typename M>
-M singleton<T, C, L, M>::mutex_;
-
-} // namespace crossbow
\ No newline at end of file
diff --git a/tests/TestRunner/local_cluster.py b/tests/TestRunner/local_cluster.py
index 85f2094774..68318d51dd 100644
--- a/tests/TestRunner/local_cluster.py
+++ b/tests/TestRunner/local_cluster.py
@@ -38,7 +38,7 @@ cluster_file = {etcdir}/fdb.cluster
 command = {fdbserver_bin}
 public_address = auto:$ID
 listen_address = public
-datadir = {datadir}/$ID
+datadir = {datadir}
 logdir = {logdir}
 # logsize = 10MiB
 # maxlogssize = 100MiB

From 359abfb0087b68d028ad81d365cf7450eca58167 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 11 May 2021 12:08:48 -0700
Subject: [PATCH 378/461] Update FDB_API_VERSION to 710

---
 fdbcli/fdbcli.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index 4d49c8efc6..49ca2547ff 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -60,7 +60,7 @@
 
 #include "flow/actorcompiler.h" // This must be the last #include.
 
-#define FDB_API_VERSION 700
+#define FDB_API_VERSION 710
 /*
  * While we could just use the MultiVersionApi instance directly, this #define allows us to swap in any other IClientApi
  * instance (e.g. from ThreadSafeApi)

From 8002a389d4ecb5abc78cf5de027b33ea85c035dc Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 11 May 2021 12:12:33 -0700
Subject: [PATCH 379/461] add comments for error handling in
 ConsistencyCheckCommand.actor.cpp

---
 fdbcli/ConsistencyCheckCommand.actor.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdbcli/ConsistencyCheckCommand.actor.cpp b/fdbcli/ConsistencyCheckCommand.actor.cpp
index 4c4370ff30..38fc310237 100644
--- a/fdbcli/ConsistencyCheckCommand.actor.cpp
+++ b/fdbcli/ConsistencyCheckCommand.actor.cpp
@@ -31,6 +31,8 @@
 using namespace fdb_cli;
 
 ACTOR static Future<bool> consistencyCheckCommandActor(Reference<ITransaction> tr, std::vector<StringRef> tokens) {
+	// We do not add a try-catch loop here as the this transaction is always supposed to succeed
+	// If not, the outer loop catch block(fdbcli.actor.cpp) will handle the error and print out the error message
 	tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
 	if (tokens.size() == 1) {
 		Optional<Value> suspended = wait(safeThreadFutureToFuture(tr->get(consistencyCheckSpecialKey)));

From ed3415c93e202f0d2a3fb219ef1b86b83ac561cf Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 11 May 2021 12:21:09 -0700
Subject: [PATCH 380/461] Guard the added unit test by NOT OPEN_FOR_IDE

---
 tests/CMakeLists.txt | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 76bab08cde..e12b1e3ce9 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -268,13 +268,12 @@ if(WITH_PYTHON)
       NAME multiversion_client/unit_tests
       COMMAND $<TARGET_FILE:fdbserver> -r unittests -f /fdbclient/multiversionclient/
     )
+    add_test(
+      NAME threadsafe_threadfuture_to_future/unit_tests
+      COMMAND $<TARGET_FILE:fdbserver> -r unittests -f /flow/safeThreadFutureToFuture/
+    )
   endif()
 
-  add_test(
-    NAME threadsafe_threadfuture_to_future/unit_tests
-    COMMAND $<TARGET_FILE:fdbserver> -r unittests -f /flow/safeThreadFutureToFuture/
-  )
-
   verify_testing()
   if (NOT OPEN_FOR_IDE AND NOT WIN32)
     create_correctness_package()

From b0554b455478cb9d039ea4df17fb200eec88d1b2 Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Tue, 11 May 2021 20:03:21 +0000
Subject: [PATCH 381/461] Capture how fast an SS is catching up to its tLog-SS
 lag

Changes:
LogSystem.h, LogSystemPeekCursor.actor.cpp:
Add APIs to find the ID of the tLog from which an SS has fetched the latest
set of versions.

storageserver.actor.cpp:
Capture the number of latest set of versions fetched, the time (in seconds)
in which those versions were fetched, and the tLog from which they were
fetched. Add this information to a TraceLogEvent.

Capture how many versions an SS has fetched in the
---
 fdbserver/LogSystem.h                   |  7 +++++++
 fdbserver/LogSystemPeekCursor.actor.cpp | 28 ++++++++++++++++++++++++-
 fdbserver/storageserver.actor.cpp       | 19 +++++++++++++++--
 3 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h
index f8a3e0b725..da2fbcf5f2 100644
--- a/fdbserver/LogSystem.h
+++ b/fdbserver/LogSystem.h
@@ -410,6 +410,8 @@ struct ILogSystem {
 
 		virtual Optional<UID> getPrimaryPeekLocation() const = 0;
 
+		virtual Optional<UID> getCurrentPeekLocation() const = 0;
+
 		virtual void addref() = 0;
 
 		virtual void delref() = 0;
@@ -473,6 +475,7 @@ struct ILogSystem {
 		Version popped() const override;
 		Version getMinKnownCommittedVersion() const override;
 		Optional<UID> getPrimaryPeekLocation() const override;
+		Optional<UID> getCurrentPeekLocation() const override;
 
 		void addref() override { ReferenceCounted<ServerPeekCursor>::addref(); }
 
@@ -534,6 +537,7 @@ struct ILogSystem {
 		Version popped() const override;
 		Version getMinKnownCommittedVersion() const override;
 		Optional<UID> getPrimaryPeekLocation() const override;
+		Optional<UID> getCurrentPeekLocation() const override;
 
 		void addref() override { ReferenceCounted<MergedPeekCursor>::addref(); }
 
@@ -589,6 +593,7 @@ struct ILogSystem {
 		Version popped() const override;
 		Version getMinKnownCommittedVersion() const override;
 		Optional<UID> getPrimaryPeekLocation() const override;
+		Optional<UID> getCurrentPeekLocation() const override;
 
 		void addref() override { ReferenceCounted<SetPeekCursor>::addref(); }
 
@@ -620,6 +625,7 @@ struct ILogSystem {
 		Version popped() const override;
 		Version getMinKnownCommittedVersion() const override;
 		Optional<UID> getPrimaryPeekLocation() const override;
+		Optional<UID> getCurrentPeekLocation() const override;
 
 		void addref() override { ReferenceCounted<MultiCursor>::addref(); }
 
@@ -698,6 +704,7 @@ struct ILogSystem {
 		Version popped() const override;
 		Version getMinKnownCommittedVersion() const override;
 		Optional<UID> getPrimaryPeekLocation() const override;
+		Optional<UID> getCurrentPeekLocation() const override;
 
 		void addref() override { ReferenceCounted<BufferedCursor>::addref(); }
 
diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp
index cc8a350845..09e692e0b6 100644
--- a/fdbserver/LogSystemPeekCursor.actor.cpp
+++ b/fdbserver/LogSystemPeekCursor.actor.cpp
@@ -393,12 +393,16 @@ Version ILogSystem::ServerPeekCursor::getMinKnownCommittedVersion() const {
 }
 
 Optional<UID> ILogSystem::ServerPeekCursor::getPrimaryPeekLocation() const {
-	if (interf) {
+	if (interf->get().present()) {
 		return interf->get().id();
 	}
 	return Optional<UID>();
 }
 
+Optional<UID> ILogSystem::ServerPeekCursor::getCurrentPeekLocation() const {
+	return ILogSystem::ServerPeekCursor::getPrimaryPeekLocation();
+}
+
 Version ILogSystem::ServerPeekCursor::popped() const {
 	return poppedVersion;
 }
@@ -673,6 +677,13 @@ Optional<UID> ILogSystem::MergedPeekCursor::getPrimaryPeekLocation() const {
 	return Optional<UID>();
 }
 
+Optional<UID> ILogSystem::MergedPeekCursor::getCurrentPeekLocation() const {
+	if (currentCursor >= 0) {
+		return serverCursors[currentCursor]->getPrimaryPeekLocation();
+	}
+	return Optional<UID>();
+}
+
 Version ILogSystem::MergedPeekCursor::popped() const {
 	Version poppedVersion = 0;
 	for (auto& c : serverCursors)
@@ -1023,6 +1034,13 @@ Optional<UID> ILogSystem::SetPeekCursor::getPrimaryPeekLocation() const {
 	return Optional<UID>();
 }
 
+Optional<UID> ILogSystem::SetPeekCursor::getCurrentPeekLocation() const {
+	if (currentCursor >= 0 && currentSet >= 0) {
+		return serverCursors[currentSet][currentCursor]->getPrimaryPeekLocation();
+	}
+	return Optional<UID>();
+}
+
 Version ILogSystem::SetPeekCursor::popped() const {
 	Version poppedVersion = 0;
 	for (auto& cursors : serverCursors) {
@@ -1123,6 +1141,10 @@ Optional<UID> ILogSystem::MultiCursor::getPrimaryPeekLocation() const {
 	return cursors.back()->getPrimaryPeekLocation();
 }
 
+Optional<UID> ILogSystem::MultiCursor::getCurrentPeekLocation() const {
+	return cursors.back()->getCurrentPeekLocation();
+}
+
 Version ILogSystem::MultiCursor::popped() const {
 	return std::max(poppedVersion, cursors.back()->popped());
 }
@@ -1403,6 +1425,10 @@ Optional<UID> ILogSystem::BufferedCursor::getPrimaryPeekLocation() const {
 	return Optional<UID>();
 }
 
+Optional<UID> ILogSystem::BufferedCursor::getCurrentPeekLocation() const {
+	return Optional<UID>();
+}
+
 Version ILogSystem::BufferedCursor::popped() const {
 	if (initialPoppedVersion == poppedVersion) {
 		return 0;
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index c92402ce0d..728dae05fc 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -545,6 +545,10 @@ public:
 	int64_t versionLag; // An estimate for how many versions it takes for the data to move from the logs to this storage
 	                    // server
 
+	int64_t versionCount;
+	double duration;
+	Optional<UID> sourceTLogID;
+
 	ProtocolVersion logProtocol;
 
 	Reference<ILogSystem> logSystem;
@@ -732,7 +736,7 @@ public:
 	  : fetchKeysHistograms(), instanceID(deterministicRandom()->randomUniqueID().first()), storage(this, storage),
 	    db(db), actors(false), lastTLogVersion(0), lastVersionWithData(0), restoredVersion(0),
 	    rebootAfterDurableVersion(std::numeric_limits<Version>::max()), durableInProgress(Void()), versionLag(0),
-	    primaryLocality(tagLocalityInvalid), updateEagerReads(0), shardChangeCounter(0),
+	    versionCount(0), duration(0), primaryLocality(tagLocalityInvalid), updateEagerReads(0), shardChangeCounter(0),
 	    fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES), shuttingDown(false),
 	    debug_inApplyUpdate(false), debug_lastValidateTime(0), watchBytes(0), numWatches(0), logProtocol(0),
 	    counters(this), tag(invalidTag), maxQueryQueue(0), thisServerID(ssi.id()),
@@ -3523,9 +3527,20 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 			if (data->otherError.getFuture().isReady())
 				data->otherError.getFuture().get();
 
+			auto curTime = now();
+			data->versionCount = ver - data->version.get();
+			data->duration = curTime - data->lastUpdate;
+			data->sourceTLogID = cursor->getCurrentPeekLocation();
+
+			TraceEvent("StorageServerCatchUpRate", data->thisServerID)
+		        .detail("VersionCount", data->versionCount)
+				.detail("Duration", data->duration)
+				.detail("SourceTLogId", data->sourceTLogID.present() ? data->sourceTLogID.get().toString() : "unknown");
+
 			data->noRecentUpdates.set(false);
-			data->lastUpdate = now();
+			data->lastUpdate = curTime;
 			data->version.set(ver); // Triggers replies to waiting gets for new version(s)
+
 			setDataVersion(data->thisServerID, data->version.get());
 			if (data->otherError.getFuture().isReady())
 				data->otherError.getFuture().get();

From 9a6151d3fcb5838f27f6c6f1d685df913981319a Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 11 May 2021 14:31:08 -0700
Subject: [PATCH 382/461] Update fdbcli.h to fdbcli.actor.h, removed the
 unnecessary wrapper

---
 fdbcli/CMakeLists.txt                    |  2 +-
 fdbcli/ConsistencyCheckCommand.actor.cpp | 16 +++++-----------
 fdbcli/Util.cpp                          |  2 +-
 fdbcli/fdbcli.actor.cpp                  |  4 ++--
 fdbcli/{fdbcli.h => fdbcli.actor.h}      | 15 ++++++++++++---
 5 files changed, 21 insertions(+), 18 deletions(-)
 rename fdbcli/{fdbcli.h => fdbcli.actor.h} (77%)

diff --git a/fdbcli/CMakeLists.txt b/fdbcli/CMakeLists.txt
index b97619fc9a..d0cab5b178 100644
--- a/fdbcli/CMakeLists.txt
+++ b/fdbcli/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(FDBCLI_SRCS
-  fdbcli.h
   fdbcli.actor.cpp
+  fdbcli.actor.h
   ConsistencyCheckCommand.actor.cpp
   FlowLineNoise.actor.cpp
   FlowLineNoise.h
diff --git a/fdbcli/ConsistencyCheckCommand.actor.cpp b/fdbcli/ConsistencyCheckCommand.actor.cpp
index 38fc310237..4b8107f954 100644
--- a/fdbcli/ConsistencyCheckCommand.actor.cpp
+++ b/fdbcli/ConsistencyCheckCommand.actor.cpp
@@ -18,7 +18,7 @@
  * limitations under the License.
  */
 
-#include "fdbcli/fdbcli.h"
+#include "fdbcli/fdbcli.actor.h"
 
 #include "fdbclient/FDBOptions.g.h"
 #include "fdbclient/IClientApi.h"
@@ -28,9 +28,11 @@
 #include "flow/ThreadHelper.actor.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 
-using namespace fdb_cli;
+namespace fdb_cli {
 
-ACTOR static Future<bool> consistencyCheckCommandActor(Reference<ITransaction> tr, std::vector<StringRef> tokens) {
+const KeyRef consistencyCheckSpecialKey = LiteralStringRef("\xff\xff/management/consistency_check_suspended");
+
+ACTOR Future<bool> consistencyCheckCommandActor(Reference<ITransaction> tr, std::vector<StringRef> tokens) {
 	// We do not add a try-catch loop here as the this transaction is always supposed to succeed
 	// If not, the outer loop catch block(fdbcli.actor.cpp) will handle the error and print out the error message
 	tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
@@ -50,14 +52,6 @@ ACTOR static Future<bool> consistencyCheckCommandActor(Reference<ITransaction> t
 	return true;
 }
 
-namespace fdb_cli {
-
-const KeyRef consistencyCheckSpecialKey = LiteralStringRef("\xff\xff/management/consistency_check_suspended");
-
-Future<bool> consistencyCheckCommand(Reference<ITransaction> tr, std::vector<StringRef> tokens) {
-	return consistencyCheckCommandActor(tr, tokens);
-}
-
 CommandFactory consistencyCheckFactory(
     "consistencycheck",
     CommandHelp(
diff --git a/fdbcli/Util.cpp b/fdbcli/Util.cpp
index 2b755bd9d3..f67f27c774 100644
--- a/fdbcli/Util.cpp
+++ b/fdbcli/Util.cpp
@@ -18,7 +18,7 @@
  * limitations under the License.
  */
 
-#include "fdbcli/fdbcli.h"
+#include "fdbcli/fdbcli.actor.h"
 
 #include "flow/Arena.h"
 
diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index 49ca2547ff..d10da845ec 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -44,7 +44,7 @@
 #include "flow/SimpleOpt.h"
 
 #include "fdbcli/FlowLineNoise.h"
-#include "fdbcli/fdbcli.h"
+#include "fdbcli/fdbcli.actor.h"
 
 #include <cinttypes>
 #include <type_traits>
@@ -3821,7 +3821,7 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 
 				if (tokencmp(tokens[0], "consistencycheck")) {
 					getTransaction(db, tr, tr2, options, intrans);
-					bool _result = wait(consistencyCheckCommand(tr2, tokens));
+					bool _result = wait(consistencyCheckCommandActor(tr2, tokens));
 					is_error = !_result;
 					continue;
 				}
diff --git a/fdbcli/fdbcli.h b/fdbcli/fdbcli.actor.h
similarity index 77%
rename from fdbcli/fdbcli.h
rename to fdbcli/fdbcli.actor.h
index 831de2decd..ceae1263c2 100644
--- a/fdbcli/fdbcli.h
+++ b/fdbcli/fdbcli.actor.h
@@ -18,13 +18,21 @@
  * limitations under the License.
  */
 
-#ifndef FDBCLI_H
-#define FDBCLI_H
 #pragma once
 
+// When actually compiled (NO_INTELLISENSE), include the generated
+// version of this file.  In intellisense use the source version.
+#if defined(NO_INTELLISENSE) && !defined(FDBCLI_FDBCLI_ACTOR_G_H)
+#define FDBCLI_FDBCLI_ACTOR_G_H
+#include "fdbcli/fdbcli.actor.g.h"
+#elif !defined(FDBCLI_FDBCLI_ACTOR_H)
+#define FDBCLI_FDBCLI_ACTOR_H
+
 #include "fdbclient/IClientApi.h"
 #include "flow/Arena.h"
 
+#include "flow/actorcompiler.h" // This must be the last #include.
+
 namespace fdb_cli {
 
 struct CommandHelp {
@@ -62,8 +70,9 @@ void printUsage(StringRef command);
 
 // All fdbcli commands (alphabetically)
 // consistency command
-Future<bool> consistencyCheckCommand(Reference<ITransaction> tr, std::vector<StringRef> tokens);
+ACTOR Future<bool> consistencyCheckCommandActor(Reference<ITransaction> tr, std::vector<StringRef> tokens);
 
 } // namespace fdb_cli
 
+#include "flow/unactorcompiler.h"
 #endif

From 6e10a8abf17c235ea8b16e4c3afba60ce7ebfaeb Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 11 May 2021 14:38:21 -0700
Subject: [PATCH 383/461] fix header's include order

---
 flow/ThreadHelper.actor.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/flow/ThreadHelper.actor.cpp b/flow/ThreadHelper.actor.cpp
index 4c0a89c7d5..06645f8d3e 100644
--- a/flow/ThreadHelper.actor.cpp
+++ b/flow/ThreadHelper.actor.cpp
@@ -18,13 +18,14 @@
  * limitations under the License.
  */
 
+#include <string>
+
+#include "flow/flow.h"
+#include "flow/network.h"
 #include "flow/ThreadHelper.actor.h"
 #include "flow/Error.h"
 #include "flow/UnitTest.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
-#include "flow/flow.h"
-#include "flow/network.h"
-#include <string>
 
 ThreadCallback* ThreadCallback::addCallback(ThreadCallback* cb) {
 	return (new ThreadMultiCallback())->addCallback(this)->addCallback(cb);

From 42eced15c9ed26e46d0284e6630bf3e40c15b6c2 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 11 May 2021 16:46:07 -0700
Subject: [PATCH 384/461] Update comments and trigger CI

---
 fdbcli/ConsistencyCheckCommand.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbcli/ConsistencyCheckCommand.actor.cpp b/fdbcli/ConsistencyCheckCommand.actor.cpp
index 4b8107f954..892acbb239 100644
--- a/fdbcli/ConsistencyCheckCommand.actor.cpp
+++ b/fdbcli/ConsistencyCheckCommand.actor.cpp
@@ -33,7 +33,7 @@ namespace fdb_cli {
 const KeyRef consistencyCheckSpecialKey = LiteralStringRef("\xff\xff/management/consistency_check_suspended");
 
 ACTOR Future<bool> consistencyCheckCommandActor(Reference<ITransaction> tr, std::vector<StringRef> tokens) {
-	// We do not add a try-catch loop here as the this transaction is always supposed to succeed
+	// Here we do not proceed in a try-catch loop since the transaction is always supposed to succeed.
 	// If not, the outer loop catch block(fdbcli.actor.cpp) will handle the error and print out the error message
 	tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
 	if (tokens.size() == 1) {

From 4361dcca2e48a180a260b6680ed72b4fad458119 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel_b_smith@apple.com>
Date: Wed, 12 May 2021 11:54:55 -0400
Subject: [PATCH 385/461] Set connectionFile instead of creating a shadow

---
 fdbserver/fdbserver.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 712186affe..403c2ef48d 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -1505,7 +1505,7 @@ private:
 					fprintf(stderr, "%s\n", ClusterConnectionString::getErrorString(connectionString, e).c_str());
 					throw;
 				}
-				auto connectionFile = makeReference<ClusterConnectionFile>(connFile, ccs);
+				connectionFile = makeReference<ClusterConnectionFile>(connFile, ccs);
 			} else {
 				std::pair<std::string, bool> resolvedClusterFile;
 				try {

From 78ef6822f6a26c366088f2362433c665e33b2754 Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Wed, 12 May 2021 16:40:33 +0000
Subject: [PATCH 386/461] Capture how fast an SS is catching up to its tLog-SS
 lag

Changes:

storagegroupserver.actor.cpp:

- Report "fetchedVersions" and "duration" as part of StorageMetrics
trace event.

- Report "sourceTLogID" as a separte trace event (and report this
only when it changes)..
---
 fdbserver/storageserver.actor.cpp | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 728dae05fc..8cd4680f6d 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -545,9 +545,10 @@ public:
 	int64_t versionLag; // An estimate for how many versions it takes for the data to move from the logs to this storage
 	                    // server
 
-	int64_t versionCount;
-	double duration;
-	Optional<UID> sourceTLogID;
+	// Metrics about the latest batch of versions fetched by this StorageServer
+	int64_t fetchedVersions; // how many versions were fetched
+	double duration; // how long (in seconds) it took to fetch the versions
+	Optional<UID> sourceTLogID; // the tLog from which the versions were fetched
 
 	ProtocolVersion logProtocol;
 
@@ -710,6 +711,8 @@ public:
 			specialCounter(cc, "DurableVersion", [self]() { return self->durableVersion.get(); });
 			specialCounter(cc, "DesiredOldestVersion", [self]() { return self->desiredOldestVersion.get(); });
 			specialCounter(cc, "VersionLag", [self]() { return self->versionLag; });
+			specialCounter(cc, "FetchedVersions", [self]() { return self->fetchedVersions; });
+			specialCounter(cc, "Duration", [self]() { return self->duration; });
 			specialCounter(cc, "LocalRate", [self] { return self->currentRate() * 100; });
 
 			specialCounter(cc, "BytesReadSampleCount", [self]() { return self->metrics.bytesReadSample.queue.size(); });
@@ -736,7 +739,7 @@ public:
 	  : fetchKeysHistograms(), instanceID(deterministicRandom()->randomUniqueID().first()), storage(this, storage),
 	    db(db), actors(false), lastTLogVersion(0), lastVersionWithData(0), restoredVersion(0),
 	    rebootAfterDurableVersion(std::numeric_limits<Version>::max()), durableInProgress(Void()), versionLag(0),
-	    versionCount(0), duration(0), primaryLocality(tagLocalityInvalid), updateEagerReads(0), shardChangeCounter(0),
+	    fetchedVersions(0), duration(0.0), primaryLocality(tagLocalityInvalid), updateEagerReads(0), shardChangeCounter(0),
 	    fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES), shuttingDown(false),
 	    debug_inApplyUpdate(false), debug_lastValidateTime(0), watchBytes(0), numWatches(0), logProtocol(0),
 	    counters(this), tag(invalidTag), maxQueryQueue(0), thisServerID(ssi.id()),
@@ -3527,18 +3530,20 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 			if (data->otherError.getFuture().isReady())
 				data->otherError.getFuture().get();
 
-			auto curTime = now();
-			data->versionCount = ver - data->version.get();
-			data->duration = curTime - data->lastUpdate;
-			data->sourceTLogID = cursor->getCurrentPeekLocation();
+			data->fetchedVersions = ver - data->version.get();
+			data->duration = now() - data->lastUpdate;
+			Optional<UID> curSourceTLogID = cursor->getCurrentPeekLocation();
 
-			TraceEvent("StorageServerCatchUpRate", data->thisServerID)
-		        .detail("VersionCount", data->versionCount)
-				.detail("Duration", data->duration)
-				.detail("SourceTLogId", data->sourceTLogID.present() ? data->sourceTLogID.get().toString() : "unknown");
+			if (curSourceTLogID != data->sourceTLogID) {
+				data->sourceTLogID = curSourceTLogID;
+
+				TraceEvent("StorageServerSourceTLogID", data->thisServerID)
+					.detail("SourceTLogID", data->sourceTLogID.present() ? data->sourceTLogID.get().toString() : "unknown")
+					.trackLatest(data->thisServerID.toString() + "/StorageServerSourceTLogID");
+			}
 
 			data->noRecentUpdates.set(false);
-			data->lastUpdate = curTime;
+			data->lastUpdate = now();
 			data->version.set(ver); // Triggers replies to waiting gets for new version(s)
 
 			setDataVersion(data->thisServerID, data->version.get());

From 061afda2ec108b9343a3306a5b07824ec41eb28f Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Wed, 12 May 2021 09:41:26 -0700
Subject: [PATCH 387/461] Fix several package issues (#4801)

* Make the structure of the server pkg match 6.2.28

* Fix OSX lib path

* Fix install destinations in client

Previously, backup_agent would map to fdbmonitor installation dir -
which is incorrect in the sense that it disagrees with where the default
foundationdb.conf expects to find backup_agent. Add a new backupagent
installation dir and install there, matching foundationdb.conf's
expectations.

Also fix an issue where several of the versions of fdbbackup weren't
being installed

* Update packaging/osx/buildpkg.sh for cmake

* Update README instructions for pkg file

* Remove osx cpack config

* Remove pm install destinations

* Fix weird syntax

* Remove cpack reference to PM

* Address review comments
---
 README.md                                |  4 +-
 cmake/CPackConfig.cmake                  | 18 ---------
 cmake/FDBInstall.cmake                   | 24 +++++++++---
 cmake/InstallLayout.cmake                | 46 +++--------------------
 fdbbackup/CMakeLists.txt                 |  4 +-
 packaging/osx/buildpkg.sh                | 48 +++++++++++++++---------
 packaging/osx/scripts-server/postinstall |  6 +--
 7 files changed, 61 insertions(+), 89 deletions(-)

diff --git a/README.md b/README.md
index cd28c798f0..9e0ddb78a5 100755
--- a/README.md
+++ b/README.md
@@ -157,11 +157,11 @@ The build under MacOS will work the same way as on Linux. To get boost and ninja
 cmake -G Ninja <PATH_TO_FOUNDATIONDB_SOURCE>
 ```
 
-To generate a installable package, you can use cpack:
+To generate a installable package,
 
 ```sh
 ninja
-cpack -G productbuild
+$SRCDIR/packaging/osx/buildpkg.sh . $SRCDIR
 ```
 
 ### Windows
diff --git a/cmake/CPackConfig.cmake b/cmake/CPackConfig.cmake
index 08f90bc0c5..c67059ec65 100644
--- a/cmake/CPackConfig.cmake
+++ b/cmake/CPackConfig.cmake
@@ -9,24 +9,6 @@ elseif(CPACK_GENERATOR MATCHES "DEB")
   set(CPACK_COMPONENTS_ALL clients-deb server-deb clients-versioned server-versioned)
   set(CPACK_RESOURCE_FILE_README ${CMAKE_SOURCE_DIR}/README.md)
   set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_SOURCE_DIR}/LICENSE)
-elseif(CPACK_GENERATOR MATCHES "productbuild")
-  set(CPACK_PACKAGING_INSTALL_PREFIX "/")
-  set(CPACK_COMPONENTS_ALL clients-pm server-pm)
-  set(CPACK_STRIP_FILES TRUE)
-  set(CPACK_PREFLIGHT_SERVER_SCRIPT ${CMAKE_SOURCE_DIR}/packaging/osx/scripts-server/preinstall)
-  set(CPACK_POSTFLIGHT_SERVER_SCRIPT ${CMAKE_SOURCE_DIR}/packaging/osx/scripts-server/postinstall)
-  set(CPACK_POSTFLIGHT_CLIENTS_SCRIPT ${CMAKE_SOURCE_DIR}/packaging/osx/scripts-server/preinstall)
-# Commenting out this readme file until it works within packaging
-  set(CPACK_RESOURCE_FILE_README ${CMAKE_SOURCE_DIR}/packaging/osx/resources/conclusion.rtf)
-  set(CPACK_PRODUCTBUILD_RESOURCES_DIR ${CMAKE_SOURCE_DIR}/packaging/osx/resources)
-# Changing the path of this file as CMAKE_BINARY_DIR does not seem to be defined
-  set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_BINARY_DIR}/License.txt)
-  if(NOT FDB_RELEASE)
-    set(prerelease_string "-PRERELEASE")
-  else()
-    set(prerelease_string "")
-  endif()
-  set(CPACK_PACKAGE_FILE_NAME "FoundationDB-${PROJECT_VERSION}${prerelease_string}")
 elseif(CPACK_GENERATOR MATCHES "TGZ")
   set(CPACK_STRIP_FILES TRUE)
   set(CPACK_COMPONENTS_ALL clients-tgz server-tgz)
diff --git a/cmake/FDBInstall.cmake b/cmake/FDBInstall.cmake
index 263291c433..2dd4be696f 100644
--- a/cmake/FDBInstall.cmake
+++ b/cmake/FDBInstall.cmake
@@ -214,7 +214,7 @@ endfunction()
 
 function(fdb_install)
   if(NOT WIN32 AND NOT OPEN_FOR_IDE)
-    set(one_value_options COMPONENT DESTINATION EXPORT DESTINATION_SUFFIX)
+    set(one_value_options COMPONENT DESTINATION EXPORT DESTINATION_SUFFIX RENAME)
     set(multi_value_options TARGETS FILES PROGRAMS DIRECTORY)
     cmake_parse_arguments(IN "${options}" "${one_value_options}" "${multi_value_options}" "${ARGN}")
 
@@ -237,6 +237,9 @@ function(fdb_install)
       get_install_dest(${pkg} ${destination} install_path)
       string(TOLOWER "${pkg}" package)
       if(install_export)
+        if(IN_RENAME)
+          message(FATAL_ERROR "RENAME for EXPORT target not implemented")
+        endif()
         install(
           EXPORT "${IN_EXPORT}-${package}"
           DESTINATION "${install_path}${IN_DESTINATION_SUFFIX}"
@@ -248,11 +251,20 @@ function(fdb_install)
           set(export_args EXPORT "${IN_EXPORT}-${package}")
         endif()
         if(NOT ${install_path} STREQUAL "")
-          install(
-            ${args}
-            ${export_args}
-            DESTINATION "${install_path}${IN_DESTINATION_SUFFIX}"
-            COMPONENT "${IN_COMPONENT}-${package}")
+          if(IN_RENAME)
+            install(
+              ${args}
+              ${export_args}
+              DESTINATION "${install_path}${IN_DESTINATION_SUFFIX}"
+              COMPONENT "${IN_COMPONENT}-${package}"
+              RENAME ${IN_RENAME})
+          else()
+            install(
+              ${args}
+              ${export_args}
+              DESTINATION "${install_path}${IN_DESTINATION_SUFFIX}"
+              COMPONENT "${IN_COMPONENT}-${package}")
+          endif()
         endif()
       endif()
     endforeach()
diff --git a/cmake/InstallLayout.cmake b/cmake/InstallLayout.cmake
index a037b65df2..91d39d4125 100644
--- a/cmake/InstallLayout.cmake
+++ b/cmake/InstallLayout.cmake
@@ -46,10 +46,6 @@ function(install_symlink)
           TO "../${rel_path}bin/${IN_FILE_NAME}"
           DESTINATION "usr/lib64/${IN_LINK_NAME}"
           COMPONENTS "${IN_COMPONENT}-deb")
-        install_symlink_impl(
-          TO "../${rel_path}local/bin/${IN_FILE_NAME}"
-          DESTINATION "usr/lib64/${IN_LINK_NAME}"
-          COMPONENTS "${IN_COMPONENT}-pm")
       elseif("${IN_LINK_DIR}" MATCHES "bin")
         install_symlink_impl(
           TO "../${rel_path}bin/${IN_FILE_NAME}"
@@ -61,10 +57,6 @@ function(install_symlink)
           COMPONENTS "${IN_COMPONENT}-el6"
                      "${IN_COMPONENT}-el7"
                      "${IN_COMPONENT}-deb")
-        install_symlink_impl(
-          TO "../${rel_path}/bin/${IN_FILE_NAME}"
-          DESTINATION "usr/local/bin/${IN_LINK_NAME}"
-          COMPONENTS "${IN_COMPONENT}-pm")
       elseif("${IN_LINK_DIR}" MATCHES "fdbmonitor")
         install_symlink_impl(
           TO "../../${rel_path}bin/${IN_FILE_NAME}"
@@ -76,10 +68,6 @@ function(install_symlink)
           COMPONENTS "${IN_COMPONENT}-el6"
                      "${IN_COMPONENT}-el7"
                      "${IN_COMPONENT}-deb")
-        install_symlink_impl(
-          TO "../../${rel_path}/bin/${IN_FILE_NAME}"
-          DESTINATION "usr/local/lib/foundationdb/${IN_LINK_NAME}"
-          COMPONENTS "${IN_COMPONENT}-pm")
       else()
         message(FATAL_ERROR "Unknown LINK_DIR ${IN_LINK_DIR}")
       endif()
@@ -103,8 +91,8 @@ function(symlink_files)
   endif()
 endfunction()
 
-fdb_install_packages(TGZ DEB EL7 PM VERSIONED)
-fdb_install_dirs(BIN SBIN LIB FDBMONITOR INCLUDE ETC LOG DATA)
+fdb_install_packages(TGZ DEB EL7 VERSIONED)
+fdb_install_dirs(BIN SBIN LIB FDBMONITOR INCLUDE ETC LOG DATA BACKUPAGENT)
 message(STATUS "FDB_INSTALL_DIRS -> ${FDB_INSTALL_DIRS}")
 
 install_destinations(TGZ
@@ -112,6 +100,7 @@ install_destinations(TGZ
   SBIN sbin
   LIB lib
   FDBMONITOR sbin
+  BACKUPAGENT usr/lib/foundationdb
   INCLUDE include
   ETC etc/foundationdb
   LOG log/foundationdb
@@ -122,19 +111,13 @@ install_destinations(DEB
   SBIN usr/sbin
   LIB usr/lib
   FDBMONITOR usr/lib/foundationdb
+  BACKUPAGENT usr/lib/foundationdb
   INCLUDE usr/include
   ETC etc/foundationdb
   LOG var/log/foundationdb
   DATA var/lib/foundationdb/data)
 copy_install_destinations(DEB EL7)
 install_destinations(EL7 LIB usr/lib64)
-install_destinations(PM
-  BIN usr/local/bin
-  SBIN usr/local/sbin
-  LIB lib
-  FDBMONITOR usr/local/libexec
-  INCLUDE usr/local/include
-  ETC usr/local/etc/foundationdb)
 
 # This can be used for debugging in case above is behaving funky
 #print_install_destinations()
@@ -142,7 +125,7 @@ install_destinations(PM
 set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
 
 if(APPLE)
-  set(CPACK_GENERATOR TGZ productbuild)
+  set(CPACK_GENERATOR TGZ)
 else()
   set(CPACK_GENERATOR RPM DEB TGZ)
 endif()
@@ -212,19 +195,16 @@ set(CPACK_PACKAGE_CONTACT "The FoundationDB Community")
 set(CPACK_COMPONENT_SERVER-EL7_DEPENDS clients-el7)
 set(CPACK_COMPONENT_SERVER-DEB_DEPENDS clients-deb)
 set(CPACK_COMPONENT_SERVER-TGZ_DEPENDS clients-tgz)
-set(CPACK_COMPONENT_SERVER-PM_DEPENDS clients-pm)
 set(CPACK_COMPONENT_SERVER-VERSIONED_DEPENDS clients-versioned)
 
 set(CPACK_COMPONENT_SERVER-EL7_DISPLAY_NAME "foundationdb-server")
 set(CPACK_COMPONENT_SERVER-DEB_DISPLAY_NAME "foundationdb-server")
 set(CPACK_COMPONENT_SERVER-TGZ_DISPLAY_NAME "foundationdb-server")
-set(CPACK_COMPONENT_SERVER-PM_DISPLAY_NAME "foundationdb-server")
 set(CPACK_COMPONENT_SERVER-VERSIONED_DISPLAY_NAME "foundationdb-server-${PROJECT_VERSION}")
 
 set(CPACK_COMPONENT_CLIENTS-EL7_DISPLAY_NAME "foundationdb-clients")
 set(CPACK_COMPONENT_CLIENTS-DEB_DISPLAY_NAME "foundationdb-clients")
 set(CPACK_COMPONENT_CLIENTS-TGZ_DISPLAY_NAME "foundationdb-clients")
-set(CPACK_COMPONENT_CLIENTS-PM_DISPLAY_NAME "foundationdb-clients")
 set(CPACK_COMPONENT_CLIENTS-VERSIONED_DISPLAY_NAME "foundationdb-clients-${PROJECT_VERSION}")
 
 
@@ -382,19 +362,6 @@ set(CPACK_DEBIAN_SERVER-VERSIONED_PACKAGE_CONTROL_EXTRA
   ${CMAKE_BINARY_DIR}/packaging/multiversion/server/postinst
   ${CMAKE_BINARY_DIR}/packaging/multiversion/server/prerm)
 
-################################################################################
-# MacOS configuration
-################################################################################
-
-if(APPLE)
-  install(PROGRAMS ${CMAKE_SOURCE_DIR}/packaging/osx/uninstall-FoundationDB.sh
-    DESTINATION "usr/local/foundationdb"
-    COMPONENT clients-pm)
-  install(FILES ${CMAKE_SOURCE_DIR}/packaging/osx/com.foundationdb.fdbmonitor.plist
-    DESTINATION "Library/LaunchDaemons"
-    COMPONENT server-pm)
-endif()
-
 ################################################################################
 # Configuration for DEB
 ################################################################################
@@ -413,9 +380,6 @@ set(CLUSTER_DESCRIPTION1 ${description1} CACHE STRING "Cluster description")
 set(CLUSTER_DESCRIPTION2 ${description2} CACHE STRING "Cluster description")
 
 if(NOT WIN32)
-  install(FILES ${CMAKE_SOURCE_DIR}/packaging/osx/foundationdb.conf.new
-    DESTINATION "usr/local/etc"
-    COMPONENT server-pm)
   fdb_install(FILES ${CMAKE_SOURCE_DIR}/packaging/foundationdb.conf
     DESTINATION etc
     COMPONENT server)
diff --git a/fdbbackup/CMakeLists.txt b/fdbbackup/CMakeLists.txt
index 48b1ad1aef..da2457b850 100644
--- a/fdbbackup/CMakeLists.txt
+++ b/fdbbackup/CMakeLists.txt
@@ -23,14 +23,14 @@ target_link_libraries(fdbdecode PRIVATE fdbclient)
 if(NOT OPEN_FOR_IDE)
   if(GENERATE_DEBUG_PACKAGES)
     fdb_install(TARGETS fdbbackup DESTINATION bin COMPONENT clients)
-    fdb_install(PROGRAMS $<TARGET_FILE:fdbbackup> DESTINATION fdbmonitor COMPONENT clients RENAME backup_agent/backup_agent)
+    fdb_install(PROGRAMS $<TARGET_FILE:fdbbackup> DESTINATION backupagent COMPONENT clients RENAME backup_agent/backup_agent)
     fdb_install(PROGRAMS $<TARGET_FILE:fdbbackup> DESTINATION bin COMPONENT clients RENAME fdbrestore)
     fdb_install(PROGRAMS $<TARGET_FILE:fdbbackup> DESTINATION bin COMPONENT clients RENAME dr_agent)
     fdb_install(PROGRAMS $<TARGET_FILE:fdbbackup> DESTINATION bin COMPONENT clients RENAME fdbdr)
   else()
     add_custom_target(prepare_fdbbackup_install ALL DEPENDS strip_only_fdbbackup)
     fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbbackup DESTINATION bin COMPONENT clients)
-    fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbbackup DESTINATION fdbmonitor COMPONENT clients RENAME backup_agent/backup_agent)
+    fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbbackup DESTINATION backupagent COMPONENT clients RENAME backup_agent/backup_agent)
     fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbbackup DESTINATION bin COMPONENT clients RENAME fdbrestore)
     fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbbackup DESTINATION bin COMPONENT clients RENAME dr_agent)
     fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbbackup DESTINATION bin COMPONENT clients RENAME fdbdr)
diff --git a/packaging/osx/buildpkg.sh b/packaging/osx/buildpkg.sh
index 60b441b191..d8b9f560a5 100755
--- a/packaging/osx/buildpkg.sh
+++ b/packaging/osx/buildpkg.sh
@@ -1,12 +1,26 @@
 #!/bin/bash
 
-set -e
+set -Eeuo pipefail
 
 umask 0022
 
-PKGFILE=$1
-VERSION=$2
-RELEASE=$3
+if [ "$#" -ne 2 ] ; then
+    echo "Usage: $0 <BUILDDIR> <SRCDIR>"
+    exit 1
+fi
+
+# BUILDDIR is the path to the cmake build directory
+# SRCDIR is the path to the source directory
+#
+# e.g. If your current directory is the project root and the build directory is _build, then you want to do
+#     $ ./packaging/osx/buildpkg.sh _build .
+#
+BUILDDIR="$1"
+SRCDIR="$2"
+
+VERSION="$(grep 'CMAKE_PROJECT_VERSION[^_]' "$BUILDDIR/CMakeCache.txt" | sed -e 's/[^=]*=//')"
+
+PKGFILE="$BUILDDIR/packages/FoundationDB-$VERSION.pkg"
 
 CLIENTSDIR=$( mktemp -d -t fdb-clients-pkg )
 SERVERDIR=$( mktemp -d -t fdb-server-pkg )
@@ -23,20 +37,20 @@ mkdir -p -m 0755 $CLIENTSDIR/Library/Python/2.7/site-packages/fdb
 mkdir -p -m 0775 $CLIENTSDIR/usr/local/etc/foundationdb
 mkdir -p -m 0755 $CLIENTSDIR/usr/local/foundationdb/backup_agent
 
-install -m 0755 bin/fdbcli $CLIENTSDIR/usr/local/bin
-install -m 0644 bindings/c/foundationdb/fdb_c.h bindings/c/foundationdb/fdb_c_options.g.h fdbclient/vexillographer/fdb.options $CLIENTSDIR/usr/local/include/foundationdb
-install -m 0755 lib/libfdb_c.dylib $CLIENTSDIR/usr/local/lib
-install -m 0644 bindings/python/fdb/*.py $CLIENTSDIR/Library/Python/2.7/site-packages/fdb
-install -m 0755 bin/fdbbackup $CLIENTSDIR/usr/local/foundationdb/backup_agent/backup_agent
-install -m 0755 packaging/osx/uninstall-FoundationDB.sh $CLIENTSDIR/usr/local/foundationdb
-dos2unix README.md $CLIENTSDIR/usr/local/foundationdb/README
+install -m 0755 "$BUILDDIR"/bin/fdbcli $CLIENTSDIR/usr/local/bin
+install -m 0644 "$SRCDIR"/bindings/c/foundationdb/fdb_c.h "$BUILDDIR"/bindings/c/foundationdb/fdb_c_options.g.h "$SRCDIR"/fdbclient/vexillographer/fdb.options $CLIENTSDIR/usr/local/include/foundationdb
+install -m 0755 "$BUILDDIR"/lib/libfdb_c.dylib $CLIENTSDIR/usr/local/lib
+install -m 0644 "$BUILDDIR"/bindings/python/fdb/*.py $CLIENTSDIR/Library/Python/2.7/site-packages/fdb
+install -m 0755 "$BUILDDIR"/bin/fdbbackup $CLIENTSDIR/usr/local/foundationdb/backup_agent/backup_agent
+install -m 0755 "$SRCDIR"/packaging/osx/uninstall-FoundationDB.sh $CLIENTSDIR/usr/local/foundationdb
+dos2unix "$SRCDIR"/README.md $CLIENTSDIR/usr/local/foundationdb/README
 chmod 0644 $CLIENTSDIR/usr/local/foundationdb/README
 ln -s /usr/local/foundationdb/backup_agent/backup_agent $CLIENTSDIR/usr/local/bin/fdbbackup
 ln -s /usr/local/foundationdb/backup_agent/backup_agent $CLIENTSDIR/usr/local/bin/fdbrestore
 ln -s /usr/local/foundationdb/backup_agent/backup_agent $CLIENTSDIR/usr/local/bin/fdbdr
 ln -s /usr/local/foundationdb/backup_agent/backup_agent $CLIENTSDIR/usr/local/bin/dr_agent
 
-pkgbuild --root $CLIENTSDIR --identifier FoundationDB-clients --version $VERSION.$RELEASE --scripts packaging/osx/scripts-clients FoundationDB-clients.pkg
+pkgbuild --root $CLIENTSDIR --identifier FoundationDB-clients --version $VERSION --scripts "$SRCDIR"/packaging/osx/scripts-clients FoundationDB-clients.pkg
 
 rm -rf $CLIENTSDIR
 
@@ -46,14 +60,14 @@ mkdir -p -m 0755 $SERVERDIR/Library/LaunchDaemons
 mkdir -p -m 0700 $SERVERDIR/usr/local/foundationdb/data
 mkdir -p -m 0700 $SERVERDIR/usr/local/foundationdb/logs
 
-install -m 0664 packaging/osx/foundationdb.conf.new $SERVERDIR/usr/local/etc/foundationdb
-install -m 0755 bin/fdbserver bin/fdbmonitor $SERVERDIR/usr/local/libexec
-install -m 0644 packaging/osx/com.foundationdb.fdbmonitor.plist $SERVERDIR/Library/LaunchDaemons
+install -m 0664 "$SRCDIR"/packaging/osx/foundationdb.conf.new $SERVERDIR/usr/local/etc/foundationdb
+install -m 0755 "$BUILDDIR"/bin/fdbserver "$BUILDDIR"/bin/fdbmonitor $SERVERDIR/usr/local/libexec
+install -m 0644 "$SRCDIR"/packaging/osx/com.foundationdb.fdbmonitor.plist $SERVERDIR/Library/LaunchDaemons
 
-pkgbuild --root $SERVERDIR --identifier FoundationDB-server --version $VERSION.$RELEASE --scripts packaging/osx/scripts-server FoundationDB-server.pkg
+pkgbuild --root $SERVERDIR --identifier FoundationDB-server --version "$VERSION" --scripts "$SRCDIR"/packaging/osx/scripts-server FoundationDB-server.pkg
 
 rm -rf $SERVERDIR
 
-productbuild --distribution packaging/osx/Distribution.xml --resources packaging/osx/resources --package-path . $PKGFILE
+productbuild --distribution "$SRCDIR"/packaging/osx/Distribution.xml --resources "$SRCDIR"/packaging/osx/resources --package-path . "$PKGFILE"
 
 rm FoundationDB-clients.pkg FoundationDB-server.pkg
diff --git a/packaging/osx/scripts-server/postinstall b/packaging/osx/scripts-server/postinstall
index a31c3fd416..34ce9f7dad 100755
--- a/packaging/osx/scripts-server/postinstall
+++ b/packaging/osx/scripts-server/postinstall
@@ -1,10 +1,10 @@
 #!/bin/bash
 
 if [ ! -f /usr/local/etc/foundationdb/fdb.cluster ]; then
-    description=`LC_CTYPE=C tr -dc A-Za-z0-9 < /dev/urandom | head -c 8`
-    random_str=`LC_CTYPE=C tr -dc A-Za-z0-9 < /dev/urandom | head -c 8`
+    description=`LC_CTYPE=C tr -dc '[:lower:][:upper:][:digit:]' < /dev/urandom | head -c 8`
+    random_str=`LC_CTYPE=C tr -dc '[:lower:][:upper:][:digit:]' < /dev/urandom | head -c 8`
     echo $description:$random_str@127.0.0.1:4689 > /usr/local/etc/foundationdb/fdb.cluster
-    chmod 0664 /etc/foundationdb/fdb.cluster
+    chmod 0664 /usr/local/etc/foundationdb/fdb.cluster
     NEWDB=1
 fi
 

From cc6497ddfb1a023171113b8286c3959bcd176faf Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 12 May 2021 16:21:44 -0700
Subject: [PATCH 388/461] Only log timeout when CC is unknown for a worker.

---
 fdbserver/Knobs.cpp        |  2 +-
 fdbserver/Knobs.h          |  2 +-
 fdbserver/worker.actor.cpp | 16 ++++++++++------
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index aee36e39f1..fc1234d243 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -616,7 +616,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	//Worker
 	init( WORKER_LOGGING_INTERVAL,                               5.0 );
 	init( HEAP_PROFILER_INTERVAL,                               30.0 );
-	init( REGISTER_WORKER_REQUEST_TIMEOUT,                     300.0 );
+	init( UNKNOWN_CC_TIMEOUT,                                  600.0 );
 	init( DEGRADED_RESET_INTERVAL,                          24*60*60 ); if ( randomize && BUGGIFY ) DEGRADED_RESET_INTERVAL = 10;
 	init( DEGRADED_WARNING_LIMIT,                                  1 );
 	init( DEGRADED_WARNING_RESET_DELAY,                   7*24*60*60 );
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 1c7c273a7b..be2caba6a1 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -543,7 +543,7 @@ public:
 	// Worker
 	double WORKER_LOGGING_INTERVAL;
 	double HEAP_PROFILER_INTERVAL;
-	double REGISTER_WORKER_REQUEST_TIMEOUT;
+	double UNKNOWN_CC_TIMEOUT;
 	double DEGRADED_RESET_INTERVAL;
 	double DEGRADED_WARNING_LIMIT;
 	double DEGRADED_WARNING_RESET_DELAY;
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 5a568fc96d..5721b154d4 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -554,20 +554,24 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			}
 		}
 
+		state bool ccInterfacePresent = ccInterface->get().present();
 		state Future<RegisterWorkerReply> registrationReply =
-		    ccInterface->get().present()
-		        ? brokenPromiseToNever(ccInterface->get().get().registerWorker.getReply(request))
-		        : Never();
+		    ccInterfacePresent ? brokenPromiseToNever(ccInterface->get().get().registerWorker.getReply(request))
+		                       : Never();
 		state double startTime = now();
 		loop choose {
 			when(RegisterWorkerReply reply = wait(registrationReply)) {
 				processClass = reply.processClass;
 				asyncPriorityInfo->set(reply.priorityInfo);
-				TraceEvent("WorkerRegisterReply").detail("CCID", ccInterface->get().get().id());
+				TraceEvent("WorkerRegisterReply")
+				    .detail("CCID", ccInterface->get().get().id())
+				    .detail("ProcessClass", reply.processClass.toString());
 				break;
 			}
-			when(wait(delay(SERVER_KNOBS->REGISTER_WORKER_REQUEST_TIMEOUT))) {
-				TraceEvent(SevWarn, "WorkerRegisterTimeout").detail("WaitTime", now() - startTime);
+			when(wait(delay(SERVER_KNOBS->UNKNOWN_CC_TIMEOUT))) {
+				if (!ccInterfacePresent) {
+					TraceEvent(SevWarn, "WorkerRegisterTimeout").detail("WaitTime", now() - startTime);
+				}
 			}
 			when(wait(ccInterface->onChange())) { break; }
 			when(wait(ddInterf->onChange())) { break; }

From 160293bd5404e9ff2b7003ffe4f66e6b698e2b4b Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Thu, 13 May 2021 14:28:06 +0000
Subject: [PATCH 389/461] Report bounce impact in fdbcli status

Changes:

Schemas.cpp: Extend the JSON schema to report whether the cluster is
bounceable and if not, report the reason for why it is not bounceable.

Status.actor.cpp: Extend recoveryStateStatusFetcher() to populate the
bounce related field(s).

mr-status-json-schemas.rst.inc: Update the schema to reflect the change
made in Schemas.cpp.

release-notes-700.rst: Add a note about the new status fields in "Status"
section.
---
 .../source/mr-status-json-schemas.rst.inc     |  5 +++-
 .../release-notes/release-notes-700.rst       |  2 +-
 fdbclient/Schemas.cpp                         |  5 +++-
 fdbserver/Status.actor.cpp                    | 23 +++++++++++++------
 4 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index acce461308..974244680d 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -477,7 +477,6 @@
       ],
       "recovery_state":{
          "seconds_since_last_recovered":1,
-         "seconds_since_fully_recovered":1,
          "required_resolvers":1,
          "required_commit_proxies":1,
          "required_grv_proxies":1,
@@ -503,6 +502,10 @@
          "required_logs":3,
          "missing_logs":"7f8d623d0cb9966e",
          "active_generations":1,
+         "bounce_impact":{
+            "can_clean_bounce":true,
+            "reason":""
+         },
          "description":"Recovery complete."
       },
       "workload":{
diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst
index 8e825035a9..c046690b2b 100644
--- a/documentation/sphinx/source/release-notes/release-notes-700.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-700.rst
@@ -31,7 +31,7 @@ Fixes
 Status
 ------
 * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4735) <https://github.com/apple/foundationdb/pull/4735>`_
-* Added ``seconds_since_fully_recovered`` to the recovery_state section of status to report the time, in seconds, since last full recovery. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
+* Added ``bounce_impact`` to the recovery_state section of status to report if the cluster is bounceable and if not, the reason for why it is not bounceable. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 
 Bindings
 --------
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 1b24af0e77..22f0543a7e 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -521,7 +521,6 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                                                           R"statusSchema(
       "recovery_state":{
          "seconds_since_last_recovered":1,
-         "seconds_since_fully_recovered":1,
          "required_resolvers":1,
          "required_commit_proxies":1,
          "required_grv_proxies":1,
@@ -547,6 +546,10 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
          "required_logs":3,
          "missing_logs":"7f8d623d0cb9966e",
          "active_generations":1,
+         "bounce_impact":{
+            "can_clean_bounce":true,
+            "reason":""
+         },
          "description":"Recovery complete."
       },
       "workload":{
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 992338310c..eef9dc7d59 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -387,6 +387,19 @@ JsonBuilderObject getLagObject(int64_t versions) {
 	return lag;
 }
 
+static JsonBuilderObject getBounceImpactInfo(int recoveryStatusCode) {
+	JsonBuilderObject bounceImpact;
+
+	if (recoveryStatusCode == RecoveryStatus::fully_recovered) {
+		bounceImpact["can_clean_bounce"] = true;
+	} else {
+		bounceImpact["can_clean_bounce"] = false;
+		bounceImpact["reason"] = "cluster hasn't fully recovered yet";
+	}
+
+	return bounceImpact;
+}
+
 struct MachineMemoryInfo {
 	double memoryUsage;
 	double aggregateLimit;
@@ -1168,14 +1181,10 @@ ACTOR static Future<JsonBuilderObject> recoveryStateStatusFetcher(Database cx,
 			message["required_resolvers"] = requiredResolvers;
 		} else if (mStatusCode == RecoveryStatus::locking_old_transaction_servers) {
 			message["missing_logs"] = md.getValue("MissingIDs").c_str();
-		} else if (mStatusCode == RecoveryStatus::fully_recovered) {
-			if (!rv.isError()) {
-				int64_t fullyRecoveredAtVersion = md.getInt64("FullyRecoveredAtVersion");
-				double secondsSinceFulyRecovered = std::max((int64_t)0, (int64_t)(rv.get() - fullyRecoveredAtVersion)) /
-							                       (double)SERVER_KNOBS->VERSIONS_PER_SECOND;
-				message["seconds_since_fully_recovered"] = secondsSinceFulyRecovered;
-			}
 		}
+
+		message["bounce_impact"] = getBounceImpactInfo(mStatusCode);
+
 		// TODO:  time_in_recovery: 0.5
 		//        time_in_state: 0.1
 

From d8cad8efcae1d721ce602563509d81a051ad55c8 Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Thu, 13 May 2021 16:36:57 +0000
Subject: [PATCH 390/461] Report bounce impact  info as part of cluster JSON
 object.

---
 fdbclient/Schemas.cpp      | 8 ++++----
 fdbserver/Status.actor.cpp | 3 +--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 22f0543a7e..0ba2feaaaa 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -546,10 +546,6 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
          "required_logs":3,
          "missing_logs":"7f8d623d0cb9966e",
          "active_generations":1,
-         "bounce_impact":{
-            "can_clean_bounce":true,
-            "reason":""
-         },
          "description":"Recovery complete."
       },
       "workload":{
@@ -652,6 +648,10 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
       "data_distribution_disabled_for_rebalance":true,
       "data_distribution_disabled":true,
       "active_primary_dc":"pv",
+      "bounce_impact":{
+         "can_clean_bounce":true,
+         "reason":""
+      },
       "configuration":{
          "log_anti_quorum":0,
          "log_replicas":2,
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index eef9dc7d59..12235f9d31 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -1183,8 +1183,6 @@ ACTOR static Future<JsonBuilderObject> recoveryStateStatusFetcher(Database cx,
 			message["missing_logs"] = md.getValue("MissingIDs").c_str();
 		}
 
-		message["bounce_impact"] = getBounceImpactInfo(mStatusCode);
-
 		// TODO:  time_in_recovery: 0.5
 		//        time_in_state: 0.1
 
@@ -2791,6 +2789,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
 
 		statusObj["protocol_version"] = format("%" PRIx64, g_network->protocolVersion().version());
 		statusObj["connection_string"] = coordinators.ccf->getConnectionString().toString();
+		statusObj["bounce_impact"] = getBounceImpactInfo(statusCode);
 
 		state Optional<DatabaseConfiguration> configuration;
 		state Optional<LoadConfigurationResult> loadResult;

From 99f6032239aecb5402ef974e076f32de15d43e13 Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Thu, 13 May 2021 16:47:05 +0000
Subject: [PATCH 391/461] Report bounce impact  info as part of cluster JSON
 object.

---
 .../sphinx/source/mr-status-json-schemas.rst.inc          | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 974244680d..202496620d 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -502,10 +502,6 @@
          "required_logs":3,
          "missing_logs":"7f8d623d0cb9966e",
          "active_generations":1,
-         "bounce_impact":{
-            "can_clean_bounce":true,
-            "reason":""
-         },
          "description":"Recovery complete."
       },
       "workload":{
@@ -608,6 +604,10 @@
       "data_distribution_disabled_for_rebalance":true,
       "data_distribution_disabled":true,
       "active_primary_dc":"pv",
+      "bounce_impact":{
+         "can_clean_bounce":true,
+         "reason":""
+      },
       "configuration":{
          "log_anti_quorum":0,
          "log_replicas":2,

From 8a15d7d14bb198e7f722858607cf2f38dc51ba11 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 13 May 2021 12:20:31 -0700
Subject: [PATCH 392/461] Bring #4518 (Logging more detailed information during
 Tlog recruitment) back.

---
 fdbserver/ClusterController.actor.cpp | 110 ++++++++++++++++++++++----
 1 file changed, 94 insertions(+), 16 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index b43e0de27d..107c865221 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -458,6 +458,33 @@ public:
 		}
 	}
 
+	// Log the reason why the worker is considered as unavailable.
+	void logWorkerUnavailable(const UID& id,
+	                          const std::string& method,
+	                          const std::string& reason,
+	                          const WorkerDetails& details,
+	                          const ProcessClass::Fitness& fitness,
+	                          const std::set<Optional<Key>>& dcIds) {
+		// Construct the list of DCs where the TLog recruitment is happening. This is mainly for logging purpose.
+		std::string dcList;
+		for (const auto& dc : dcIds) {
+			if (!dcList.empty()) {
+				dcList += ',';
+			}
+			dcList += printable(dc);
+		}
+		// Note that the recruitment happens only during initial database creation and recovery. So these trace
+		// events should be sparse.
+		TraceEvent("GetTLogTeamWorkerUnavailable", id)
+		    .detail("TLogRecruitMethod", method)
+		    .detail("Reason", reason)
+		    .detail("WorkerID", details.interf.id())
+		    .detail("WorkerDC", details.interf.locality.dcId())
+		    .detail("Address", details.interf.addresses().toString())
+		    .detail("Fitness", fitness)
+		    .detail("RecruitmentDcIds", dcList);
+	};
+
 	// A TLog recruitment method specialized for three_data_hall and three_datacenter configurations
 	// It attempts to evenly recruit processes from across data_halls or datacenters
 	std::vector<WorkerDetails> getWorkersForTlogsComplex(DatabaseConfiguration const& conf,
@@ -478,11 +505,30 @@ public:
 			auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
 
 			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
-			        exclusionWorkerIds.end() ||
-			    !workerAvailable(worker_info, checkStable) ||
-			    conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign ||
-			    (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) ||
-			    (!allowDegraded && worker_details.degraded)) {
+			    exclusionWorkerIds.end()) {
+				logWorkerUnavailable(id, "complex", "Worker is excluded", worker_details, fitness, dcIds);
+				continue;
+			}
+			if (!workerAvailable(worker_info, checkStable)) {
+				logWorkerUnavailable(id, "complex", "Worker is not available", worker_details, fitness, dcIds);
+				continue;
+			}
+			if (conf.isExcludedServer(worker_details.interf.addresses())) {
+				logWorkerUnavailable(
+				    id, "complex", "Worker server is excluded from the cluster", worker_details, fitness, dcIds);
+				continue;
+			}
+			if (fitness == ProcessClass::NeverAssign) {
+				logWorkerUnavailable(id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
+				continue;
+			}
+			if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {
+				logWorkerUnavailable(id, "complex", "Worker is not in the target DC", worker_details, fitness, dcIds);
+				continue;
+			}
+			if (!allowDegraded && worker_details.degraded) {
+				logWorkerUnavailable(
+				    id, "complex", "Worker is degraded and not allowed", worker_details, fitness, dcIds);
 				continue;
 			}
 
@@ -686,10 +732,25 @@ public:
 			const auto& worker_details = worker_info.details;
 			auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
 			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
-			        exclusionWorkerIds.end() ||
-			    !workerAvailable(worker_info, checkStable) ||
-			    conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign ||
-			    (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0)) {
+			    exclusionWorkerIds.end()) {
+				logWorkerUnavailable(id, "simple", "Worker is excluded", worker_details, fitness, dcIds);
+				continue;
+			}
+			if (!workerAvailable(worker_info, checkStable)) {
+				logWorkerUnavailable(id, "simple", "Worker is not available", worker_details, fitness, dcIds);
+				continue;
+			}
+			if (conf.isExcludedServer(worker_details.interf.addresses())) {
+				logWorkerUnavailable(
+				    id, "simple", "Worker server is excluded from the cluster", worker_details, fitness, dcIds);
+				continue;
+			}
+			if (fitness == ProcessClass::NeverAssign) {
+				logWorkerUnavailable(id, "simple", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
+				continue;
+			}
+			if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {
+				logWorkerUnavailable(id, "simple", "Worker is not in the target DC", worker_details, fitness, dcIds);
 				continue;
 			}
 
@@ -795,10 +856,27 @@ public:
 			const auto& worker_details = worker_info.details;
 			auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
 			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
-			        exclusionWorkerIds.end() ||
-			    !workerAvailable(worker_info, checkStable) ||
-			    conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign ||
-			    (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0)) {
+			    exclusionWorkerIds.end()) {
+				logWorkerUnavailable(id, "deprecated", "Worker is excluded", worker_details, fitness, dcIds);
+				continue;
+			}
+			if (!workerAvailable(worker_info, checkStable)) {
+				logWorkerUnavailable(id, "deprecated", "Worker is not available", worker_details, fitness, dcIds);
+				continue;
+			}
+			if (conf.isExcludedServer(worker_details.interf.addresses())) {
+				logWorkerUnavailable(
+				    id, "deprecated", "Worker server is excluded from the cluster", worker_details, fitness, dcIds);
+				continue;
+			}
+			if (fitness == ProcessClass::NeverAssign) {
+				logWorkerUnavailable(
+				    id, "deprecated", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
+				continue;
+			}
+			if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {
+				logWorkerUnavailable(
+				    id, "deprecated", "Worker is not in the target DC", worker_details, fitness, dcIds);
 				continue;
 			}
 
@@ -3091,9 +3169,9 @@ ACTOR Future<Void> workerAvailabilityWatch(WorkerInterface worker,
 					cluster->masterProcessId = Optional<Key>();
 				}
 				TraceEvent("ClusterControllerWorkerFailed", cluster->id)
-					.detail("ProcessId", worker.locality.processId())
-					.detail("ProcessClass", failedWorkerInfo.details.processClass.toString())
-					.detail("Address", worker.address());
+				    .detail("ProcessId", worker.locality.processId())
+				    .detail("ProcessClass", failedWorkerInfo.details.processClass.toString())
+				    .detail("Address", worker.address());
 				cluster->removedDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint());
 				cluster->id_worker.erase(worker.locality.processId());
 				cluster->updateWorkerList.set(worker.locality.processId(), Optional<ProcessData>());

From bdb5517f1e281c6ceabe9d0dd61a10ebfa701465 Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Thu, 13 May 2021 19:38:05 +0000
Subject: [PATCH 393/461] Provide a better explanation of the new metrics in
 the release notes.

---
 documentation/sphinx/source/release-notes/release-notes-700.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst
index c046690b2b..8997f8c9fd 100644
--- a/documentation/sphinx/source/release-notes/release-notes-700.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-700.rst
@@ -31,7 +31,7 @@ Fixes
 Status
 ------
 * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4735) <https://github.com/apple/foundationdb/pull/4735>`_
-* Added ``bounce_impact`` to the recovery_state section of status to report if the cluster is bounceable and if not, the reason for why it is not bounceable. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
+* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 
 Bindings
 --------

From 6275adc5a063d76238b00ead7fe8baeac7b35aab Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Thu, 13 May 2021 21:38:07 +0000
Subject: [PATCH 394/461] Address build failure

LogSystemPeekCursor.actor.cpp:
Check if "interf" is set before referencing it.
---
 fdbserver/LogSystemPeekCursor.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp
index 09e692e0b6..26287919cd 100644
--- a/fdbserver/LogSystemPeekCursor.actor.cpp
+++ b/fdbserver/LogSystemPeekCursor.actor.cpp
@@ -393,7 +393,7 @@ Version ILogSystem::ServerPeekCursor::getMinKnownCommittedVersion() const {
 }
 
 Optional<UID> ILogSystem::ServerPeekCursor::getPrimaryPeekLocation() const {
-	if (interf->get().present()) {
+	if (interf && interf->get().present()) {
 		return interf->get().id();
 	}
 	return Optional<UID>();

From 4163270c02b763b739edacbd4504815c02f85ac1 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Thu, 13 May 2021 23:13:14 +0000
Subject: [PATCH 395/461] Put aarch64 libfdb_java in the right place for fat
 jar

---
 bindings/java/CMakeLists.txt                  |  8 +++---
 .../main/com/apple/foundationdb/JNIUtil.java  | 26 ++++++++++---------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/bindings/java/CMakeLists.txt b/bindings/java/CMakeLists.txt
index 2da8639b8d..09012cdf97 100644
--- a/bindings/java/CMakeLists.txt
+++ b/bindings/java/CMakeLists.txt
@@ -141,8 +141,6 @@ endif()
 target_include_directories(fdb_java PRIVATE ${JNI_INCLUDE_DIRS})
 # libfdb_java.so is loaded by fdb-java.jar and doesn't need to depened on jvm shared libraries.
 target_link_libraries(fdb_java PRIVATE fdb_c)
-set_target_properties(fdb_java PROPERTIES
-  LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib/${SYSTEM_NAME}/amd64/)
 if(APPLE)
   set_target_properties(fdb_java PROPERTIES SUFFIX ".jnilib")
 endif()
@@ -217,7 +215,11 @@ if(NOT OPEN_FOR_IDE)
   elseif(APPLE)
     set(lib_destination "osx/x86_64")
   else()
-    set(lib_destination "linux/amd64")
+   if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+     set(lib_destination "linux/aarch64")
+   else()
+     set(lib_destination "linux/amd64")
+    endif()
   endif()
 	set(lib_destination "${unpack_dir}/lib/${lib_destination}")
 	set(jni_package "${CMAKE_BINARY_DIR}/packages/lib")
diff --git a/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java b/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java
index 8aa3d9f138..a5380112cd 100644
--- a/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java
+++ b/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java
@@ -36,11 +36,7 @@ class JNIUtil {
 	private static final String TEMPFILE_PREFIX = "fdbjni";
 	private static final String TEMPFILE_SUFFIX = ".library";
 
-	private enum OS {
-		WIN32("windows", "amd64", false),
-		LINUX("linux", "amd64", true),
-		OSX("osx", "x86_64", true);
-
+	private static class OS {
 		private final String name;
 		private final String arch;
 		private final boolean canDeleteEager;
@@ -171,13 +167,19 @@ class JNIUtil {
 
 	private static OS getRunningOS() {
 		String osname = System.getProperty("os.name").toLowerCase();
-		if(osname.startsWith("windows"))
-			return OS.WIN32;
-		if(osname.startsWith("linux"))
-			return OS.LINUX;
-		if(osname.startsWith("mac") || osname.startsWith("darwin"))
-			return OS.OSX;
-		throw new IllegalStateException("Unknown or unsupported OS: " + osname);
+		String arch = System.getProperty("os.arch");
+		if (arch != "amd64" && arch != "x86_64" && arch != "aarch64") {
+			throw new IllegalStateException("Unknown or unsupported arch: " + arch);
+		}
+		if (osname.startsWith("windows")) {
+			return new OS("windows", arch, /* canDeleteEager */ false);
+		} else if (osname.startsWith("linux")) {
+			return new OS("linux", arch, /* canDeleteEager */ true);
+		} else if (osname.startsWith("mac") || osname.startsWith("darwin")) {
+			return new OS("osx", arch, /* canDeleteEager */ true);
+		} else {
+			throw new IllegalStateException("Unknown or unsupported OS: " + osname);
+		}
 	}
 
 	private JNIUtil() {}

From e892ca00e4f3e2f78a011d543f452691f2eb7ba7 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Fri, 14 May 2021 00:03:03 +0000
Subject: [PATCH 396/461] Use proper string equality

---
 bindings/java/src/main/com/apple/foundationdb/JNIUtil.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java b/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java
index a5380112cd..99c2f8a322 100644
--- a/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java
+++ b/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java
@@ -168,7 +168,7 @@ class JNIUtil {
 	private static OS getRunningOS() {
 		String osname = System.getProperty("os.name").toLowerCase();
 		String arch = System.getProperty("os.arch");
-		if (arch != "amd64" && arch != "x86_64" && arch != "aarch64") {
+		if (!arch.equals("amd64") && !arch.equals("x86_64") && !arch.equals("aarch64")) {
 			throw new IllegalStateException("Unknown or unsupported arch: " + arch);
 		}
 		if (osname.startsWith("windows")) {

From 70e53605cfe83ae798ccf4027e27d36b0077ad67 Mon Sep 17 00:00:00 2001
From: Alex Moundalexis <amoundalexis@apple.com>
Date: Fri, 14 May 2021 10:50:46 -0400
Subject: [PATCH 397/461] updated copyright year on web site

---
 documentation/sphinx/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/conf.py b/documentation/sphinx/conf.py
index 5ec9238930..ab42fdba6a 100644
--- a/documentation/sphinx/conf.py
+++ b/documentation/sphinx/conf.py
@@ -49,7 +49,7 @@ master_doc = 'index'
 
 # General information about the project.
 project = u'FoundationDB'
-copyright = u'2013-2018 Apple, Inc and the FoundationDB project authors'
+copyright = u'2013-2021 Apple, Inc and the FoundationDB project authors'
 
 # Load the version information from 'versions.target'
 import xml.etree.ElementTree as ET

From a162682d6d63a435da4e283c28c3762e40f184ad Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 14 May 2021 11:12:47 -0700
Subject: [PATCH 398/461] Fix accounting for time spent in run loop after
 breaking due to yield

---
 flow/Net2.actor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp
index bb0b0325c6..c3b35f1203 100644
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@@ -1513,6 +1513,7 @@ void Net2::run() {
 			double newTaskBegin = timer_monotonic();
 			if (check_yield(TaskPriority::Max, tscNow)) {
 				checkForSlowTask(tscBegin, tscNow, newTaskBegin - taskBegin, currentTaskID);
+				taskBegin = newTaskBegin;	
 				FDB_TRACE_PROBE(run_loop_yield);
 				++countYields;
 				break;

From d55b94fc06ca521bfc8b35d91a9f0f0ad226f5a8 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 14 May 2021 12:38:26 -0700
Subject: [PATCH 399/461] Add release note

---
 documentation/sphinx/source/release-notes/release-notes-630.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst
index f4b5c8aacb..2057e7fcb2 100644
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@@ -2,9 +2,11 @@
 Release Notes
 #############
 
+
 6.3.13
 ======
 * The multi-version client now requires at most two client connections with version 6.2 or larger, regardless of how many external clients are configured. Clients older than 6.2 will continue to create an additional connection each. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
+* Fix an accounting error that could potentially result in inaccuracies in priority busyness metrics. `(PR #4824) <https://github.com/apple/foundationdb/pull/4824>`_
 
 6.3.12
 ======

From 2298567c2bf998915123401e004680ee6726b099 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Fri, 14 May 2021 23:12:00 -0700
Subject: [PATCH 400/461] Use of aligned_alloc() for 4k pages causes too much
 wasted virtual memory.  Added new 4k-aligned fast allocator, and changed
 Arena::allocatedAlignedBuffer() to be 4k-specific, now called
 Arena::allocate4kAlignedBuffer().

---
 fdbrpc/dsltest.actor.cpp |  4 ++--
 fdbserver/IPager.h       |  5 ++++-
 flow/Arena.cpp           | 36 ++++++++++++++++++------------------
 flow/Arena.h             | 14 +++++++-------
 flow/FastAlloc.h         | 20 ++++++++++++++++++++
 5 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/fdbrpc/dsltest.actor.cpp b/fdbrpc/dsltest.actor.cpp
index 21e1808afd..89703ea2d2 100644
--- a/fdbrpc/dsltest.actor.cpp
+++ b/fdbrpc/dsltest.actor.cpp
@@ -632,8 +632,8 @@ void showArena(ArenaBlock* a, ArenaBlock* parent) {
 			ArenaBlockRef* r = (ArenaBlockRef*)((char*)a->getData() + o);
 
 			// If alignedBuffer is valid then print its pointer and size, else recurse
-			if (r->alignedBufferSize != 0) {
-				printf("AlignedBuffer %p (<-%p) %u bytes\n", r->alignedBuffer, a, r->alignedBufferSize);
+			if (r->aligned4kBufferSize != 0) {
+				printf("AlignedBuffer %p (<-%p) %u bytes\n", r->aligned4kBuffer, a, r->aligned4kBufferSize);
 			} else {
 				showArena(r->next, a);
 			}
diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h
index cb9612fd95..79d7ed2a80 100644
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@@ -43,7 +43,7 @@ public:
 	// The page's logical size includes an opaque checksum, use size() to get usable size
 	ArenaPage(int logicalSize, int bufferSize) : logicalSize(logicalSize), bufferSize(bufferSize), userData(nullptr) {
 		if (bufferSize > 0) {
-			buffer = (uint8_t*)arena.allocateAlignedBuffer(4096, bufferSize);
+			buffer = (uint8_t*)arena.allocate4kAlignedBuffer(bufferSize);
 
 			// Mark any unused page portion defined
 			VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize);
@@ -56,6 +56,9 @@ public:
 		if (userData != nullptr && userDataDestructor != nullptr) {
 			userDataDestructor(userData);
 		}
+		if(buffer != 0) {
+			VALGRIND_MAKE_MEM_UNDEFINED(buffer, bufferSize);
+		}
 	}
 
 	uint8_t const* begin() const { return (uint8_t*)buffer; }
diff --git a/flow/Arena.cpp b/flow/Arena.cpp
index fe649c548b..016112cc5b 100644
--- a/flow/Arena.cpp
+++ b/flow/Arena.cpp
@@ -102,8 +102,8 @@ void Arena::dependsOn(const Arena& p) {
 	}
 }
 
-void* Arena::allocateAlignedBuffer(size_t alignment, size_t size) {
-	return ArenaBlock::dependOnAlignedBuffer(impl, alignment, size);
+void* Arena::allocate4kAlignedBuffer(size_t size) {
+	return ArenaBlock::dependOn4kAlignedBuffer(impl, size);
 }
 
 size_t Arena::getSize() const {
@@ -177,8 +177,8 @@ size_t ArenaBlock::totalSize() {
 	while (o) {
 		ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + o);
 		makeDefined(r, sizeof(ArenaBlockRef));
-		if (r->alignedBufferSize != 0) {
-			s += r->alignedBufferSize;
+		if (r->aligned4kBufferSize != 0) {
+			s += r->aligned4kBufferSize;
 		} else {
 			allowAccess(r->next);
 			s += r->next->totalSize();
@@ -201,7 +201,7 @@ void ArenaBlock::getUniqueBlocks(std::set<ArenaBlock*>& a) {
 		makeDefined(r, sizeof(ArenaBlockRef));
 
 		// If next is valid recursively count its blocks
-		if (r->alignedBufferSize == 0) {
+		if (r->aligned4kBufferSize == 0) {
 			r->next->getUniqueBlocks(a);
 		}
 
@@ -226,7 +226,7 @@ int ArenaBlock::addUsed(int bytes) {
 void ArenaBlock::makeReference(ArenaBlock* next) {
 	ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + bigUsed);
 	makeDefined(r, sizeof(ArenaBlockRef));
-	r->alignedBufferSize = 0;
+	r->aligned4kBufferSize = 0;
 	r->next = next;
 	r->nextBlockOffset = nextBlockOffset;
 	makeNoAccess(r, sizeof(ArenaBlockRef));
@@ -234,17 +234,17 @@ void ArenaBlock::makeReference(ArenaBlock* next) {
 	bigUsed += sizeof(ArenaBlockRef);
 }
 
-void* ArenaBlock::makeAlignedBuffer(size_t alignment, size_t size) {
+void* ArenaBlock::make4kAlignedBuffer(size_t size) {
 	ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + bigUsed);
 	makeDefined(r, sizeof(ArenaBlockRef));
-	r->alignedBufferSize = size;
-	r->alignedBuffer = aligned_alloc(alignment, size);
-	// printf("Arena::alignedBuffer alloc %p\n", r->alignedBuffer);
+	r->aligned4kBufferSize = size;
+	r->aligned4kBuffer = allocateFast4kAligned(size);
+	//printf("Arena::aligned4kBuffer alloc size=%u ptr=%p\n", size, r->aligned4kBuffer);
 	r->nextBlockOffset = nextBlockOffset;
 	makeNoAccess(r, sizeof(ArenaBlockRef));
 	nextBlockOffset = bigUsed;
 	bigUsed += sizeof(ArenaBlockRef);
-	return r->alignedBuffer;
+	return r->aligned4kBuffer;
 }
 
 void ArenaBlock::dependOn(Reference<ArenaBlock>& self, ArenaBlock* other) {
@@ -255,11 +255,11 @@ void ArenaBlock::dependOn(Reference<ArenaBlock>& self, ArenaBlock* other) {
 		self->makeReference(other);
 }
 
-void* ArenaBlock::dependOnAlignedBuffer(Reference<ArenaBlock>& self, size_t alignment, size_t size) {
+void* ArenaBlock::dependOn4kAlignedBuffer(Reference<ArenaBlock>& self, size_t size) {
 	if (!self || self->isTiny() || self->unused() < sizeof(ArenaBlockRef)) {
-		return create(SMALL, self)->makeAlignedBuffer(alignment, size);
+		return create(SMALL, self)->make4kAlignedBuffer(size);
 	} else {
-		return self->makeAlignedBuffer(alignment, size);
+		return self->make4kAlignedBuffer(size);
 	}
 }
 
@@ -396,10 +396,10 @@ void ArenaBlock::destroy() {
 				ArenaBlockRef* br = (ArenaBlockRef*)((char*)b->getData() + o);
 				makeDefined(br, sizeof(ArenaBlockRef));
 
-				// If alignedBuffer is valid, free it
-				if (br->alignedBufferSize != 0) {
-					// printf("Arena::alignedBuffer free %p\n", br->alignedBuffer);
-					aligned_free(br->alignedBuffer);
+				// If aligned4kBuffer is valid, free it
+				if (br->aligned4kBufferSize != 0) {
+					//printf("Arena::aligned4kBuffer free %p\n", br->aligned4kBuffer);
+					freeFast4kAligned(br->aligned4kBufferSize, br->aligned4kBuffer);
 				} else {
 					allowAccess(br->next);
 					if (br->next->delref_no_destroy())
diff --git a/flow/Arena.h b/flow/Arena.h
index 999e873044..a34dcf67c6 100644
--- a/flow/Arena.h
+++ b/flow/Arena.h
@@ -102,7 +102,7 @@ public:
 	Arena& operator=(Arena&&) noexcept;
 
 	void dependsOn(const Arena& p);
-	void* allocateAlignedBuffer(size_t alignment, size_t size);
+	void* allocate4kAlignedBuffer(size_t size);
 	size_t getSize() const;
 
 	bool hasFree(size_t size, const void* address);
@@ -130,12 +130,12 @@ struct scalar_traits<Arena> : std::true_type {
 };
 
 struct ArenaBlockRef {
-	// If alignedBufferSize is not 0, alignedBuffer is valid and must be freed with aligned_free()
-	// Otherwise, next is valid
-	size_t alignedBufferSize;
+	// Only one of (next, aligned4kBuffer) are valid at any one time, as they occupy the same space.
+	// If aligned4kBufferSize is not 0, aligned4kBuffer is valid, otherwise next is valid.
+	size_t aligned4kBufferSize;
 	union {
 		ArenaBlock* next;
-		void* alignedBuffer;
+		void* aligned4kBuffer;
 	};
 	uint32_t nextBlockOffset;
 };
@@ -167,9 +167,9 @@ struct ArenaBlock : NonCopyable, ThreadSafeReferenceCounted<ArenaBlock> {
 	void getUniqueBlocks(std::set<ArenaBlock*>& a);
 	int addUsed(int bytes);
 	void makeReference(ArenaBlock* next);
-	void* makeAlignedBuffer(size_t alignment, size_t size);
+	void* make4kAlignedBuffer(size_t size);
 	static void dependOn(Reference<ArenaBlock>& self, ArenaBlock* other);
-	static void* dependOnAlignedBuffer(Reference<ArenaBlock>& self, size_t alignment, size_t size);
+	static void* dependOn4kAlignedBuffer(Reference<ArenaBlock>& self, size_t size);
 	static void* allocate(Reference<ArenaBlock>& self, int bytes);
 	// Return an appropriately-sized ArenaBlock to store the given data
 	static ArenaBlock* create(int dataSize, Reference<ArenaBlock>& next);
diff --git a/flow/FastAlloc.h b/flow/FastAlloc.h
index f9fff408d2..3f5f2ab58b 100644
--- a/flow/FastAlloc.h
+++ b/flow/FastAlloc.h
@@ -266,4 +266,24 @@ inline void freeFast(int size, void* ptr) {
 	delete[](uint8_t*) ptr;
 }
 
+[[nodiscard]] inline void* allocateFast4kAligned(int size) {
+	if (size <= 4096)
+		return FastAllocator<4096>::allocate();
+	if (size <= 8192)
+		return FastAllocator<8192>::allocate();
+	if (size <= 16384)
+		return FastAllocator<16384>::allocate();
+	return aligned_alloc(4096, size);
+}
+
+inline void freeFast4kAligned(int size, void* ptr) {
+	if (size <= 4096)
+		return FastAllocator<4096>::release(ptr);
+	if (size <= 8192)
+		return FastAllocator<8192>::release(ptr);
+	if (size <= 16384)
+		return FastAllocator<16384>::release(ptr);
+	aligned_free(ptr);
+}
+
 #endif

From 6a5bf120f83c9efa73b91ebaa54ab6764e53bb41 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sat, 15 May 2021 23:00:21 -0700
Subject: [PATCH 401/461] Bug fix:  It is possible for the pager to be shut
 down while a cursor operation is acquiring its mutex, specifically after the
 permit is available but before the delay(0) inside take() is ready, causing
 the cursor to operate on an invalid pager.

---
 fdbserver/VersionedBTree.actor.cpp | 9 +++++++++
 flow/genericactors.actor.h         | 8 ++++++++
 2 files changed, 17 insertions(+)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 1c97ab11bd..66e6bed9b7 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -264,6 +264,7 @@ public:
 		Future<Void> writeOperations;
 
 		FlowLock mutex;
+		Future<Void> killMutex;
 
 		Cursor() : mode(NONE) {}
 
@@ -274,6 +275,14 @@ public:
 		          int readOffset = 0,
 		          LogicalPageID endPage = invalidLogicalPageID) {
 			queue = q;
+
+			// If the pager gets an error, which includes shutdown, kill the mutex so any waiters can no longer run.
+			// This avoids having every mutex wait also wait on pagerError.
+			killMutex = map(ready(queue->pagerError), [=](Void e) {
+				mutex.kill();
+				return Void();
+			});
+
 			mode = m;
 			firstPageIDWritten = invalidLogicalPageID;
 			offset = readOffset;
diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h
index 46cdb6d73f..7bf2a05e63 100644
--- a/flow/genericactors.actor.h
+++ b/flow/genericactors.actor.h
@@ -1334,6 +1334,14 @@ struct FlowLock : NonCopyable, public ReferenceCounted<FlowLock> {
 	int64_t activePermits() const { return active; }
 	int waiters() const { return takers.size(); }
 
+	// Try to send error to all current and future waiters
+	// Only works if broken_on_destruct.canBeSet()
+	void kill(Error e = broken_promise()) {
+		if (broken_on_destruct.canBeSet()) {
+			broken_on_destruct.sendError(e);
+		}
+	}
+
 private:
 	std::list<std::pair<Promise<Void>, int64_t>> takers;
 	const int64_t permits;

From cfeff9aa4bf8a2cc43845218e8bc33e00cc128ab Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sun, 16 May 2021 01:41:40 -0700
Subject: [PATCH 402/461] Clarity improvement, loop was reusing variable name
 from enclosing scope.

---
 fdbserver/IPager.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h
index 79d7ed2a80..811ed8ab77 100644
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@@ -93,16 +93,16 @@ public:
 		int usableSize = pages.front()->size();
 		int totalUsableSize = pages.size() * usableSize;
 		int totalBufferSize = pages.front()->bufferSize * pages.size();
-		ArenaPage* p = new ArenaPage(totalUsableSize + sizeof(Checksum), totalBufferSize);
+		ArenaPage* superpage = new ArenaPage(totalUsableSize + sizeof(Checksum), totalBufferSize);
 
-		uint8_t* wptr = p->mutate();
+		uint8_t* wptr = superpage->mutate();
 		for (auto& p : pages) {
 			ASSERT(p->size() == usableSize);
 			memcpy(wptr, p->begin(), usableSize);
 			wptr += usableSize;
 		}
 
-		return Reference<ArenaPage>(p);
+		return Reference<ArenaPage>(superpage);
 	}
 
 	Checksum& getChecksum() { return *(Checksum*)(buffer + size()); }

From b4e766bd13d8b44eb4be53b3ed96587db8af4e99 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sun, 16 May 2021 02:00:43 -0700
Subject: [PATCH 403/461] Bug fix, returned value wasn't pointing into the
 correct arena.

---
 fdbserver/VersionedBTree.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 66e6bed9b7..abfacef7a7 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -6316,7 +6316,7 @@ public:
 
 		wait(cur.seekGTE(key, 0));
 		if (cur.isValid() && cur.get().key == key) {
-			Value v = cur.get().value.get();
+			ValueRef v = cur.get().value.get();
 			int len = std::min(v.size(), maxLength);
 			// Return a Value prefix whose arena is the source page's arena
 			return Value(v.substr(0, len), cur.back().page->getArena());

From bd0c4a4892398cb7629c7c8b19467321ae6c56ec Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sun, 16 May 2021 03:03:55 -0700
Subject: [PATCH 404/461] Avoid callers of getValue() and getValuePrefix() from
 being able to add arena dependencies to the source page arena.

---
 fdbserver/VersionedBTree.actor.cpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index abfacef7a7..1e7bb4e712 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -6292,7 +6292,10 @@ public:
 		wait(cur.seekGTE(key, 0));
 		if (cur.isValid() && cur.get().key == key) {
 			// Return a Value whose arena is the source page's arena
-			return Value(cur.get().value.get(), cur.back().page->getArena());
+			Value v;
+			v.arena().dependsOn(cur.back().page->getArena());
+			v.contents() = cur.get().value.get();
+			return v;
 		}
 
 		return Optional<Value>();
@@ -6316,10 +6319,14 @@ public:
 
 		wait(cur.seekGTE(key, 0));
 		if (cur.isValid() && cur.get().key == key) {
-			ValueRef v = cur.get().value.get();
-			int len = std::min(v.size(), maxLength);
-			// Return a Value prefix whose arena is the source page's arena
-			return Value(v.substr(0, len), cur.back().page->getArena());
+			// Return a Value whose arena is the source page's arena
+			Value v;
+			v.arena().dependsOn(cur.back().page->getArena());
+			v.contents() = cur.get().value.get();
+			if (v.size() > maxLength) {
+				v.contents() = v.substr(0, maxLength);
+			}
+			return v;
 		}
 
 		return Optional<Value>();

From a31e4f622f928dd6959ef40171f67d0abc2296ac Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sun, 16 May 2021 03:58:05 -0700
Subject: [PATCH 405/461] Changed ArenaBlockRef to use 32 bit aligned4kBuffer
 size.

---
 flow/Arena.cpp | 10 +++++-----
 flow/Arena.h   |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/flow/Arena.cpp b/flow/Arena.cpp
index 016112cc5b..096ded32fd 100644
--- a/flow/Arena.cpp
+++ b/flow/Arena.cpp
@@ -102,7 +102,7 @@ void Arena::dependsOn(const Arena& p) {
 	}
 }
 
-void* Arena::allocate4kAlignedBuffer(size_t size) {
+void* Arena::allocate4kAlignedBuffer(uint32_t size) {
 	return ArenaBlock::dependOn4kAlignedBuffer(impl, size);
 }
 
@@ -234,12 +234,12 @@ void ArenaBlock::makeReference(ArenaBlock* next) {
 	bigUsed += sizeof(ArenaBlockRef);
 }
 
-void* ArenaBlock::make4kAlignedBuffer(size_t size) {
+void* ArenaBlock::make4kAlignedBuffer(uint32_t size) {
 	ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + bigUsed);
 	makeDefined(r, sizeof(ArenaBlockRef));
 	r->aligned4kBufferSize = size;
 	r->aligned4kBuffer = allocateFast4kAligned(size);
-	//printf("Arena::aligned4kBuffer alloc size=%u ptr=%p\n", size, r->aligned4kBuffer);
+	// printf("Arena::aligned4kBuffer alloc size=%u ptr=%p\n", size, r->aligned4kBuffer);
 	r->nextBlockOffset = nextBlockOffset;
 	makeNoAccess(r, sizeof(ArenaBlockRef));
 	nextBlockOffset = bigUsed;
@@ -255,7 +255,7 @@ void ArenaBlock::dependOn(Reference<ArenaBlock>& self, ArenaBlock* other) {
 		self->makeReference(other);
 }
 
-void* ArenaBlock::dependOn4kAlignedBuffer(Reference<ArenaBlock>& self, size_t size) {
+void* ArenaBlock::dependOn4kAlignedBuffer(Reference<ArenaBlock>& self, uint32_t size) {
 	if (!self || self->isTiny() || self->unused() < sizeof(ArenaBlockRef)) {
 		return create(SMALL, self)->make4kAlignedBuffer(size);
 	} else {
@@ -398,7 +398,7 @@ void ArenaBlock::destroy() {
 
 				// If aligned4kBuffer is valid, free it
 				if (br->aligned4kBufferSize != 0) {
-					//printf("Arena::aligned4kBuffer free %p\n", br->aligned4kBuffer);
+					// printf("Arena::aligned4kBuffer free %p\n", br->aligned4kBuffer);
 					freeFast4kAligned(br->aligned4kBufferSize, br->aligned4kBuffer);
 				} else {
 					allowAccess(br->next);
diff --git a/flow/Arena.h b/flow/Arena.h
index a34dcf67c6..c08072e35c 100644
--- a/flow/Arena.h
+++ b/flow/Arena.h
@@ -102,7 +102,7 @@ public:
 	Arena& operator=(Arena&&) noexcept;
 
 	void dependsOn(const Arena& p);
-	void* allocate4kAlignedBuffer(size_t size);
+	void* allocate4kAlignedBuffer(uint32_t size);
 	size_t getSize() const;
 
 	bool hasFree(size_t size, const void* address);
@@ -132,7 +132,7 @@ struct scalar_traits<Arena> : std::true_type {
 struct ArenaBlockRef {
 	// Only one of (next, aligned4kBuffer) are valid at any one time, as they occupy the same space.
 	// If aligned4kBufferSize is not 0, aligned4kBuffer is valid, otherwise next is valid.
-	size_t aligned4kBufferSize;
+	uint32_t aligned4kBufferSize;
 	union {
 		ArenaBlock* next;
 		void* aligned4kBuffer;
@@ -167,9 +167,9 @@ struct ArenaBlock : NonCopyable, ThreadSafeReferenceCounted<ArenaBlock> {
 	void getUniqueBlocks(std::set<ArenaBlock*>& a);
 	int addUsed(int bytes);
 	void makeReference(ArenaBlock* next);
-	void* make4kAlignedBuffer(size_t size);
+	void* make4kAlignedBuffer(uint32_t size);
 	static void dependOn(Reference<ArenaBlock>& self, ArenaBlock* other);
-	static void* dependOn4kAlignedBuffer(Reference<ArenaBlock>& self, size_t size);
+	static void* dependOn4kAlignedBuffer(Reference<ArenaBlock>& self, uint32_t size);
 	static void* allocate(Reference<ArenaBlock>& self, int bytes);
 	// Return an appropriately-sized ArenaBlock to store the given data
 	static ArenaBlock* create(int dataSize, Reference<ArenaBlock>& next);

From f88596bfd0d207c3f953b9eb09effcf3251865de Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sun, 16 May 2021 05:13:55 -0700
Subject: [PATCH 406/461] Applied clang-format after conflict resolution.

---
 fdbserver/VersionedBTree.actor.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index ba3dd2c86b..7e29d34147 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -4279,8 +4279,7 @@ private:
 	                                                         const RedwoodRecordRef* upperBound,
 	                                                         bool forLazyClear = false,
 	                                                         bool cacheable = true,
-	                                                         bool* fromCache = nullptr)
-	{
+	                                                         bool* fromCache = nullptr) {
 		if (!forLazyClear) {
 			debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n",
 			             toString(id).c_str(),

From 8b1f9f733749ffc347543152fe101d47c2427701 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Tue, 11 May 2021 00:05:08 +0000
Subject: [PATCH 407/461] Add  command line support

---
 .gitignore                          |  2 +-
 fdbcli/fdbcli.actor.cpp             | 11 ++++++++---
 fdbclient/DatabaseConfiguration.cpp | 11 +++++++----
 fdbclient/DatabaseConfiguration.h   |  3 +++
 fdbclient/ManagementAPI.actor.cpp   |  8 ++++++++
 5 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2b74cc1f7c..f555965fab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,7 +7,7 @@ bindings/java/foundationdb-client*.jar
 bindings/java/foundationdb-tests*.jar
 bindings/java/fdb-java-*-sources.jar
 packaging/msi/FDBInstaller.msi
-
+builds/
 # Generated source, build, and packaging files
 *.g.cpp
 *.g.h
diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index d10da845ec..7f1bb3b735 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -496,7 +496,8 @@ void initHelp() {
 	helpMap["configure"] = CommandHelp(
 	    "configure [new] "
 	    "<single|double|triple|three_data_hall|three_datacenter|ssd|memory|memory-radixtree-beta|proxies=<PROXIES>|"
-	    "commit_proxies=<COMMIT_PROXIES>|grv_proxies=<GRV_PROXIES>|logs=<LOGS>|resolvers=<RESOLVERS>>*",
+	    "commit_proxies=<COMMIT_PROXIES>|grv_proxies=<GRV_PROXIES>|logs=<LOGS>|resolvers=<RESOLVERS>>*|"
+	    "perpetual_storage_wiggle=<WIGGLE_SPEED>",
 	    "change the database configuration",
 	    "The `new' option, if present, initializes a new database with the given configuration rather than changing "
 	    "the configuration of an existing one. When used, both a redundancy mode and a storage engine must be "
@@ -517,8 +518,11 @@ void initHelp() {
 	    "1, or set to -1 which restores the number of GRV proxies to the default value.\n\nlogs=<LOGS>: Sets the "
 	    "desired number of log servers in the cluster. Must be at least 1, or set to -1 which restores the number of "
 	    "logs to the default value.\n\nresolvers=<RESOLVERS>: Sets the desired number of resolvers in the cluster. "
-	    "Must be at least 1, or set to -1 which restores the number of resolvers to the default value.\n\nSee the "
-	    "FoundationDB Administration Guide for more information.");
+	    "Must be at least 1, or set to -1 which restores the number of resolvers to the default value.\n\n"
+	    "perpetual_storage_wiggle=<WIGGLE_SPEED>: Set the value speed (a.k.a., the number of processes that the Data "
+	    "Distributor should wiggle at a time). Currently, only 0 and 1 are supported. The value 0 means to disable the "
+	    "perpetual storage wiggle.\n\n"
+	    "See the FoundationDB Administration Guide for more information.");
 	helpMap["fileconfigure"] = CommandHelp(
 	    "fileconfigure [new] <FILENAME>",
 	    "change the database configuration from a file",
@@ -2766,6 +2770,7 @@ void configureGenerator(const char* text, const char* line, std::vector<std::str
 		                   "grv_proxies=",
 		                   "logs=",
 		                   "resolvers=",
+		                   "perpetual_storage_wiggle=",
 		                   nullptr };
 	arrayGenerator(text, line, opts, lc);
 }
diff --git a/fdbclient/DatabaseConfiguration.cpp b/fdbclient/DatabaseConfiguration.cpp
index 4da069b775..838f6c3c10 100644
--- a/fdbclient/DatabaseConfiguration.cpp
+++ b/fdbclient/DatabaseConfiguration.cpp
@@ -43,6 +43,7 @@ void DatabaseConfiguration::resetInternal() {
 	remoteDesiredTLogCount = -1;
 	remoteTLogReplicationFactor = repopulateRegionAntiQuorum = 0;
 	backupWorkerEnabled = false;
+	perpetualStorageWiggleSpeed = 0;
 }
 
 void parse(int* i, ValueRef const& v) {
@@ -194,9 +195,9 @@ bool DatabaseConfiguration::isValid() const {
 	      getDesiredRemoteLogs() >= 1 && remoteTLogReplicationFactor >= 0 && repopulateRegionAntiQuorum >= 0 &&
 	      repopulateRegionAntiQuorum <= 1 && usableRegions >= 1 && usableRegions <= 2 && regions.size() <= 2 &&
 	      (usableRegions == 1 || regions.size() == 2) && (regions.size() == 0 || regions[0].priority >= 0) &&
-	      (regions.size() == 0 ||
-	       tLogPolicy->info() !=
-	           "dcid^2 x zoneid^2 x 1"))) { // We cannot specify regions with three_datacenter replication
+	      (regions.size() == 0 || tLogPolicy->info() != "dcid^2 x zoneid^2 x 1") &&
+	      // We cannot specify regions with three_datacenter replication
+	      (perpetualStorageWiggleSpeed == 0 || perpetualStorageWiggleSpeed == 1))) {
 		return false;
 	}
 	std::set<Key> dcIds;
@@ -352,7 +353,7 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const {
 	}
 
 	result["backup_worker_enabled"] = (int32_t)backupWorkerEnabled;
-
+	result["perpetual_storage_wiggle"] = perpetualStorageWiggleSpeed;
 	return result;
 }
 
@@ -499,6 +500,8 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) {
 		parse(&repopulateRegionAntiQuorum, value);
 	} else if (ck == LiteralStringRef("regions")) {
 		parse(&regions, value);
+	} else if (ck == LiteralStringRef("perpetual_storage_wiggle")) {
+		parse(&perpetualStorageWiggleSpeed, value);
 	} else {
 		return false;
 	}
diff --git a/fdbclient/DatabaseConfiguration.h b/fdbclient/DatabaseConfiguration.h
index bc64a6c9c5..ef539f40b0 100644
--- a/fdbclient/DatabaseConfiguration.h
+++ b/fdbclient/DatabaseConfiguration.h
@@ -239,6 +239,9 @@ struct DatabaseConfiguration {
 	int32_t repopulateRegionAntiQuorum;
 	std::vector<RegionInfo> regions;
 
+	// Perpetual Storage Setting
+	int32_t perpetualStorageWiggleSpeed;
+
 	// Excluded servers (no state should be here)
 	bool isExcludedServer(NetworkAddressList) const;
 	std::set<AddressExclusion> getExcludedServers() const;
diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp
index 90d670e801..f53cf65828 100644
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@@ -134,6 +134,14 @@ std::map<std::string, std::string> configForToken(std::string const& mode) {
 			    BinaryWriter::toValue(regionObj, IncludeVersion(ProtocolVersion::withRegionConfiguration())).toString();
 		}
 
+		if (key == "perpetual_storage_wiggle" && isInteger(value)) {
+			int ppWiggle = atoi(value.c_str());
+			if (ppWiggle >= 2 || ppWiggle < 0) {
+				printf("Error: Only 0 and 1 are valid values of perpetual_storage_wiggle at present.\n");
+				return out;
+			}
+			out[p + key] = value;
+		}
 		return out;
 	}
 

From 6065d247f86e12988cdab05a1adc910a8abc7ea4 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Mon, 17 May 2021 20:22:27 +0000
Subject: [PATCH 408/461] fix perpetualStorageWiggleKey

---
 fdbclient/SystemData.cpp | 3 +++
 fdbclient/SystemData.h   | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index 314df8930b..fd8d2faa9f 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -594,6 +594,9 @@ ProcessClass decodeProcessClassValue(ValueRef const& value) {
 const KeyRangeRef configKeys(LiteralStringRef("\xff/conf/"), LiteralStringRef("\xff/conf0"));
 const KeyRef configKeysPrefix = configKeys.begin;
 
+const KeyRef perpetualStorageWiggleKey(LiteralStringRef("\xff/conf/perpetual_storage_wiggle"));
+const KeyRef wigglingStorageServerKey(LiteralStringRef("\xff/storageWiggleUID"));
+
 const KeyRef triggerDDTeamInfoPrintKey(LiteralStringRef("\xff/triggerDDTeamInfoPrint"));
 
 const KeyRangeRef excludedServersKeys(LiteralStringRef("\xff/conf/excluded/"), LiteralStringRef("\xff/conf/excluded0"));
diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h
index e7a54e632c..79efb688c8 100644
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@@ -196,6 +196,8 @@ UID decodeProcessClassKeyOld(KeyRef const& key);
 extern const KeyRangeRef configKeys;
 extern const KeyRef configKeysPrefix;
 
+extern const KeyRef perpetualStorageWiggleKey;
+extern const KeyRef wigglingStorageServerKey;
 // Change the value of this key to anything and that will trigger detailed data distribution team info log.
 extern const KeyRef triggerDDTeamInfoPrintKey;
 

From e40538729e29882c93396e15c5f3fc796cff2ddf Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Mon, 17 May 2021 14:46:06 -0700
Subject: [PATCH 409/461] Update fdbserver/IPager.h

Co-authored-by: Andrew Noyes <andrew.noyes@snowflake.com>
---
 fdbserver/IPager.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h
index 55146b6533..31ff36ef88 100644
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@@ -56,7 +56,7 @@ public:
 		if (userData != nullptr && userDataDestructor != nullptr) {
 			userDataDestructor(userData);
 		}
-		if(buffer != 0) {
+		if(buffer != nullptr) {
 			VALGRIND_MAKE_MEM_UNDEFINED(buffer, bufferSize);
 		}
 	}

From 60504e12ac7e4e4e8f9624402496a98015e9b4ec Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Mon, 17 May 2021 18:02:09 -0700
Subject: [PATCH 410/461] Address review comments.

---
 fdbserver/VersionedBTree.actor.cpp | 33 +++++-------------------------
 flow/Arena.h                       |  8 +++++---
 flow/FastAlloc.h                   |  2 ++
 3 files changed, 12 insertions(+), 31 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 7e29d34147..b5c1e85dfd 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -6372,33 +6372,6 @@ public:
 		return result;
 	}
 
-	ACTOR static Future<Optional<Value>> readValue_impl(KeyValueStoreRedwoodUnversioned* self,
-	                                                    Key key,
-	                                                    Optional<UID> debugID) {
-		state VersionedBTree::BTreeCursor cur;
-		wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion()));
-
-		state Reference<FlowLock> readLock = self->m_concurrentReads;
-		wait(readLock->take());
-		state FlowLock::Releaser releaser(*readLock);
-		++g_redwoodMetrics.opGet;
-
-		wait(cur.seekGTE(key, 0));
-		if (cur.isValid() && cur.get().key == key) {
-			// Return a Value whose arena is the source page's arena
-			Value v;
-			v.arena().dependsOn(cur.back().page->getArena());
-			v.contents() = cur.get().value.get();
-			return v;
-		}
-
-		return Optional<Value>();
-	}
-
-	Future<Optional<Value>> readValue(KeyRef key, Optional<UID> debugID = Optional<UID>()) override {
-		return catchError(readValue_impl(this, key, debugID));
-	}
-
 	ACTOR static Future<Optional<Value>> readValuePrefix_impl(KeyValueStoreRedwoodUnversioned* self,
 	                                                          Key key,
 	                                                          int maxLength,
@@ -6413,7 +6386,7 @@ public:
 
 		wait(cur.seekGTE(key, 0));
 		if (cur.isValid() && cur.get().key == key) {
-			// Return a Value whose arena is the source page's arena
+			// Return a Value whose arena depends on the source page arena
 			Value v;
 			v.arena().dependsOn(cur.back().page->getArena());
 			v.contents() = cur.get().value.get();
@@ -6432,6 +6405,10 @@ public:
 		return catchError(readValuePrefix_impl(this, key, maxLength, debugID));
 	}
 
+	Future<Optional<Value>> readValue(KeyRef key, Optional<UID> debugID = Optional<UID>()) override {
+		return catchError(readValuePrefix_impl(this, key, std::numeric_limits<int>::max(), debugID));
+	}
+
 	~KeyValueStoreRedwoodUnversioned() override{};
 
 private:
diff --git a/flow/Arena.h b/flow/Arena.h
index c08072e35c..b940c6bcb0 100644
--- a/flow/Arena.h
+++ b/flow/Arena.h
@@ -130,13 +130,15 @@ struct scalar_traits<Arena> : std::true_type {
 };
 
 struct ArenaBlockRef {
-	// Only one of (next, aligned4kBuffer) are valid at any one time, as they occupy the same space.
-	// If aligned4kBufferSize is not 0, aligned4kBuffer is valid, otherwise next is valid.
-	uint32_t aligned4kBufferSize;
 	union {
 		ArenaBlock* next;
 		void* aligned4kBuffer;
 	};
+
+	// Only one of (next, aligned4kBuffer) is valid at any one time, as they occupy the same space.
+	// If aligned4kBufferSize is not 0, aligned4kBuffer is valid, otherwise next is valid.
+	uint32_t aligned4kBufferSize;
+
 	uint32_t nextBlockOffset;
 };
 
diff --git a/flow/FastAlloc.h b/flow/FastAlloc.h
index 3f5f2ab58b..55e5731fd5 100644
--- a/flow/FastAlloc.h
+++ b/flow/FastAlloc.h
@@ -267,6 +267,7 @@ inline void freeFast(int size, void* ptr) {
 }
 
 [[nodiscard]] inline void* allocateFast4kAligned(int size) {
+	// Use FastAllocator for sizes it supports to avoid internal fragmentation in some implementations of aligned_alloc
 	if (size <= 4096)
 		return FastAllocator<4096>::allocate();
 	if (size <= 8192)
@@ -277,6 +278,7 @@ inline void freeFast(int size, void* ptr) {
 }
 
 inline void freeFast4kAligned(int size, void* ptr) {
+	// Sizes supported by FastAllocator must be release via FastAllocator
 	if (size <= 4096)
 		return FastAllocator<4096>::release(ptr);
 	if (size <= 8192)

From f30793fd85157829541eecbe0c3cc49dff95efe1 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Mon, 17 May 2021 19:27:06 -0700
Subject: [PATCH 411/461] Implement getValuePrefix() using getValue() rather
 than the other way around to avoid the common getValue()'s actor state
 increasing from 128 to 256 bytes since it is a very hot code path.

---
 fdbserver/IPager.h                 |  2 +-
 fdbserver/VersionedBTree.actor.cpp | 25 +++++++++++++------------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h
index 31ff36ef88..bc2a0f68f1 100644
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@@ -56,7 +56,7 @@ public:
 		if (userData != nullptr && userDataDestructor != nullptr) {
 			userDataDestructor(userData);
 		}
-		if(buffer != nullptr) {
+		if (buffer != nullptr) {
 			VALGRIND_MAKE_MEM_UNDEFINED(buffer, bufferSize);
 		}
 	}
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index b5c1e85dfd..8a919fd190 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -6372,10 +6372,9 @@ public:
 		return result;
 	}
 
-	ACTOR static Future<Optional<Value>> readValuePrefix_impl(KeyValueStoreRedwoodUnversioned* self,
-	                                                          Key key,
-	                                                          int maxLength,
-	                                                          Optional<UID> debugID) {
+	ACTOR static Future<Optional<Value>> readValue_impl(KeyValueStoreRedwoodUnversioned* self,
+	                                                    Key key,
+	                                                    Optional<UID> debugID) {
 		state VersionedBTree::BTreeCursor cur;
 		wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion()));
 
@@ -6390,23 +6389,25 @@ public:
 			Value v;
 			v.arena().dependsOn(cur.back().page->getArena());
 			v.contents() = cur.get().value.get();
-			if (v.size() > maxLength) {
-				v.contents() = v.substr(0, maxLength);
-			}
 			return v;
 		}
 
 		return Optional<Value>();
 	}
 
+	Future<Optional<Value>> readValue(KeyRef key, Optional<UID> debugID = Optional<UID>()) override {
+		return catchError(readValue_impl(this, key, debugID));
+	}
+
 	Future<Optional<Value>> readValuePrefix(KeyRef key,
 	                                        int maxLength,
 	                                        Optional<UID> debugID = Optional<UID>()) override {
-		return catchError(readValuePrefix_impl(this, key, maxLength, debugID));
-	}
-
-	Future<Optional<Value>> readValue(KeyRef key, Optional<UID> debugID = Optional<UID>()) override {
-		return catchError(readValuePrefix_impl(this, key, std::numeric_limits<int>::max(), debugID));
+		return catchError(map(readValue_impl(this, key, debugID), [maxLength](Optional<Value> v) {
+			if (v.present() && v.get().size() > maxLength) {
+				v.get().contents() = v.get().substr(0, maxLength);
+			}
+			return v;
+		}));
 	}
 
 	~KeyValueStoreRedwoodUnversioned() override{};

From 319e77eef12897420e59715c57e69162405620d8 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 17 May 2021 19:15:33 -0700
Subject: [PATCH 412/461] Add severity in logWorkerUnavailable(). Also, only
 log when fitness is GoodFit or BestFit.

---
 fdbserver/ClusterController.actor.cpp | 89 +++++++++++++++++----------
 1 file changed, 58 insertions(+), 31 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 107c865221..53304bc6f6 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -459,7 +459,8 @@ public:
 	}
 
 	// Log the reason why the worker is considered as unavailable.
-	void logWorkerUnavailable(const UID& id,
+	void logWorkerUnavailable(const Severity severity,
+	                          const UID& id,
 	                          const std::string& method,
 	                          const std::string& reason,
 	                          const WorkerDetails& details,
@@ -473,17 +474,21 @@ public:
 			}
 			dcList += printable(dc);
 		}
-		// Note that the recruitment happens only during initial database creation and recovery. So these trace
-		// events should be sparse.
-		TraceEvent("GetTLogTeamWorkerUnavailable", id)
-		    .detail("TLogRecruitMethod", method)
-		    .detail("Reason", reason)
-		    .detail("WorkerID", details.interf.id())
-		    .detail("WorkerDC", details.interf.locality.dcId())
-		    .detail("Address", details.interf.addresses().toString())
-		    .detail("Fitness", fitness)
-		    .detail("RecruitmentDcIds", dcList);
-	};
+		// Logging every possible options is a lot for every recruitment; logging all of the options with GoodFit or
+		// BestFit may work because there should only be like 30 tlog class processes. Plus, the recruitment happens
+		// only during initial database creation and recovery. So these trace events should be sparse.
+		if (fitness == ProcessClass::GoodFit || fitness == ProcessClass::BestFit ||
+		    fitness == ProcessClass::NeverAssign) {
+			TraceEvent(severity, "GetTLogTeamWorkerUnavailable", id)
+			    .detail("TLogRecruitMethod", method)
+			    .detail("Reason", reason)
+			    .detail("WorkerID", details.interf.id())
+			    .detail("WorkerDC", details.interf.locality.dcId())
+			    .detail("Address", details.interf.addresses().toString())
+			    .detail("Fitness", fitness)
+			    .detail("RecruitmentDcIds", dcList);
+		}
+	}
 
 	// A TLog recruitment method specialized for three_data_hall and three_datacenter configurations
 	// It attempts to evenly recruit processes from across data_halls or datacenters
@@ -506,29 +511,36 @@ public:
 
 			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
 			    exclusionWorkerIds.end()) {
-				logWorkerUnavailable(id, "complex", "Worker is excluded", worker_details, fitness, dcIds);
+				logWorkerUnavailable(SevInfo, id, "complex", "Worker is excluded", worker_details, fitness, dcIds);
 				continue;
 			}
 			if (!workerAvailable(worker_info, checkStable)) {
-				logWorkerUnavailable(id, "complex", "Worker is not available", worker_details, fitness, dcIds);
+				logWorkerUnavailable(SevInfo, id, "complex", "Worker is not available", worker_details, fitness, dcIds);
 				continue;
 			}
 			if (conf.isExcludedServer(worker_details.interf.addresses())) {
-				logWorkerUnavailable(
-				    id, "complex", "Worker server is excluded from the cluster", worker_details, fitness, dcIds);
+				logWorkerUnavailable(SevInfo,
+				                     id,
+				                     "complex",
+				                     "Worker server is excluded from the cluster",
+				                     worker_details,
+				                     fitness,
+				                     dcIds);
 				continue;
 			}
 			if (fitness == ProcessClass::NeverAssign) {
-				logWorkerUnavailable(id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
+				logWorkerUnavailable(
+				    SevDebug, id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
 				continue;
 			}
 			if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {
-				logWorkerUnavailable(id, "complex", "Worker is not in the target DC", worker_details, fitness, dcIds);
+				logWorkerUnavailable(
+				    SevDebug, id, "complex", "Worker is not in the target DC", worker_details, fitness, dcIds);
 				continue;
 			}
 			if (!allowDegraded && worker_details.degraded) {
 				logWorkerUnavailable(
-				    id, "complex", "Worker is degraded and not allowed", worker_details, fitness, dcIds);
+				    SevInfo, id, "complex", "Worker is degraded and not allowed", worker_details, fitness, dcIds);
 				continue;
 			}
 
@@ -731,26 +743,34 @@ public:
 		for (const auto& [worker_process_id, worker_info] : id_worker) {
 			const auto& worker_details = worker_info.details;
 			auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
+
 			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
 			    exclusionWorkerIds.end()) {
-				logWorkerUnavailable(id, "simple", "Worker is excluded", worker_details, fitness, dcIds);
+				logWorkerUnavailable(SevInfo, id, "simple", "Worker is excluded", worker_details, fitness, dcIds);
 				continue;
 			}
 			if (!workerAvailable(worker_info, checkStable)) {
-				logWorkerUnavailable(id, "simple", "Worker is not available", worker_details, fitness, dcIds);
+				logWorkerUnavailable(SevInfo, id, "simple", "Worker is not available", worker_details, fitness, dcIds);
 				continue;
 			}
 			if (conf.isExcludedServer(worker_details.interf.addresses())) {
-				logWorkerUnavailable(
-				    id, "simple", "Worker server is excluded from the cluster", worker_details, fitness, dcIds);
+				logWorkerUnavailable(SevInfo,
+				                     id,
+				                     "simple",
+				                     "Worker server is excluded from the cluster",
+				                     worker_details,
+				                     fitness,
+				                     dcIds);
 				continue;
 			}
 			if (fitness == ProcessClass::NeverAssign) {
-				logWorkerUnavailable(id, "simple", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
+				logWorkerUnavailable(
+				    SevDebug, id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
 				continue;
 			}
 			if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {
-				logWorkerUnavailable(id, "simple", "Worker is not in the target DC", worker_details, fitness, dcIds);
+				logWorkerUnavailable(
+				    SevDebug, id, "simple", "Worker is not in the target DC", worker_details, fitness, dcIds);
 				continue;
 			}
 
@@ -855,28 +875,35 @@ public:
 		for (const auto& [worker_process_id, worker_info] : id_worker) {
 			const auto& worker_details = worker_info.details;
 			auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
+
 			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
 			    exclusionWorkerIds.end()) {
-				logWorkerUnavailable(id, "deprecated", "Worker is excluded", worker_details, fitness, dcIds);
+				logWorkerUnavailable(SevInfo, id, "deprecated", "Worker is excluded", worker_details, fitness, dcIds);
 				continue;
 			}
 			if (!workerAvailable(worker_info, checkStable)) {
-				logWorkerUnavailable(id, "deprecated", "Worker is not available", worker_details, fitness, dcIds);
+				logWorkerUnavailable(
+				    SevInfo, id, "deprecated", "Worker is not available", worker_details, fitness, dcIds);
 				continue;
 			}
 			if (conf.isExcludedServer(worker_details.interf.addresses())) {
-				logWorkerUnavailable(
-				    id, "deprecated", "Worker server is excluded from the cluster", worker_details, fitness, dcIds);
+				logWorkerUnavailable(SevInfo,
+				                     id,
+				                     "deprecated",
+				                     "Worker server is excluded from the cluster",
+				                     worker_details,
+				                     fitness,
+				                     dcIds);
 				continue;
 			}
 			if (fitness == ProcessClass::NeverAssign) {
 				logWorkerUnavailable(
-				    id, "deprecated", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
+				    SevDebug, id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds);
 				continue;
 			}
 			if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {
 				logWorkerUnavailable(
-				    id, "deprecated", "Worker is not in the target DC", worker_details, fitness, dcIds);
+				    SevDebug, id, "deprecated", "Worker is not in the target DC", worker_details, fitness, dcIds);
 				continue;
 			}
 

From 3066e856c962693926005c0828661aa7cf0eeef4 Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Wed, 19 May 2021 16:08:32 +0000
Subject: [PATCH 413/461] Expose "bounce impact" and Storage Server "version
 catch-up rate" metrics

Changes:

storageserver.actor.cpp: Use counters to capture (a) how fast a storage
server is catching up in versions and (b) the version fetch frequency.

Status.actor.cpp: Report the captured counter metrics as part of storage
metrics.
---
 fdbserver/Status.actor.cpp        |  2 ++
 fdbserver/storageserver.actor.cpp | 24 +++++++++++-------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 12235f9d31..2277005867 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -491,6 +491,8 @@ struct RolesInfo {
 			obj["mutation_bytes"] = StatusCounter(storageMetrics.getValue("MutationBytes")).getStatus();
 			obj["mutations"] = StatusCounter(storageMetrics.getValue("Mutations")).getStatus();
 			obj.setKeyRawNumber("local_rate", storageMetrics.getValue("LocalRate"));
+			obj["fetched_versions"] = StatusCounter(storageMetrics.getValue("FetchedVersions")).getStatus();
+			obj["fetch_frequency"] = StatusCounter(storageMetrics.getValue("FetchFrequency")).getStatus();
 
 			Version version = storageMetrics.getInt64("Version");
 			Version durableVersion = storageMetrics.getInt64("DurableVersion");
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 8cd4680f6d..4516e2a176 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -545,10 +545,7 @@ public:
 	int64_t versionLag; // An estimate for how many versions it takes for the data to move from the logs to this storage
 	                    // server
 
-	// Metrics about the latest batch of versions fetched by this StorageServer
-	int64_t fetchedVersions; // how many versions were fetched
-	double duration; // how long (in seconds) it took to fetch the versions
-	Optional<UID> sourceTLogID; // the tLog from which the versions were fetched
+	Optional<UID> sourceTLogID; // the tLog from which the latest batch of versions were fetched
 
 	ProtocolVersion logProtocol;
 
@@ -683,6 +680,8 @@ public:
 		Counter loops;
 		Counter fetchWaitingMS, fetchWaitingCount, fetchExecutingMS, fetchExecutingCount;
 		Counter readsRejected;
+		Counter fetchedVersions;
+		Counter fetchFrequency;
 
 		LatencySample readLatencySample;
 		LatencyBands readLatencyBands;
@@ -700,10 +699,11 @@ public:
 		    updateBatches("UpdateBatches", cc), updateVersions("UpdateVersions", cc), loops("Loops", cc),
 		    fetchWaitingMS("FetchWaitingMS", cc), fetchWaitingCount("FetchWaitingCount", cc),
 		    fetchExecutingMS("FetchExecutingMS", cc), fetchExecutingCount("FetchExecutingCount", cc),
-		    readsRejected("ReadsRejected", cc), readLatencySample("ReadLatencyMetrics",
-		                                                          self->thisServerID,
-		                                                          SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                                                          SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		    readsRejected("ReadsRejected", cc), fetchedVersions("FetchedVersions", cc),
+		    fetchFrequency("FetchFrequency", cc), readLatencySample("ReadLatencyMetrics",
+                                                                    self->thisServerID,
+                                                                    SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+                                                                    SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 		    readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
 			specialCounter(cc, "LastTLogVersion", [self]() { return self->lastTLogVersion; });
 			specialCounter(cc, "Version", [self]() { return self->version.get(); });
@@ -711,8 +711,6 @@ public:
 			specialCounter(cc, "DurableVersion", [self]() { return self->durableVersion.get(); });
 			specialCounter(cc, "DesiredOldestVersion", [self]() { return self->desiredOldestVersion.get(); });
 			specialCounter(cc, "VersionLag", [self]() { return self->versionLag; });
-			specialCounter(cc, "FetchedVersions", [self]() { return self->fetchedVersions; });
-			specialCounter(cc, "Duration", [self]() { return self->duration; });
 			specialCounter(cc, "LocalRate", [self] { return self->currentRate() * 100; });
 
 			specialCounter(cc, "BytesReadSampleCount", [self]() { return self->metrics.bytesReadSample.queue.size(); });
@@ -739,7 +737,7 @@ public:
 	  : fetchKeysHistograms(), instanceID(deterministicRandom()->randomUniqueID().first()), storage(this, storage),
 	    db(db), actors(false), lastTLogVersion(0), lastVersionWithData(0), restoredVersion(0),
 	    rebootAfterDurableVersion(std::numeric_limits<Version>::max()), durableInProgress(Void()), versionLag(0),
-	    fetchedVersions(0), duration(0.0), primaryLocality(tagLocalityInvalid), updateEagerReads(0), shardChangeCounter(0),
+	    primaryLocality(tagLocalityInvalid), updateEagerReads(0), shardChangeCounter(0),
 	    fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES), shuttingDown(false),
 	    debug_inApplyUpdate(false), debug_lastValidateTime(0), watchBytes(0), numWatches(0), logProtocol(0),
 	    counters(this), tag(invalidTag), maxQueryQueue(0), thisServerID(ssi.id()),
@@ -3530,8 +3528,8 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 			if (data->otherError.getFuture().isReady())
 				data->otherError.getFuture().get();
 
-			data->fetchedVersions = ver - data->version.get();
-			data->duration = now() - data->lastUpdate;
+			data->counters.fetchedVersions += (ver - data->version.get());
+			++data->counters.fetchFrequency;
 			Optional<UID> curSourceTLogID = cursor->getCurrentPeekLocation();
 
 			if (curSourceTLogID != data->sourceTLogID) {

From 622f43474aa3a44b711185910bb9304856e1a526 Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Wed, 19 May 2021 19:54:49 +0000
Subject: [PATCH 414/461] Expose "bounce impact" and Storage Server "version
 catch-up rate" metrics

Changes:

Schemas.cpp: Extend the JSON schema to report the new metrics that have
been added.

mr-status-json-schemas.rst.inc: Update the schema to reflect the changes
made to the JSON schema.

release-notes-700.rst: Add a note about the new metrics in "Status"
section.
---
 .../sphinx/source/mr-status-json-schemas.rst.inc       | 10 ++++++++++
 .../sphinx/source/release-notes/release-notes-700.rst  |  2 ++
 fdbclient/Schemas.cpp                                  | 10 ++++++++++
 3 files changed, 22 insertions(+)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 202496620d..deb8afcdb7 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -121,6 +121,16 @@
                      "counter":0,
                      "roughness":0.0
                   },
+                  "fetched_versions":{
+                     "hz":0.0,
+                     "counter":0,
+                     "roughness":0.0
+                  },
+                  "fetch_frequency":{
+                     "hz":0.0,
+                     "counter":0,
+                     "roughness":0.0
+                  },
                   "grv_latency_statistics":{ // GRV Latency metrics are grouped according to priority (currently batch or default).
                      "default":{
                          "count":0,
diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst
index 8997f8c9fd..85ca56979c 100644
--- a/documentation/sphinx/source/release-notes/release-notes-700.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-700.rst
@@ -32,6 +32,8 @@ Status
 ------
 * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4735) <https://github.com/apple/foundationdb/pull/4735>`_
 * Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
+* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
+* Added ``fetch_frequency`` to the storage metrics section of status to report the version fetching frequency of a storage server. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 
 Bindings
 --------
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 0ba2feaaaa..c6dc059573 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -144,6 +144,16 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                      "counter":0,
                      "roughness":0.0
                   },
+                  "fetched_versions":{
+                     "hz":0.0,
+                     "counter":0,
+                     "roughness":0.0
+                  },
+                  "fetch_frequency":{
+                     "hz":0.0,
+                     "counter":0,
+                     "roughness":0.0
+                  },
                   "grv_latency_statistics":{
                      "default":{
                         "count":0,

From d067ca507bea78a7c4c15c5c3a2730536ec71a67 Mon Sep 17 00:00:00 2001
From: Josh Slocum <josh.slocum@snowflake.com>
Date: Wed, 19 May 2021 19:57:51 +0000
Subject: [PATCH 415/461] Surfacing non-fatal parse errors in Test Harness
 output

---
 contrib/TestHarness/Program.cs.cmake | 11 ++++++++--
 contrib/TraceLogHelper/JsonParser.cs |  5 +++--
 contrib/TraceLogHelper/XmlParser.cs  | 31 +++++++++++++++++-----------
 3 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/contrib/TestHarness/Program.cs.cmake b/contrib/TestHarness/Program.cs.cmake
index 8d666b2725..075a2758d6 100644
--- a/contrib/TestHarness/Program.cs.cmake
+++ b/contrib/TestHarness/Program.cs.cmake
@@ -717,7 +717,7 @@ namespace SummarizeTest
 
         delegate IEnumerable<Magnesium.Event> parseDelegate(System.IO.Stream stream, string file,
             bool keepOriginalElement = false, double startTime = -1, double endTime = Double.MaxValue,
-            double samplingFactor = 1.0);
+            double samplingFactor = 1.0, Action<string> nonFatalErrorMessage = null);
 
         static int Summarize(string[] traceFiles, string summaryFileName,
             string errorFileName, bool? killed, List<string> outputErrors, int? exitCode, long? peakMemory,
@@ -750,12 +750,14 @@ namespace SummarizeTest
                 {
                     try
                     {
+                        // Use Action to set this because IEnumerables with yield can't have an out variable
+                        string nonFatalParseError = null;
                         parseDelegate parse;
                         if (traceFileName.EndsWith(".json"))
                             parse = Magnesium.JsonParser.Parse;
                         else
                             parse = Magnesium.XmlParser.Parse;
-                        foreach (var ev in parse(traceFile, traceFileName))
+                        foreach (var ev in parse(traceFile, traceFileName, nonFatalErrorMessage: (x) => { nonFatalParseError = x; }))
                         {
                             Magnesium.Severity newSeverity;
                             if (severityMap.TryGetValue(new KeyValuePair<string, Magnesium.Severity>(ev.Type, ev.Severity), out newSeverity))
@@ -876,6 +878,11 @@ namespace SummarizeTest
                             if (ev.Type == "StderrSeverity")
                                 stderrSeverity = int.Parse(ev.Details.NewSeverity);
                         }
+                        if (nonFatalParseError != null) {
+                            xout.Add(new XElement("NonFatalParseError",
+                                new XAttribute("Severity", (int)Magnesium.Severity.SevWarnAlways),
+                                new XAttribute("ErrorMessage", nonFatalParseError)));
+                        }
 
                     }
                     catch (Exception e)
diff --git a/contrib/TraceLogHelper/JsonParser.cs b/contrib/TraceLogHelper/JsonParser.cs
index 9d7272a37f..84fbab81ab 100644
--- a/contrib/TraceLogHelper/JsonParser.cs
+++ b/contrib/TraceLogHelper/JsonParser.cs
@@ -1,4 +1,4 @@
-﻿/*
+/*
  * JsonParser.cs
  *
  * This source file is part of the FoundationDB open source project
@@ -34,9 +34,10 @@ namespace Magnesium
 	{
 		static Random r = new Random();
 
+		// dummy parameter nonFatalParseError to match xml
 		public static IEnumerable<Event> Parse(System.IO.Stream stream, string file,
 			bool keepOriginalElement = false, double startTime = -1, double endTime = Double.MaxValue,
-			double samplingFactor = 1.0)
+			double samplingFactor = 1.0, Action<string> nonFatalErrorMessage = null)
 		{
 			using (var reader = new System.IO.StreamReader(stream))
 			{
diff --git a/contrib/TraceLogHelper/XmlParser.cs b/contrib/TraceLogHelper/XmlParser.cs
index 3728c58c3b..9ab79d920e 100644
--- a/contrib/TraceLogHelper/XmlParser.cs
+++ b/contrib/TraceLogHelper/XmlParser.cs
@@ -33,14 +33,29 @@ namespace Magnesium
 
 		public static IEnumerable<Event> Parse(System.IO.Stream stream, string file, 
 			bool keepOriginalElement = false, double startTime = -1, double endTime = Double.MaxValue,
-			double samplingFactor = 1.0)
+			double samplingFactor = 1.0, Action<string> nonFatalErrorMessage = null)
 		{
 			using (var reader = XmlReader.Create(stream))
 			{
 				reader.ReadToDescendant("Trace");
 				reader.Read();
-				foreach (var xev in StreamElements(reader))
+				
+				// foreach (var xev in StreamElements(reader))
+				// need to be able to catch and save non-fatal exceptions in StreamElements, so use explicit iterator instead of foreach
+				var iter = StreamElements(reader).GetEnumerator();
+				while (true)
 				{
+					try {
+						if (!iter.MoveNext()) {
+							break;
+						}
+					} catch (Exception e) {
+						if (nonFatalErrorMessage != null) {
+							nonFatalErrorMessage(e.Message);
+						}
+						break;
+					}
+					var xev = iter.Current;
 					Event ev = null;
 					try
 					{
@@ -165,28 +180,20 @@ namespace Magnesium
 			}
 		}
 
+		// throws exceptions if xml is invalid
 		private static IEnumerable<XElement> StreamElements(this XmlReader reader)
 		{
 			while (!reader.EOF)
 			{
 				if (reader.NodeType == XmlNodeType.Element)
 				{
-					XElement node = null;
-					try
-					{
-						node = XElement.ReadFrom(reader) as XElement;
-					}
-					catch (Exception) { break; }
+					XElement node = XElement.ReadFrom(reader) as XElement;
 					if (node != null)
 						yield return node;
 				}
 				else
 				{
-					try
-					{
 						reader.Read();
-					}
-					catch (Exception) { break; }
 				}
 			}
 		}

From 907248dcd4d06d1cc6bfffd4a2e5d7f1c364017f Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 19 May 2021 13:26:01 -0700
Subject: [PATCH 416/461] fixed a rare simulation bug where missingFinalCommit
 could be skipped by two successive logSystem changes

---
 fdbserver/OldTLogServer_6_0.actor.cpp | 2 +-
 fdbserver/OldTLogServer_6_2.actor.cpp | 2 +-
 fdbserver/TLogServer.actor.cpp        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp
index a442a3df6a..543111ede6 100644
--- a/fdbserver/OldTLogServer_6_0.actor.cpp
+++ b/fdbserver/OldTLogServer_6_0.actor.cpp
@@ -1498,10 +1498,10 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
 
 ACTOR Future<Void> commitQueue(TLogData* self) {
 	state Reference<LogData> logData;
+	state std::vector<Reference<LogData>> missingFinalCommit;
 
 	loop {
 		int foundCount = 0;
-		state std::vector<Reference<LogData>> missingFinalCommit;
 		for (auto it : self->id_data) {
 			if (!it.second->stopped) {
 				logData = it.second;
diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp
index a305b27f3a..c7fea829c5 100644
--- a/fdbserver/OldTLogServer_6_2.actor.cpp
+++ b/fdbserver/OldTLogServer_6_2.actor.cpp
@@ -1925,10 +1925,10 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
 
 ACTOR Future<Void> commitQueue(TLogData* self) {
 	state Reference<LogData> logData;
+	state std::vector<Reference<LogData>> missingFinalCommit;
 
 	loop {
 		int foundCount = 0;
-		state std::vector<Reference<LogData>> missingFinalCommit;
 		for (auto it : self->id_data) {
 			if (!it.second->stopped) {
 				logData = it.second;
diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index 76d4bf3bf2..4ea9e83bee 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -1965,10 +1965,10 @@ ACTOR Future<Void> doQueueCommit(TLogData* self,
 
 ACTOR Future<Void> commitQueue(TLogData* self) {
 	state Reference<LogData> logData;
+	state std::vector<Reference<LogData>> missingFinalCommit;
 
 	loop {
 		int foundCount = 0;
-		state std::vector<Reference<LogData>> missingFinalCommit;
 		for (auto it : self->id_data) {
 			if (!it.second->stopped) {
 				logData = it.second;

From 2fa80e79126381c1963333a13551feabf362436b Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Wed, 19 May 2021 22:04:43 +0000
Subject: [PATCH 417/461] Address review comments

---
 .../sphinx/source/mr-status-json-schemas.rst.inc     |  2 +-
 .../source/release-notes/release-notes-700.rst       |  2 +-
 fdbclient/Schemas.cpp                                |  2 +-
 fdbserver/Status.actor.cpp                           |  2 +-
 fdbserver/storageserver.actor.cpp                    | 12 ++++++------
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index deb8afcdb7..7979331898 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -126,7 +126,7 @@
                      "counter":0,
                      "roughness":0.0
                   },
-                  "fetch_frequency":{
+                  "fetches_from_logs":{
                      "hz":0.0,
                      "counter":0,
                      "roughness":0.0
diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst
index 85ca56979c..cec839fc2e 100644
--- a/documentation/sphinx/source/release-notes/release-notes-700.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-700.rst
@@ -33,7 +33,7 @@ Status
 * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4735) <https://github.com/apple/foundationdb/pull/4735>`_
 * Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 * Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
-* Added ``fetch_frequency`` to the storage metrics section of status to report the version fetching frequency of a storage server. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
+* Added ``fetch_frequency`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 
 Bindings
 --------
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index c6dc059573..5fef5fb6eb 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -149,7 +149,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                      "counter":0,
                      "roughness":0.0
                   },
-                  "fetch_frequency":{
+                  "fetches_from_logs":{
                      "hz":0.0,
                      "counter":0,
                      "roughness":0.0
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 2277005867..5f546638ff 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -492,7 +492,7 @@ struct RolesInfo {
 			obj["mutations"] = StatusCounter(storageMetrics.getValue("Mutations")).getStatus();
 			obj.setKeyRawNumber("local_rate", storageMetrics.getValue("LocalRate"));
 			obj["fetched_versions"] = StatusCounter(storageMetrics.getValue("FetchedVersions")).getStatus();
-			obj["fetch_frequency"] = StatusCounter(storageMetrics.getValue("FetchFrequency")).getStatus();
+			obj["fetches_from_logs"] = StatusCounter(storageMetrics.getValue("FetchesFromLogs")).getStatus();
 
 			Version version = storageMetrics.getInt64("Version");
 			Version durableVersion = storageMetrics.getInt64("DurableVersion");
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 4516e2a176..1db250d9cd 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -681,7 +681,7 @@ public:
 		Counter fetchWaitingMS, fetchWaitingCount, fetchExecutingMS, fetchExecutingCount;
 		Counter readsRejected;
 		Counter fetchedVersions;
-		Counter fetchFrequency;
+		Counter fetchesFromLogs;
 
 		LatencySample readLatencySample;
 		LatencyBands readLatencyBands;
@@ -700,10 +700,10 @@ public:
 		    fetchWaitingMS("FetchWaitingMS", cc), fetchWaitingCount("FetchWaitingCount", cc),
 		    fetchExecutingMS("FetchExecutingMS", cc), fetchExecutingCount("FetchExecutingCount", cc),
 		    readsRejected("ReadsRejected", cc), fetchedVersions("FetchedVersions", cc),
-		    fetchFrequency("FetchFrequency", cc), readLatencySample("ReadLatencyMetrics",
-                                                                    self->thisServerID,
-                                                                    SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-                                                                    SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		    fetchesFromLogs("FetchesFromLogs", cc), readLatencySample("ReadLatencyMetrics",
+                                                                      self->thisServerID,
+                                                                      SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+                                                                      SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 		    readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
 			specialCounter(cc, "LastTLogVersion", [self]() { return self->lastTLogVersion; });
 			specialCounter(cc, "Version", [self]() { return self->version.get(); });
@@ -3529,7 +3529,7 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 				data->otherError.getFuture().get();
 
 			data->counters.fetchedVersions += (ver - data->version.get());
-			++data->counters.fetchFrequency;
+			++data->counters.fetchesFromLogs;
 			Optional<UID> curSourceTLogID = cursor->getCurrentPeekLocation();
 
 			if (curSourceTLogID != data->sourceTLogID) {

From 43e0d362df10991d40647e3730e279cf8e885ecf Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Wed, 19 May 2021 22:12:34 +0000
Subject: [PATCH 418/461] Address a review comment

---
 documentation/sphinx/source/release-notes/release-notes-700.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst
index cec839fc2e..ea78b9a10b 100644
--- a/documentation/sphinx/source/release-notes/release-notes-700.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-700.rst
@@ -33,7 +33,7 @@ Status
 * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4735) <https://github.com/apple/foundationdb/pull/4735>`_
 * Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 * Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
-* Added ``fetch_frequency`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
+* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 
 Bindings
 --------

From 93c809764f647a173adf73bcab6990c276213302 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Wed, 19 May 2021 23:52:16 +0000
Subject: [PATCH 419/461] fix Schema check error

---
 fdbclient/Schemas.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 682ddc9c9a..ae85799e85 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -727,7 +727,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
          "auto_logs":3,
          "commit_proxies":5,
          "grv_proxies":1,
-         "backup_worker_enabled":1
+         "backup_worker_enabled":1,
+         "perpetual_storage_wiggle":0
       },
       "data":{
          "least_operating_space_bytes_log_server":0,

From a57061a5ed0de7b8ce9c60f40c0e31b4ee2f94d7 Mon Sep 17 00:00:00 2001
From: Xiaoxi Wang <xiaoxi.wang@snowflake.com>
Date: Thu, 20 May 2021 00:06:53 +0000
Subject: [PATCH 420/461] change UID to PID meaning Process ID

---
 fdbclient/SystemData.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index fd8d2faa9f..0f035b745c 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -595,7 +595,7 @@ const KeyRangeRef configKeys(LiteralStringRef("\xff/conf/"), LiteralStringRef("\
 const KeyRef configKeysPrefix = configKeys.begin;
 
 const KeyRef perpetualStorageWiggleKey(LiteralStringRef("\xff/conf/perpetual_storage_wiggle"));
-const KeyRef wigglingStorageServerKey(LiteralStringRef("\xff/storageWiggleUID"));
+const KeyRef wigglingStorageServerKey(LiteralStringRef("\xff/storageWigglePID"));
 
 const KeyRef triggerDDTeamInfoPrintKey(LiteralStringRef("\xff/triggerDDTeamInfoPrint"));
 

From 64608fe86b76738d53cbbe00d631a3af8cbbbe85 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 20 May 2021 13:48:41 -0600
Subject: [PATCH 421/461] allow simulation properties to be overwritten

---
 fdbserver/SimulatedCluster.actor.cpp          | 478 ++++++++++++------
 fdbserver/TesterInterface.actor.h             |  21 -
 fdbserver/tester.actor.cpp                    |  14 -
 tests/fast/AtomicBackupToDBCorrectness.toml   |   1 +
 tests/fast/BackupToDBCorrectness.toml         |   1 +
 tests/fast/BackupToDBCorrectnessClean.toml    |   1 +
 tests/fast/ConfigureLocked.toml               |   3 +-
 tests/fast/FuzzApiCorrectness.toml            |   1 +
 tests/fast/FuzzApiCorrectnessClean.toml       |   1 +
 tests/fast/KillRegionCycle.toml               |   1 +
 tests/fast/LongStackWriteDuringRead.toml      |   1 +
 tests/fast/LowLatency.toml                    |   1 +
 tests/fast/ProtocolVersion.toml               |   1 +
 tests/fast/ReportConflictingKeys.toml         |   1 +
 tests/fast/WriteDuringRead.toml               |   1 +
 tests/fast/WriteDuringReadClean.toml          |   1 +
 tests/rare/ConflictRangeCheck.toml            |   1 +
 tests/rare/ConflictRangeRYOWCheck.toml        |   1 +
 .../from_7.0.0/SnapIncrementalRestore-1.toml  |   1 +
 tests/slow/ApiCorrectnessSwitchover.toml      |   1 +
 tests/slow/DifferentClustersSameRV.toml       |   1 +
 tests/slow/LowLatencyWithFailures.toml        |   1 +
 ...elRestoreNewBackupCorrectnessAtomicOp.toml |   1 +
 ...allelRestoreNewBackupCorrectnessCycle.toml |   1 +
 ...estoreNewBackupCorrectnessMultiCycles.toml |   1 +
 ...NewBackupWriteDuringReadAtomicRestore.toml |   1 +
 ...elRestoreOldBackupCorrectnessAtomicOp.toml |   1 +
 ...estoreOldBackupCorrectnessMultiCycles.toml |   1 +
 ...OldBackupWriteDuringReadAtomicRestore.toml |   1 +
 tests/slow/SharedBackupCorrectness.toml       |   1 +
 tests/slow/SharedBackupToDBCorrectness.toml   |   1 +
 tests/slow/VersionStampBackupToDB.toml        |   1 +
 tests/slow/VersionStampSwitchover.toml        |   1 +
 tests/slow/WriteDuringReadAtomicRestore.toml  |   1 +
 tests/slow/WriteDuringReadSwitchover.toml     |   1 +
 35 files changed, 364 insertions(+), 183 deletions(-)

diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index f10ca774bb..128eace3a8 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -22,6 +22,7 @@
 #include <fstream>
 #include <ostream>
 #include <sstream>
+#include <toml.hpp>
 #include "fdbrpc/Locality.h"
 #include "fdbrpc/simulator.h"
 #include "fdbclient/DatabaseContext.h"
@@ -37,8 +38,8 @@
 #include "fdbclient/BackupAgent.actor.h"
 #include "fdbclient/versions.h"
 #include "flow/ProtocolVersion.h"
-#include "flow/actorcompiler.h" // This must be the last #include.
 #include "flow/network.h"
+#include "flow/actorcompiler.h" // This must be the last #include.
 
 #undef max
 #undef min
@@ -46,10 +47,210 @@
 extern "C" int g_expect_full_pointermap;
 extern const char* getSourceVersion();
 
+using namespace std::literals;
+
 const int MACHINE_REBOOT_TIME = 10;
 
 bool destructed = false;
 
+// Configuration details specified in workload test files that change the simulation
+// environment details
+class TestConfig {
+	class ConfigBuilder {
+		using value_type = toml::basic_value<toml::discard_comments>;
+		std::unordered_map<std::string_view, std::function<void(value_type const&)>> confMap;
+
+	public:
+		ConfigBuilder& add(std::string_view key, int* value) {
+			confMap.emplace(key, [value](value_type const& v) { *value = v.as_integer(); });
+			return *this;
+		}
+		ConfigBuilder& add(std::string_view key, Optional<int>* value) {
+			confMap.emplace(key, [value](value_type const& v) { *value = v.as_integer(); });
+			return *this;
+		}
+		ConfigBuilder& add(std::string_view key, bool* value) {
+			confMap.emplace(key, [value](value_type const& v) { *value = v.as_boolean(); });
+			return *this;
+		}
+		ConfigBuilder& add(std::string_view key, Optional<bool>* value) {
+			confMap.emplace(key, [value](value_type const& v) { *value = v.as_boolean(); });
+			return *this;
+		}
+		ConfigBuilder& add(std::string_view key, std::string* value) {
+			confMap.emplace(key, [value](value_type const& v) { *value = v.as_string(); });
+			return *this;
+		}
+		ConfigBuilder& add(std::string_view key, Optional<std::string>* value) {
+			confMap.emplace(key, [value](value_type const& v) { *value = v.as_string(); });
+			return *this;
+		}
+		ConfigBuilder& add(std::string_view key, std::vector<int>* value) {
+			confMap.emplace(key, [value](value_type const& v) {
+				auto arr = v.as_array();
+				for (const auto& i : arr) {
+					value->push_back(i.as_integer());
+				}
+			});
+			return *this;
+		}
+		void set(std::string const& key, value_type const& val) {
+			auto iter = confMap.find(key);
+			if (iter == confMap.end()) {
+				std::cerr << "Unknown configuration attribute " << key << std::endl;
+				TraceEvent("UnknownConfigurationAttribute").detail("Name", key);
+				throw unknown_error();
+			}
+			iter->second(val);
+		}
+	};
+
+	bool isIniFile(const char* fileName) {
+		std::string name = fileName;
+		auto pos = name.find_last_of('.');
+		ASSERT(pos != std::string::npos && pos + 1 < name.size());
+		auto extension = name.substr(pos + 1);
+		return extension == "txt"sv;
+	}
+
+	void loadIniFile(const char* testFile) {
+		std::ifstream ifs;
+		ifs.open(testFile, std::ifstream::in);
+		if (!ifs.good())
+			return;
+
+		std::string cline;
+
+		while (ifs.good()) {
+			getline(ifs, cline);
+			std::string line = removeWhitespace(std::string(cline));
+			if (!line.size() || line.find(';') == 0)
+				continue;
+
+			size_t found = line.find('=');
+			if (found == std::string::npos)
+				// hmmm, not good
+				continue;
+			std::string attrib = removeWhitespace(line.substr(0, found));
+			std::string value = removeWhitespace(line.substr(found + 1));
+
+			if (attrib == "extraDB") {
+				sscanf(value.c_str(), "%d", &extraDB);
+			}
+
+			if (attrib == "minimumReplication") {
+				sscanf(value.c_str(), "%d", &minimumReplication);
+			}
+
+			if (attrib == "minimumRegions") {
+				sscanf(value.c_str(), "%d", &minimumRegions);
+			}
+
+			if (attrib == "configureLocked") {
+				sscanf(value.c_str(), "%d", &configureLocked);
+			}
+
+			if (attrib == "startIncompatibleProcess") {
+				startIncompatibleProcess = strcmp(value.c_str(), "true") == 0;
+			}
+
+			if (attrib == "logAntiQuorum") {
+				sscanf(value.c_str(), "%d", &logAntiQuorum);
+			}
+
+			if (attrib == "storageEngineExcludeTypes") {
+				std::stringstream ss(value);
+				for (int i; ss >> i;) {
+					storageEngineExcludeTypes.push_back(i);
+					if (ss.peek() == ',') {
+						ss.ignore();
+					}
+				}
+			}
+			if (attrib == "maxTLogVersion") {
+				sscanf(value.c_str(), "%d", &maxTLogVersion);
+			}
+		}
+
+		ifs.close();
+	}
+
+
+public:
+	int extraDB = 0;
+	int minimumReplication = 0;
+	int minimumRegions = 0;
+	bool configureLocked = false;
+	bool startIncompatibleProcess = false;
+	int logAntiQuorum = -1;
+	// Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig
+	//	0 = "ssd"
+	//	1 = "memory"
+	//	2 = "memory-radixtree-beta"
+	//	3 = "ssd-redwood-experimental"
+	// Requires a comma-separated list of numbers WITHOUT whitespaces
+	std::vector<int> storageEngineExcludeTypes;
+	// Set the maximum TLog version that can be selected for a test
+	// Refer to FDBTypes.h::TLogVersion. Defaults to the maximum supported version.
+	int maxTLogVersion = TLogVersion::MAX_SUPPORTED;
+	// Set true to simplify simulation configs for easier debugging
+	bool simpleConfig = false;
+	Optional<bool> generateFearless, buggify;
+	Optional<int> datacenters, desiredTLogCount, commitProxyCount, grvProxyCount, resolverCount, storageEngineType,
+	    stderrSeverity, machineCount, processesPerMachine, coordinators;
+	Optional<std::string> config;
+
+	void readFromConfig(const char* testFile) {
+		if (isIniFile(testFile)) {
+			loadIniFile(testFile);
+			return;
+		}
+		ConfigBuilder builder;
+		builder.add("extraDB", &extraDB)
+		    .add("minimumReplication", &minimumReplication)
+		    .add("minimumRegions", &minimumRegions)
+		    .add("configureLocked", &configureLocked)
+		    .add("startIncompatibleProcess", &startIncompatibleProcess)
+		    .add("logAntiQuorum", &logAntiQuorum)
+		    .add("storageEngineExcludeTypes", &storageEngineExcludeTypes)
+		    .add("maxTLogVersion", &maxTLogVersion)
+		    .add("simpleConfig", &simpleConfig)
+		    .add("generateFearless", &generateFearless)
+		    .add("datacenters", &datacenters)
+		    .add("desiredTLogCount", &desiredTLogCount)
+		    .add("commitProxyCount", &commitProxyCount)
+		    .add("grvProxyCount", &grvProxyCount)
+		    .add("resolverCount", &resolverCount)
+		    .add("storageEngineType", &storageEngineType)
+		    .add("config", &config)
+		    .add("buggify", &buggify)
+		    .add("StderrSeverity", &stderrSeverity)
+		    .add("machineCount", &machineCount)
+		    .add("processesPerMachine", &processesPerMachine)
+		    .add("coordinators", &coordinators);
+		try {
+			auto file = toml::parse(testFile);
+			if (file.contains("configuration") && toml::find(file, "configuration").is_table()) {
+				auto conf = toml::find(file, "configuration").as_table();
+				for (const auto& [key, value] : conf) {
+					if (key == "ClientInfoLogging") {
+						setNetworkOption(FDBNetworkOptions::DISABLE_CLIENT_STATISTICS_LOGGING);
+					} else {
+						builder.set(key, value);
+					}
+				}
+				if (stderrSeverity.present()) {
+					TraceEvent("StderrSeverity").detail("NewSeverity", stderrSeverity.get());
+				}
+			}
+		} catch (std::exception& e) {
+			std::cerr << e.what() << std::endl;
+			TraceEvent("TOMLParseError").detail("Error", printable(e.what()));
+			throw unknown_error();
+		}
+	}
+};
+
 template <class T>
 T simulate(const T& in) {
 	BinaryWriter writer(AssumeVersion(g_network->protocolVersion()));
@@ -885,30 +1086,57 @@ StringRef StringRefOf(const char* s) {
 // of different combinations
 void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	set_config("new");
-	const bool simple = false; // Set true to simplify simulation configs for easier debugging
 	// generateMachineTeamTestConfig set up the number of servers per machine and the number of machines such that
 	// if we do not remove the surplus server and machine teams, the simulation test will report error.
 	// This is needed to make sure the number of server (and machine) teams is no larger than the desired number.
 	bool generateMachineTeamTestConfig = BUGGIFY_WITH_PROB(0.1) ? true : false;
-	bool generateFearless = simple ? false : (testConfig.minimumRegions > 1 || deterministicRandom()->random01() < 0.5);
-	datacenters = simple ? 1
-	                     : (generateFearless
-	                            ? (testConfig.minimumReplication > 0 || deterministicRandom()->random01() < 0.5 ? 4 : 6)
+	bool generateFearless =
+	    testConfig.simpleConfig ? false : (testConfig.minimumRegions > 1 || deterministicRandom()->random01() < 0.5);
+	if (testConfig.generateFearless.present()) {
+		// overwrite whatever decision we made before
+		generateFearless = testConfig.generateFearless.get();
+	}
+	datacenters =
+	    testConfig.simpleConfig
+	        ? 1
+	        : (generateFearless ? (testConfig.minimumReplication > 0 || deterministicRandom()->random01() < 0.5 ? 4 : 6)
 	                            : deterministicRandom()->randomInt(1, 4));
-	if (deterministicRandom()->random01() < 0.25)
+	if (testConfig.datacenters.present()) {
+		datacenters = testConfig.datacenters.get();
+	}
+	if (testConfig.desiredTLogCount.present()) {
+		db.desiredTLogCount = testConfig.desiredTLogCount.get();
+	} else if (deterministicRandom()->random01() < 0.25) {
 		db.desiredTLogCount = deterministicRandom()->randomInt(1, 7);
-	if (deterministicRandom()->random01() < 0.25)
+	}
+
+	if (testConfig.commitProxyCount.present()) {
+		db.commitProxyCount = testConfig.commitProxyCount.get();
+	} else if (deterministicRandom()->random01() < 0.25) {
 		db.commitProxyCount = deterministicRandom()->randomInt(1, 7);
-	if (deterministicRandom()->random01() < 0.25)
+	}
+
+	if (testConfig.grvProxyCount.present()) {
+		db.grvProxyCount = testConfig.grvProxyCount.get();
+	} else if (deterministicRandom()->random01() < 0.25) {
 		db.grvProxyCount = deterministicRandom()->randomInt(1, 4);
-	if (deterministicRandom()->random01() < 0.25)
+	}
+
+	if (testConfig.resolverCount.present()) {
+		db.resolverCount = testConfig.resolverCount.get();
+	} else if (deterministicRandom()->random01() < 0.25) {
 		db.resolverCount = deterministicRandom()->randomInt(1, 7);
+	}
 	int storage_engine_type = deterministicRandom()->randomInt(0, 4);
-	// Continuously re-pick the storage engine type if it's the one we want to exclude
-	while (std::find(testConfig.storageEngineExcludeTypes.begin(),
-	                 testConfig.storageEngineExcludeTypes.end(),
-	                 storage_engine_type) != testConfig.storageEngineExcludeTypes.end()) {
-		storage_engine_type = deterministicRandom()->randomInt(0, 4);
+	if (testConfig.storageEngineType.present()) {
+		storage_engine_type = testConfig.storageEngineType.get();
+	} else {
+		// Continuously re-pick the storage engine type if it's the one we want to exclude
+		while (std::find(testConfig.storageEngineExcludeTypes.begin(),
+		                 testConfig.storageEngineExcludeTypes.end(),
+		                 storage_engine_type) != testConfig.storageEngineExcludeTypes.end()) {
+			storage_engine_type = deterministicRandom()->randomInt(0, 4);
+		}
 	}
 	switch (storage_engine_type) {
 	case 0: {
@@ -941,75 +1169,81 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	//	}
 	//	set_config("memory");
 	//  set_config("memory-radixtree-beta");
-	if (simple) {
+	if (testConfig.simpleConfig) {
 		db.desiredTLogCount = 1;
 		db.commitProxyCount = 1;
 		db.grvProxyCount = 1;
 		db.resolverCount = 1;
 	}
-	int replication_type = simple ? 1
-	                              : (std::max(testConfig.minimumReplication,
-	                                          datacenters > 4 ? deterministicRandom()->randomInt(1, 3)
-	                                                          : std::min(deterministicRandom()->randomInt(0, 6), 3)));
-	switch (replication_type) {
-	case 0: {
-		TEST(true); // Simulated cluster using custom redundancy mode
-		int storage_servers = deterministicRandom()->randomInt(1, generateFearless ? 4 : 5);
-		// FIXME: log replicas must be more than storage replicas because otherwise better master exists will not
-		// recognize it needs to change dcs
-		int replication_factor = deterministicRandom()->randomInt(storage_servers, generateFearless ? 4 : 5);
-		int anti_quorum = deterministicRandom()->randomInt(
-		    0,
-		    (replication_factor / 2) + 1); // The anti quorum cannot be more than half of the replication factor, or the
-		                                   // log system will continue to accept commits when a recovery is impossible
-		// Go through buildConfiguration, as it sets tLogPolicy/storagePolicy.
-		set_config(format("storage_replicas:=%d log_replicas:=%d log_anti_quorum:=%d "
-		                  "replica_datacenters:=1 min_replica_datacenters:=1",
-		                  storage_servers,
-		                  replication_factor,
-		                  anti_quorum));
-		break;
-	}
-	case 1: {
-		TEST(true); // Simulated cluster running in single redundancy mode
-		set_config("single");
-		break;
-	}
-	case 2: {
-		TEST(true); // Simulated cluster running in double redundancy mode
-		set_config("double");
-		break;
-	}
-	case 3: {
-		if (datacenters <= 2 || generateFearless) {
-			TEST(true); // Simulated cluster running in triple redundancy mode
-			set_config("triple");
-		} else if (datacenters == 3) {
-			TEST(true); // Simulated cluster running in 3 data-hall mode
-			set_config("three_data_hall");
-		} else {
-			ASSERT(false);
-		}
-		break;
-	}
-	default:
-		ASSERT(false); // Programmer forgot to adjust cases.
-	}
-
-	if (deterministicRandom()->random01() < 0.5) {
-		int logSpill = deterministicRandom()->randomInt(TLogSpillType::VALUE, TLogSpillType::END);
-		set_config(format("log_spill:=%d", logSpill));
-		int logVersion = deterministicRandom()->randomInt(TLogVersion::MIN_RECRUITABLE, testConfig.maxTLogVersion + 1);
-		set_config(format("log_version:=%d", logVersion));
+	if (testConfig.config.present()) {
+		set_config(testConfig.config.get());
 	} else {
-		if (deterministicRandom()->random01() < 0.7)
-			set_config(format("log_version:=%d", testConfig.maxTLogVersion));
-		if (deterministicRandom()->random01() < 0.5)
-			set_config(format("log_spill:=%d", TLogSpillType::DEFAULT));
-	}
-
-	if (deterministicRandom()->random01() < 0.5) {
-		set_config("backup_worker_enabled:=1");
+		int replication_type = testConfig.simpleConfig
+		                           ? 1
+		                           : (std::max(testConfig.minimumReplication,
+		                                       datacenters > 4 ? deterministicRandom()->randomInt(1, 3)
+		                                                       : std::min(deterministicRandom()->randomInt(0, 6), 3)));
+		switch (replication_type) {
+		case 0: {
+			TEST(true); // Simulated cluster using custom redundancy mode
+			int storage_servers = deterministicRandom()->randomInt(1, generateFearless ? 4 : 5);
+			// FIXME: log replicas must be more than storage replicas because otherwise better master exists will not
+			// recognize it needs to change dcs
+			int replication_factor = deterministicRandom()->randomInt(storage_servers, generateFearless ? 4 : 5);
+			int anti_quorum = deterministicRandom()->randomInt(
+			    0,
+			    (replication_factor / 2) +
+			        1); // The anti quorum cannot be more than half of the replication factor, or the
+			            // log system will continue to accept commits when a recovery is impossible
+			// Go through buildConfiguration, as it sets tLogPolicy/storagePolicy.
+			set_config(format("storage_replicas:=%d log_replicas:=%d log_anti_quorum:=%d "
+			                  "replica_datacenters:=1 min_replica_datacenters:=1",
+			                  storage_servers,
+			                  replication_factor,
+			                  anti_quorum));
+			break;
+		}
+		case 1: {
+			TEST(true); // Simulated cluster running in single redundancy mode
+			set_config("single");
+			break;
+		}
+		case 2: {
+			TEST(true); // Simulated cluster running in double redundancy mode
+			set_config("double");
+			break;
+		}
+		case 3: {
+			if (datacenters <= 2 || generateFearless) {
+				TEST(true); // Simulated cluster running in triple redundancy mode
+				set_config("triple");
+			} else if (datacenters == 3) {
+				TEST(true); // Simulated cluster running in 3 data-hall mode
+				set_config("three_data_hall");
+			} else {
+				ASSERT(false);
+			}
+			break;
+		}
+		default:
+			ASSERT(false); // Programmer forgot to adjust cases.
+		}
+		if (deterministicRandom()->random01() < 0.5) {
+			int logSpill = deterministicRandom()->randomInt(TLogSpillType::VALUE, TLogSpillType::END);
+			set_config(format("log_spill:=%d", logSpill));
+			int logVersion =
+			    deterministicRandom()->randomInt(TLogVersion::MIN_RECRUITABLE, testConfig.maxTLogVersion + 1);
+			set_config(format("log_version:=%d", logVersion));
+		} else {
+			if (deterministicRandom()->random01() < 0.7)
+				set_config(format("log_version:=%d", testConfig.maxTLogVersion));
+			if (deterministicRandom()->random01() < 0.5)
+				set_config(format("log_spill:=%d", TLogSpillType::DEFAULT));
+		}
+		
+		if (deterministicRandom()->random01() < 0.5) {
+			set_config("backup_worker_enabled:=1");
+		}
 	}
 
 	if (generateFearless || (datacenters == 2 && deterministicRandom()->random01() < 0.5)) {
@@ -1211,7 +1445,9 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 		}
 	}
 
-	if (generateFearless && testConfig.minimumReplication > 1) {
+	if (testConfig.machineCount.present()) {
+		machine_count = testConfig.machineCount.get();
+	} else if (generateFearless && testConfig.minimumReplication > 1) {
 		// low latency tests in fearless configurations need 4 machines per datacenter (3 for triple replication, 1 that
 		// is down during failures).
 		machine_count = 16;
@@ -1234,11 +1470,15 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 		}
 	}
 
-	// because we protect a majority of coordinators from being killed, it is better to run with low numbers of
-	// coordinators to prevent too many processes from being protected
-	coordinators = (testConfig.minimumRegions <= 1 && BUGGIFY)
-	                   ? deterministicRandom()->randomInt(1, std::max(machine_count, 2))
-	                   : 1;
+	if (testConfig.coordinators.present()) {
+		coordinators = testConfig.coordinators.get();
+	} else {
+		// because we protect a majority of coordinators from being killed, it is better to run with low numbers of
+		// coordinators to prevent too many processes from being protected
+		coordinators = (testConfig.minimumRegions <= 1 && BUGGIFY)
+		                   ? deterministicRandom()->randomInt(1, std::max(machine_count, 2))
+		                   : 1;
+	}
 
 	if (testConfig.minimumReplication > 1 && datacenters == 3) {
 		// low latency tests in 3 data hall mode need 2 other data centers with 2 machines each to avoid waiting for
@@ -1247,7 +1487,9 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 		coordinators = 3;
 	}
 
-	if (generateFearless) {
+	if (testConfig.processesPerMachine.present()) {
+		processes_per_machine = testConfig.processesPerMachine.get();
+	} else if (generateFearless) {
 		processes_per_machine = 1;
 	} else {
 		processes_per_machine = deterministicRandom()->randomInt(1, (extraDB ? 14 : 28) / machine_count + 2);
@@ -1626,68 +1868,10 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
 	    .detail("StartingConfiguration", pStartingConfiguration->toString());
 }
 
+using namespace std::literals;
+
 // Populates the TestConfig fields according to what is found in the test file.
-void checkTestConf(const char* testFile, TestConfig* testConfig) {
-	std::ifstream ifs;
-	ifs.open(testFile, std::ifstream::in);
-	if (!ifs.good())
-		return;
-
-	std::string cline;
-
-	while (ifs.good()) {
-		getline(ifs, cline);
-		std::string line = removeWhitespace(std::string(cline));
-		if (!line.size() || line.find(';') == 0)
-			continue;
-
-		size_t found = line.find('=');
-		if (found == std::string::npos)
-			// hmmm, not good
-			continue;
-		std::string attrib = removeWhitespace(line.substr(0, found));
-		std::string value = removeWhitespace(line.substr(found + 1));
-
-		if (attrib == "extraDB") {
-			sscanf(value.c_str(), "%d", &testConfig->extraDB);
-		}
-
-		if (attrib == "minimumReplication") {
-			sscanf(value.c_str(), "%d", &testConfig->minimumReplication);
-		}
-
-		if (attrib == "minimumRegions") {
-			sscanf(value.c_str(), "%d", &testConfig->minimumRegions);
-		}
-
-		if (attrib == "configureLocked") {
-			sscanf(value.c_str(), "%d", &testConfig->configureLocked);
-		}
-
-		if (attrib == "startIncompatibleProcess") {
-			testConfig->startIncompatibleProcess = strcmp(value.c_str(), "true") == 0;
-		}
-
-		if (attrib == "logAntiQuorum") {
-			sscanf(value.c_str(), "%d", &testConfig->logAntiQuorum);
-		}
-
-		if (attrib == "storageEngineExcludeTypes") {
-			std::stringstream ss(value);
-			for (int i; ss >> i;) {
-				testConfig->storageEngineExcludeTypes.push_back(i);
-				if (ss.peek() == ',') {
-					ss.ignore();
-				}
-			}
-		}
-		if (attrib == "maxTLogVersion") {
-			sscanf(value.c_str(), "%d", &testConfig->maxTLogVersion);
-		}
-	}
-
-	ifs.close();
-}
+void checkTestConf(const char* testFile, TestConfig* testConfig) {}
 
 ACTOR void setupAndRun(std::string dataFolder,
                        const char* testFile,
@@ -1699,7 +1883,7 @@ ACTOR void setupAndRun(std::string dataFolder,
 	state Standalone<StringRef> startingConfiguration;
 	state int testerCount = 1;
 	state TestConfig testConfig;
-	checkTestConf(testFile, &testConfig);
+	testConfig.readFromConfig(testFile);
 	g_simulator.hasDiffProtocolProcess = testConfig.startIncompatibleProcess;
 	g_simulator.setDiffProtocol = false;
 
diff --git a/fdbserver/TesterInterface.actor.h b/fdbserver/TesterInterface.actor.h
index ddfb04da22..8320cc566b 100644
--- a/fdbserver/TesterInterface.actor.h
+++ b/fdbserver/TesterInterface.actor.h
@@ -100,27 +100,6 @@ struct WorkloadRequest {
 	}
 };
 
-// Configuration details specified in workload test files that change the simulation
-// environment details
-struct TestConfig {
-	int extraDB = 0;
-	int minimumReplication = 0;
-	int minimumRegions = 0;
-	int configureLocked = 0;
-	bool startIncompatibleProcess = false;
-	int logAntiQuorum = -1;
-	// Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig
-	//	0 = "ssd"
-	//	1 = "memory"
-	//	2 = "memory-radixtree-beta"
-	//	3 = "ssd-redwood-experimental"
-	// Requires a comma-separated list of numbers WITHOUT whitespaces
-	std::vector<int> storageEngineExcludeTypes;
-	// Set the maximum TLog version that can be selected for a test
-	// Refer to FDBTypes.h::TLogVersion. Defaults to the maximum supported version.
-	int maxTLogVersion = TLogVersion::MAX_SUPPORTED;
-};
-
 struct TesterInterface {
 	constexpr static FileIdentifier file_identifier = 4465210;
 	RequestStream<WorkloadRequest> recruitments;
diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp
index fa18a376a4..4b98b38486 100644
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@@ -1249,20 +1249,6 @@ std::vector<TestSpec> readTOMLTests_(std::string fileName) {
 
 	const toml::value& conf = toml::parse(fileName);
 
-	// Handle all global settings
-	for (const auto& [k, v] : conf.as_table()) {
-		if (k == "test") {
-			continue;
-		}
-		if (testSpecGlobalKeys.find(k) != testSpecGlobalKeys.end()) {
-			testSpecGlobalKeys[k](toml_to_string(v));
-		} else {
-			TraceEvent(SevError, "TestSpecUnrecognizedGlobalParam")
-			    .detail("Attrib", k)
-			    .detail("Value", toml_to_string(v));
-		}
-	}
-
 	// Then parse each test
 	const toml::array& tests = toml::find(conf, "test").as_array();
 	for (const toml::value& test : tests) {
diff --git a/tests/fast/AtomicBackupToDBCorrectness.toml b/tests/fast/AtomicBackupToDBCorrectness.toml
index cc6f8e453a..1b601da923 100644
--- a/tests/fast/AtomicBackupToDBCorrectness.toml
+++ b/tests/fast/AtomicBackupToDBCorrectness.toml
@@ -1,3 +1,4 @@
+[configuration]
 extraDB = 1
 
 [[test]]
diff --git a/tests/fast/BackupToDBCorrectness.toml b/tests/fast/BackupToDBCorrectness.toml
index 62f30151d4..cf50093657 100644
--- a/tests/fast/BackupToDBCorrectness.toml
+++ b/tests/fast/BackupToDBCorrectness.toml
@@ -1,3 +1,4 @@
+[configuration]
 extraDB = 1
 
 [[test]]
diff --git a/tests/fast/BackupToDBCorrectnessClean.toml b/tests/fast/BackupToDBCorrectnessClean.toml
index 0dfdbbd8b0..9c2e9135e5 100644
--- a/tests/fast/BackupToDBCorrectnessClean.toml
+++ b/tests/fast/BackupToDBCorrectnessClean.toml
@@ -1,3 +1,4 @@
+[configuration]
 extraDB = 1
 
 [[test]]
diff --git a/tests/fast/ConfigureLocked.toml b/tests/fast/ConfigureLocked.toml
index 169fc2a2a3..592701931a 100644
--- a/tests/fast/ConfigureLocked.toml
+++ b/tests/fast/ConfigureLocked.toml
@@ -1,4 +1,5 @@
-configureLocked = 1
+[configuration]
+configureLocked = true
 
 [[test]]
 testTitle = 'ConfigureLocked'
diff --git a/tests/fast/FuzzApiCorrectness.toml b/tests/fast/FuzzApiCorrectness.toml
index 0e1e88619c..20d4e215b5 100644
--- a/tests/fast/FuzzApiCorrectness.toml
+++ b/tests/fast/FuzzApiCorrectness.toml
@@ -1,3 +1,4 @@
+[configuration]
 StderrSeverity = 30
 
 [[test]]
diff --git a/tests/fast/FuzzApiCorrectnessClean.toml b/tests/fast/FuzzApiCorrectnessClean.toml
index 9b66edf86d..7165deda42 100644
--- a/tests/fast/FuzzApiCorrectnessClean.toml
+++ b/tests/fast/FuzzApiCorrectnessClean.toml
@@ -1,3 +1,4 @@
+[configuration]
 StderrSeverity = 30
 
 [[test]]
diff --git a/tests/fast/KillRegionCycle.toml b/tests/fast/KillRegionCycle.toml
index 71eebfbc2a..77bd6ce2ef 100644
--- a/tests/fast/KillRegionCycle.toml
+++ b/tests/fast/KillRegionCycle.toml
@@ -1,3 +1,4 @@
+[configuration]
 minimumRegions = 2
 
 [[test]]
diff --git a/tests/fast/LongStackWriteDuringRead.toml b/tests/fast/LongStackWriteDuringRead.toml
index e80ff22846..d65d9a2a91 100644
--- a/tests/fast/LongStackWriteDuringRead.toml
+++ b/tests/fast/LongStackWriteDuringRead.toml
@@ -1,3 +1,4 @@
+[configuration]
 StderrSeverity = 30
 
 [[test]]
diff --git a/tests/fast/LowLatency.toml b/tests/fast/LowLatency.toml
index bcf71ba942..d8af3b38c9 100644
--- a/tests/fast/LowLatency.toml
+++ b/tests/fast/LowLatency.toml
@@ -1,3 +1,4 @@
+[configuration]
 buggify = false
 minimumReplication = 2
 
diff --git a/tests/fast/ProtocolVersion.toml b/tests/fast/ProtocolVersion.toml
index 626b876dd5..2cf223b5db 100644
--- a/tests/fast/ProtocolVersion.toml
+++ b/tests/fast/ProtocolVersion.toml
@@ -1,3 +1,4 @@
+[configuration]
 startIncompatibleProcess = true
 
 [[test]]
diff --git a/tests/fast/ReportConflictingKeys.toml b/tests/fast/ReportConflictingKeys.toml
index 2f81880c00..6b0654c143 100644
--- a/tests/fast/ReportConflictingKeys.toml
+++ b/tests/fast/ReportConflictingKeys.toml
@@ -1,3 +1,4 @@
+[configuration]
 buggify = false
 
 [[test]]
diff --git a/tests/fast/WriteDuringRead.toml b/tests/fast/WriteDuringRead.toml
index 82b39e78ae..565fc957df 100644
--- a/tests/fast/WriteDuringRead.toml
+++ b/tests/fast/WriteDuringRead.toml
@@ -1,3 +1,4 @@
+[configuration]
 StderrSeverity = 30
 
 [[test]]
diff --git a/tests/fast/WriteDuringReadClean.toml b/tests/fast/WriteDuringReadClean.toml
index fca62f39ec..83e61507c2 100644
--- a/tests/fast/WriteDuringReadClean.toml
+++ b/tests/fast/WriteDuringReadClean.toml
@@ -1,3 +1,4 @@
+[configuration]
 StderrSeverity = 30
 
 [[test]]
diff --git a/tests/rare/ConflictRangeCheck.toml b/tests/rare/ConflictRangeCheck.toml
index f923ebb137..3e8860fd50 100644
--- a/tests/rare/ConflictRangeCheck.toml
+++ b/tests/rare/ConflictRangeCheck.toml
@@ -1,3 +1,4 @@
+[configuration]
 buggify = false
 
 [[test]]
diff --git a/tests/rare/ConflictRangeRYOWCheck.toml b/tests/rare/ConflictRangeRYOWCheck.toml
index 1a2e4f39d0..d190459170 100644
--- a/tests/rare/ConflictRangeRYOWCheck.toml
+++ b/tests/rare/ConflictRangeRYOWCheck.toml
@@ -1,3 +1,4 @@
+[configuration]
 buggify = false
 
 [[test]]
diff --git a/tests/restarting/from_7.0.0/SnapIncrementalRestore-1.toml b/tests/restarting/from_7.0.0/SnapIncrementalRestore-1.toml
index efa3bae128..6321090c4e 100644
--- a/tests/restarting/from_7.0.0/SnapIncrementalRestore-1.toml
+++ b/tests/restarting/from_7.0.0/SnapIncrementalRestore-1.toml
@@ -1,3 +1,4 @@
+[configuration]
 logAntiQuorum = 0
 
 [[test]] 
diff --git a/tests/slow/ApiCorrectnessSwitchover.toml b/tests/slow/ApiCorrectnessSwitchover.toml
index 98b5ebd3a1..d97474e422 100644
--- a/tests/slow/ApiCorrectnessSwitchover.toml
+++ b/tests/slow/ApiCorrectnessSwitchover.toml
@@ -1,3 +1,4 @@
+[configuration]
 extraDB = 2
 
 [[test]]
diff --git a/tests/slow/DifferentClustersSameRV.toml b/tests/slow/DifferentClustersSameRV.toml
index 4cda3eaea4..4d14271361 100644
--- a/tests/slow/DifferentClustersSameRV.toml
+++ b/tests/slow/DifferentClustersSameRV.toml
@@ -1,3 +1,4 @@
+[configuration]
 extraDB = 2
 
 [[test]]
diff --git a/tests/slow/LowLatencyWithFailures.toml b/tests/slow/LowLatencyWithFailures.toml
index 3888bb9c26..21514f247c 100644
--- a/tests/slow/LowLatencyWithFailures.toml
+++ b/tests/slow/LowLatencyWithFailures.toml
@@ -1,3 +1,4 @@
+[configuration]
 minimumReplication = 2
 
 [[test]]
diff --git a/tests/slow/ParallelRestoreNewBackupCorrectnessAtomicOp.toml b/tests/slow/ParallelRestoreNewBackupCorrectnessAtomicOp.toml
index a208f02872..ba56f68d31 100644
--- a/tests/slow/ParallelRestoreNewBackupCorrectnessAtomicOp.toml
+++ b/tests/slow/ParallelRestoreNewBackupCorrectnessAtomicOp.toml
@@ -1,4 +1,5 @@
 # Disable buggify for parallel restore
+#[configuration]
 #buggify=on
 
 [[test]]
diff --git a/tests/slow/ParallelRestoreNewBackupCorrectnessCycle.toml b/tests/slow/ParallelRestoreNewBackupCorrectnessCycle.toml
index 2c2d4b0333..3acece923d 100644
--- a/tests/slow/ParallelRestoreNewBackupCorrectnessCycle.toml
+++ b/tests/slow/ParallelRestoreNewBackupCorrectnessCycle.toml
@@ -1,4 +1,5 @@
 # Disable buggify for parallel restore
+#[configuration]
 #buggify=off
 
 [[test]]
diff --git a/tests/slow/ParallelRestoreNewBackupCorrectnessMultiCycles.toml b/tests/slow/ParallelRestoreNewBackupCorrectnessMultiCycles.toml
index c94b2bc7a8..7862f5784a 100644
--- a/tests/slow/ParallelRestoreNewBackupCorrectnessMultiCycles.toml
+++ b/tests/slow/ParallelRestoreNewBackupCorrectnessMultiCycles.toml
@@ -1,4 +1,5 @@
 # Disable buggify for parallel restore
+#[configuration]
 #buggify=off
 
 [[test]]
diff --git a/tests/slow/ParallelRestoreNewBackupWriteDuringReadAtomicRestore.toml b/tests/slow/ParallelRestoreNewBackupWriteDuringReadAtomicRestore.toml
index ae92b4b956..4b305660bc 100644
--- a/tests/slow/ParallelRestoreNewBackupWriteDuringReadAtomicRestore.toml
+++ b/tests/slow/ParallelRestoreNewBackupWriteDuringReadAtomicRestore.toml
@@ -1,3 +1,4 @@
+[configuration]
 StderrSeverity = 30
 
 [[test]]
diff --git a/tests/slow/ParallelRestoreOldBackupCorrectnessAtomicOp.toml b/tests/slow/ParallelRestoreOldBackupCorrectnessAtomicOp.toml
index 04130159d1..647d15ec26 100644
--- a/tests/slow/ParallelRestoreOldBackupCorrectnessAtomicOp.toml
+++ b/tests/slow/ParallelRestoreOldBackupCorrectnessAtomicOp.toml
@@ -1,4 +1,5 @@
 # Disable buggify for parallel restore
+#[configuration]
 #buggify=on
 
 [[test]]
diff --git a/tests/slow/ParallelRestoreOldBackupCorrectnessMultiCycles.toml b/tests/slow/ParallelRestoreOldBackupCorrectnessMultiCycles.toml
index 8dc215c593..8f6f7b92aa 100644
--- a/tests/slow/ParallelRestoreOldBackupCorrectnessMultiCycles.toml
+++ b/tests/slow/ParallelRestoreOldBackupCorrectnessMultiCycles.toml
@@ -1,4 +1,5 @@
 # Disable buggify for parallel restore
+#[configuration]
 #buggify=off
 
 [[test]]
diff --git a/tests/slow/ParallelRestoreOldBackupWriteDuringReadAtomicRestore.toml b/tests/slow/ParallelRestoreOldBackupWriteDuringReadAtomicRestore.toml
index e09dc4fdd9..0479031d75 100644
--- a/tests/slow/ParallelRestoreOldBackupWriteDuringReadAtomicRestore.toml
+++ b/tests/slow/ParallelRestoreOldBackupWriteDuringReadAtomicRestore.toml
@@ -1,3 +1,4 @@
+[configuration]
 StderrSeverity = 30
 
 [[test]]
diff --git a/tests/slow/SharedBackupCorrectness.toml b/tests/slow/SharedBackupCorrectness.toml
index 253736e6ce..c03b89831a 100644
--- a/tests/slow/SharedBackupCorrectness.toml
+++ b/tests/slow/SharedBackupCorrectness.toml
@@ -1,3 +1,4 @@
+[configuration]
 extraDB = 1
 
 [[test]]
diff --git a/tests/slow/SharedBackupToDBCorrectness.toml b/tests/slow/SharedBackupToDBCorrectness.toml
index 2a6b45f0ec..3a3a07dfbd 100644
--- a/tests/slow/SharedBackupToDBCorrectness.toml
+++ b/tests/slow/SharedBackupToDBCorrectness.toml
@@ -1,3 +1,4 @@
+[configuration]
 extraDB = 1
 
 [[test]]
diff --git a/tests/slow/VersionStampBackupToDB.toml b/tests/slow/VersionStampBackupToDB.toml
index 29d86df4e1..4b36182dd0 100644
--- a/tests/slow/VersionStampBackupToDB.toml
+++ b/tests/slow/VersionStampBackupToDB.toml
@@ -1,3 +1,4 @@
+[configuration]
 extraDB = 2
 
 [[test]]
diff --git a/tests/slow/VersionStampSwitchover.toml b/tests/slow/VersionStampSwitchover.toml
index f65086ab59..328c199b93 100644
--- a/tests/slow/VersionStampSwitchover.toml
+++ b/tests/slow/VersionStampSwitchover.toml
@@ -1,3 +1,4 @@
+[configuration]
 extraDB = 2
 
 [[test]]
diff --git a/tests/slow/WriteDuringReadAtomicRestore.toml b/tests/slow/WriteDuringReadAtomicRestore.toml
index 96868a11ef..a148f0a1c9 100644
--- a/tests/slow/WriteDuringReadAtomicRestore.toml
+++ b/tests/slow/WriteDuringReadAtomicRestore.toml
@@ -1,3 +1,4 @@
+[configuration]
 StderrSeverity = 30
 
 [[test]]
diff --git a/tests/slow/WriteDuringReadSwitchover.toml b/tests/slow/WriteDuringReadSwitchover.toml
index 7eaa3a36b0..b5232c9119 100644
--- a/tests/slow/WriteDuringReadSwitchover.toml
+++ b/tests/slow/WriteDuringReadSwitchover.toml
@@ -1,3 +1,4 @@
+[configuration]
 StderrSeverity = 30
 extraDB = 2
 

From bb076115c9b308ecfa703f396955c82df65d5c3d Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Fri, 21 May 2021 16:40:29 -0700
Subject: [PATCH 422/461] Only enable backup worker when using partitioned logs

This addresses issue #4849.
---
 fdbclient/FileBackupAgent.actor.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index dfd33b5b67..317e4cc095 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2705,13 +2705,17 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 		wait(checkTaskVersion(cx, task, StartFullBackupTaskFunc::name, StartFullBackupTaskFunc::version));
 
 		state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
+		state BackupConfig config(task);
+		state Future<Optional<bool>> partitionedLog;
 		loop {
 			try {
 				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-				Version startVersion = wait(tr->getReadVersion());
+				partitionedLog = config.partitionedLogEnabled().get(tr);
+				state Future<Version> startVersionFuture = tr->getReadVersion();
+				wait(success(partitionedLog) && success(startVersionFuture));
 
-				Params.beginVersion().set(task, startVersion);
+				Params.beginVersion().set(task, startVersionFuture.get());
 				break;
 			} catch (Error& e) {
 				wait(tr->onError(e));
@@ -2721,14 +2725,15 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 		// Check if backup worker is enabled
 		DatabaseConfiguration dbConfig = wait(getDatabaseConfiguration(cx));
 		state bool backupWorkerEnabled = dbConfig.backupWorkerEnabled;
-		if (!backupWorkerEnabled) {
+		if (!backupWorkerEnabled && partitionedLog.get().present() && partitionedLog.get().get()) {
+			// Change configuration only when we set to use partitioned logs and
+			// the flag was not set before.
 			wait(success(changeConfig(cx, "backup_worker_enabled:=1", true)));
 			backupWorkerEnabled = true;
 		}
 
 		// Set the "backupStartedKey" and wait for all backup worker started
 		tr->reset();
-		state BackupConfig config(task);
 		loop {
 			state Future<Void> watchFuture;
 			try {
@@ -2738,7 +2743,7 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 
 				state Future<Optional<Value>> started = tr->get(backupStartedKey);
 				state Future<Optional<Value>> taskStarted = tr->get(config.allWorkerStarted().key);
-				state Future<Optional<bool>> partitionedLog = config.partitionedLogEnabled().get(tr);
+				partitionedLog = config.partitionedLogEnabled().get(tr);
 				wait(success(started) && success(taskStarted) && success(partitionedLog));
 
 				if (!partitionedLog.get().present() || !partitionedLog.get().get()) {

From 19b9a35c586b7d6c517560df09d89c351d53629a Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Mon, 24 May 2021 18:37:48 +0000
Subject: [PATCH 423/461] Expose "bounce impact" and Storage Server "version
 catch-up rate" metrics

Update the Status section of release-notes-630.rst with info about
the new status fields introduced by PR
https://github.com/apple/foundationdb/pull/4770 .
---
 .../sphinx/source/release-notes/release-notes-630.rst          | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst
index 2057e7fcb2..8d2ff59aba 100644
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@@ -113,6 +113,9 @@ Status
 * If a process is unable to flush trace logs to disk, the problem will now be reported via the output of ``status`` command inside ``fdbcli``. `(PR #2605) <https://github.com/apple/foundationdb/pull/2605>`_ `(PR #2820) <https://github.com/apple/foundationdb/pull/2820>`_
 * When a configuration key is changed, it will always be included in ``status json`` output, even the value is reverted back to the default value. [6.3.5] `(PR #3610) <https://github.com/apple/foundationdb/pull/3610>`_
 * Added transactions.rejected_for_queued_too_long for bookkeeping the number of transactions rejected by commit proxy because its queuing time exceeds MVCC window.[6.3.11] `(PR #4353) <https://github.com/apple/foundationdb/pull/4353>`_
+* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
+* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
+* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 
 Bindings
 --------

From eaad5798dd69cdf9c2be6ba65c797c9ca03f8819 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 24 May 2021 15:10:58 -0400
Subject: [PATCH 424/461] update release notes to match the ones on release-6.3

---
 .../sphinx/source/release-notes/release-notes-630.rst        | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst
index 8d2ff59aba..38eab2be6f 100644
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@@ -5,14 +5,19 @@ Release Notes
 
 6.3.13
 ======
+* Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4736) <https://github.com/apple/foundationdb/pull/4736>`_
 * The multi-version client now requires at most two client connections with version 6.2 or larger, regardless of how many external clients are configured. Clients older than 6.2 will continue to create an additional connection each. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
 * Fix an accounting error that could potentially result in inaccuracies in priority busyness metrics. `(PR #4824) <https://github.com/apple/foundationdb/pull/4824>`_
+* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
+* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
+* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 
 6.3.12
 ======
 * Change the default for --knob_tls_server_handshake_threads to 64. The previous was 1000. This avoids starting 1000 threads by default, but may adversely affect recovery time for large clusters using tls. Users with large tls clusters should consider explicitly setting this knob in their foundationdb.conf file. `(PR #4421) <https://github.com/apple/foundationdb/pull/4421>`_
 * Fix accounting error that could cause commits to incorrectly fail with ``proxy_memory_limit_exceeded``. `(PR #4526) <https://github.com/apple/foundationdb/pull/4526>`_
 * As an optimization, partial restore using target key ranges now filters backup log data prior to loading it into the database.  `(PR #4554) <https://github.com/apple/foundationdb/pull/4554>`_
+* Fix fault tolerance calculation when there are no tLogs in LogSet.  `(PR #4454) <https://github.com/apple/foundationdb/pull/4454>`_
 
 6.3.11
 ======

From a107604d3a4dde095448f623f6e31601eeace18d Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 24 May 2021 15:53:21 -0400
Subject: [PATCH 425/461] Move patch notes to the appropriate section, and add
 new 6.3.14 notes

---
 .../sphinx/source/release-notes/release-notes-630.rst  | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst
index 38eab2be6f..6cb34252ec 100644
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@@ -3,14 +3,18 @@ Release Notes
 #############
 
 
+6.3.14
+======
+* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
+* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
+* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
+* Added the ``bypass_unreadable`` transaction option which allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. `(PR #4774) <https://github.com/apple/foundationdb/pull/4774>`_
+
 6.3.13
 ======
 * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4736) <https://github.com/apple/foundationdb/pull/4736>`_
 * The multi-version client now requires at most two client connections with version 6.2 or larger, regardless of how many external clients are configured. Clients older than 6.2 will continue to create an additional connection each. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
 * Fix an accounting error that could potentially result in inaccuracies in priority busyness metrics. `(PR #4824) <https://github.com/apple/foundationdb/pull/4824>`_
-* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
-* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
-* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 
 6.3.12
 ======

From 61e2ec62cc3cffdcfa8892978044dd306dc7e0ff Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 24 May 2021 16:06:37 -0400
Subject: [PATCH 426/461] more release notes added

---
 documentation/sphinx/source/release-notes/release-notes-630.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst
index 6cb34252ec..c180798496 100644
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@@ -9,6 +9,7 @@ Release Notes
 * Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 * Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 * Added the ``bypass_unreadable`` transaction option which allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. `(PR #4774) <https://github.com/apple/foundationdb/pull/4774>`_
+* Fix several packaging issues. The osx package should now install successfully, and the structure of the RPM and DEB packages should match that of 6.2. `(PR #4810) <https://github.com/apple/foundationdb/pull/4810>`_
 
 6.3.13
 ======

From 3b08f39cc34af28579b32120b6f51ca127fe0791 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 24 May 2021 16:07:55 -0400
Subject: [PATCH 427/461] remove extra notes in "Status" section

---
 .../sphinx/source/release-notes/release-notes-630.rst          | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst
index c180798496..307e08ce87 100644
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@@ -123,9 +123,6 @@ Status
 * If a process is unable to flush trace logs to disk, the problem will now be reported via the output of ``status`` command inside ``fdbcli``. `(PR #2605) <https://github.com/apple/foundationdb/pull/2605>`_ `(PR #2820) <https://github.com/apple/foundationdb/pull/2820>`_
 * When a configuration key is changed, it will always be included in ``status json`` output, even the value is reverted back to the default value. [6.3.5] `(PR #3610) <https://github.com/apple/foundationdb/pull/3610>`_
 * Added transactions.rejected_for_queued_too_long for bookkeeping the number of transactions rejected by commit proxy because its queuing time exceeds MVCC window.[6.3.11] `(PR #4353) <https://github.com/apple/foundationdb/pull/4353>`_
-* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
-* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
-* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 
 Bindings
 --------

From a0b136356054230aa69248f87836afc70efac35d Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 24 May 2021 16:28:55 -0400
Subject: [PATCH 428/461] move note to 6.3.14

---
 documentation/sphinx/source/release-notes/release-notes-630.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst
index 307e08ce87..af5b2a02c7 100644
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@@ -10,12 +10,12 @@ Release Notes
 * Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 * Added the ``bypass_unreadable`` transaction option which allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. `(PR #4774) <https://github.com/apple/foundationdb/pull/4774>`_
 * Fix several packaging issues. The osx package should now install successfully, and the structure of the RPM and DEB packages should match that of 6.2. `(PR #4810) <https://github.com/apple/foundationdb/pull/4810>`_
+* Fix an accounting error that could potentially result in inaccuracies in priority busyness metrics. `(PR #4824) <https://github.com/apple/foundationdb/pull/4824>`_
 
 6.3.13
 ======
 * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4736) <https://github.com/apple/foundationdb/pull/4736>`_
 * The multi-version client now requires at most two client connections with version 6.2 or larger, regardless of how many external clients are configured. Clients older than 6.2 will continue to create an additional connection each. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
-* Fix an accounting error that could potentially result in inaccuracies in priority busyness metrics. `(PR #4824) <https://github.com/apple/foundationdb/pull/4824>`_
 
 6.3.12
 ======

From 7efa4e02bcc5de4f9608febf9139bf97c0e939da Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 24 May 2021 18:23:14 -0400
Subject: [PATCH 429/461] add 6.3.12 notes

---
 documentation/sphinx/source/release-notes/release-notes-630.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst
index af5b2a02c7..859aee0eb4 100644
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@@ -23,6 +23,8 @@ Release Notes
 * Fix accounting error that could cause commits to incorrectly fail with ``proxy_memory_limit_exceeded``. `(PR #4526) <https://github.com/apple/foundationdb/pull/4526>`_
 * As an optimization, partial restore using target key ranges now filters backup log data prior to loading it into the database.  `(PR #4554) <https://github.com/apple/foundationdb/pull/4554>`_
 * Fix fault tolerance calculation when there are no tLogs in LogSet.  `(PR #4454) <https://github.com/apple/foundationdb/pull/4454>`_
+* Change client's ``iteration_progression`` size defaults from 256 to 4096 bytes for better performance. `(PR #4416) <https://github.com/apple/foundationdb/pull/4416>`_
+* Add the ability to instrument java driver actions, such as FDBTransaction and RangeQuery. `(PR #4385) <https://github.com/apple/foundationdb/pull/4385>`_
 
 6.3.11
 ======

From 80abc4d86543b34b58abb14c8418f94f66eda86d Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 24 May 2021 18:27:56 -0400
Subject: [PATCH 430/461] update formatting

---
 documentation/sphinx/source/release-notes/release-notes-630.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst
index 859aee0eb4..bebd55e859 100644
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@@ -24,7 +24,7 @@ Release Notes
 * As an optimization, partial restore using target key ranges now filters backup log data prior to loading it into the database.  `(PR #4554) <https://github.com/apple/foundationdb/pull/4554>`_
 * Fix fault tolerance calculation when there are no tLogs in LogSet.  `(PR #4454) <https://github.com/apple/foundationdb/pull/4454>`_
 * Change client's ``iteration_progression`` size defaults from 256 to 4096 bytes for better performance. `(PR #4416) <https://github.com/apple/foundationdb/pull/4416>`_
-* Add the ability to instrument java driver actions, such as FDBTransaction and RangeQuery. `(PR #4385) <https://github.com/apple/foundationdb/pull/4385>`_
+* Add the ability to instrument java driver actions, such as ``FDBTransaction`` and ``RangeQuery``. `(PR #4385) <https://github.com/apple/foundationdb/pull/4385>`_
 
 6.3.11
 ======

From 7cdd43c352c6d04555f12aef9036e2c7d906da66 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Mon, 24 May 2021 19:19:54 +0000
Subject: [PATCH 431/461] Handle retriable errors better in fdb_c_unit_tests

---
 bindings/c/test/unit/unit_tests.cpp | 87 +++++++++++++++++++++--------
 1 file changed, 63 insertions(+), 24 deletions(-)

diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp
index 360284e55d..703b1273dd 100644
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@@ -263,13 +263,15 @@ TEST_CASE("fdb_future_set_callback") {
 		    &context));
 
 		fdb_error_t err = wait_future(f1);
+
+		context.event.wait(); // Wait until callback is called
+
 		if (err) {
 			fdb::EmptyFuture f2 = tr.on_error(err);
 			fdb_check(wait_future(f2));
 			continue;
 		}
 
-		context.event.wait();
 		break;
 	}
 }
@@ -515,10 +517,10 @@ TEST_CASE("write system key") {
 	fdb::Transaction tr(db);
 
 	std::string syskey("\xff\x02");
-	fdb_check(tr.set_option(FDB_TR_OPTION_ACCESS_SYSTEM_KEYS, nullptr, 0));
-	tr.set(syskey, "bar");
 
 	while (1) {
+		fdb_check(tr.set_option(FDB_TR_OPTION_ACCESS_SYSTEM_KEYS, nullptr, 0));
+		tr.set(syskey, "bar");
 		fdb::EmptyFuture f1 = tr.commit();
 
 		fdb_error_t err = wait_future(f1);
@@ -949,16 +951,25 @@ TEST_CASE("fdb_transaction_clear") {
 }
 
 TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_ADD") {
-	insert_data(db, create_data({ { "foo", "a" } }));
+	insert_data(db, create_data({ { "foo", "\x00" } }));
 
 	fdb::Transaction tr(db);
 	int8_t param = 1;
+	int potentialCommitCount = 0;
 	while (1) {
 		tr.atomic_op(key("foo"), (const uint8_t*)&param, sizeof(param), FDB_MUTATION_TYPE_ADD);
+		if (potentialCommitCount + 1 == 256) {
+			// Trying to commit again might overflow the one unsigned byte we're looking at
+			break;
+		}
+		++potentialCommitCount;
 		fdb::EmptyFuture f1 = tr.commit();
 
 		fdb_error_t err = wait_future(f1);
 		if (err) {
+			if (fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE_NOT_COMMITTED, err)) {
+				--potentialCommitCount;
+			}
 			fdb::EmptyFuture f2 = tr.on_error(err);
 			fdb_check(wait_future(f2));
 			continue;
@@ -969,7 +980,8 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_ADD") {
 	auto value = get_value(key("foo"), /* snapshot */ false, {});
 	REQUIRE(value.has_value());
 	CHECK(value->size() == 1);
-	CHECK(value->data()[0] == 'b'); // incrementing 'a' results in 'b'
+	CHECK(uint8_t(value->data()[0]) > 0);
+	CHECK(uint8_t(value->data()[0]) <= potentialCommitCount);
 }
 
 TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_BIT_AND") {
@@ -1139,14 +1151,19 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_BIT_XOR") {
 
 	fdb::Transaction tr(db);
 	char param[] = { 'a', 'd' };
+	int potentialCommitCount = 0;
 	while (1) {
 		tr.atomic_op(key("foo"), (const uint8_t*)"b", 1, FDB_MUTATION_TYPE_BIT_XOR);
 		tr.atomic_op(key("bar"), (const uint8_t*)param, 2, FDB_MUTATION_TYPE_BIT_XOR);
 		tr.atomic_op(key("baz"), (const uint8_t*)"d", 1, FDB_MUTATION_TYPE_BIT_XOR);
+		++potentialCommitCount;
 		fdb::EmptyFuture f1 = tr.commit();
 
 		fdb_error_t err = wait_future(f1);
 		if (err) {
+			if (fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE_NOT_COMMITTED, err)) {
+				--potentialCommitCount;
+			}
 			fdb::EmptyFuture f2 = tr.on_error(err);
 			fdb_check(wait_future(f2));
 			continue;
@@ -1154,6 +1171,11 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_BIT_XOR") {
 		break;
 	}
 
+	if (potentialCommitCount != 1) {
+		MESSAGE("Transaction may not have committed exactly once. Suppressing assertions");
+		return;
+	}
+
 	auto value = get_value(key("foo"), /* snapshot */ false, {});
 	REQUIRE(value.has_value());
 	CHECK(value->size() == 1);
@@ -1204,13 +1226,18 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_APPEND_IF_FITS") {
 	insert_data(db, create_data({ { "foo", "f" } }));
 
 	fdb::Transaction tr(db);
+	int potentialCommitCount = 0;
 	while (1) {
 		tr.atomic_op(key("foo"), (const uint8_t*)"db", 2, FDB_MUTATION_TYPE_APPEND_IF_FITS);
 		tr.atomic_op(key("bar"), (const uint8_t*)"foundation", 10, FDB_MUTATION_TYPE_APPEND_IF_FITS);
+		++potentialCommitCount;
 		fdb::EmptyFuture f1 = tr.commit();
 
 		fdb_error_t err = wait_future(f1);
 		if (err) {
+			if (fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE_NOT_COMMITTED, err)) {
+				--potentialCommitCount;
+			}
 			fdb::EmptyFuture f2 = tr.on_error(err);
 			fdb_check(wait_future(f2));
 			continue;
@@ -1218,13 +1245,18 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_APPEND_IF_FITS") {
 		break;
 	}
 
-	auto value = get_value(key("foo"), /* snapshot */ false, {});
-	REQUIRE(value.has_value());
-	CHECK(value->compare("fdb") == 0);
+	auto value_foo = get_value(key("foo"), /* snapshot */ false, {});
+	REQUIRE(value_foo.has_value());
 
-	value = get_value(key("bar"), /* snapshot */ false, {});
-	REQUIRE(value.has_value());
-	CHECK(value->compare("foundation") == 0);
+	auto value_bar = get_value(key("bar"), /* snapshot */ false, {});
+	REQUIRE(value_bar.has_value());
+
+	if (potentialCommitCount != 1) {
+		MESSAGE("Transaction may not have committed exactly once. Suppressing assertions");
+	} else {
+		CHECK(value_foo.value() == "fdb");
+		CHECK(value_bar.value() == "foundation");
+	}
 }
 
 TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_MAX") {
@@ -1576,7 +1608,7 @@ TEST_CASE("fdb_transaction_watch max watches") {
 		fdb_check(f1.set_callback(
 		    +[](FDBFuture* f, void* param) {
 			    fdb_error_t err = fdb_future_get_error(f);
-			    if (err != 1101) { // operation_cancelled
+			    if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) {
 				    CHECK(err == 1032); // too_many_watches
 			    }
 			    auto* event = static_cast<std::shared_ptr<FdbEvent>*>(param);
@@ -1587,7 +1619,7 @@ TEST_CASE("fdb_transaction_watch max watches") {
 		fdb_check(f2.set_callback(
 		    +[](FDBFuture* f, void* param) {
 			    fdb_error_t err = fdb_future_get_error(f);
-			    if (err != 1101) { // operation_cancelled
+			    if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) {
 				    CHECK(err == 1032); // too_many_watches
 			    }
 			    auto* event = static_cast<std::shared_ptr<FdbEvent>*>(param);
@@ -1598,7 +1630,7 @@ TEST_CASE("fdb_transaction_watch max watches") {
 		fdb_check(f3.set_callback(
 		    +[](FDBFuture* f, void* param) {
 			    fdb_error_t err = fdb_future_get_error(f);
-			    if (err != 1101) { // operation_cancelled
+			    if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) {
 				    CHECK(err == 1032); // too_many_watches
 			    }
 			    auto* event = static_cast<std::shared_ptr<FdbEvent>*>(param);
@@ -1609,7 +1641,7 @@ TEST_CASE("fdb_transaction_watch max watches") {
 		fdb_check(f4.set_callback(
 		    +[](FDBFuture* f, void* param) {
 			    fdb_error_t err = fdb_future_get_error(f);
-			    if (err != 1101) { // operation_cancelled
+			    if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) {
 				    CHECK(err == 1032); // too_many_watches
 			    }
 			    auto* event = static_cast<std::shared_ptr<FdbEvent>*>(param);
@@ -1671,7 +1703,7 @@ TEST_CASE("fdb_transaction_cancel") {
 	// ... until the transaction has been reset.
 	tr.reset();
 	fdb::ValueFuture f2 = tr.get("foo", /* snapshot */ false);
-	fdb_check(wait_future(f2));
+	CHECK(wait_future(f2) != 1025); // transaction_cancelled
 }
 
 TEST_CASE("fdb_transaction_add_conflict_range") {
@@ -2146,22 +2178,29 @@ TEST_CASE("monitor_network_busyness") {
 }
 
 int main(int argc, char** argv) {
-	if (argc != 3 && argc != 4) {
+	if (argc < 3) {
 		std::cout << "Unit tests for the FoundationDB C API.\n"
-		          << "Usage: fdb_c_unit_tests /path/to/cluster_file key_prefix [externalClient]" << std::endl;
+		          << "Usage: fdb_c_unit_tests /path/to/cluster_file key_prefix [externalClient] [doctest args]"
+		          << std::endl;
 		return 1;
 	}
 	fdb_check(fdb_select_api_version(710));
-	if (argc == 4) {
+	if (argc >= 4) {
 		std::string externalClientLibrary = argv[3];
-		fdb_check(fdb_network_set_option(
-		    FDBNetworkOption::FDB_NET_OPTION_DISABLE_LOCAL_CLIENT, reinterpret_cast<const uint8_t*>(""), 0));
-		fdb_check(fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_LIBRARY,
-		                                 reinterpret_cast<const uint8_t*>(externalClientLibrary.c_str()),
-		                                 externalClientLibrary.size()));
+		if (externalClientLibrary.substr(0, 2) != "--") {
+			fdb_check(fdb_network_set_option(
+			    FDBNetworkOption::FDB_NET_OPTION_DISABLE_LOCAL_CLIENT, reinterpret_cast<const uint8_t*>(""), 0));
+			fdb_check(fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_LIBRARY,
+			                                 reinterpret_cast<const uint8_t*>(externalClientLibrary.c_str()),
+			                                 externalClientLibrary.size()));
+		}
 	}
 
+	/* fdb_check(fdb_network_set_option( */
+	/*     FDBNetworkOption::FDB_NET_OPTION_CLIENT_BUGGIFY_ENABLE, reinterpret_cast<const uint8_t*>(""), 0)); */
+
 	doctest::Context context;
+	context.applyCommandLine(argc, argv);
 
 	fdb_check(fdb_setup_network());
 	std::thread network_thread{ &fdb_run_network };

From 07edc1db9a84d3da138a773cd5cfd33d55412f1f Mon Sep 17 00:00:00 2001
From: Josh Slocum <josh.slocum@snowflake.com>
Date: Tue, 25 May 2021 15:32:49 +0000
Subject: [PATCH 432/461] Removing spaces in SevWarn trace event names

---
 fdbserver/FDBExecHelper.actor.cpp | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/fdbserver/FDBExecHelper.actor.cpp b/fdbserver/FDBExecHelper.actor.cpp
index 618bba35be..e4185ad7c2 100644
--- a/fdbserver/FDBExecHelper.actor.cpp
+++ b/fdbserver/FDBExecHelper.actor.cpp
@@ -148,7 +148,10 @@ ACTOR Future<int> spawnProcess(std::string path,
 	state pid_t pid = pidAndReadFD.first;
 	state Optional<int> readFD = pidAndReadFD.second;
 	if (pid == -1) {
-		TraceEvent(SevWarnAlways, "SpawnProcess: Command failed to spawn").detail("Cmd", path).detail("Args", allArgs);
+		TraceEvent(SevWarnAlways, "SpawnProcessFailure")
+		    .detail("Reason", "Command failed to spawn")
+		    .detail("Cmd", path)
+		    .detail("Args", allArgs);
 		return -1;
 	} else if (pid > 0) {
 		state int status = -1;
@@ -160,7 +163,8 @@ ACTOR Future<int> spawnProcess(std::string path,
 			if (runTime > maxWaitTime) {
 				// timing out
 
-				TraceEvent(SevWarnAlways, "SpawnProcess : Command failed, timeout")
+				TraceEvent(SevWarnAlways, "SpawnProcessFailure")
+				    .detail("Reason", "Command failed, timeout")
 				    .detail("Cmd", path)
 				    .detail("Args", allArgs);
 				return -1;
@@ -175,9 +179,10 @@ ACTOR Future<int> spawnProcess(std::string path,
 			}
 
 			if (err < 0) {
-				TraceEvent event(SevWarnAlways, "SpawnProcess : Command failed");
+				TraceEvent event(SevWarnAlways, "SpawnProcessFailure");
 				setupTraceWithOutput(event, bytesRead, outputBuffer);
-				event.detail("Cmd", path)
+				event.detail("Reason", "Command failed")
+				    .detail("Cmd", path)
 				    .detail("Args", allArgs)
 				    .detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : -1);
 				return -1;
@@ -194,14 +199,15 @@ ACTOR Future<int> spawnProcess(std::string path,
 			} else {
 				// child process completed
 				if (!(WIFEXITED(status) && WEXITSTATUS(status) == 0)) {
-					TraceEvent event(SevWarnAlways, "SpawnProcess : Command failed");
+					TraceEvent event(SevWarnAlways, "SpawnProcessFailure");
 					setupTraceWithOutput(event, bytesRead, outputBuffer);
-					event.detail("Cmd", path)
+					event.detail("Reason", "Command failed")
+					    .detail("Cmd", path)
 					    .detail("Args", allArgs)
 					    .detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : -1);
 					return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
 				}
-				TraceEvent event("SpawnProcess : Command status");
+				TraceEvent event("SpawnProcess_CommandStatus");
 				setupTraceWithOutput(event, bytesRead, outputBuffer);
 				event.detail("Cmd", path)
 				    .detail("Args", allArgs)

From a39dec1380f61bee25d0cc9fe20c3008bb37d37d Mon Sep 17 00:00:00 2001
From: Josh Slocum <josh.slocum@snowflake.com>
Date: Tue, 25 May 2021 15:51:26 +0000
Subject: [PATCH 433/461] Fixing multiple small redwood test bugs

---
 fdbserver/VersionedBTree.actor.cpp | 37 ++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 8a919fd190..49eac05655 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -40,6 +40,12 @@
 
 #define REDWOOD_DEBUG 0
 
+// Only print redwood debug statements for a certain address. Useful in simulation with many redwood processes to reduce
+// log size.
+#define REDWOOD_DEBUG_ADDR 0
+// example addr:  "[abcd::4:0:1:4]:1"
+#define REDWOOD_DEBUG_ADDR_VAL "";
+
 #define debug_printf_stream stdout
 #define debug_printf_always(...)                                                                                       \
 	{                                                                                                                  \
@@ -49,11 +55,25 @@
 		fflush(debug_printf_stream);                                                                                   \
 	}
 
+#define debug_printf_addr(...)                                                                                         \
+	{                                                                                                                  \
+		std::string addr = REDWOOD_DEBUG_ADDR_VAL;                                                                     \
+		if (!memcmp(addr.c_str(), g_network->getLocalAddress().toString().c_str(), addr.size())) {                     \
+			std::string prefix =                                                                                       \
+			    format("%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__);               \
+			std::string msg = format(__VA_ARGS__);                                                                     \
+			writePrefixedLines(debug_printf_stream, prefix, msg);                                                      \
+			fflush(debug_printf_stream);                                                                               \
+		}                                                                                                              \
+	}
+
 #define debug_printf_noop(...)
 
 #if defined(NO_INTELLISENSE)
 #if REDWOOD_DEBUG
 #define debug_printf debug_printf_always
+#elif REDWOOD_DEBUG_ADDR
+#define debug_printf debug_printf_addr
 #else
 #define debug_printf debug_printf_noop
 #endif
@@ -3868,9 +3888,10 @@ private:
 	std::unordered_map<LogicalPageID, ParentInfo> parents;
 	ParentInfoMapT childUpdateTracker;
 
-	// MetaKey changes size so allocate space for it to expand into
+	// MetaKey changes size so allocate space for it to expand into. FIXME: Steve is fixing this to be dynamically
+	// sized.
 	union {
-		uint8_t headerSpace[sizeof(MetaKey) + sizeof(LogicalPageID) * 30];
+		uint8_t headerSpace[sizeof(MetaKey) + sizeof(LogicalPageID) * 200];
 		MetaKey m_header;
 	};
 
@@ -7548,7 +7569,11 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") {
 	std::set<IntIntPair> uniqueItems;
 	while (uniqueItems.size() < N) {
 		IntIntPair p = randomPair();
-		if (uniqueItems.count(p) == 0) {
+		auto nextP = p; // also check if next highest/lowest key is not in set for testLTE/testGTE
+		nextP.v++;
+		auto prevP = p;
+		prevP.v--;
+		if (uniqueItems.count(p) == 0 && uniqueItems.count(nextP) == 0 && uniqueItems.count(prevP) == 0) {
 			uniqueItems.insert(p);
 		}
 	}
@@ -7566,7 +7591,11 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") {
 	std::vector<IntIntPair> toDelete;
 	while (1) {
 		IntIntPair p = randomPair();
-		if (uniqueItems.count(p) == 0) {
+		auto nextP = p; // also check if next highest/lowest key is not in set for testLTE/testGTE
+		nextP.v++;
+		auto prevP = p;
+		prevP.v--;
+		if (uniqueItems.count(p) == 0 && uniqueItems.count(nextP) == 0 && uniqueItems.count(prevP) == 0) {
 			if (!r.insert(p)) {
 				break;
 			};

From c31196ab01d2ebf65389a92bc941d2195dec2137 Mon Sep 17 00:00:00 2001
From: Josh Slocum <josh.slocum@snowflake.com>
Date: Tue, 25 May 2021 10:24:00 -0700
Subject: [PATCH 434/461] Update fdbserver/FDBExecHelper.actor.cpp

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>
---
 fdbserver/FDBExecHelper.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/FDBExecHelper.actor.cpp b/fdbserver/FDBExecHelper.actor.cpp
index e4185ad7c2..1a32999e7d 100644
--- a/fdbserver/FDBExecHelper.actor.cpp
+++ b/fdbserver/FDBExecHelper.actor.cpp
@@ -207,7 +207,7 @@ ACTOR Future<int> spawnProcess(std::string path,
 					    .detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : -1);
 					return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
 				}
-				TraceEvent event("SpawnProcess_CommandStatus");
+				TraceEvent event("SpawnProcessCommandStatus");
 				setupTraceWithOutput(event, bytesRead, outputBuffer);
 				event.detail("Cmd", path)
 				    .detail("Args", allArgs)

From ce82c9653e7708e0ce6bdb38c538934f3825fa2d Mon Sep 17 00:00:00 2001
From: Josh Slocum <josh.slocum@snowflake.com>
Date: Fri, 5 Mar 2021 19:28:15 +0000
Subject: [PATCH 435/461] Testing Storage Server implementation

---
 .../source/mr-status-json-schemas.rst.inc     |  10 +
 fdbcli/fdbcli.actor.cpp                       |  16 +-
 fdbclient/BackupAgentBase.actor.cpp           |  18 +-
 fdbclient/CMakeLists.txt                      |   1 +
 fdbclient/CommitProxyInterface.h              |  15 +-
 fdbclient/DatabaseConfiguration.cpp           |  27 +-
 fdbclient/DatabaseConfiguration.h             |   4 +
 fdbclient/DatabaseContext.h                   |   9 +
 fdbclient/ManagementAPI.actor.cpp             |  39 +-
 fdbclient/NativeAPI.actor.cpp                 | 261 +++++++-
 fdbclient/Schemas.cpp                         |  17 +-
 fdbclient/StorageServerInterface.cpp          | 465 ++++++++++++++
 fdbclient/StorageServerInterface.h            |  19 +-
 fdbclient/SystemData.cpp                      |  85 ++-
 fdbclient/SystemData.h                        |   5 +
 fdbrpc/CMakeLists.txt                         |   3 +-
 fdbrpc/LoadBalance.actor.h                    | 106 +++-
 fdbrpc/QueueModel.cpp                         |  35 ++
 fdbrpc/QueueModel.h                           |  24 +-
 fdbrpc/TSSComparison.h                        |  78 +++
 fdbrpc/fdbrpc.h                               |   2 +
 fdbrpc/simulator.h                            |   6 +-
 fdbserver/ApplyMetadataMutation.cpp           |   7 +
 fdbserver/ClusterController.actor.cpp         | 143 ++++-
 fdbserver/CommitProxyServer.actor.cpp         |   3 +
 fdbserver/DataDistribution.actor.cpp          | 569 +++++++++++++++---
 fdbserver/DataDistributionTracker.actor.cpp   |  14 +-
 fdbserver/Knobs.cpp                           |   1 +
 fdbserver/Knobs.h                             |   1 +
 fdbserver/MoveKeys.actor.cpp                  | 482 +++++++++++----
 fdbserver/MoveKeys.actor.h                    |   3 +-
 fdbserver/MutationTracking.cpp                |   3 +
 fdbserver/QuietDatabase.actor.cpp             |  16 +-
 fdbserver/Ratekeeper.actor.cpp                |   8 +-
 fdbserver/SimulatedCluster.actor.cpp          |  44 ++
 fdbserver/Status.actor.cpp                    |  16 +-
 fdbserver/TLogServer.actor.cpp                |   5 +
 fdbserver/WorkerInterface.actor.h             |  11 +-
 fdbserver/masterserver.actor.cpp              |   6 +
 fdbserver/storageserver.actor.cpp             | 562 +++++++++++++++--
 fdbserver/tester.actor.cpp                    |  20 +
 fdbserver/worker.actor.cpp                    |  60 +-
 .../workloads/ConsistencyCheck.actor.cpp      | 190 +++++-
 fdbserver/workloads/RandomMoveKeys.actor.cpp  |   3 +-
 fdbserver/workloads/workloads.actor.h         |   6 +-
 flow/ProtocolVersion.h                        |   4 +-
 flow/genericactors.actor.h                    |   2 +
 flow/serialize.h                              |   9 +
 tests/CMakeLists.txt                          |   2 +
 tests/StorageServerInterface.txt              |   7 +
 tests/SystemData.txt                          |   7 +
 51 files changed, 3128 insertions(+), 321 deletions(-)
 create mode 100644 fdbclient/StorageServerInterface.cpp
 create mode 100644 fdbrpc/TSSComparison.h
 create mode 100644 tests/StorageServerInterface.txt
 create mode 100644 tests/SystemData.txt

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 7979331898..914a682c4c 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -682,6 +682,16 @@
              "ssd-rocksdb-experimental",
              "memory"
          ]},
+         "tss_count":1,
+         "tss_storage_engine":{
+         "$enum":[
+             "ssd",
+             "ssd-1",
+             "ssd-2",
+             "ssd-redwood-experimental",
+             "ssd-rocksdb-experimental",
+             "memory"
+         ]},
          "coordinators_count":1,
          "excluded_servers":[
             {
diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index 7f1bb3b735..0b53bcde6d 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -501,7 +501,10 @@ void initHelp() {
 	    "change the database configuration",
 	    "The `new' option, if present, initializes a new database with the given configuration rather than changing "
 	    "the configuration of an existing one. When used, both a redundancy mode and a storage engine must be "
-	    "specified.\n\nRedundancy mode:\n  single - one copy of the data.  Not fault tolerant.\n  double - two copies "
+	    "specified.\n\ntss: when enabled, configures the testing storage server for the cluster instead."
+	    "When used with new to set up tss for the first time, it requires both a count and a storage engine."
+	    "To disable the testing storage server, run \"configure tss count=0\"\n\n"
+	    "Redundancy mode:\n  single - one copy of the data.  Not fault tolerant.\n  double - two copies "
 	    "of data (survive one failure).\n  triple - three copies of data (survive two failures).\n  three_data_hall - "
 	    "See the Admin Guide.\n  three_datacenter - See the Admin Guide.\n\nStorage engine:\n  ssd - B-Tree storage "
 	    "engine optimized for solid state disks.\n  memory - Durable in-memory storage engine for small "
@@ -1128,6 +1131,17 @@ void printStatus(StatusObjectReader statusObj,
 				if (statusObjConfig.get("log_routers", intVal))
 					outputString += format("\n  Desired Log Routers    - %d", intVal);
 
+				if (statusObjConfig.get("tss_count", intVal) && intVal > 0) {
+					int activeTss = 0;
+					if (statusObjCluster.has("active_tss_count")) {
+						statusObjCluster.get("active_tss_count", activeTss);
+					}
+					outputString += format("\n  TSS                    - %d/%d", activeTss, intVal);
+
+					if (statusObjConfig.get("tss_storage_engine", strVal))
+						outputString += format("\n  TSS Storage Engine     - %s", strVal.c_str());
+				}
+
 				outputString += "\n  Usable Regions         - ";
 				if (statusObjConfig.get("usable_regions", intVal)) {
 					outputString += std::to_string(intVal);
diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp
index fba2e69954..cc861f310a 100644
--- a/fdbclient/BackupAgentBase.actor.cpp
+++ b/fdbclient/BackupAgentBase.actor.cpp
@@ -404,8 +404,14 @@ ACTOR Future<Void> readCommitted(Database cx,
 			state RangeResult values = wait(tr.getRange(begin, end, limits));
 
 			// When this buggify line is enabled, if there are more than 1 result then use half of the results
+			// Copy the data instead of messing with the results directly to avoid TSS issues.
 			if (values.size() > 1 && BUGGIFY) {
-				values.resize(values.arena(), values.size() / 2);
+				Standalone<RangeResultRef> copy;
+				// only copy first half of values into copy
+				for (int i = 0; i < values.size() / 2; i++) {
+					copy.push_back_deep(copy.arena(), values[i]);
+				}
+				values = copy;
 				values.more = true;
 				// Half of the time wait for this tr to expire so that the next read is at a different version
 				if (deterministicRandom()->random01() < 0.5)
@@ -469,9 +475,15 @@ ACTOR Future<Void> readCommitted(Database cx,
 
 			state RangeResult rangevalue = wait(tr.getRange(nextKey, end, limits));
 
-			// When this buggify line is enabled, if there are more than 1 result then use half of the results
+			// When this buggify line is enabled, if there are more than 1 result then use half of the results.
+			// Copy the data instead of messing with the results directly to avoid TSS issues.
 			if (rangevalue.size() > 1 && BUGGIFY) {
-				rangevalue.resize(rangevalue.arena(), rangevalue.size() / 2);
+				Standalone<RangeResultRef> copy;
+				// only copy first half of rangevalue into copy
+				for (int i = 0; i < rangevalue.size() / 2; i++) {
+					copy.push_back_deep(copy.arena(), rangevalue[i]);
+				}
+				rangevalue = copy;
 				rangevalue.more = true;
 				// Half of the time wait for this tr to expire so that the next read is at a different version
 				if (deterministicRandom()->random01() < 0.5)
diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index bd14ef7b52..75b9fd5a0a 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -68,6 +68,7 @@ set(FDBCLIENT_SRCS
   Status.h
   StatusClient.actor.cpp
   StatusClient.h
+  StorageServerInterface.cpp
   StorageServerInterface.h
   Subspace.cpp
   Subspace.h
diff --git a/fdbclient/CommitProxyInterface.h b/fdbclient/CommitProxyInterface.h
index 794b88ceaa..16f6695a03 100644
--- a/fdbclient/CommitProxyInterface.h
+++ b/fdbclient/CommitProxyInterface.h
@@ -116,18 +116,31 @@ struct ClientDBInfo {
 	    firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk
 	Optional<Value> forward;
 	vector<VersionHistory> history;
+	vector<std::pair<UID, StorageServerInterface>>
+	    tssMapping; // logically map<ssid, tss interface> for all active TSS pairs
 
 	ClientDBInfo() {}
 
 	bool operator==(ClientDBInfo const& r) const { return id == r.id; }
 	bool operator!=(ClientDBInfo const& r) const { return id != r.id; }
 
+	// convenience method to treat tss mapping like a map
+	// TODO can serializer handle maps? could just change it
+	Optional<StorageServerInterface> getTssPair(UID storageServerID) const {
+		for (auto& it : tssMapping) {
+			if (it.first == storageServerID) {
+				return Optional<StorageServerInterface>(it.second);
+			}
+		}
+		return Optional<StorageServerInterface>();
+	}
+
 	template <class Archive>
 	void serialize(Archive& ar) {
 		if constexpr (!is_fb_function<Archive>) {
 			ASSERT(ar.protocolVersion().isValid());
 		}
-		serializer(ar, grvProxies, commitProxies, id, forward, history);
+		serializer(ar, grvProxies, commitProxies, id, forward, history, tssMapping);
 	}
 };
 
diff --git a/fdbclient/DatabaseConfiguration.cpp b/fdbclient/DatabaseConfiguration.cpp
index 838f6c3c10..a2cfc435b3 100644
--- a/fdbclient/DatabaseConfiguration.cpp
+++ b/fdbclient/DatabaseConfiguration.cpp
@@ -31,7 +31,8 @@ void DatabaseConfiguration::resetInternal() {
 	commitProxyCount = grvProxyCount = resolverCount = desiredTLogCount = tLogWriteAntiQuorum = tLogReplicationFactor =
 	    storageTeamSize = desiredLogRouterCount = -1;
 	tLogVersion = TLogVersion::DEFAULT;
-	tLogDataStoreType = storageServerStoreType = KeyValueStoreType::END;
+	tLogDataStoreType = storageServerStoreType = testingStorageServerStoreType = KeyValueStoreType::END;
+	desiredTSSCount = 0;
 	tLogSpillType = TLogSpillType::DEFAULT;
 	autoCommitProxyCount = CLIENT_KNOBS->DEFAULT_AUTO_COMMIT_PROXIES;
 	autoGrvProxyCount = CLIENT_KNOBS->DEFAULT_AUTO_GRV_PROXIES;
@@ -299,6 +300,25 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const {
 		result["storage_engine"] = "custom";
 	}
 
+	if (desiredTSSCount > 0) {
+		result["tss_count"] = desiredTSSCount;
+		if (testingStorageServerStoreType == KeyValueStoreType::SSD_BTREE_V1) {
+			result["tss_storage_engine"] = "ssd-1";
+		} else if (testingStorageServerStoreType == KeyValueStoreType::SSD_BTREE_V2) {
+			result["tss_storage_engine"] = "ssd-2";
+		} else if (testingStorageServerStoreType == KeyValueStoreType::SSD_REDWOOD_V1) {
+			result["tss_storage_engine"] = "ssd-redwood-experimental";
+		} else if (testingStorageServerStoreType == KeyValueStoreType::SSD_ROCKSDB_V1) {
+			result["tss_storage_engine"] = "ssd-rocksdb-experimental";
+		} else if (testingStorageServerStoreType == KeyValueStoreType::MEMORY_RADIXTREE) {
+			result["tss_storage_engine"] = "memory-radixtree-beta";
+		} else if (testingStorageServerStoreType == KeyValueStoreType::MEMORY) {
+			result["tss_storage_engine"] = "memory-2";
+		} else {
+			result["tss_storage_engine"] = "custom";
+		}
+	}
+
 	result["log_spill"] = (int)tLogSpillType;
 
 	if (remoteTLogReplicationFactor == 1) {
@@ -449,6 +469,8 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) {
 		}
 	} else if (ck == LiteralStringRef("storage_replicas")) {
 		parse(&storageTeamSize, value);
+	} else if (ck == LiteralStringRef("tss_count")) {
+		parse(&desiredTSSCount, value);
 	} else if (ck == LiteralStringRef("log_version")) {
 		parse((&type), value);
 		type = std::max((int)TLogVersion::MIN_RECRUITABLE, type);
@@ -471,6 +493,9 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) {
 	} else if (ck == LiteralStringRef("storage_engine")) {
 		parse((&type), value);
 		storageServerStoreType = (KeyValueStoreType::StoreType)type;
+	} else if (ck == LiteralStringRef("tss_storage_engine")) {
+		parse((&type), value);
+		testingStorageServerStoreType = (KeyValueStoreType::StoreType)type;
 	} else if (ck == LiteralStringRef("auto_commit_proxies")) {
 		parse(&autoCommitProxyCount, value);
 	} else if (ck == LiteralStringRef("auto_grv_proxies")) {
diff --git a/fdbclient/DatabaseConfiguration.h b/fdbclient/DatabaseConfiguration.h
index ef539f40b0..0df45ce228 100644
--- a/fdbclient/DatabaseConfiguration.h
+++ b/fdbclient/DatabaseConfiguration.h
@@ -225,6 +225,10 @@ struct DatabaseConfiguration {
 	int32_t storageTeamSize;
 	KeyValueStoreType storageServerStoreType;
 
+	// Testing StorageServers
+	int32_t desiredTSSCount;
+	KeyValueStoreType testingStorageServerStoreType;
+
 	// Remote TLogs
 	int32_t desiredLogRouterCount;
 	int32_t remoteDesiredTLogCount;
diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h
index ae1a5a741b..b1dee87f18 100644
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@@ -273,6 +273,9 @@ public:
 	Reference<AsyncVar<Reference<ClusterConnectionFile>>> connectionFile;
 	AsyncTrigger proxiesChangeTrigger;
 	Future<Void> monitorProxiesInfoChange;
+	Future<Void> monitorTssInfoChange;
+	Future<Void> tssMismatchHandler;
+	PromiseStream<UID> tssMismatchStream;
 	Reference<CommitProxyInfo> commitProxies;
 	Reference<GrvProxyInfo> grvProxies;
 	bool proxyProvisional; // Provisional commit proxy and grv proxy are used at the same time.
@@ -320,6 +323,8 @@ public:
 
 	std::map<UID, StorageServerInfo*> server_interf;
 
+	std::map<UID, Reference<TSSMetrics>> tssMetrics;
+
 	UID dbId;
 	bool internal; // Only contexts created through the C client and fdbcli are non-internal
 
@@ -419,6 +424,10 @@ public:
 	static bool debugUseTags;
 	static const std::vector<std::string> debugTransactionTagChoices;
 	std::unordered_map<KeyRef, Reference<WatchMetadata>> watchMap;
+
+	// TODO should this be private?
+	void maybeAddTssMapping(StorageServerInterface const& ssi);
+	void addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi);
 };
 
 #endif
diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp
index f53cf65828..8b4d03b4d8 100644
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@@ -60,6 +60,13 @@ std::map<std::string, std::string> configForToken(std::string const& mode) {
 		return out;
 	}
 
+	if (mode == "tss") {
+		// Set temporary marker in config map to mark that this is a tss configuration and not a normal storage/log
+		// configuration. A bit of a hack but reuses the parsing code nicely.
+		out[p + "istss"] = "1";
+		return out;
+	}
+
 	if (mode == "locked") {
 		// Setting this key is interpreted as an instruction to use the normal version-stamp-based mechanism for locking
 		// the database.
@@ -119,7 +126,7 @@ std::map<std::string, std::string> configForToken(std::string const& mode) {
 
 		if ((key == "logs" || key == "commit_proxies" || key == "grv_proxies" || key == "resolvers" ||
 		     key == "remote_logs" || key == "log_routers" || key == "usable_regions" ||
-		     key == "repopulate_anti_quorum") &&
+		     key == "repopulate_anti_quorum" || key == "count") &&
 		    isInteger(value)) {
 			out[p + key] = value;
 		}
@@ -334,6 +341,36 @@ ConfigurationResult buildConfiguration(std::vector<StringRef> const& modeTokens,
 		serializeReplicationPolicy(policyWriter, logPolicy);
 		outConf[p + "log_replication_policy"] = policyWriter.toValue().toString();
 	}
+	if (outConf.count(p + "istss")) {
+		// redo config parameters to be tss config instead of normal config
+
+		// save param values from parsing as a normal config
+		bool isNew = outConf.count(p + "initialized");
+		Optional<std::string> count;
+		Optional<std::string> storageEngine;
+		if (outConf.count(p + "count")) {
+			count = Optional<std::string>(outConf[p + "count"]);
+		}
+		if (outConf.count(p + "storage_engine")) {
+			storageEngine = Optional<std::string>(outConf[p + "storage_engine"]);
+		}
+
+		// A new tss setup must have count + storage engine. An adjustment must have at least one.
+		if ((isNew && (!count.present() || !storageEngine.present())) ||
+		    (!isNew && !count.present() && !storageEngine.present())) {
+			// TODO is this the right error type? And should we log something?
+			return ConfigurationResult::INCOMPLETE_CONFIGURATION;
+		}
+
+		// clear map and only reset tss parameters
+		outConf.clear();
+		if (count.present()) {
+			outConf[p + "tss_count"] = count.get();
+		}
+		if (storageEngine.present()) {
+			outConf[p + "tss_storage_engine"] = storageEngine.get();
+		}
+	}
 	return ConfigurationResult::SUCCESS;
 }
 
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 75c11db594..9cd9a32d8c 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -38,6 +38,7 @@
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/JsonBuilder.h"
+#include "fdbclient/KeyBackedTypes.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/Knobs.h"
 #include "fdbclient/ManagementAPI.actor.h"
@@ -121,6 +122,52 @@ NetworkOptions::NetworkOptions()
 static const Key CLIENT_LATENCY_INFO_PREFIX = LiteralStringRef("client_latency/");
 static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = LiteralStringRef("client_latency_counter/");
 
+// TODO make tss function here
+void DatabaseContext::maybeAddTssMapping(StorageServerInterface const& ssi) {
+	// add tss mapping if server is new
+
+	Optional<StorageServerInterface> tssPair = clientInfo->get().getTssPair(ssi.id());
+	if (tssPair.present()) {
+		addTssMapping(ssi, tssPair.get());
+	}
+}
+
+// calling getInterface potentially recursively is weird, but since this function is only called when an entry is
+// created/changed, the recursive call should never recurse itself.
+void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi) {
+	// TODO get both with a getInterface call which will create the tss endpoint and/or update both endpoints if there
+	// was a change in endpoint tokens
+
+	// the order of these is important because it hits the "different token same locality" issue, so we always want to
+	// request the tss first so the ss request overrides it.
+	// TODO this shouldn't be necessary after i stop doing the same server hack
+	Reference<StorageServerInfo> tssInfo = StorageServerInfo::getInterface(this, tssi, clientLocality);
+	Reference<StorageServerInfo> ssInfo = StorageServerInfo::getInterface(this, ssi, clientLocality);
+
+	// add new tss metrics object to queue
+	Reference<TSSMetrics> metrics = makeReference<TSSMetrics>();
+	tssMetrics[tssi.id()] = metrics;
+
+	// TODO any other requests it makes sense to duplicate?
+	// add each read data request interface to map (getValue, getKey, getKeyValues, watchValue)
+	queueModel.updateTssEndpoint(
+	    ssInfo->interf.getValue.getEndpoint().token.first(),
+	    TSSEndpointData(tssi.id(), tssInfo->interf.getValue.getEndpoint(), metrics, clientInfo->get().id));
+	queueModel.updateTssEndpoint(
+	    ssInfo->interf.getKey.getEndpoint().token.first(),
+	    TSSEndpointData(tssi.id(), tssInfo->interf.getKey.getEndpoint(), metrics, clientInfo->get().id));
+	queueModel.updateTssEndpoint(
+	    ssInfo->interf.getKeyValues.getEndpoint().token.first(),
+	    TSSEndpointData(tssi.id(), tssInfo->interf.getKeyValues.getEndpoint(), metrics, clientInfo->get().id));
+	queueModel.updateTssEndpoint(
+	    ssInfo->interf.watchValue.getEndpoint().token.first(),
+	    TSSEndpointData(tssi.id(), tssInfo->interf.watchValue.getEndpoint(), metrics, clientInfo->get().id));
+
+	// TODO REMOVE
+	printf(
+	    "added tss endpoints to queue for mapping %s=%s\n", ssi.id().toString().c_str(), tssi.id().toString().c_str());
+}
+
 Reference<StorageServerInfo> StorageServerInfo::getInterface(DatabaseContext* cx,
                                                              StorageServerInterface const& ssi,
                                                              LocalityData const& locality) {
@@ -133,11 +180,19 @@ Reference<StorageServerInfo> StorageServerInfo::getInterface(DatabaseContext* cx
 				//       pointing to. This is technically correct, but is very unnatural. We may want to refactor load
 				//       balance to take an AsyncVar<Reference<Interface>> so that it is notified when the interface
 				//       changes.
+
 				it->second->interf = ssi;
+
+				// TODO remove print
+				printf("maybeAddTss same locality %s\n", ssi.id().toString().c_str());
+				cx->maybeAddTssMapping(ssi);
 			} else {
 				it->second->notifyContextDestroyed();
 				Reference<StorageServerInfo> loc(new StorageServerInfo(cx, ssi, locality));
 				cx->server_interf[ssi.id()] = loc.getPtr();
+				// TODO REMOVE print
+				printf("maybeAddTss different locality %s\n", ssi.id().toString().c_str());
+				cx->maybeAddTssMapping(ssi);
 				return loc;
 			}
 		}
@@ -147,6 +202,9 @@ Reference<StorageServerInfo> StorageServerInfo::getInterface(DatabaseContext* cx
 
 	Reference<StorageServerInfo> loc(new StorageServerInfo(cx, ssi, locality));
 	cx->server_interf[ssi.id()] = loc.getPtr();
+	// TODO REMOVE print
+	// printf("maybeAddTss new ssi %s\n", ssi.id().toString().c_str());
+	cx->maybeAddTssMapping(ssi);
 	return loc;
 }
 
@@ -327,6 +385,55 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
 		cx->mutationsPerCommit.clear();
 		cx->bytesPerCommit.clear();
 
+		for (const auto& it : cx->tssMetrics) {
+			// TODO could skip this tss if request counter is zero? would potentially complicate elapsed calculation
+			// though
+			if (it.second->mismatches.getIntervalDelta()) {
+				printf("Found tss %s with %d mismatches!!\n",
+				       it.first.toString().c_str(),
+				       it.second->mismatches.getIntervalDelta());
+				cx->tssMismatchStream.send(it.first);
+			}
+			TraceEvent tssEv("TSSClientMetrics", cx->dbId);
+			tssEv.detail("TSSID", it.first)
+			    .detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged)
+			    .detail("Internal", cx->internal);
+
+			it.second->cc.logToTraceEvent(tssEv);
+
+			tssEv.detail("MeanSSGetValueLatency", it.second->SSgetValueLatency.mean())
+			    .detail("MedianSSGetValueLatency", it.second->SSgetValueLatency.median())
+			    .detail("SSGetValueLatency90", it.second->SSgetValueLatency.percentile(0.90))
+			    .detail("SSGetValueLatency99", it.second->SSgetValueLatency.percentile(0.99));
+
+			tssEv.detail("MeanTSSGetValueLatency", it.second->TSSgetValueLatency.mean())
+			    .detail("MedianTSSGetValueLatency", it.second->TSSgetValueLatency.median())
+			    .detail("TSSGetValueLatency90", it.second->TSSgetValueLatency.percentile(0.90))
+			    .detail("TSSGetValueLatencyDiff99", it.second->TSSgetValueLatency.percentile(0.99));
+
+			tssEv.detail("MeanSSGetKeyLatency", it.second->SSgetKeyLatency.mean())
+			    .detail("MedianSSGetKeyLatency", it.second->SSgetKeyLatency.median())
+			    .detail("SSGetKeyLatency90", it.second->SSgetKeyLatency.percentile(0.90))
+			    .detail("SSGetKeyLatency99", it.second->SSgetKeyLatency.percentile(0.99));
+
+			tssEv.detail("MeanTSSGetKeyLatency", it.second->TSSgetKeyLatency.mean())
+			    .detail("MedianTSSGetKeyLatency", it.second->TSSgetKeyLatency.median())
+			    .detail("TSSGetKeyLatency90", it.second->TSSgetKeyLatency.percentile(0.90))
+			    .detail("TSSGetKeyLatencyDiff99", it.second->TSSgetKeyLatency.percentile(0.99));
+
+			tssEv.detail("MeanSSGetKeyValuesLatency", it.second->SSgetKeyLatency.mean())
+			    .detail("MedianSSGetKeyValuesLatency", it.second->SSgetKeyLatency.median())
+			    .detail("SSGetKeyValuesLatency90", it.second->SSgetKeyLatency.percentile(0.90))
+			    .detail("SSGetKeyValuesLatency99", it.second->SSgetKeyLatency.percentile(0.99));
+
+			tssEv.detail("MeanTSSGetKeyValuesLatency", it.second->TSSgetKeyValuesLatency.mean())
+			    .detail("MedianTSSGetKeyValuesLatency", it.second->TSSgetKeyValuesLatency.median())
+			    .detail("TSSGetKeyValuesLatency90", it.second->TSSgetKeyValuesLatency.percentile(0.90))
+			    .detail("TSSGetKeyValuesLatencyDiff99", it.second->TSSgetKeyValuesLatency.percentile(0.99));
+
+			it.second->clear();
+		}
+
 		lastLogged = now();
 	}
 }
@@ -711,6 +818,110 @@ ACTOR Future<Void> monitorCacheList(DatabaseContext* self) {
 	}
 }
 
+// updates tss mapping when set of tss servers changes
+ACTOR static Future<Void> monitorTssChange(DatabaseContext* cx) {
+	state vector<std::pair<UID, StorageServerInterface>> curTssMapping;
+	curTssMapping = cx->clientInfo->get().tssMapping;
+
+	loop {
+		wait(cx->clientInfo->onChange());
+		if (cx->clientInfo->get().tssMapping != curTssMapping) {
+			// TODO maybe re-read this from system keys instead if it changes
+			ClientDBInfo clientInfo = cx->clientInfo->get();
+			curTssMapping = clientInfo.tssMapping;
+
+			// TODO REMOVE print
+			// printf("gonna do tss stuff with %d tss's\n", curTssMapping.size());
+
+			std::unordered_set<UID> seenTssIds;
+
+			if (curTssMapping.size()) {
+				for (const auto& it : curTssMapping) {
+					seenTssIds.insert(it.second.id());
+
+					if (cx->server_interf.count(it.first)) {
+						// TODO REMOVE
+						printf("found new tss mapping %s -> %s\n",
+						       it.first.toString().c_str(),
+						       it.second.id().toString().c_str());
+						cx->addTssMapping(cx->server_interf[it.first]->interf, it.second);
+					} else {
+						// TODO REMOVE case and print
+						// printf("server %s with tss pair %s not in server_interf, skipping for now\n",
+						// it.first.toString().c_str(), it.second.id().toString().c_str());
+					}
+				}
+			}
+
+			for (auto it = cx->tssMetrics.begin(); it != cx->tssMetrics.end();) {
+				if (seenTssIds.count(it->first)) {
+					it++;
+				} else {
+					// TODO REMOVE
+					printf("Erasing tss %s from tss_metrics\n", it->first.toString().c_str());
+					it = cx->tssMetrics.erase(it);
+				}
+			}
+
+			cx->queueModel.removeOldTssData(clientInfo.id);
+		}
+	}
+}
+
+ACTOR static Future<Void> handleTssMismatches(DatabaseContext* cx) {
+	state Reference<ReadYourWritesTransaction> tr;
+	state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
+	loop {
+		state UID tssID = waitNext(cx->tssMismatchStream.getFuture());
+		// find ss pair id so we can remove it from the mapping
+		state UID tssPairID;
+		bool found = false;
+		for (const auto& it : cx->clientInfo->get().tssMapping) {
+			if (it.second.id() == tssID) {
+				tssPairID = it.first;
+				found = true;
+				break;
+			}
+		}
+		// TODO maybe instead of assert, do a trace event because it's possible that by the time we checked the mismatch
+		// the tss is gone?
+		if (found) {
+			// TODO add trace event
+			TEST(true); // killing TSS because it got mismatch
+			printf("KILLING TSS %s (partner=%s) BECAUSE OF TSS MISMATCH\n",
+			       tssID.toString().c_str(),
+			       tssPairID.toString().c_str());
+
+			// TODO we could write something to the system keyspace and then have DD listen to that keyspace and then DD
+			// do exactly this, so why not just cut out the middle man (or the middle system keys, as it were)
+			tr = makeReference<ReadYourWritesTransaction>(Database(Reference<DatabaseContext>::addRef(cx)));
+			loop {
+				try {
+					tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+					tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+
+					tr->clear(serverTagKeyFor(tssID));
+					tssMapDB.erase(tr, tssPairID);
+
+					tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString());
+					wait(tr->commit());
+
+					break;
+				} catch (Error& e) {
+					printf("Kill Mismatch TSS Transaction got error %d\n", e.code());
+					wait(tr->onError(e));
+				}
+			}
+			tr = makeReference<ReadYourWritesTransaction>(); // clear out txn so that the extra ref gets decref'd and we
+			                                                 // can free cx
+
+		} else {
+			TEST(true); // Not killing TSS with mismatch because it's already gone
+			printf("Not killing TSS %s because of tss mismatch, must be already removed\n", tssID.toString().c_str());
+		}
+	}
+}
+
 ACTOR static Future<HealthMetrics> getHealthMetricsActor(DatabaseContext* cx, bool detailed) {
 	if (now() - cx->healthMetricsLastUpdated < CLIENT_KNOBS->AGGREGATE_HEALTH_METRICS_MAX_STALENESS) {
 		if (detailed) {
@@ -960,6 +1171,8 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 	GlobalConfig::create(this, clientInfo);
 
 	monitorProxiesInfoChange = monitorProxiesChange(clientInfo, &proxiesChangeTrigger);
+	monitorTssInfoChange = monitorTssChange(this);
+	tssMismatchHandler = handleTssMismatches(this);
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);
 	cacheListMonitor = monitorCacheList(this);
 
@@ -1199,6 +1412,8 @@ Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo,
 DatabaseContext::~DatabaseContext() {
 	cacheListMonitor.cancel();
 	monitorProxiesInfoChange.cancel();
+	monitorTssInfoChange.cancel();
+	tssMismatchHandler.cancel();
 	for (auto it = server_interf.begin(); it != server_interf.end(); it = server_interf.erase(it))
 		it->second->notifyContextDestroyed();
 	ASSERT_ABORT(server_interf.empty());
@@ -2345,6 +2560,11 @@ ACTOR Future<Key> getKey(Database cx, KeySelector k, Future<Version> version, Tr
 				    "NativeAPI.getKey.Before"); //.detail("StartKey",
 				                                // k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual);
 			++cx->transactionPhysicalReads;
+
+			GetKeyRequest req(
+			    span.context, k, version.get(), cx->sampleReadTags() ? tags : Optional<TagSet>(), getKeyID);
+			req.arena.dependsOn(k.arena());
+
 			state GetKeyReply reply;
 			try {
 				choose {
@@ -2353,11 +2573,7 @@ ACTOR Future<Key> getKey(Database cx, KeySelector k, Future<Version> version, Tr
 					         wait(loadBalance(cx.getPtr(),
 					                          ssi.second,
 					                          &StorageServerInterface::getKey,
-					                          GetKeyRequest(span.context,
-					                                        k,
-					                                        version.get(),
-					                                        cx->sampleReadTags() ? tags : Optional<TagSet>(),
-					                                        getKeyID),
+					                          req,
 					                          TaskPriority::DefaultPromiseEndpoint,
 					                          false,
 					                          cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) {
@@ -2718,6 +2934,9 @@ ACTOR Future<RangeResult> getExactRange(Database cx,
 			req.end = firstGreaterOrEqual(range.end);
 			req.spanContext = span.context;
 
+			// keep shard's arena around in case of async tss comparison
+			req.arena.dependsOn(locations[shard].first.arena());
+
 			transformRangeLimits(limits, reverse, req);
 			ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse);
 
@@ -3034,6 +3253,9 @@ ACTOR Future<RangeResult> getRange(Database cx,
 			req.isFetchKeys = (info.taskID == TaskPriority::FetchKeys);
 			req.version = readVersion;
 
+			// In case of async tss comparison, also make req arena depend on begin, end, and/or shard's arena depending
+			// on which  is used
+			bool dependOnShard = false;
 			if (reverse && (begin - 1).isDefinitelyLess(shard.begin) &&
 			    (!begin.isFirstGreaterOrEqual() ||
 			     begin.getKey() != shard.begin)) { // In this case we would be setting modifiedSelectors to true, but
@@ -3041,14 +3263,23 @@ ACTOR Future<RangeResult> getRange(Database cx,
 
 				req.begin = firstGreaterOrEqual(shard.begin);
 				modifiedSelectors = true;
-			} else
+				req.arena.dependsOn(shard.arena());
+				dependOnShard = true;
+			} else {
 				req.begin = begin;
+				req.arena.dependsOn(begin.arena());
+			}
 
 			if (!reverse && end.isDefinitelyGreater(shard.end)) {
 				req.end = firstGreaterOrEqual(shard.end);
 				modifiedSelectors = true;
-			} else
+				if (!dependOnShard) {
+					req.arena.dependsOn(shard.arena());
+				}
+			} else {
 				req.end = end;
+				req.arena.dependsOn(end.arena());
+			}
 
 			transformRangeLimits(limits, reverse, req);
 			ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse);
@@ -3133,10 +3364,18 @@ ACTOR Future<RangeResult> getRange(Database cx,
 					output.readThroughEnd = readThroughEnd;
 
 					if (BUGGIFY && limits.hasByteLimit() && output.size() > std::max(1, originalLimits.minRows)) {
+						printf("Buggify resizing in nativeapi\n");
+						// Copy instead of resizing because TSS maybe be using output's arena for comparison. This only
+						// happens in simulation so it's fine
+						Standalone<RangeResultRef> copy;
+						int newSize =
+						    deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size());
+						for (int i = 0; i < newSize; i++) {
+							copy.push_back_deep(copy.arena(), output[i]);
+						}
+						output = copy;
 						output.more = true;
-						output.resize(
-						    output.arena(),
-						    deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size()));
+
 						getRangeFinished(cx,
 						                 trLogInfo,
 						                 startTime,
@@ -4180,6 +4419,8 @@ ACTOR static Future<Void> tryCommit(Database cx,
 		choose {
 			when(wait(cx->onProxiesChanged())) {
 				reply.cancel();
+				// TODO REMOVE
+				printf("tryCommit proxies changed ERROR!\n");
 				throw request_maybe_delivered();
 			}
 			when(CommitID ci = wait(reply)) {
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 124fb17873..514866fe83 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -431,6 +431,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
          "seconds" : 1.0,
          "versions" : 1000000
       },
+      "active_tss_count":0,
       "degraded_processes":0,
       "database_available":true,
       "database_lock_state": {
@@ -729,6 +730,19 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
              "memory-2",
              "memory-radixtree-beta"
          ]},
+         "tss_count":1,
+         "tss_storage_engine":{
+         "$enum":[
+             "ssd",
+             "ssd-1",
+             "ssd-2",
+             "ssd-redwood-experimental",
+             "ssd-rocksdb-experimental",
+             "memory",
+             "memory-1",
+             "memory-2",
+             "memory-radixtree-beta"
+         ]},
          "coordinators_count":1,
          "excluded_servers":[
             {
@@ -802,7 +816,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                 }
             }
          ],
-         "least_operating_space_bytes_storage_server":0
+         "least_operating_space_bytes_storage_server":0,
+         "max_machine_failures_without_losing_data":0
       },
       "machines":{
          "$map":{
diff --git a/fdbclient/StorageServerInterface.cpp b/fdbclient/StorageServerInterface.cpp
new file mode 100644
index 0000000000..180d1b814c
--- /dev/null
+++ b/fdbclient/StorageServerInterface.cpp
@@ -0,0 +1,465 @@
+/*
+ * StorageServerInterface.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/StorageServerInterface.h"
+#include "flow/crc32c.h" // for crc32c_append, to checksum values in tss trace events
+
+// Includes template specializations for all tss operations on storage server types.
+// New StorageServerInterface reply types must be added here or it won't compile.
+
+// if size + hex of checksum is shorter than value, record that instead of actual value. break-even point is 12
+// characters
+std::string traceChecksumValue(ValueRef s) {
+	return s.size() > 12 ? format("(%d)%08x", s.size(), crc32c_append(0, s.begin(), s.size())) : s.toString();
+}
+
+template <>
+bool TSS_doCompare(const GetValueRequest& req,
+                   const GetValueReply& src,
+                   const GetValueReply& tss,
+                   Severity traceSeverity,
+                   UID tssId) {
+	if (src.value.present() != tss.value.present() || (src.value.present() && src.value.get() != tss.value.get())) {
+		printf("GetValue %s @ %lld mismatch: src=%s, tss=%s\n",
+		       req.key.printable().c_str(),
+		       req.version,
+		       src.value.present() ? traceChecksumValue(src.value.get()).c_str() : "missing",
+		       tss.value.present() ? traceChecksumValue(tss.value.get()).c_str() : "missing");
+		TraceEvent(traceSeverity, "TSSMismatchGetValue")
+		    .suppressFor(1.0)
+		    .detail("TSSID", tssId)
+		    .detail("Key", req.key.printable())
+		    .detail("Version", req.version)
+		    .detail("SSReply", src.value.present() ? traceChecksumValue(src.value.get()) : "missing")
+		    .detail("TSSReply", tss.value.present() ? traceChecksumValue(tss.value.get()) : "missing");
+
+		return false;
+	}
+	// printf("tss GetValueReply matched! src=%s, tss=%s\n", src.value.present() ? src.value.get().toString().c_str() :
+	// "missing", tss.value.present() ? tss.value.get().toString().c_str() : "missing");
+	return true;
+}
+
+template <>
+bool TSS_doCompare(const GetKeyRequest& req,
+                   const GetKeyReply& src,
+                   const GetKeyReply& tss,
+                   Severity traceSeverity,
+                   UID tssId) {
+	// This process is a bit complicated. Since the tss and ss can return different results if neighboring shards to
+	// req.sel.key are currently being moved, We validate that the results are the same IF the returned key selectors
+	// are final. Otherwise, we only mark the request as a mismatch if the difference between the two returned key
+	// selectors could ONLY be because of different results from the storage engines. We can afford to only partially
+	// check key selectors that start in a TSS shard and end in a non-TSS shard because the other read queries and the
+	// consistency check will eventually catch a misbehaving storage engine.
+	bool matches = true;
+	// printf("GetKey %s:<%s:%d @ %lld start:\n",
+	//		req.sel.getKey().toString().c_str(), req.sel.orEqual ? "=" : "", req.sel.offset, req.version);
+	if (src.sel.orEqual == tss.sel.orEqual && src.sel.offset == tss.sel.offset) {
+		// full matching case
+		if (src.sel.offset == 0 && src.sel.orEqual) {
+			// found exact key, should be identical
+			matches = src.sel.getKey() == tss.sel.getKey();
+		}
+		// if the query doesn't return the final key, there is an edge case where the ss and tss have different shard
+		// boundaries, so they pass different shard boundary keys back for the same offset
+	} else if (src.sel.getKey() == tss.sel.getKey()) {
+		// There is one case with a positive offset where the shard boundary the incomplete query stopped at is the next
+		// key in the shard that the complete query returned. This is not possible with a negative offset because the
+		// shard boundary is exclusive backwards
+		if (src.sel.offset == 0 && src.sel.orEqual && tss.sel.offset == 1 && !tss.sel.orEqual) {
+			// case where ss was complete and tss was incomplete
+		} else if (tss.sel.offset == 0 && tss.sel.orEqual && src.sel.offset == 1 && !src.sel.orEqual) {
+			// case where tss was complete and ss was incomplete
+		} else {
+			matches = false;
+		}
+	} else {
+		// ss/tss returned different keys, and different offsets and/or orEqual
+		// here we just validate that ordering of the keys matches the ordering of the offsets
+		bool tssKeyLarger = src.sel.getKey() < tss.sel.getKey();
+		// the only case offsets are equal and orEqual aren't equal is the case with a negative offset,
+		// where one response has <=0 with the actual result and the other has <0 with the shard upper boundary.
+		// So whichever one has the actual result should have the lower key.
+		bool tssOffsetLarger = (src.sel.offset == tss.sel.offset) ? tss.sel.orEqual : src.sel.offset < tss.sel.offset;
+		// printf("  partial comparison: tssLarger=%s, tssOffsetLarger=%s, matches=%s\n", tssKeyLarger ? "T" : "F",
+		// tssOffsetLarger ? "T": "F", matches ? "T" : "F");
+		matches = tssKeyLarger != tssOffsetLarger;
+	}
+	if (!matches) {
+		// TODO REMOVE print
+		printf("GetKey %s:<%s:%d @ %lld mismatch: src=%s:<%s:%d, tss=%s:<%s:%d\n",
+		       req.sel.getKey().printable().c_str(),
+		       req.sel.orEqual ? "=" : "",
+		       req.sel.offset,
+		       req.version,
+		       src.sel.getKey().printable().c_str(),
+		       src.sel.orEqual ? "=" : "",
+		       src.sel.offset,
+		       tss.sel.getKey().printable().c_str(),
+		       tss.sel.orEqual ? "=" : "",
+		       tss.sel.offset);
+		TraceEvent(traceSeverity, "TSSMismatchGetKey")
+		    .suppressFor(1.0)
+		    .detail("TSSID", tssId)
+		    .detail("KeySelector",
+		            format("%s%s:%d", req.sel.orEqual ? "=" : "", req.sel.getKey().printable().c_str(), req.sel.offset))
+		    .detail("Version", req.version)
+		    .detail("SSReply",
+		            format("%s%s:%d", src.sel.orEqual ? "=" : "", src.sel.getKey().printable().c_str(), src.sel.offset))
+		    .detail(
+		        "TSSReply",
+		        format("%s%s:%d", tss.sel.orEqual ? "=" : "", tss.sel.getKey().printable().c_str(), tss.sel.offset));
+	}
+	return matches;
+}
+
+template <>
+bool TSS_doCompare(const GetKeyValuesRequest& req,
+                   const GetKeyValuesReply& src,
+                   const GetKeyValuesReply& tss,
+                   Severity traceSeverity,
+                   UID tssId) {
+	if (src.more != tss.more || src.data != tss.data) {
+		// TODO REMOVE debugging prints
+		printf("GetKeyValues [%s:<%s:%d - %s:<%s:%d) @ %lld (lim=%d limB=%d) mismatch:\n",
+		       req.begin.getKey().printable().c_str(),
+		       req.begin.orEqual ? "=" : "",
+		       req.begin.offset,
+		       req.end.getKey().printable().c_str(),
+		       req.end.orEqual ? "=" : "",
+		       req.end.offset,
+		       req.version,
+		       req.limit,
+		       req.limitBytes);
+
+		std::string ssResultsString = format("(%d)%s:\n", src.data.size(), src.more ? "+" : "");
+		printf("src= (%d)%s:", src.data.size(), src.more ? "+" : "");
+		for (auto& it : src.data) {
+			printf("    %s=%s\n", it.key.printable().c_str(), it.value.printable().c_str());
+			ssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value);
+		}
+
+		std::string tssResultsString = format("(%d)%s:\n", tss.data.size(), tss.more ? "+" : "");
+		printf("tss= (%d)%s:", tss.data.size(), tss.more ? "+" : "");
+		for (auto& it : tss.data) {
+			printf("    %s=%s\n", it.key.printable().c_str(), it.value.printable().c_str());
+			tssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value);
+		}
+		printf("\n");
+
+		TraceEvent(traceSeverity, "TSSMismatchGetKeyValues")
+		    .suppressFor(1.0)
+		    .detail("TSSID", tssId)
+		    .detail(
+		        "Begin",
+		        format(
+		            "%s%s:%d", req.begin.orEqual ? "=" : "", req.begin.getKey().printable().c_str(), req.begin.offset))
+		    .detail("End",
+		            format("%s%s:%d", req.end.orEqual ? "=" : "", req.end.getKey().printable().c_str(), req.end.offset))
+		    .detail("Version", req.version)
+		    .detail("Limit", req.limit)
+		    .detail("LimitBytes", req.limitBytes)
+		    .detail("SSReply", ssResultsString)
+		    .detail("TSSReply", tssResultsString);
+
+		return false;
+	}
+	/*printf("tss GetKeyValues [%s:<%s:%d - %s:<%s:%d) matched! %d=%d\n",
+	        req.begin.getKey().printable().c_str(), req.begin.orEqual ? "=" : "", req.begin.offset,
+	        req.end.getKey().printable().c_str(), req.end.orEqual ? "=" : "", req.end.offset,
+	        src.data.size(), tss.data.size());*/
+	return true;
+}
+
+template <>
+bool TSS_doCompare(const WatchValueRequest& req,
+                   const WatchValueReply& src,
+                   const WatchValueReply& tss,
+                   Severity traceSeverity,
+                   UID tssId) {
+	// TODO should this check that both returned the same version? We mainly want to duplicate watches just for load
+	return true;
+}
+
+// no-op template specializations for metrics replies
+template <>
+bool TSS_doCompare(const WaitMetricsRequest& req,
+                   const StorageMetrics& src,
+                   const StorageMetrics& tss,
+                   Severity traceSeverity,
+                   UID tssId) {
+	return true;
+}
+
+template <>
+bool TSS_doCompare(const SplitMetricsRequest& req,
+                   const SplitMetricsReply& src,
+                   const SplitMetricsReply& tss,
+                   Severity traceSeverity,
+                   UID tssId) {
+	return true;
+}
+
+template <>
+bool TSS_doCompare(const ReadHotSubRangeRequest& req,
+                   const ReadHotSubRangeReply& src,
+                   const ReadHotSubRangeReply& tss,
+                   Severity traceSeverity,
+                   UID tssId) {
+	return true;
+}
+
+template <>
+bool TSS_doCompare(const SplitRangeRequest& req,
+                   const SplitRangeReply& src,
+                   const SplitRangeReply& tss,
+                   Severity traceSeverity,
+                   UID tssId) {
+	// TODO in theory this should return the same response from both right?
+	return true;
+}
+
+// don't duplicate \xff reads or fetchKeys (avoid adding load to servers)
+template <>
+bool TSS_shouldDuplicateRequest(const GetValueRequest& req) {
+	return req.key.size() == 0 || req.key[0] != 0xff;
+}
+
+template <>
+bool TSS_shouldDuplicateRequest(const GetKeyRequest& req) {
+	return req.sel.getKey().size() == 0 || req.sel.getKey()[0] != 0xff;
+}
+
+template <>
+bool TSS_shouldDuplicateRequest(const GetKeyValuesRequest& req) {
+	return (req.begin.getKey().size() == 0 || req.begin.getKey()[0] != 0xff || req.end.getKey().size() == 0 ||
+	        req.end.getKey()[0] != 0xff) &&
+	       !req.isFetchKeys;
+}
+
+template <>
+bool TSS_shouldDuplicateRequest(const WatchValueRequest& req) {
+	return req.key.size() == 0 || req.key[0] != 0xff;
+}
+
+template <>
+bool TSS_shouldDuplicateRequest(const WaitMetricsRequest& req) {
+	return false;
+}
+
+template <>
+bool TSS_shouldDuplicateRequest(const SplitMetricsRequest& req) {
+	return false;
+}
+
+template <>
+bool TSS_shouldDuplicateRequest(const ReadHotSubRangeRequest& req) {
+	return false;
+}
+
+template <>
+bool TSS_shouldDuplicateRequest(const SplitRangeRequest& req) {
+	return false;
+}
+
+// only record metrics for data reads
+
+template <>
+void TSSMetrics::recordLatency(const GetValueRequest& req, double ssLatency, double tssLatency) {
+	SSgetValueLatency.addSample(ssLatency);
+	TSSgetValueLatency.addSample(tssLatency);
+}
+
+template <>
+void TSSMetrics::recordLatency(const GetKeyRequest& req, double ssLatency, double tssLatency) {
+	SSgetKeyLatency.addSample(ssLatency);
+	TSSgetKeyLatency.addSample(tssLatency);
+}
+
+template <>
+void TSSMetrics::recordLatency(const GetKeyValuesRequest& req, double ssLatency, double tssLatency) {
+	SSgetKeyValuesLatency.addSample(ssLatency);
+	TSSgetKeyValuesLatency.addSample(tssLatency);
+}
+
+template <>
+void TSSMetrics::recordLatency(const WatchValueRequest& req, double ssLatency, double tssLatency) {}
+
+template <>
+void TSSMetrics::recordLatency(const WaitMetricsRequest& req, double ssLatency, double tssLatency) {}
+
+template <>
+void TSSMetrics::recordLatency(const SplitMetricsRequest& req, double ssLatency, double tssLatency) {}
+
+template <>
+void TSSMetrics::recordLatency(const ReadHotSubRangeRequest& req, double ssLatency, double tssLatency) {}
+
+template <>
+void TSSMetrics::recordLatency(const SplitRangeRequest& req, double ssLatency, double tssLatency) {}
+
+// -------------------
+
+// TODO ADD UNIT TESTS for compare methods, especially GetKey!!
+TEST_CASE("/StorageServerInterface/TSSCompare/TestComparison") {
+	printf("testing tss comparisons\n");
+
+	// test getValue
+	GetValueRequest gvReq;
+	gvReq.key = StringRef("a");
+	gvReq.version = 5;
+
+	UID tssId;
+
+	GetValueReply gvReplyMissing;
+	GetValueReply gvReplyA(Optional<Value>(StringRef("a")), false);
+	GetValueReply gvReplyB(Optional<Value>(StringRef("b")), false);
+	ASSERT(TSS_doCompare(gvReq, gvReplyMissing, gvReplyMissing, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gvReq, gvReplyA, gvReplyA, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gvReq, gvReplyB, gvReplyB, SevInfo, tssId));
+
+	ASSERT(!TSS_doCompare(gvReq, gvReplyMissing, gvReplyA, SevInfo, tssId));
+	ASSERT(!TSS_doCompare(gvReq, gvReplyA, gvReplyB, SevInfo, tssId));
+
+	// test GetKeyValues
+	Arena a; // for all of the refs. ASAN complains if this isn't done. Could also make them all standalone i guess
+	GetKeyValuesRequest gkvReq;
+	gkvReq.begin = firstGreaterOrEqual(StringRef(a, "A"));
+	gkvReq.end = firstGreaterOrEqual(StringRef(a, "C"));
+	gkvReq.version = 5;
+
+	GetKeyValuesReply gkvReplyEmpty;
+	GetKeyValuesReply gkvReplyOne;
+	KeyValueRef v;
+	v.key = StringRef(a, "a");
+	v.value = StringRef(a, "1");
+	gkvReplyOne.data.push_back_deep(gkvReplyOne.arena, v);
+	GetKeyValuesReply gkvReplyOneMore;
+	gkvReplyOneMore.data.push_back_deep(gkvReplyOneMore.arena, v);
+	gkvReplyOneMore.more = true;
+
+	ASSERT(TSS_doCompare(gkvReq, gkvReplyEmpty, gkvReplyEmpty, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkvReq, gkvReplyOne, gkvReplyOne, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkvReq, gkvReplyOneMore, gkvReplyOneMore, SevInfo, tssId));
+	ASSERT(!TSS_doCompare(gkvReq, gkvReplyEmpty, gkvReplyOne, SevInfo, tssId));
+	ASSERT(!TSS_doCompare(gkvReq, gkvReplyOne, gkvReplyOneMore, SevInfo, tssId));
+
+	// test GetKey
+	GetKeyRequest gkReq;
+	gkReq.sel = KeySelectorRef(StringRef(a, "Z"), false, 1);
+	gkReq.version = 5;
+
+	GetKeyReply gkReplyA(KeySelectorRef(StringRef(a, "A"), false, 20), false);
+	GetKeyReply gkReplyB(KeySelectorRef(StringRef(a, "B"), false, 10), false);
+	GetKeyReply gkReplyC(KeySelectorRef(StringRef(a, "C"), true, 0), false);
+	GetKeyReply gkReplyD(KeySelectorRef(StringRef(a, "D"), false, -10), false);
+	GetKeyReply gkReplyE(KeySelectorRef(StringRef(a, "E"), false, -20), false);
+
+	// identical cases
+	ASSERT(TSS_doCompare(gkReq, gkReplyA, gkReplyA, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkReq, gkReplyB, gkReplyB, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyC, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkReq, gkReplyD, gkReplyD, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkReq, gkReplyE, gkReplyE, SevInfo, tssId));
+
+	// relative offset cases
+	ASSERT(TSS_doCompare(gkReq, gkReplyA, gkReplyB, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkReq, gkReplyB, gkReplyA, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkReq, gkReplyA, gkReplyC, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyA, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkReq, gkReplyB, gkReplyC, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyB, SevInfo, tssId));
+
+	ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyD, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkReq, gkReplyD, gkReplyC, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyE, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkReq, gkReplyE, gkReplyC, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkReq, gkReplyD, gkReplyE, SevInfo, tssId));
+	ASSERT(TSS_doCompare(gkReq, gkReplyE, gkReplyD, SevInfo, tssId));
+
+	// test same offset/orEqual wrong key
+	ASSERT(!TSS_doCompare(gkReq,
+	                      GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false),
+	                      GetKeyReply(KeySelectorRef(StringRef("B"), true, 0), false),
+	                      SevInfo,
+	                      tssId));
+	// this could be from different shard boundaries, so don't say it's a mismatch
+	ASSERT(TSS_doCompare(gkReq,
+	                     GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 10), false),
+	                     GetKeyReply(KeySelectorRef(StringRef(a, "B"), false, 10), false),
+	                     SevInfo,
+	                     tssId));
+
+	// test offsets and key difference don't match
+	ASSERT(!TSS_doCompare(gkReq,
+	                      GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 0), false),
+	                      GetKeyReply(KeySelectorRef(StringRef("B"), false, 10), false),
+	                      SevInfo,
+	                      tssId));
+	ASSERT(!TSS_doCompare(gkReq,
+	                      GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, -10), false),
+	                      GetKeyReply(KeySelectorRef(StringRef("B"), false, 0), false),
+	                      SevInfo,
+	                      tssId));
+
+	// test key is next over in one shard, one found it and other didn't
+	// positive
+	// one that didn't find is +1
+	ASSERT(TSS_doCompare(gkReq,
+	                     GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 1), false),
+	                     GetKeyReply(KeySelectorRef(StringRef("B"), true, 0), false),
+	                     SevInfo,
+	                     tssId));
+	ASSERT(!TSS_doCompare(gkReq,
+	                      GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false),
+	                      GetKeyReply(KeySelectorRef(StringRef("B"), false, 1), false),
+	                      SevInfo,
+	                      tssId));
+
+	// negative will have zero offset but not equal set
+	ASSERT(TSS_doCompare(gkReq,
+	                     GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false),
+	                     GetKeyReply(KeySelectorRef(StringRef("B"), false, 0), false),
+	                     SevInfo,
+	                     tssId));
+	ASSERT(!TSS_doCompare(gkReq,
+	                      GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 0), false),
+	                      GetKeyReply(KeySelectorRef(StringRef("B"), true, 0), false),
+	                      SevInfo,
+	                      tssId));
+
+	// test shard boundary key returned by incomplete query is the same as the key found by the other (only possible in
+	// positive direction)
+	ASSERT(TSS_doCompare(gkReq,
+	                     GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false),
+	                     GetKeyReply(KeySelectorRef(StringRef("A"), false, 1), false),
+	                     SevInfo,
+	                     tssId));
+
+	// explictly test checksum function
+	std::string s = "A";
+	std::string s12 = "ABCDEFGHIJKL";
+	std::string s13 = "ABCDEFGHIJKLO";
+	std::string checksumStart13 = "(13)";
+	ASSERT(s == traceChecksumValue(StringRef(s)));
+	ASSERT(s12 == traceChecksumValue(StringRef(s12)));
+	ASSERT(checksumStart13 == traceChecksumValue(StringRef(s13)).substr(0, 4));
+	return Void();
+}
\ No newline at end of file
diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h
index 84971f040b..9a514a447e 100644
--- a/fdbclient/StorageServerInterface.h
+++ b/fdbclient/StorageServerInterface.h
@@ -29,7 +29,9 @@
 #include "fdbrpc/LoadBalance.actor.h"
 #include "fdbrpc/Stats.h"
 #include "fdbrpc/TimedRequest.h"
+#include "fdbrpc/TSSComparison.h"
 #include "fdbclient/TagThrottle.h"
+#include "flow/UnitTest.h"
 
 // Dead code, removed in the next protocol version
 struct VersionReply {
@@ -54,6 +56,10 @@ struct StorageServerInterface {
 
 	LocalityData locality;
 	UID uniqueID;
+	// TODO get rid of explicit mapping?
+	// Effectively implements Optional<UID> but serializer didn't like Optional
+	bool isTss;
+	UID tssPairID;
 
 	RequestStream<struct GetValueRequest> getValue;
 	RequestStream<struct GetKeyRequest> getKey;
@@ -74,8 +80,8 @@ struct StorageServerInterface {
 	RequestStream<struct ReadHotSubRangeRequest> getReadHotRanges;
 	RequestStream<struct SplitRangeRequest> getRangeSplitPoints;
 
-	explicit StorageServerInterface(UID uid) : uniqueID(uid) {}
-	StorageServerInterface() : uniqueID(deterministicRandom()->randomUniqueID()) {}
+	explicit StorageServerInterface(UID uid) : uniqueID(uid), isTss(false) {}
+	StorageServerInterface() : uniqueID(deterministicRandom()->randomUniqueID()), isTss(false) {}
 	NetworkAddress address() const { return getValue.getEndpoint().getPrimaryAddress(); }
 	NetworkAddress stableAddress() const { return getValue.getEndpoint().getStableAddress(); }
 	Optional<NetworkAddress> secondaryAddress() const { return getValue.getEndpoint().addresses.secondaryAddress; }
@@ -88,7 +94,11 @@ struct StorageServerInterface {
 		// considered
 
 		if (ar.protocolVersion().hasSmallEndpoints()) {
-			serializer(ar, uniqueID, locality, getValue);
+			if (ar.protocolVersion().hasTSS()) {
+				serializer(ar, uniqueID, locality, getValue, isTss, tssPairID);
+			} else {
+				serializer(ar, uniqueID, locality, getValue);
+			}
 			if (Ar::isDeserializing) {
 				getKey = RequestStream<struct GetKeyRequest>(getValue.getEndpoint().getAdjustedEndpoint(1));
 				getKeyValues = RequestStream<struct GetKeyValuesRequest>(getValue.getEndpoint().getAdjustedEndpoint(2));
@@ -127,8 +137,9 @@ struct StorageServerInterface {
 			           waitFailure,
 			           getQueuingMetrics,
 			           getKeyValueStoreType);
-			if (ar.protocolVersion().hasWatches())
+			if (ar.protocolVersion().hasWatches()) {
 				serializer(ar, watchValue);
+			}
 		}
 	}
 	bool operator==(StorageServerInterface const& s) const { return uniqueID == s.uniqueID; }
diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index 0f035b745c..1d7a750fe5 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -25,6 +25,7 @@
 #include "flow/Arena.h"
 #include "flow/TDMetric.actor.h"
 #include "flow/serialize.h"
+#include "flow/UnitTest.h"
 
 const KeyRef systemKeysPrefix = LiteralStringRef("\xff");
 const KeyRangeRef normalKeys(KeyRef(), systemKeysPrefix);
@@ -345,7 +346,11 @@ uint16_t cacheChangeKeyDecodeIndex(const KeyRef& key) {
 	return idx;
 }
 
+const KeyRef tssMappingChangeKey = LiteralStringRef("\xff\x02/tssMappingChangeKey");
+const KeyRangeRef tssMappingKeys(LiteralStringRef("\xff/tss/"), LiteralStringRef("\xff/tss0"));
+
 const KeyRangeRef serverTagKeys(LiteralStringRef("\xff/serverTag/"), LiteralStringRef("\xff/serverTag0"));
+
 const KeyRef serverTagPrefix = serverTagKeys.begin;
 const KeyRangeRef serverTagConflictKeys(LiteralStringRef("\xff/serverTagConflict/"),
                                         LiteralStringRef("\xff/serverTagConflict0"));
@@ -532,6 +537,7 @@ const Key serverListKeyFor(UID serverID) {
 	return wr.toValue();
 }
 
+// TODO use flatbuffers depending on version
 const Value serverListValue(StorageServerInterface const& server) {
 	BinaryWriter wr(IncludeVersion(ProtocolVersion::withServerListValue()));
 	wr << server;
@@ -550,6 +556,18 @@ StorageServerInterface decodeServerListValue(ValueRef const& value) {
 	return s;
 }
 
+// TODO merge this with above stuff or something
+const Value serverListValueFB(StorageServerInterface const& server) {
+	return ObjectWriter::toValue(server, IncludeVersion());
+}
+
+StorageServerInterface decodeServerListValueFB(ValueRef const& value) {
+	StorageServerInterface s;
+	ObjectReader reader(value.begin(), IncludeVersion());
+	reader.deserialize(s);
+	return s;
+}
+
 // processClassKeys.contains(k) iff k.startsWith( processClassKeys.begin ) because '/'+1 == '0'
 const KeyRangeRef processClassKeys(LiteralStringRef("\xff/processClass/"), LiteralStringRef("\xff/processClass0"));
 const KeyRef processClassPrefix = processClassKeys.begin;
@@ -636,15 +654,17 @@ std::string encodeFailedServersKey(AddressExclusion const& addr) {
 // const KeyRangeRef globalConfigKeys( LiteralStringRef("\xff/globalConfig/"), LiteralStringRef("\xff/globalConfig0") );
 // const KeyRef globalConfigPrefix = globalConfigKeys.begin;
 
-const KeyRangeRef globalConfigDataKeys( LiteralStringRef("\xff/globalConfig/k/"), LiteralStringRef("\xff/globalConfig/k0") );
+const KeyRangeRef globalConfigDataKeys(LiteralStringRef("\xff/globalConfig/k/"),
+                                       LiteralStringRef("\xff/globalConfig/k0"));
 const KeyRef globalConfigKeysPrefix = globalConfigDataKeys.begin;
 
-const KeyRangeRef globalConfigHistoryKeys( LiteralStringRef("\xff/globalConfig/h/"), LiteralStringRef("\xff/globalConfig/h0") );
+const KeyRangeRef globalConfigHistoryKeys(LiteralStringRef("\xff/globalConfig/h/"),
+                                          LiteralStringRef("\xff/globalConfig/h0"));
 const KeyRef globalConfigHistoryPrefix = globalConfigHistoryKeys.begin;
 
 const KeyRef globalConfigVersionKey = LiteralStringRef("\xff/globalConfig/v");
 
-const KeyRangeRef workerListKeys( LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0") );
+const KeyRangeRef workerListKeys(LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0"));
 const KeyRef workerListPrefix = workerListKeys.begin;
 
 const Key workerListKeyFor(StringRef processID) {
@@ -1085,3 +1105,62 @@ const KeyRangeRef testOnlyTxnStateStorePrefixRange(LiteralStringRef("\xff/TESTON
 const KeyRef writeRecoveryKey = LiteralStringRef("\xff/writeRecovery");
 const ValueRef writeRecoveryKeyTrue = LiteralStringRef("1");
 const KeyRef snapshotEndVersionKey = LiteralStringRef("\xff/snapshotEndVersion");
+
+// for tests
+void testSSISerdes(StorageServerInterface const& ssi, bool useFB) {
+	printf("ssi=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\naddress=%s\ngetValue=%s\n\n\n",
+	       ssi.id().toString().c_str(),
+	       ssi.locality.toString().c_str(),
+	       ssi.isTss ? "true" : "false",
+	       ssi.isTss ? ssi.tssPairID.toString().c_str() : "",
+	       ssi.address().toString().c_str(),
+	       ssi.getValue.getEndpoint().token.toString().c_str());
+
+	StorageServerInterface ssi2 =
+	    (useFB) ? decodeServerListValueFB(serverListValueFB(ssi)) : decodeServerListValue(serverListValue(ssi));
+
+	printf("ssi2=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\naddress=%s\ngetValue=%s\n\n\n",
+	       ssi2.id().toString().c_str(),
+	       ssi2.locality.toString().c_str(),
+	       ssi2.isTss ? "true" : "false",
+	       ssi2.isTss ? ssi2.tssPairID.toString().c_str() : "",
+	       ssi2.address().toString().c_str(),
+	       ssi2.getValue.getEndpoint().token.toString().c_str());
+
+	ASSERT(ssi.id() == ssi2.id());
+	ASSERT(ssi.locality == ssi2.locality);
+	ASSERT(ssi.isTss == ssi2.isTss);
+	if (ssi.isTss) {
+		ASSERT(ssi2.tssPairID == ssi2.tssPairID);
+	}
+	ASSERT(ssi.address() == ssi2.address());
+	ASSERT(ssi.getValue.getEndpoint().token == ssi2.getValue.getEndpoint().token);
+}
+
+// unit test for serialization since tss stuff had bugs
+TEST_CASE("/SystemData/SerDes/SSI") {
+	printf("testing ssi serdes\n");
+	LocalityData localityData(Optional<Standalone<StringRef>>(),
+	                          Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
+	                          Standalone<StringRef>(deterministicRandom()->randomUniqueID().toString()),
+	                          Optional<Standalone<StringRef>>());
+
+	// non-tss
+	StorageServerInterface ssi;
+	ssi.uniqueID = UID(0x1234123412341234, 0x5678567856785678);
+	ssi.locality = localityData;
+	ssi.isTss = false;
+	ssi.initEndpoints();
+
+	testSSISerdes(ssi, false);
+	testSSISerdes(ssi, true);
+
+	ssi.isTss = true;
+	ssi.tssPairID = UID(0x2345234523452345, 0x1238123812381238);
+
+	testSSISerdes(ssi, false);
+	testSSISerdes(ssi, true);
+	printf("ssi serdes test complete\n");
+
+	return Void();
+}
\ No newline at end of file
diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h
index 79efb688c8..b9efe1e8a5 100644
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@@ -115,6 +115,11 @@ extern const KeyRef cacheChangePrefix;
 const Key cacheChangeKeyFor(uint16_t idx);
 uint16_t cacheChangeKeyDecodeIndex(const KeyRef& key);
 
+//    "\xff/tss/[[serverId]]" := "[[tssId]]"
+extern const KeyRef tssMappingChangeKey;
+extern const KeyRangeRef tssMappingKeys;
+extern const KeyRef tssMappingPrefix;
+
 // "\xff/serverTag/[[serverID]]" = "[[Tag]]"
 //	Provides the Tag for the given serverID. Used to access a
 //	storage server's corresponding TLog in order to apply mutations.
diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt
index 055e497034..89ec859e8f 100644
--- a/fdbrpc/CMakeLists.txt
+++ b/fdbrpc/CMakeLists.txt
@@ -29,7 +29,8 @@ set(FDBRPC_SRCS
   sim2.actor.cpp
   sim_validation.cpp
   TimedRequest.h
-  TraceFileIO.cpp)
+  TraceFileIO.cpp
+  TSSComparison.h)
 
 set(COMPILE_EIO OFF)
 
diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h
index 393c3c0ee2..2f1ee375bf 100644
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@@ -31,11 +31,16 @@
 #include "flow/flow.h"
 #include "flow/Knobs.h"
 
+// TODO REMOVE?
+#include <cinttypes>
+
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/fdbrpc.h"
 #include "fdbrpc/Locality.h"
 #include "fdbrpc/QueueModel.h"
 #include "fdbrpc/MultiInterface.h"
+#include "fdbrpc/simulator.h" // for checking tss simulation mode
+#include "fdbrpc/TSSComparison.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 
 using std::vector;
@@ -75,6 +80,82 @@ struct LoadBalancedReply {
 Optional<LoadBalancedReply> getLoadBalancedReply(const LoadBalancedReply* reply);
 Optional<LoadBalancedReply> getLoadBalancedReply(const void*);
 
+ACTOR template <class Req, class Resp>
+Future<Void> tssComparison(Req req,
+                           Future<ErrorOr<Resp>> fSource,
+                           Future<ErrorOr<Resp>> fTss,
+                           TSSEndpointData tssData) {
+	// TODO add timeout and time requests
+	state double startTime = now();
+	state Future<Optional<ErrorOr<Resp>>> fTssWithTimeout = timeout(fTss, 5.0 /*TODO knob?*/);
+	state int finished = 0;
+	state double srcEndTime;
+	state double tssEndTime;
+
+	loop {
+		choose {
+			when(state ErrorOr<Resp> src = wait(fSource)) {
+				srcEndTime = now();
+				fSource = Never();
+				finished++;
+				if (finished == 2) {
+					break;
+				}
+			}
+			when(state Optional<ErrorOr<Resp>> tss = wait(fTssWithTimeout)) {
+				tssEndTime = now();
+				fTssWithTimeout = Never();
+				finished++;
+				if (finished == 2) {
+					break;
+				}
+			}
+		}
+	}
+
+	++tssData.metrics->requests;
+
+	if (src.isError()) {
+		++tssData.metrics->ssErrors;
+	}
+	if (!tss.present()) {
+		++tssData.metrics->tssTimeouts;
+	} else if (tss.get().isError()) {
+		++tssData.metrics->tssErrors;
+		printf("Tss got error %d\n", tss.get().getError().code());
+	}
+	if (!src.isError() && tss.present() && !tss.get().isError()) {
+		Optional<LoadBalancedReply> srcLB = getLoadBalancedReply(&src.get());
+		Optional<LoadBalancedReply> tssLB = getLoadBalancedReply(&tss.get().get());
+		ASSERT(srcLB.present() ==
+		       tssLB.present()); // getLoadBalancedReply returned different responses for same templated type
+
+		// if Resp is a LoadBalancedReply, only compare if both replies are non-error
+		if (!srcLB.present() || (!srcLB.get().error.present() && !tssLB.get().error.present())) {
+			// only record latency difference if both requests actually succeeded, so that we're comparing apples to
+			// apples
+			tssData.metrics->recordLatency(req, srcEndTime - startTime, tssEndTime - startTime);
+
+			// expect mismatches in drop mutations mode.
+			Severity traceSeverity =
+			    (g_network->isSimulated() && g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations)
+			        ? SevWarnAlways
+			        : SevError;
+
+			if (!TSS_doCompare(req, src.get(), tss.get().get(), traceSeverity, tssData.tssId)) {
+				++tssData.metrics->mismatches;
+			}
+		} else if (tssLB.present() && tssLB.get().error.present()) {
+			++tssData.metrics->tssErrors;
+			printf("Tss got LB error %d\n", tssLB.get().error.get().code());
+		} else if (srcLB.present() && srcLB.get().error.present()) {
+			++tssData.metrics->ssErrors;
+		}
+	}
+
+	return Void();
+}
+
 // Stores state for a request made by the load balancer
 template <class Request>
 struct RequestData : NonCopyable {
@@ -91,6 +172,26 @@ struct RequestData : NonCopyable {
 	// This is true once setupRequest is called, even though at that point the response is Never().
 	bool isValid() { return response.isValid(); }
 
+	void maybeDuplicateTSSRequest(RequestStream<Request> const* stream,
+	                              Request const& request,
+	                              QueueModel* model,
+								  Future<Reply> ssResponse) {
+		if (model) {
+			// Send parallel request to TSS pair, if it exists
+			Optional<TSSEndpointData> tssData = model->getTssData(stream->getEndpoint().token.first());
+
+			if (tssData.present() && TSS_shouldDuplicateRequest(request)) {
+				resetReply(request);
+
+				// TODO add timeout from knob to tss request?
+				// FIXME: optimize to avoid creating new netNotifiedQueue for each message
+				RequestStream<Request> tssRequestStream(tssData.get().endpoint);
+				Future<ErrorOr<REPLY_TYPE(Request)>> fTssResult = tssRequestStream.tryGetReply(request);
+				model->addActor.send(tssComparison(request, fResult, fTssResult, tssData.get()));
+			}
+		}
+	}
+
 	// Initializes the request state and starts it, possibly after a backoff delay
 	void startRequest(double backoff,
 	                  bool triedAllOptions,
@@ -105,12 +206,15 @@ struct RequestData : NonCopyable {
 			    delay(backoff), [this, stream, &request, model](Void _) {
 				    requestStarted = true;
 				    modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
-				    return stream->tryGetReply(request);
+				    Future<Reply> resp = stream->tryGetReply(request);
+					maybeDuplicateTSSRequest(stream, request, model, resp);
+					return resp;
 			    });
 		} else {
 			requestStarted = true;
 			modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
 			response = stream->tryGetReply(request);
+			maybeDuplicateTSSRequest(stream, request, model, response);
 		}
 
 		requestProcessed = false;
diff --git a/fdbrpc/QueueModel.cpp b/fdbrpc/QueueModel.cpp
index fa458d6738..2cb5687b61 100644
--- a/fdbrpc/QueueModel.cpp
+++ b/fdbrpc/QueueModel.cpp
@@ -18,6 +18,8 @@
  * limitations under the License.
  */
 
+#include <cinttypes>
+
 #include "fdbrpc/QueueModel.h"
 #include "fdbrpc/LoadBalance.h"
 
@@ -60,6 +62,39 @@ double QueueModel::addRequest(uint64_t id) {
 	return d.penalty;
 }
 
+void QueueModel::updateTssEndpoint(uint64_t endpointId, TSSEndpointData tssData) {
+	auto& d = data[endpointId];
+	if (!d.tssData.present()) {
+		tssCount++;
+	}
+
+	d.tssData = Optional<TSSEndpointData>(tssData);
+	// TODO REMOVE print
+	printf("Setting tss endpoint for %" PRIx64 " = %s\n", endpointId, tssData.endpoint.token.toString().c_str());
+}
+
+void QueueModel::removeOldTssData(UID currentGeneration) {
+	if (tssCount > 0) {
+		// expire old tss mappings that aren't present in new mapping
+		for (auto& it : data) {
+			if (it.second.tssData.present() && it.second.tssData.get().generation != currentGeneration) {
+				// TODO REMOVE print
+				printf("Removing tss endpoint for %" PRIx64
+				       " because its generation %s doesn't match the current one %s\n",
+				       it.first,
+				       it.second.tssData.get().generation.toString().c_str(),
+				       currentGeneration.toString().c_str());
+				it.second.tssData = Optional<TSSEndpointData>();
+				tssCount--;
+			}
+		}
+	}
+}
+
+Optional<TSSEndpointData> QueueModel::getTssData(uint64_t id) {
+	return data[id].tssData;
+}
+
 Optional<LoadBalancedReply> getLoadBalancedReply(const LoadBalancedReply* reply) {
 	return *reply;
 }
diff --git a/fdbrpc/QueueModel.h b/fdbrpc/QueueModel.h
index 3ff07a80e9..f8592fa9a5 100644
--- a/fdbrpc/QueueModel.h
+++ b/fdbrpc/QueueModel.h
@@ -26,6 +26,19 @@
 #include "fdbrpc/Smoother.h"
 #include "flow/Knobs.h"
 #include "flow/ActorCollection.h"
+#include "fdbrpc/TSSComparison.h" // For TSS Metrics
+#include "fdbrpc/FlowTransport.h" // For Endpoint
+
+struct TSSEndpointData {
+	UID tssId;
+	Endpoint endpoint;
+	Reference<TSSMetrics> metrics;
+	UID generation; // TODO this isn't exactly like a generation since it's not ordered, i'll try to think of a better
+	                // name
+
+	TSSEndpointData(UID tssId, Endpoint endpoint, Reference<TSSMetrics> metrics, UID generation)
+	  : tssId(tssId), endpoint(endpoint), metrics(metrics), generation(generation) {}
+};
 
 // The data structure used for the client-side load balancing algorithm to
 // decide which storage server to read data from. Conceptually, it tracks the
@@ -59,6 +72,10 @@ struct QueueData {
 	// hasn't returned a valid result, increase above `futureVersionBackoff`
 	// to increase the future backoff amount.
 	double increaseBackoffTime;
+
+	// a bit of a hack to store this here, but it's the only centralized place for per-endpoint tracking
+	Optional<TSSEndpointData> tssData;
+
 	QueueData()
 	  : latency(0.001), penalty(1.0), smoothOutstanding(FLOW_KNOBS->QUEUE_MODEL_SMOOTHING_AMOUNT), failedUntil(0),
 	    futureVersionBackoff(FLOW_KNOBS->FUTURE_VERSION_INITIAL_BACKOFF), increaseBackoffTime(0) {}
@@ -91,7 +108,11 @@ public:
 	Future<Void> laggingRequests; // requests for which a different recipient already answered
 	int laggingRequestCount;
 
-	QueueModel() : secondMultiplier(1.0), secondBudget(0), laggingRequestCount(0) {
+	void updateTssEndpoint(uint64_t endpointId, TSSEndpointData endpointData);
+	void removeOldTssData(UID currentGeneration);
+	Optional<TSSEndpointData> getTssData(uint64_t endpointId);
+
+	QueueModel() : secondMultiplier(1.0), secondBudget(0), laggingRequestCount(0), tssCount(0) {
 		laggingRequests = actorCollection(addActor.getFuture(), &laggingRequestCount);
 	}
 
@@ -99,6 +120,7 @@ public:
 
 private:
 	std::unordered_map<uint64_t, QueueData> data;
+	uint32_t tssCount;
 };
 
 /* old queue model
diff --git a/fdbrpc/TSSComparison.h b/fdbrpc/TSSComparison.h
new file mode 100644
index 0000000000..6724e3dae7
--- /dev/null
+++ b/fdbrpc/TSSComparison.h
@@ -0,0 +1,78 @@
+/*
+ * TSSComparison.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This header is to declare the tss comparison function that LoadBalance.Actor.h needs to be aware of to call,
+ * But StorageServerInterface.h needs to implement on the types defined in SSI.h.
+ */
+#ifndef FDBRPC_TSS_COMPARISON_H
+#define FDBRPC_TSS_COMPARISON_H
+
+#include "fdbrpc/ContinuousSample.h"
+#include "fdbrpc/Stats.h"
+
+// refcounted + noncopyable because both DatabaseContext and individual endpoints share ownership
+struct TSSMetrics : ReferenceCounted<TSSMetrics>, NonCopyable {
+	CounterCollection cc;
+	Counter requests;
+	Counter ssErrors;
+	Counter tssErrors;
+	Counter tssTimeouts;
+	Counter mismatches;
+
+	// TODO we could probably just ignore getKey as it's seldom used?
+	ContinuousSample<double> SSgetValueLatency;
+	ContinuousSample<double> SSgetKeyLatency;
+	ContinuousSample<double> SSgetKeyValuesLatency;
+
+	ContinuousSample<double> TSSgetValueLatency;
+	ContinuousSample<double> TSSgetKeyLatency;
+	ContinuousSample<double> TSSgetKeyValuesLatency;
+
+	template <class Req>
+	void recordLatency(const Req& req, double ssLatency, double tssLatency);
+
+	void clear() {
+		SSgetValueLatency.clear();
+		SSgetKeyLatency.clear();
+		SSgetKeyValuesLatency.clear();
+
+		TSSgetValueLatency.clear();
+		TSSgetKeyLatency.clear();
+		TSSgetKeyValuesLatency.clear();
+	}
+
+	TSSMetrics()
+	  : cc("TSSClientMetrics"), requests("Requests", cc), ssErrors("SSErrors", cc), tssErrors("TSSErrors", cc),
+	    tssTimeouts("TSSTimeouts", cc), mismatches("Mismatches", cc), SSgetValueLatency(1000), SSgetKeyLatency(1000),
+	    SSgetKeyValuesLatency(1000), TSSgetValueLatency(1000), TSSgetKeyLatency(1000), TSSgetKeyValuesLatency(1000) {}
+};
+
+// global static functions
+
+template <class Req>
+bool TSS_shouldDuplicateRequest(const Req& req);
+
+// part of the contract of this function is that if there is a mismatch, the implementation needs to record a trace
+// event with the specified severity and tssId in the event.
+template <class Req, class Rep>
+bool TSS_doCompare(const Req& req, const Rep& src, const Rep& tss, Severity traceSeverity, UID tssId);
+
+#endif
diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h
index a2a6af5af6..e15e0126a1 100644
--- a/fdbrpc/fdbrpc.h
+++ b/fdbrpc/fdbrpc.h
@@ -335,6 +335,7 @@ public:
 			Future<Void> disc =
 			    makeDependent<T>(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint(taskID));
 			if (disc.isReady()) {
+				printf("got disconnect or failure 1 :O\n");
 				return ErrorOr<REPLY_TYPE(X)>(request_maybe_delivered());
 			}
 			Reference<Peer> peer =
@@ -353,6 +354,7 @@ public:
 			Future<Void> disc =
 			    makeDependent<T>(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint());
 			if (disc.isReady()) {
+				printf("got disconnect or failure 2 :O\n");
 				return ErrorOr<REPLY_TYPE(X)>(request_maybe_delivered());
 			}
 			Reference<Peer> peer =
diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h
index 4b74ed91ba..fd49c64447 100644
--- a/fdbrpc/simulator.h
+++ b/fdbrpc/simulator.h
@@ -41,7 +41,7 @@ public:
 	  : desiredCoordinators(1), physicalDatacenters(1), processesPerMachine(0), listenersPerProcess(1),
 	    isStopped(false), lastConnectionFailure(0), connectionFailuresDisableDuration(0), speedUpSimulation(false),
 	    allSwapsDisabled(false), backupAgents(BackupAgentType::WaitForType), drAgents(BackupAgentType::WaitForType),
-	    extraDB(nullptr), allowLogSetKills(true), usableRegions(1) {}
+	    extraDB(nullptr), allowLogSetKills(true), usableRegions(1), tssMode(TSSMode::Disabled) {}
 
 	// Order matters!
 	enum KillType {
@@ -55,6 +55,9 @@ public:
 		None
 	};
 
+	// Order matters! all modes >= 2 are fault injection modes
+	enum TSSMode { Disabled, EnabledNormal, EnabledAddDelay, EnabledDropMutations };
+
 	enum class BackupAgentType { NoBackupAgents, WaitForType, BackupToFile, BackupToDB };
 
 	// Subclasses may subclass ProcessInfo as well
@@ -401,6 +404,7 @@ public:
 	int32_t satelliteTLogWriteAntiQuorumFallback;
 	std::vector<Optional<Standalone<StringRef>>> primarySatelliteDcIds;
 	std::vector<Optional<Standalone<StringRef>>> remoteSatelliteDcIds;
+	TSSMode tssMode;
 
 	// Used by workloads that perform reconfigurations
 	int testerCount;
diff --git a/fdbserver/ApplyMetadataMutation.cpp b/fdbserver/ApplyMetadataMutation.cpp
index 125344d721..87044f49b7 100644
--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@@ -95,12 +95,14 @@ void applyMetadataMutations(SpanID const& spanContext,
 
 						for (const auto& id : src) {
 							auto storageInfo = getStorageInfo(id, storageCache, txnStateStore);
+							ASSERT(!storageInfo->interf.isTss);
 							ASSERT(storageInfo->tag != invalidTag);
 							info.tags.push_back(storageInfo->tag);
 							info.src_info.push_back(storageInfo);
 						}
 						for (const auto& id : dest) {
 							auto storageInfo = getStorageInfo(id, storageCache, txnStateStore);
+							ASSERT(!storageInfo->interf.isTss);
 							ASSERT(storageInfo->tag != invalidTag);
 							info.tags.push_back(storageInfo->tag);
 							info.dest_info.push_back(storageInfo);
@@ -113,6 +115,11 @@ void applyMetadataMutations(SpanID const& spanContext,
 					txnStateStore->set(KeyValueRef(m.param1, m.param2));
 			} else if (m.param1.startsWith(serverKeysPrefix)) {
 				if (toCommit) {
+					Optional<Value> t =
+					    txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get();
+					// printf("got SetValue for serverKeysPrefix/%s, tag=%s\n",
+					// serverKeysDecodeServer(m.param1).toString().c_str(), t.present() ?
+					// decodeServerTagValue(t.get()).toString().c_str() : "");
 					MutationRef privatized = m;
 					privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
 					TraceEvent(SevDebug, "SendingPrivateMutation", dbgid)
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 53304bc6f6..abb87fdf2d 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1,3 +1,4 @@
+
 /*
  * ClusterController.actor.cpp
  *
@@ -3185,9 +3186,9 @@ ACTOR Future<Void> workerAvailabilityWatch(WorkerInterface worker,
 					checkOutstandingRequests(cluster);
 				}
 			}
+
 			when(wait(failed)) { // remove workers that have failed
 				WorkerInfo& failedWorkerInfo = cluster->id_worker[worker.locality.processId()];
-
 				if (!failedWorkerInfo.reply.isSet()) {
 					failedWorkerInfo.reply.send(
 					    RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo));
@@ -3378,14 +3379,22 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
 		isChanged = true;
 	}
 
+	// TODO remove debugging
+	printf("CC:\ntss_count=%d\ntss_storage_engine=%d|%s\n",
+	       db->config.desiredTSSCount,
+	       db->config.testingStorageServerStoreType,
+	       db->config.testingStorageServerStoreType.toString().c_str());
+
 	// Construct the client information
 	if (db->clientInfo->get().commitProxies != req.commitProxies ||
 	    db->clientInfo->get().grvProxies != req.grvProxies) {
 		isChanged = true;
+		// TODO why construct a new one and not just copy the old one and change proxies + id?
 		ClientDBInfo clientInfo;
 		clientInfo.id = deterministicRandom()->randomUniqueID();
 		clientInfo.commitProxies = req.commitProxies;
 		clientInfo.grvProxies = req.grvProxies;
+		clientInfo.tssMapping = db->clientInfo->get().tssMapping;
 		db->clientInfo->set(clientInfo);
 		dbInfo.client = db->clientInfo->get();
 	}
@@ -3861,6 +3870,136 @@ ACTOR Future<Void> monitorServerInfoConfig(ClusterControllerData::DBInfo* db) {
 	}
 }
 
+// Monitors the tss mapping change key for changes,
+// and broadcasts the new tss mapping to the rest of the cluster in ClientDBInfo.
+ACTOR Future<Void> monitorTSSMapping(ClusterControllerData* self) {
+	state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
+	loop {
+		state Reference<ReadYourWritesTransaction> tr =
+		    Reference<ReadYourWritesTransaction>(new ReadYourWritesTransaction(self->db.db));
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+				tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
+
+				std::vector<std::pair<UID, UID>> tssResults =
+				    wait(tssMapDB.getRange(tr, UID(), Optional<UID>(), CLIENT_KNOBS->TOO_MANY));
+				ASSERT(tssResults.size() < CLIENT_KNOBS->TOO_MANY);
+
+				state std::unordered_map<UID, UID> tssIdMap;
+				std::set<UID> seenTssIds;
+
+				for (auto& it : tssResults) {
+					tssIdMap[it.first] = it.second;
+					// ensure two storage servers don't map to same TSS
+					ASSERT(seenTssIds.insert(it.second).second);
+				}
+
+				// TODO REMOVE print
+				printf("tss mapping of size %d\n", tssIdMap.size());
+
+				// TODO is copying storage server interfaces bad?
+				state std::vector<std::pair<UID, StorageServerInterface>> newMapping;
+				state std::map<UID, StorageServerInterface> oldMapping;
+				state bool mappingChanged = false;
+
+				state ClientDBInfo clientInfo = self->db.clientInfo->get();
+
+				for (auto& it : clientInfo.tssMapping) {
+					oldMapping[it.first] = it.second;
+					if (!tssIdMap.count(it.first)) {
+						// TODO add trace event
+						printf("tss mapping removed: %s=%s\n",
+						       it.first.toString().c_str(),
+						       it.second.id().toString().c_str());
+						TraceEvent("TSS_MappingRemoved", self->id)
+						    .detail("SSID", it.first)
+						    .detail("TSSID", it.second.id());
+						mappingChanged = true;
+					}
+				}
+
+				for (auto& it : tssIdMap) {
+					bool ssAlreadyPaired = oldMapping.count(it.first);
+
+					state Optional<UID> oldTssId;
+					state Optional<UID> oldGetValueEndpoint;
+
+					if (ssAlreadyPaired) {
+						auto interf = oldMapping[it.first];
+						// check if this SS maps to a new TSS
+						oldTssId = Optional<UID>(interf.id());
+						oldGetValueEndpoint = Optional<UID>(interf.getValue.getEndpoint().token);
+						if (interf.id() != it.second) {
+							TraceEvent("TSS_MappingChanged", self->id)
+							    .detail("SSID", it.first)
+							    .detail("TSSID", it.second)
+							    .detail("OldTSSID", interf.id());
+							printf("tss mapping updated: %s=%s\n",
+							       it.first.toString().c_str(),
+							       it.second.toString().c_str());
+							mappingChanged = true;
+						}
+					} else {
+						// TODO add trace event
+						TraceEvent("TSS_MappingAdded", self->id).detail("SSID", it.first).detail("TSSID", it.second);
+						printf("tss mapping added: %s=%s\n", it.first.toString().c_str(), it.second.toString().c_str());
+						mappingChanged = true;
+					}
+
+					state UID ssid = it.first;
+					state UID tssid = it.second;
+					// request storage server interface for tssid, add it to results
+					// TODO could issue all of these futures and then process then after as an optimization
+					Optional<Value> tssiVal = wait(tr->get(serverListKeyFor(it.second)));
+
+					// because we read the tss mapping in the same transaction, there can be no races with tss removal
+					// and the tss interface must exist
+					ASSERT(tssiVal.present());
+
+					StorageServerInterface tssi = decodeServerListValue(tssiVal.get());
+					if (oldTssId.present() && tssi.id() == oldTssId.get() && oldGetValueEndpoint.present() &&
+					    oldGetValueEndpoint.get() != tssi.getValue.getEndpoint().token) {
+						// TODO REMOVE print
+						printf("tss %s restarted, getValue %s -> %s\n",
+						       tssi.id().toString().c_str(),
+						       oldGetValueEndpoint.get().toString().c_str(),
+						       tssi.getValue.getEndpoint().token.toString().c_str());
+						mappingChanged = true;
+					}
+					newMapping.push_back(std::pair<UID, StorageServerInterface>(ssid, tssi));
+				}
+
+				// if nothing changed, skip updating
+				if (mappingChanged) {
+					// TODO REMOVE print
+					printf("CC updating tss client and server info\n");
+					clientInfo.id = deterministicRandom()->randomUniqueID();
+					clientInfo.tssMapping = newMapping;
+					self->db.clientInfo->set(clientInfo);
+
+					ServerDBInfo serverInfo = self->db.serverInfo->get();
+					// also change server db info so workers get new mapping
+					serverInfo.id = deterministicRandom()->randomUniqueID();
+					serverInfo.infoGeneration = ++self->db.dbInfoCount;
+					serverInfo.client = clientInfo;
+					self->db.serverInfo->set(serverInfo);
+				}
+
+				state Future<Void> tssChangeFuture = tr->watch(tssMappingChangeKey);
+
+				wait(tr->commit());
+				wait(tssChangeFuture);
+
+				break;
+			} catch (Error& e) {
+				wait(tr->onError(e));
+			}
+		}
+	}
+}
+
 // Monitors the global configuration version key for changes. When changes are
 // made, the global configuration history is read and any updates are sent to
 // all processes in the system by updating the ClientDBInfo object. The
@@ -4411,6 +4550,7 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
 	self.addActor.send(handleForcedRecoveries(&self, interf));
 	self.addActor.send(monitorDataDistributor(&self));
 	self.addActor.send(monitorRatekeeper(&self));
+	self.addActor.send(monitorTSSMapping(&self));
 	self.addActor.send(dbInfoUpdater(&self));
 	self.addActor.send(traceCounters("ClusterControllerMetrics",
 	                                 self.id,
@@ -4452,6 +4592,7 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
 		when(GetWorkersRequest req = waitNext(interf.getWorkers.getFuture())) {
 			++self.getWorkersRequests;
 			vector<WorkerDetails> workers;
+			// printf("CC got GetWorkersRequest\n");
 
 			for (auto& it : self.id_worker) {
 				if ((req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) &&
diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp
index d1469c0d3b..3fc1ed02c3 100644
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@@ -1507,6 +1507,7 @@ ACTOR static Future<Void> rejoinServer(CommitProxyInterface proxy, ProxyCommitDa
 
 	loop {
 		GetStorageServerRejoinInfoRequest req = waitNext(proxy.getStorageServerRejoinInfo.getFuture());
+		printf("Proxy got Rejoin req for %s\n", req.id.toString().c_str());
 		if (commitData->txnStateStore->readValue(serverListKeyFor(req.id)).get().present()) {
 			GetStorageServerRejoinInfoReply rep;
 			rep.version = commitData->version;
@@ -1567,8 +1568,10 @@ ACTOR static Future<Void> rejoinServer(CommitProxyInterface proxy, ProxyCommitDa
 				}
 				rep.newTag = Tag(maxTagLocality + 1, 0);
 			}
+			printf("Proxy sent Rejoin response for %s\n", req.id.toString().c_str());
 			req.reply.send(rep);
 		} else {
+			printf("Proxy notifying %s it can't rejoin because it was removed.\n", req.id.toString().c_str());
 			req.reply.sendError(worker_removed());
 		}
 	}
diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index d1762fc7cb..cbb0364178 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -66,6 +66,7 @@ struct TCServerInfo : public ReferenceCounted<TCServerInfo> {
 	Future<std::pair<StorageServerInterface, ProcessClass>> onInterfaceChanged;
 	Promise<Void> removed;
 	Future<Void> onRemoved;
+	Future<Void> onTSSPairRemoved;
 	Promise<Void> wakeUpTracker;
 	bool inDesiredDC;
 	LocalityEntry localityEntry;
@@ -83,8 +84,10 @@ struct TCServerInfo : public ReferenceCounted<TCServerInfo> {
 	             Reference<LocalitySet> storageServerSet)
 	  : id(ssi.id()), collection(collection), lastKnownInterface(ssi), lastKnownClass(processClass),
 	    dataInFlightToServer(0), onInterfaceChanged(interfaceChanged.getFuture()), onRemoved(removed.getFuture()),
-	    inDesiredDC(inDesiredDC), storeType(KeyValueStoreType::END) {
-		localityEntry = ((LocalityMap<UID>*)storageServerSet.getPtr())->add(ssi.locality, &id);
+	    inDesiredDC(inDesiredDC), storeType(KeyValueStoreType::END), onTSSPairRemoved(Never()) {
+		if (!ssi.isTss) {
+			localityEntry = ((LocalityMap<UID>*)storageServerSet.getPtr())->add(ssi.locality, &id);
+		}
 	}
 
 	bool isCorrectStoreType(KeyValueStoreType configStoreType) {
@@ -398,6 +401,7 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution(Data
 
 	state std::map<UID, Optional<Key>> server_dc;
 	state std::map<vector<UID>, std::pair<vector<UID>, vector<UID>>> team_cache;
+	state std::vector<std::pair<StorageServerInterface, ProcessClass>> tss_servers;
 
 	// Get the server list in its own try/catch block since it modifies result.  We don't want a subsequent failure
 	// causing entries to be duplicated
@@ -447,12 +451,19 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution(Data
 
 			for (int i = 0; i < serverList.get().size(); i++) {
 				auto ssi = decodeServerListValue(serverList.get()[i].value);
-				result->allServers.push_back(std::make_pair(ssi, id_data[ssi.locality.processId()].processClass));
-				server_dc[ssi.id()] = ssi.locality.dcId();
+				if (!ssi.isTss) {
+					printf("DD adding SS %s on init\n", ssi.id().toString().c_str());
+					result->allServers.push_back(std::make_pair(ssi, id_data[ssi.locality.processId()].processClass));
+					server_dc[ssi.id()] = ssi.locality.dcId();
+				} else {
+					printf("DD ignoring TSS %s on init until after team building\n", ssi.id().toString().c_str());
+					tss_servers.push_back(std::make_pair(ssi, id_data[ssi.locality.processId()].processClass));
+				}
 			}
 
 			break;
 		} catch (Error& e) {
+			printf("get initial DD failed %d\n", e.code());
 			wait(tr.onError(e));
 
 			ASSERT(!succeeded); // We shouldn't be retrying if we have already started modifying result in this loop
@@ -546,6 +557,7 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution(Data
 				beginKey = keyServers.end()[-1].key;
 				break;
 			} catch (Error& e) {
+				printf("GetInitialTeams got error %d\n", e.code());
 				TraceEvent("GetInitialTeamsKeyServersRetry", distributorId).error(e);
 
 				wait(tr.onError(e));
@@ -559,6 +571,12 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution(Data
 	// a dummy shard at the end with no keys or servers makes life easier for trackInitialShards()
 	result->shards.push_back(DDShardInfo(allKeys.end));
 
+	// add tss to server list AFTER teams are built
+	for (auto& it : tss_servers) {
+		printf("DD adding TSS %s on init\n", it.first.id().toString().c_str());
+		result->allServers.push_back(it);
+	}
+
 	return result;
 }
 
@@ -567,7 +585,8 @@ ACTOR Future<Void> storageServerTracker(struct DDTeamCollection* self,
                                         TCServerInfo* server,
                                         Promise<Void> errorOut,
                                         Version addedVersion,
-                                        const DDEnabledState* ddEnabledState);
+                                        const DDEnabledState* ddEnabledState,
+                                        bool isTss);
 
 Future<Void> teamTracker(struct DDTeamCollection* const& self,
                          Reference<TCTeamInfo> const& team,
@@ -598,6 +617,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	int64_t unhealthyServers;
 	std::map<int,int> priority_teams;
 	std::map<UID, Reference<TCServerInfo>> server_info;
+	std::map<UID, Reference<TCServerInfo>> tss_info_by_pair;
+	std::map<UID, Reference<TCServerInfo>> server_and_tss_info; // TODO could replace this with an efficient way to do a read-only concatenation of 2 data structures? 
 	std::map<Key, int> lagging_zones; // zone to number of storage servers lagging
 	AsyncVar<bool> disableFailingLaggingServers;
 
@@ -610,6 +631,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	vector<Reference<TCTeamInfo>> badTeams;
 	Reference<ShardsAffectedByTeamFailure> shardsAffectedByTeamFailure;
 	PromiseStream<UID> removedServers;
+	PromiseStream<UID> removedTSS;
 	std::set<UID> recruitingIds; // The IDs of the SS which are being recruited
 	std::set<NetworkAddress> recruitingLocalities;
 	Future<Void> initialFailureReactionDelay;
@@ -624,6 +646,8 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	int optimalTeamCount;
 	AsyncVar<bool> zeroOptimalTeams;
 
+	bool isTssRecruiting; // If tss recruiting is waiting on a pair, don't consider DD recruiting for the purposes of QuietDB
+
 	// EXCLUDED if an address is in the excluded list in the database.
 	// FAILED if an address is permanently failed.
 	// NONE by default.  Updated asynchronously (eventually)
@@ -709,7 +733,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 	    initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay, this)),
 	    optimalTeamCount(0), recruitingStream(0), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY),
 	    unhealthyServers(0), includedDCs(includedDCs), otherTrackedDCs(otherTrackedDCs),
-	    zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary),
+	    zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary), isTssRecruiting(false),
 	    medianAvailableSpace(SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO), lastMedianAvailableSpaceUpdate(0),
 	    processingUnhealthy(processingUnhealthy), lowestUtilizationTeam(0), highestUtilizationTeam(0),
 	    getShardMetrics(getShardMetrics), removeFailedServer(removeFailedServer) {
@@ -758,10 +782,11 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		// The following makes sure that, even if a reference to a team is held in the DD Queue, the tracker will be
 		// stopped
 		//  before the server_status map to which it has a pointer, is destroyed.
-		for (auto& [_, info] : server_info) {
+		for (auto& [_, info] : server_and_tss_info) {
 			info->tracker.cancel();
 			info->collection = nullptr;
 		}
+
 		// TraceEvent("DDTeamCollectionDestructed", distributorId)
 		//    .detail("Primary", primary)
 		//    .detail("ServerTrackerDestroyed", server_info.size());
@@ -1128,6 +1153,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		self->healthyZone.set(initTeams->initHealthyZoneValue);
 		// SOMEDAY: If some servers have teams and not others (or some servers have more data than others) and there is
 		// an address/locality collision, should we preferentially mark the least used server as undesirable?
+
 		for (auto i = initTeams->allServers.begin(); i != initTeams->allServers.end(); ++i) {
 			if (self->shouldHandleServer(i->first)) {
 				if (!self->isValidLocality(self->configuration.storagePolicy, i->first.locality)) {
@@ -1141,6 +1167,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 						self->addActor.send(self->checkInvalidLocalities);
 					}
 				}
+				printf("%p init adding %s\n", (void*)self, i->first.toString().c_str());
 				self->addServer(i->first, i->second, self->serverTrackerErrorOut, 0, ddEnabledState);
 			}
 		}
@@ -2419,14 +2446,25 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		if (!shouldHandleServer(newServer)) {
 			return;
 		}
-		allServers.push_back(newServer.id());
 
-		TraceEvent("AddedStorageServer", distributorId)
+		// printf("addServer(%s)\n", newServer.id().toString().c_str());
+
+		if (!newServer.isTss) {
+			allServers.push_back(newServer.id());
+		}
+
+		TraceEvent(newServer.isTss ? "AddedTSS" : "AddedStorageServer", distributorId)
 		    .detail("ServerID", newServer.id())
 		    .detail("ProcessClass", processClass.toString())
 		    .detail("WaitFailureToken", newServer.waitFailure.getEndpoint().token)
 		    .detail("Address", newServer.waitFailure.getEndpoint().getPrimaryAddress());
-		auto& r = server_info[newServer.id()] = makeReference<TCServerInfo>(
+
+		// TODO how to do this?
+		/*if (newServer.isTss) {
+		    tr.detail("TSSPairID", newServer.tssPairID);
+		}*/
+
+		auto& r = server_and_tss_info[newServer.id()] = makeReference<TCServerInfo>(
 		    newServer,
 		    this,
 		    processClass,
@@ -2434,12 +2472,33 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		        std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end(),
 		    storageServerSet);
 
-		// Establish the relation between server and machine
-		checkAndCreateMachine(r);
+		if (newServer.isTss) {
+			tss_info_by_pair[newServer.tssPairID] = r;
 
-		r->tracker = storageServerTracker(this, cx, r.getPtr(), errorOut, addedVersion, ddEnabledState);
-		doBuildTeams = true; // Adding a new server triggers to build new teams
-		restartTeamBuilder.trigger();
+			if (server_info.count(newServer.tssPairID)) {
+				r->onTSSPairRemoved = server_info[newServer.tssPairID]->onRemoved;
+			}
+		} else {
+			server_info[newServer.id()] = r;
+			// Establish the relation between server and machine
+			checkAndCreateMachine(r);
+		}
+
+		r->tracker =
+		    storageServerTracker(this, cx, r.getPtr(), errorOut, addedVersion, ddEnabledState, newServer.isTss);
+
+		if (!newServer.isTss) {
+			// link and wake up tss' tracker so it knows when this server gets removed
+			if (tss_info_by_pair.count(newServer.id())) {
+				tss_info_by_pair[newServer.id()]->onTSSPairRemoved = r->onRemoved;
+				if (tss_info_by_pair[newServer.id()]->wakeUpTracker.canBeSet()) {
+					tss_info_by_pair[newServer.id()]->wakeUpTracker.send(Void());
+				}
+			}
+
+			doBuildTeams = true; // Adding a new server triggers to build new teams
+			restartTeamBuilder.trigger();
+		}
 	}
 
 	bool removeTeam(Reference<TCTeamInfo> team) {
@@ -2605,7 +2664,21 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		return foundMachineTeam;
 	}
 
+	void removeTSS(UID removedServer) {
+		// much simpler than remove server. tss isn't in any teams, so just remove it from data structures
+		TEST(true); // Remove a TSS frm the cluster
+		printf("Removing tss %s\n", removedServer.toString().c_str());
+		TraceEvent("RemovedTSS", distributorId).detail("ServerID", removedServer);
+		Reference<TCServerInfo> removedServerInfo = server_and_tss_info[removedServer];
+
+		tss_info_by_pair.erase(removedServerInfo->lastKnownInterface.tssPairID);
+		server_and_tss_info.erase(removedServer);
+
+		server_status.clear(removedServer);
+	}
+
 	void removeServer(UID removedServer) {
+		printf("Removing ss %s\n", removedServer.toString().c_str());
 		TraceEvent("RemovedStorageServer", distributorId).detail("ServerID", removedServer);
 
 		// ASSERT( !shardsAffectedByTeamFailure->getServersForTeam( t ) for all t in teams that contain removedServer )
@@ -2703,6 +2776,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 			}
 		}
 		server_info.erase(removedServer);
+		server_and_tss_info.erase(removedServer);
 
 		if (server_status.get(removedServer).initialized && server_status.get(removedServer).isUnhealthy()) {
 			unhealthyServers--;
@@ -2726,7 +2800,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 };
 
 TCServerInfo::~TCServerInfo() {
-	if (collection && ssVersionTooFarBehind.get()) {
+	if (collection && ssVersionTooFarBehind.get() && !lastKnownInterface.isTss) {
 		collection->removeLaggingStorageServer(lastKnownInterface.locality.zoneId().get());
 	}
 }
@@ -3359,6 +3433,7 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 				    .detail("IsReady", self->initialFailureReactionDelay.isReady());
 				self->traceTeamCollectionInfo();
 			}
+
 			// Check if the number of degraded machines has changed
 			state vector<Future<Void>> change;
 			bool anyUndesired = false;
@@ -3400,18 +3475,20 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 			bool containsFailed = teamContainsFailedServer(self, team);
 			bool recheck = !healthy && (lastReady != self->initialFailureReactionDelay.isReady() ||
 			                            (lastZeroHealthy && !self->zeroHealthyTeams->get()) || containsFailed);
-			// TraceEvent("TeamHealthChangeDetected", self->distributorId)
-			//     .detail("Team", team->getDesc())
-			//     .detail("ServersLeft", serversLeft)
-			//     .detail("LastServersLeft", lastServersLeft)
-			//     .detail("AnyUndesired", anyUndesired)
-			//     .detail("LastAnyUndesired", lastAnyUndesired)
-			//     .detail("AnyWrongConfiguration", anyWrongConfiguration)
-			//     .detail("LastWrongConfiguration", lastWrongConfiguration)
-			//     .detail("Recheck", recheck)
-			//     .detail("BadTeam", badTeam)
-			//     .detail("LastZeroHealthy", lastZeroHealthy)
-			//     .detail("ZeroHealthyTeam", self->zeroHealthyTeams->get());
+
+			// TODO recomment
+			TraceEvent("TeamHealthChangeDetected", self->distributorId)
+			    .detail("Team", team->getDesc())
+			    .detail("ServersLeft", serversLeft)
+			    .detail("LastServersLeft", lastServersLeft)
+			    .detail("AnyUndesired", anyUndesired)
+			    .detail("LastAnyUndesired", lastAnyUndesired)
+			    .detail("AnyWrongConfiguration", anyWrongConfiguration)
+			    .detail("LastWrongConfiguration", lastWrongConfiguration)
+			    .detail("Recheck", recheck)
+			    .detail("BadTeam", badTeam)
+			    .detail("LastZeroHealthy", lastZeroHealthy)
+			    .detail("ZeroHealthyTeam", self->zeroHealthyTeams->get());
 
 			lastReady = self->initialFailureReactionDelay.isReady();
 			lastZeroHealthy = self->zeroHealthyTeams->get();
@@ -3764,8 +3841,8 @@ ACTOR Future<Void> waitServerListChange(DDTeamCollection* self,
 						ProcessClass const& processClass = results[i].second;
 						if (!self->shouldHandleServer(ssi)) {
 							continue;
-						} else if (self->server_info.count(serverId)) {
-							auto& serverInfo = self->server_info[serverId];
+						} else if (self->server_and_tss_info.count(serverId)) {
+							auto& serverInfo = self->server_and_tss_info[serverId];
 							if (ssi.getValue.getEndpoint() != serverInfo->lastKnownInterface.getValue.getEndpoint() ||
 							    processClass != serverInfo->lastKnownClass.classType()) {
 								Promise<std::pair<StorageServerInterface, ProcessClass>> currentInterfaceChanged =
@@ -3783,7 +3860,9 @@ ACTOR Future<Void> waitServerListChange(DDTeamCollection* self,
 							                self->serverTrackerErrorOut,
 							                tr.getReadVersion().get(),
 							                ddEnabledState);
-							self->doBuildTeams = true;
+							if (!ssi.isTss) {
+								self->doBuildTeams = true;
+							}
 						}
 					}
 
@@ -3798,6 +3877,7 @@ ACTOR Future<Void> waitServerListChange(DDTeamCollection* self,
 				}
 			}
 		} catch (Error& e) {
+			printf("WaitServerListChange got error %d\n", e.code());
 			wait(tr.onError(e));
 			serverListAndProcessClasses = Never();
 			isFetchingResults = false;
@@ -3886,16 +3966,18 @@ ACTOR Future<Void> keyValueStoreTypeTracker(DDTeamCollection* self, TCServerInfo
 }
 
 ACTOR Future<Void> waitForAllDataRemoved(Database cx, UID serverID, Version addedVersion, DDTeamCollection* teams) {
-	state Transaction tr(cx);
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
+	printf("Waiting for data to be removed from %s\n", serverID.toString().c_str());
 	loop {
 		try {
-			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-			Version ver = wait(tr.getReadVersion());
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			Version ver = wait(tr->getReadVersion());
 
 			// we cannot remove a server immediately after adding it, because a perfectly timed master recovery could
 			// cause us to not store the mutations sent to the short lived storage server.
 			if (ver > addedVersion + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS) {
-				bool canRemove = wait(canRemoveStorageServer(&tr, serverID));
+				bool canRemove = wait(canRemoveStorageServer(tr, serverID));
 				// TraceEvent("WaitForAllDataRemoved")
 				//     .detail("Server", serverID)
 				//     .detail("CanRemove", canRemove)
@@ -3908,9 +3990,9 @@ ACTOR Future<Void> waitForAllDataRemoved(Database cx, UID serverID, Version adde
 
 			// Wait for any change to the serverKeys for this server
 			wait(delay(SERVER_KNOBS->ALL_DATA_REMOVED_DELAY, TaskPriority::DataDistribution));
-			tr.reset();
+			tr->reset();
 		} catch (Error& e) {
-			wait(tr.onError(e));
+			wait(tr->onError(e));
 		}
 	}
 }
@@ -3923,6 +4005,10 @@ ACTOR Future<Void> storageServerFailureTracker(DDTeamCollection* self,
 	state StorageServerInterface interf = server->lastKnownInterface;
 	state int targetTeamNumPerServer =
 	    (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (self->configuration.storageTeamSize + 1)) / 2;
+
+	printf("Starting failure tracker for %sSS %s\n",
+	       server->lastKnownInterface.isTss ? "T" : "",
+	       server->lastKnownInterface.id().toString().c_str());
 	loop {
 		state bool inHealthyZone = false; // healthChanged actor will be Never() if this flag is true
 		if (self->healthyZone.get().present()) {
@@ -3941,16 +4027,18 @@ ACTOR Future<Void> storageServerFailureTracker(DDTeamCollection* self,
 			}
 		}
 
-		if (self->server_status.get(interf.id()).initialized) {
-			bool unhealthy = self->server_status.get(interf.id()).isUnhealthy();
-			if (unhealthy && !status->isUnhealthy()) {
-				self->unhealthyServers--;
-			}
-			if (!unhealthy && status->isUnhealthy()) {
+		if (!interf.isTss) {
+			if (self->server_status.get(interf.id()).initialized) {
+				bool unhealthy = self->server_status.get(interf.id()).isUnhealthy();
+				if (unhealthy && !status->isUnhealthy()) {
+					self->unhealthyServers--;
+				}
+				if (!unhealthy && status->isUnhealthy()) {
+					self->unhealthyServers++;
+				}
+			} else if (status->isUnhealthy()) {
 				self->unhealthyServers++;
 			}
-		} else if (status->isUnhealthy()) {
-			self->unhealthyServers++;
 		}
 
 		self->server_status.set(interf.id(), *status);
@@ -3971,7 +4059,7 @@ ACTOR Future<Void> storageServerFailureTracker(DDTeamCollection* self,
 		choose {
 			when(wait(healthChanged)) {
 				status->isFailed = !status->isFailed;
-				if (!status->isFailed &&
+				if (!status->isFailed && !server->lastKnownInterface.isTss &&
 				    (server->teams.size() < targetTeamNumPerServer || self->lastBuildTeamsFailed)) {
 					self->doBuildTeams = true;
 				}
@@ -4014,7 +4102,9 @@ ACTOR Future<Void> storageServerTracker(
     TCServerInfo* server, // This actor is owned by this TCServerInfo, point to server_info[id]
     Promise<Void> errorOut,
     Version addedVersion,
-    const DDEnabledState* ddEnabledState) {
+    const DDEnabledState* ddEnabledState,
+    bool isTss) {
+
 	state Future<Void> failureTracker;
 	state ServerStatus status(false, false, server->lastKnownInterface.locality);
 	state bool lastIsUnhealthy = false;
@@ -4022,13 +4112,16 @@ ACTOR Future<Void> storageServerTracker(
 
 	state Future<std::pair<StorageServerInterface, ProcessClass>> interfaceChanged = server->onInterfaceChanged;
 
-	state Future<Void> storeTypeTracker = keyValueStoreTypeTracker(self, server);
+	state Future<Void> storeTypeTracker = (isTss) ? Never() : keyValueStoreTypeTracker(self, server);
 	state bool hasWrongDC = !isCorrectDC(self, server);
 	state bool hasInvalidLocality =
 	    !self->isValidLocality(self->configuration.storagePolicy, server->lastKnownInterface.locality);
 	state int targetTeamNumPerServer =
 	    (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (self->configuration.storageTeamSize + 1)) / 2;
 
+	// TODO REMOVE
+	printf("Started %sSS tracker for %s\n", isTss ? "T" : "", server->id.toString().c_str());
+
 	try {
 		loop {
 			status.isUndesired = !self->disableFailingLaggingServers.get() && server->ssVersionTooFarBehind.get();
@@ -4042,7 +4135,7 @@ ACTOR Future<Void> storageServerTracker(
 			// dcLocation, interface) is changed.
 			state std::vector<Future<Void>> otherChanges;
 			std::vector<Promise<Void>> wakeUpTrackers;
-			for (const auto& i : self->server_info) {
+			for (const auto& i : self->server_and_tss_info) {
 				if (i.second.getPtr() != server &&
 				    i.second->lastKnownInterface.address() == server->lastKnownInterface.address()) {
 					auto& statusInfo = self->server_status.get(i.first);
@@ -4144,11 +4237,11 @@ ACTOR Future<Void> storageServerTracker(
 				    .detail("Excluded", worstAddr.toString());
 				status.isUndesired = true;
 				status.isWrongConfiguration = true;
-				if (worstStatus == DDTeamCollection::Status::FAILED) {
+				if (worstStatus == DDTeamCollection::Status::FAILED && !isTss) {
 					TraceEvent(SevWarn, "FailedServerRemoveKeys", self->distributorId)
 					    .detail("Server", server->id)
 					    .detail("Excluded", worstAddr.toString());
-					wait(delay(0.0)); //Do not throw an error while still inside trackExcludedServers
+					wait(delay(0.0)); // Do not throw an error while still inside trackExcludedServers
 					while (!ddEnabledState->isDDEnabled()) {
 						wait(delay(1.0));
 					}
@@ -4165,7 +4258,7 @@ ACTOR Future<Void> storageServerTracker(
 				self->restartRecruiting.trigger();
 			}
 
-			if (lastIsUnhealthy && !status.isUnhealthy() &&
+			if (lastIsUnhealthy && !status.isUnhealthy() && !isTss &&
 			    (server->teams.size() < targetTeamNumPerServer || self->lastBuildTeamsFailed)) {
 				self->doBuildTeams = true;
 				self->restartTeamBuilder.trigger(); // This does not trigger building teams if there exist healthy teams
@@ -4174,7 +4267,9 @@ ACTOR Future<Void> storageServerTracker(
 
 			state bool recordTeamCollectionInfo = false;
 			choose {
-				when(wait(failureTracker)) {
+				when(wait(failureTracker || server->onTSSPairRemoved)) {
+					printf("Server %s getting removed\n", server->id.toString().c_str());
+
 					// The server is failed AND all data has been removed from it, so permanently remove it.
 					TraceEvent("StatusMapChange", self->distributorId)
 					    .detail("ServerID", server->id)
@@ -4185,7 +4280,9 @@ ACTOR Future<Void> storageServerTracker(
 					}
 
 					// Remove server from FF/serverList
-					wait(removeStorageServer(cx, server->id, self->lock, ddEnabledState));
+					Optional<UID> tssPairID =
+					    server->lastKnownInterface.isTss ? server->lastKnownInterface.tssPairID : Optional<UID>();
+					wait(removeStorageServer(cx, server->id, tssPairID, self->lock, ddEnabledState));
 
 					TraceEvent("StatusMapChange", self->distributorId)
 					    .detail("ServerID", server->id)
@@ -4193,7 +4290,11 @@ ACTOR Future<Void> storageServerTracker(
 					// Sets removeSignal (alerting dataDistributionTeamCollection to remove the storage server from its
 					// own data structures)
 					server->removed.send(Void());
-					self->removedServers.send(server->id);
+					if (isTss) {
+						self->removedTSS.send(server->id);
+					} else {
+						self->removedServers.send(server->id);
+					}
 					return Void();
 				}
 				when(std::pair<StorageServerInterface, ProcessClass> newInterface = wait(interfaceChanged)) {
@@ -4211,7 +4312,7 @@ ACTOR Future<Void> storageServerTracker(
 
 					server->lastKnownInterface = newInterface.first;
 					server->lastKnownClass = newInterface.second;
-					if (localityChanged) {
+					if (localityChanged && !isTss) {
 						TEST(true); // Server locality changed
 
 						// The locality change of a server will affect machine teams related to the server if
@@ -4303,7 +4404,7 @@ ACTOR Future<Void> storageServerTracker(
 					recordTeamCollectionInfo = true;
 					// Restart the storeTracker for the new interface. This will cancel the previous
 					// keyValueStoreTypeTracker
-					storeTypeTracker = keyValueStoreTypeTracker(self, server);
+					storeTypeTracker = (isTss) ? Never() : keyValueStoreTypeTracker(self, server);
 					hasWrongDC = !isCorrectDC(self, server);
 					hasInvalidLocality =
 					    !self->isValidLocality(self->configuration.storagePolicy, server->lastKnownInterface.locality);
@@ -4350,6 +4451,7 @@ ACTOR Future<Void> storageServerTracker(
 // Monitor whether or not storage servers are being recruited.  If so, then a database cannot be considered quiet
 ACTOR Future<Void> monitorStorageServerRecruitment(DDTeamCollection* self) {
 	state bool recruiting = false;
+	state bool lastIsTss = false;
 	TraceEvent("StorageServerRecruitment", self->distributorId)
 	    .detail("State", "Idle")
 	    .trackLatest("StorageServerRecruitment_" + self->distributorId.toString());
@@ -4360,12 +4462,22 @@ ACTOR Future<Void> monitorStorageServerRecruitment(DDTeamCollection* self) {
 			}
 			TraceEvent("StorageServerRecruitment", self->distributorId)
 			    .detail("State", "Recruiting")
+			    .detail("IsTSS", self->isTssRecruiting ? "True" : "False")
 			    .trackLatest("StorageServerRecruitment_" + self->distributorId.toString());
 			recruiting = true;
+			lastIsTss = self->isTssRecruiting;
 		} else {
 			loop {
 				choose {
-					when(wait(self->recruitingStream.onChange())) {}
+					when(wait(self->recruitingStream.onChange())) {
+						if (lastIsTss != self->isTssRecruiting) {
+							TraceEvent("StorageServerRecruitment", self->distributorId)
+							    .detail("State", "Recruiting")
+							    .detail("IsTSS", self->isTssRecruiting ? "True" : "False")
+							    .trackLatest("StorageServerRecruitment_" + self->distributorId.toString());
+							lastIsTss = self->isTssRecruiting;
+						}
+					}
 					when(wait(self->recruitingStream.get() == 0
 					              ? delay(SERVER_KNOBS->RECRUITMENT_IDLE_DELAY, TaskPriority::DataDistribution)
 					              : Future<Void>(Never()))) {
@@ -4444,8 +4556,9 @@ ACTOR Future<Void> checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) {
 }
 
 int numExistingSSOnAddr(DDTeamCollection* self, const AddressExclusion& addr) {
+	// TODO add tss?
 	int numExistingSS = 0;
-	for (auto& server : self->server_info) {
+	for (auto& server : self->server_and_tss_info) {
 		const NetworkAddress& netAddr = server.second->lastKnownInterface.stableAddress();
 		AddressExclusion usedAddr(netAddr.ip, netAddr.port);
 		if (usedAddr == addr) {
@@ -4456,9 +4569,75 @@ int numExistingSSOnAddr(DDTeamCollection* self, const AddressExclusion& addr) {
 	return numExistingSS;
 }
 
+// All state that represents an ongoing tss pair recruitment
+struct TSSRecruitmentState : ReferenceCounted<TSSRecruitmentState>, NonCopyable {
+	Promise<Optional<std::pair<UID, Version>>>
+	    ssPairInfo; // if set, for ss to pass its id to tss pair once it is successfully recruited
+	Promise<bool> tssPairDone; // if set, for tss to pass ss that it was successfully recruited
+	Optional<Key> dcId; // dc
+	bool active;
+
+	TSSRecruitmentState() : active(false) {}
+
+	TSSRecruitmentState(Optional<Key> dcId) : active(true), dcId(dcId) {}
+
+	void cancel() {
+		// only cancel if both haven't been set, otherwise one half of pair could think it was successful but the other
+		// half would think it failed
+		if (active && ssPairInfo.canBeSet() && tssPairDone.canBeSet()) {
+			ssPairInfo.send(Optional<std::pair<UID, Version>>());
+			// callback of ssPairInfo could have cancelled tssPairDone already, so double check before cancelling
+			if (tssPairDone.canBeSet()) {
+				tssPairDone.send(false);
+			}
+		}
+	}
+
+	bool tssRecruitSuccess() {
+		if (active && tssPairDone.canBeSet()) {
+			tssPairDone.send(true);
+			return true;
+		}
+		return false;
+	}
+
+	bool tssRecruitFailed() {
+		if (active && tssPairDone.canBeSet()) {
+			printf("tssPair: %p\n", &tssPairDone);
+			tssPairDone.send(false);
+			return true;
+		}
+		return false;
+	}
+
+	bool ssRecruitSuccess(std::pair<UID, Version> ssInfo) {
+		if (active && ssPairInfo.canBeSet()) {
+			ssPairInfo.send(Optional<std::pair<UID, Version>>(ssInfo));
+			return true;
+		}
+		return false;
+	}
+
+	bool ssRecruitFailed() {
+		if (active && ssPairInfo.canBeSet()) {
+			ssPairInfo.send(Optional<std::pair<UID, Version>>());
+			return true;
+		}
+		return false;
+	}
+
+	Future<Optional<std::pair<UID, Version>>> waitOnSS() { return ssPairInfo.getFuture(); }
+
+	Future<bool> waitOnTSS() { return tssPairDone.getFuture(); }
+};
+
+// TODO switch recruitment order(ish) - grab tss but don't init it, wait for it to actually grab an ss, then the ss
+// signals here to start, then when done this signals the ss to add server
 ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
                                      RecruitStorageReply candidateWorker,
-                                     const DDEnabledState* ddEnabledState) {
+                                     const DDEnabledState* ddEnabledState,
+                                     bool recruitTss,
+                                     Reference<TSSRecruitmentState> tssState) {
 	// SOMEDAY: Cluster controller waits for availability, retry quickly if a server's Locality changes
 	self->recruitingStream.set(self->recruitingStream.get() + 1);
 
@@ -4470,11 +4649,61 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
 		// too many storage server on the same address (i.e., process) can cause OOM.
 		// Ask the candidateWorker to initialize a SS only if the worker does not have a pending request
 		state UID interfaceId = deterministicRandom()->randomUniqueID();
-		InitializeStorageRequest isr;
-		isr.storeType = self->configuration.storageServerStoreType;
+
+		state InitializeStorageRequest isr;
+		isr.storeType =
+		    recruitTss ? self->configuration.testingStorageServerStoreType : self->configuration.storageServerStoreType;
 		isr.seedTag = invalidTag;
 		isr.reqId = deterministicRandom()->randomUniqueID();
 		isr.interfaceId = interfaceId;
+		isr.isTss = recruitTss;
+
+		printf("InitStorage %s on %sSS %s\n",
+		       interfaceId.toString().c_str(),
+		       recruitTss ? "T" : "",
+		       candidateWorker.worker.address().toString().c_str());
+
+		self->recruitingIds.insert(interfaceId);
+		self->recruitingLocalities.insert(candidateWorker.worker.stableAddress());
+
+		// if tss, wait for pair ss to finish and add its id to isr. If pair fails, don't recruit tss
+		state bool doRecruit = true;
+		if (recruitTss) {
+			TraceEvent("TSS_Recruit", self->distributorId)
+			    .detail("TSSID", interfaceId)
+			    .detail("Stage", "TSSWaitingPair")
+			    .detail("Addr", candidateWorker.worker.address())
+			    .detail("Locality", candidateWorker.worker.locality.toString());
+
+			printf("TSS %s waiting for partner uid\n", interfaceId.toString().c_str());
+			Optional<std::pair<UID, Version>> ssPairInfoResult = wait(tssState->waitOnSS());
+			if (ssPairInfoResult.present()) {
+				printf("TSS %s got pair of %s @ %lld\n",
+				       interfaceId.toString().c_str(),
+				       ssPairInfoResult.get().first.toString().c_str(),
+				       ssPairInfoResult.get().second);
+				isr.tssPairID = ssPairInfoResult.get().first;
+				isr.tssPairVersion = ssPairInfoResult.get().second;
+
+				TraceEvent("TSS_Recruit", self->distributorId)
+				    .detail("SSID", isr.tssPairID)
+				    .detail("TSSID", interfaceId)
+				    .detail("Stage", "TSSWaitingPair")
+				    .detail("Addr", candidateWorker.worker.address())
+				    .detail("Locality", candidateWorker.worker.locality.toString());
+			} else {
+				printf("TSS %s didn't get partner, partner recruitment must have failed, abandoning\n",
+				       interfaceId.toString().c_str());
+				isr.isTss = false;
+				doRecruit = false;
+
+				TraceEvent(SevWarn, "TSS_RecruitError", self->distributorId)
+				    .detail("TSSID", interfaceId)
+				    .detail("Reason", "SS recruitment failed for some reason")
+				    .detail("Addr", candidateWorker.worker.address())
+				    .detail("Locality", candidateWorker.worker.locality.toString());
+			}
+		}
 
 		TraceEvent("DDRecruiting")
 		    .detail("Primary", self->primary)
@@ -4483,19 +4712,64 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
 		    .detail("WorkerLocality", candidateWorker.worker.locality.toString())
 		    .detail("Interf", interfaceId)
 		    .detail("Addr", candidateWorker.worker.address())
+		    .detail("TSS", recruitTss ? "true" : "false")
 		    .detail("RecruitingStream", self->recruitingStream.get());
 
-		self->recruitingIds.insert(interfaceId);
-		self->recruitingLocalities.insert(candidateWorker.worker.stableAddress());
-		state ErrorOr<InitializeStorageReply> newServer =
-		    wait(candidateWorker.worker.storage.tryGetReply(isr, TaskPriority::DataDistribution));
-		if (newServer.isError()) {
+		Future<ErrorOr<InitializeStorageReply>> fRecruit =
+		    doRecruit ? candidateWorker.worker.storage.tryGetReply(isr, TaskPriority::DataDistribution)
+		              : Future<ErrorOr<InitializeStorageReply>>(ErrorOr<InitializeStorageReply>(recruitment_failed()));
+
+		state ErrorOr<InitializeStorageReply> newServer = wait(fRecruit);
+
+		if (doRecruit && newServer.isError()) {
 			TraceEvent(SevWarn, "DDRecruitmentError").error(newServer.getError());
 			if (!newServer.isError(error_code_recruitment_failed) &&
 			    !newServer.isError(error_code_request_maybe_delivered))
 				throw newServer.getError();
 			wait(delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskPriority::DataDistribution));
 		}
+
+		if (!recruitTss && newServer.present() &&
+		    tssState->ssRecruitSuccess(std::pair(interfaceId, newServer.get().addedVersion))) {
+			printf("ss %s signalling tss pair with version %lld\n",
+			       interfaceId.toString().c_str(),
+			       newServer.get().addedVersion);
+			// ss has a tss pair. send it this id, but wait for add server until tss is recruited
+
+			TraceEvent("TSS_Recruit", self->distributorId)
+			    .detail("SSID", interfaceId)
+			    .detail("Stage", "SSSignaling")
+			    .detail("Addr", candidateWorker.worker.address())
+			    .detail("Locality", candidateWorker.worker.locality.toString());
+
+			// wait for timeout, and give up if no TSS pair recruited
+			Optional<bool> tssSuccessful = wait(timeout(tssState->waitOnTSS(), SERVER_KNOBS->TSS_RECRUITMENT_TIMEOUT));
+
+			// TODO if unsuccessful, fail out tss so it doesn't cause a mismatch error?
+			if (tssSuccessful.present() && tssSuccessful.get()) {
+				TraceEvent("TSS_Recruit", self->distributorId)
+				    .detail("SSID", interfaceId)
+				    .detail("Stage", "SSGotPair")
+				    .detail("Addr", candidateWorker.worker.address())
+				    .detail("Locality", candidateWorker.worker.locality.toString());
+			} else {
+				TraceEvent(SevWarn, "TSS_RecruitError", self->distributorId)
+				    .detail("SSID", interfaceId)
+				    .detail("Reason",
+				            tssSuccessful.present() ? "TSS recruitment failed for some reason"
+				                                    : "TSS recruitment timed out")
+				    .detail("Addr", candidateWorker.worker.address())
+				    .detail("Locality", candidateWorker.worker.locality.toString());
+
+				// TODO need to remove that tss here!!
+			}
+
+			// TODO trace event, change sev and message if timeout or if unsuccessful
+			printf("ss %s %ssuccessfully got tss pair!\n",
+			       interfaceId.toString().c_str(),
+			       (tssSuccessful.present() && tssSuccessful.get()) ? "" : "un");
+		}
+
 		self->recruitingIds.erase(interfaceId);
 		self->recruitingLocalities.erase(candidateWorker.worker.stableAddress());
 
@@ -4509,26 +4783,46 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
 		    .detail("RecruitingStream", self->recruitingStream.get());
 
 		if (newServer.present()) {
-			if (!self->server_info.count(newServer.get().interf.id()))
-				self->addServer(newServer.get().interf,
-				                candidateWorker.processClass,
-				                self->serverTrackerErrorOut,
-				                newServer.get().addedVersion,
-				                ddEnabledState);
-			else
-				TraceEvent(SevWarn, "DDRecruitmentError").detail("Reason", "Server ID already recruited");
-
-			self->doBuildTeams = true;
+			UID id = newServer.get().interf.id();
+			if (!self->server_and_tss_info.count(id)) {
+				if (!recruitTss || tssState->tssRecruitSuccess()) {
+					self->addServer(newServer.get().interf,
+					                candidateWorker.processClass,
+					                self->serverTrackerErrorOut,
+					                newServer.get().addedVersion,
+					                ddEnabledState);
+				} else {
+					// TODO tss recruitment was cancelled since it failed to send a response to the ss, kill it
+					printf("TSS recruitment was cancelled, stop\n");
+				}
+			} else {
+				TraceEvent(SevWarn, "DDRecruitmentError")
+				    .detail("Reason", "Server ID already recruited")
+				    .detail("ServerID", id);
+			}
+			if (!recruitTss) {
+				self->doBuildTeams = true;
+			}
 		}
 	}
 
+	if (recruitTss && tssState->tssRecruitFailed()) {
+		TEST(true); // TSS recruitment failed for some reason
+		// if tss wasn't already marked as done, it was unsuccessful in recruitment
+		printf("tss recruitment failed for some reason, signalling ss.\n");
+	}
+	if (!recruitTss && tssState->ssRecruitFailed()) {
+		TEST(true); // SS with pair TSS recruitment failed for some reason
+		// if ss didn't already send its pair id to tss, it was unsuccessful in recruitment
+		printf("ss recruitment failed for some reason, signalling tss.\n");
+	}
+
 	self->recruitingStream.set(self->recruitingStream.get() - 1);
 	self->restartRecruiting.trigger();
 
 	return Void();
 }
 
-// Recruit a worker as a storage server
 ACTOR Future<Void> storageRecruiter(DDTeamCollection* self,
                                     Reference<AsyncVar<struct ServerDBInfo>> db,
                                     const DDEnabledState* ddEnabledState) {
@@ -4536,13 +4830,24 @@ ACTOR Future<Void> storageRecruiter(DDTeamCollection* self,
 	state RecruitStorageRequest lastRequest;
 	state bool hasHealthyTeam;
 	state std::map<AddressExclusion, int> numSSPerAddr;
+
+	// tss-specific recruitment state
+	state uint32_t tssToRecruit = self->configuration.desiredTSSCount - db->get().client.tssMapping.size();
+	state Reference<TSSRecruitmentState> tssState = makeReference<TSSRecruitmentState>();
+
+	printf("DD setting tssToRecruit=%d (%d - %d)\n",
+	       tssToRecruit,
+	       self->configuration.desiredTSSCount,
+	       db->get().client.tssMapping.size());
+	TraceEvent(SevDebug, "TSS_RecruitUpdated", self->distributorId).detail("Count", tssToRecruit);
+
 	loop {
 		try {
 			numSSPerAddr.clear();
 			hasHealthyTeam = (self->healthyTeamCount != 0);
 			RecruitStorageRequest rsr;
 			std::set<AddressExclusion> exclusions;
-			for (auto s = self->server_info.begin(); s != self->server_info.end(); ++s) {
+			for (auto s = self->server_and_tss_info.begin(); s != self->server_and_tss_info.end(); ++s) {
 				auto serverStatus = self->server_status.get(s->second->lastKnownInterface.id());
 				if (serverStatus.excludeOnRecruit()) {
 					TraceEvent(SevDebug, "DDRecruitExcl1")
@@ -4574,7 +4879,7 @@ ACTOR Future<Void> storageRecruiter(DDTeamCollection* self,
 				exclusions.insert(addr);
 			}
 
-			rsr.criticalRecruitment = self->healthyTeamCount == 0;
+			rsr.criticalRecruitment = !hasHealthyTeam;
 			for (auto it : exclusions) {
 				rsr.excludeAddresses.push_back(it);
 			}
@@ -4611,10 +4916,96 @@ ACTOR Future<Void> storageRecruiter(DDTeamCollection* self,
 						    .detail("Addr", candidateSSAddr.toString())
 						    .detail("NumExistingSS", numExistingSS);
 					}
-					self->addActor.send(initializeStorage(self, candidateWorker, ddEnabledState));
+
+					if (hasHealthyTeam && !tssState->active && tssToRecruit > 0) {
+						TraceEvent("TSS_Recruit", self->distributorId)
+						    .detail("Stage", "HoldTSS")
+						    .detail("Addr", candidateSSAddr.toString())
+						    .detail("Locality", candidateWorker.worker.locality.toString());
+
+						TEST(true); // Starting TSS recruitment
+						printf("starting recruitment of tss\n");
+						self->isTssRecruiting = true;
+						tssState = makeReference<TSSRecruitmentState>(candidateWorker.worker.locality.dcId());
+
+						self->addActor.send(initializeStorage(self, candidateWorker, ddEnabledState, true, tssState));
+					} else {
+						if (tssState->active && candidateWorker.worker.locality.dcId() == tssState->dcId) {
+							TEST(true); // TSS recruits pair in same dc
+							self->isTssRecruiting = false;
+							TraceEvent("TSS_Recruit", self->distributorId)
+							    .detail("Stage", "PairSS")
+							    .detail("Addr", candidateSSAddr.toString())
+							    .detail("Locality", candidateWorker.worker.locality.toString());
+							printf("starting recruitment of ss with eventual tss pair in dc \'%s\'\n",
+							       tssState->dcId.present() ? tssState->dcId.get().toString().c_str() : "");
+							self->addActor.send(
+							    initializeStorage(self, candidateWorker, ddEnabledState, false, tssState));
+							// successfully started recruitment of pair, reset tss recruitment state
+							tssState = makeReference<TSSRecruitmentState>();
+							tssToRecruit--;
+							if (tssToRecruit > 0) {
+								printf("%d tss pairs left to recruit\n", tssToRecruit);
+							}
+						} else {
+							if (tssState->active) {
+								TEST(true); // TSS recruitment skipped potential pair because it's in a different dc
+								printf("Recruiting normal ss (no tss) b/c new ss is in different dc \'%s\' than tss "
+								       "\'%s\'\n",
+								       candidateWorker.worker.locality.dcId().present()
+								           ? candidateWorker.worker.locality.dcId().get().toString().c_str()
+								           : "",
+								       tssState->dcId.present() ? tssState->dcId.get().toString().c_str() : "");
+							} else {
+								printf("recruiting normal ss (no tss)\n");
+							}
+							self->addActor.send(initializeStorage(
+							    self, candidateWorker, ddEnabledState, false, makeReference<TSSRecruitmentState>()));
+						}
+					}
 				}
-				when(wait(db->onChange())) { // SOMEDAY: only if clusterInterface changes?
+				when(wait(db->onChange())) { // SOMEDAY: only if clusterInterface or tss changes?
 					fCandidateWorker = Future<RecruitStorageReply>();
+					// TODO REMOVE print
+					int newTssToRecruit = self->configuration.desiredTSSCount - db->get().client.tssMapping.size();
+					if (newTssToRecruit != tssToRecruit) {
+						TraceEvent("TSS_RecruitUpdated", self->distributorId).detail("Count", newTssToRecruit);
+						tssToRecruit = newTssToRecruit;
+					}
+
+					// TODO HANDLE HERE if count is more than desired tss?
+
+					printf("DD updated tssToRecruit=%d (%d - %d)\n",
+					       tssToRecruit,
+					       self->configuration.desiredTSSCount,
+					       db->get().client.tssMapping.size());
+
+					if (self->isTssRecruiting && (tssToRecruit == 0 || self->zeroHealthyTeams->get())) {
+						TEST(tssToRecruit == 0); // tss recruitment cancelled due to too many TSS
+						TEST(self->zeroHealthyTeams->get()); // tss recruitment cancelled due zero healthy teams
+						TraceEvent(SevWarn, "TSS_RecruitCancelled", self->distributorId)
+						    .detail("Reason", tssToRecruit == 0 ? "ConfigChange" : "ZeroHealthyTeams");
+						printf("Cancelling tss recruitment! tssToRecruit: %d, zeroHealthyTeams: %s\n",
+						       tssToRecruit,
+						       self->zeroHealthyTeams->get() ? "T" : "F");
+						tssState->cancel();
+						tssState = makeReference<TSSRecruitmentState>();
+						self->isTssRecruiting = false;
+					}
+				}
+				when(wait(self->zeroHealthyTeams->onChange())) {
+					// TODO refactor?
+					if (self->isTssRecruiting && self->zeroHealthyTeams->get()) {
+						TEST(self->zeroHealthyTeams->get()); // tss recruitment cancelled due zero healthy teams 2
+						TraceEvent(SevWarn, "TSS_RecruitCancelled", self->distributorId)
+						    .detail("Reason", "ZeroHealthyTeams");
+						printf("Cancelling tss recruitment!! tssToRecruit: %d, zeroHealthyTeams: %s\n",
+						       tssToRecruit,
+						       self->zeroHealthyTeams->get() ? "T" : "F");
+						tssState->cancel();
+						tssState = makeReference<TSSRecruitmentState>();
+						self->isTssRecruiting = false;
+					}
 				}
 				when(wait(self->restartRecruiting.onTrigger())) {}
 			}
@@ -4760,6 +5151,13 @@ ACTOR Future<Void> dataDistributionTeamCollection(Reference<DDTeamCollection> te
 
 				self->restartRecruiting.trigger();
 			}
+			when(UID removedTSS = waitNext(self->removedTSS.getFuture())) {
+				TEST(true); // TSS removed from database
+				self->removeTSS(removedTSS);
+				serverRemoved.send(Void());
+
+				self->restartRecruiting.trigger();
+			}
 			when(wait(self->zeroHealthyTeams->onChange())) {
 				if (self->zeroHealthyTeams->get()) {
 					self->restartRecruiting.trigger();
@@ -5254,6 +5652,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
 			wait(waitForAll(actors));
 			return Void();
 		} catch (Error& e) {
+			// TODO REMOVE
+			printf("DD got error! %d\n", e.code());
 			trackerCancelled = true;
 			state Error err = e;
 			TraceEvent("DataDistributorDestroyTeamCollections").error(e);
@@ -5265,7 +5665,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
 			if (removeFailedServer.getFuture().isReady() && !removeFailedServer.getFuture().isError()) {
 				TraceEvent("RemoveFailedServer", removeFailedServer.getFuture().get()).error(err);
 				wait(removeKeysFromFailedServer(cx, removeFailedServer.getFuture().get(), lock, ddEnabledState));
-				wait(removeStorageServer(cx, removeFailedServer.getFuture().get(), lock, ddEnabledState));
+				Optional<UID> tssPairID;
+				wait(removeStorageServer(cx, removeFailedServer.getFuture().get(), tssPairID, lock, ddEnabledState));
 			} else {
 				if (err.code() != error_code_movekeys_conflict) {
 					throw err;
@@ -5921,3 +6322,5 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") {
 
 	return Void();
 }
+
+// TODO add unit test for TSS recruitment?
diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp
index 94f38622f0..51501c9b62 100644
--- a/fdbserver/DataDistributionTracker.actor.cpp
+++ b/fdbserver/DataDistributionTracker.actor.cpp
@@ -497,14 +497,22 @@ ACTOR Future<Void> shardSplitter(DataDistributionTracker* self,
 		    .detail("MaxBytes", shardBounds.max.bytes)
 		    .detail("MetricsBytes", metrics.bytes)
 		    .detail("Bandwidth",
-		            bandwidthStatus == BandwidthStatusHigh     ? "High"
-		            : bandwidthStatus == BandwidthStatusNormal ? "Normal"
-		                                                       : "Low")
+		            bandwidthStatus == BandwidthStatusHigh
+		                ? "High"
+		                : bandwidthStatus == BandwidthStatusNormal ? "Normal" : "Low")
 		    .detail("BytesPerKSec", metrics.bytesPerKSecond)
 		    .detail("NumShards", numShards);
 	}
 
 	if (numShards > 1) {
+		// TODO REMOVE
+		printf("Splitting [%s - %s) into %d shards:\n",
+		       splitKeys[0].toString().c_str(),
+		       splitKeys[numShards].toString().c_str(),
+		       numShards);
+		for (int i = 0; i < numShards; i++) {
+			printf("    [%s - %s)\n", splitKeys[i].toString().c_str(), splitKeys[i + 1].toString().c_str());
+		}
 		int skipRange = deterministicRandom()->randomInt(0, numShards);
 		// The queue can't deal with RelocateShard requests which split an existing shard into three pieces, so
 		// we have to send the unskipped ranges in this order (nibbling in from the edges of the old range)
diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index fc1234d243..8e507f1727 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -217,6 +217,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( SERVER_LIST_DELAY,                                     1.0 );
 	init( RECRUITMENT_IDLE_DELAY,                                1.0 );
 	init( STORAGE_RECRUITMENT_DELAY,                            10.0 );
+	init( TSS_RECRUITMENT_TIMEOUT,       3*STORAGE_RECRUITMENT_DELAY ); if (randomize && BUGGIFY ) TSS_RECRUITMENT_TIMEOUT = 1.0; //Super low timeout should cause tss recruitments to fail
 	init( DATA_DISTRIBUTION_LOGGING_INTERVAL,                    5.0 );
 	init( DD_ENABLED_CHECK_DELAY,                                1.0 );
 	init( DD_STALL_CHECK_DELAY,                                  0.4 ); //Must be larger than 2*MAX_BUGGIFIED_DELAY
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index be2caba6a1..9a4cc4a047 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -167,6 +167,7 @@ public:
 	double SERVER_LIST_DELAY;
 	double RECRUITMENT_IDLE_DELAY;
 	double STORAGE_RECRUITMENT_DELAY;
+	double TSS_RECRUITMENT_TIMEOUT;
 	double DATA_DISTRIBUTION_LOGGING_INTERVAL;
 	double DD_ENABLED_CHECK_DELAY;
 	double DD_STALL_CHECK_DELAY;
diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp
index 1f2e3a9780..927a7af00b 100644
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@@ -20,6 +20,8 @@
 
 #include "flow/Util.h"
 #include "fdbrpc/FailureMonitor.h"
+#include "fdbclient/DatabaseContext.h" // for tss mapping
+#include "fdbclient/KeyBackedTypes.h"
 #include "fdbclient/SystemData.h"
 #include "fdbserver/MoveKeys.actor.h"
 #include "fdbserver/Knobs.h"
@@ -99,6 +101,7 @@ ACTOR static Future<Void> checkMoveKeysLock(Transaction* tr,
                                             bool isWrite = true) {
 	if (!ddEnabledState->isDDEnabled()) {
 		TraceEvent(SevDebug, "DDDisabledByInMemoryCheck");
+		printf("MK: DD disabled\n");
 		throw movekeys_conflict();
 	}
 	Optional<Value> readVal = wait(tr->get(moveKeysLockOwnerKey));
@@ -110,6 +113,7 @@ ACTOR static Future<Void> checkMoveKeysLock(Transaction* tr,
 		UID lastWrite = readVal.present() ? BinaryReader::fromStringRef<UID>(readVal.get(), Unversioned()) : UID();
 		if (lastWrite != lock.prevWrite) {
 			TEST(true); // checkMoveKeysLock: Conflict with previous owner
+			printf("MK: conflict with previous owner\n");
 			throw movekeys_conflict();
 		}
 
@@ -143,6 +147,7 @@ ACTOR static Future<Void> checkMoveKeysLock(Transaction* tr,
 		return Void();
 	} else {
 		TEST(true); // checkMoveKeysLock: Conflict with new owner
+		printf("MK: conflict %s with new owner %s\n", currentOwner.toString().c_str(), lock.myOwner.toString().c_str());
 		throw movekeys_conflict();
 	}
 }
@@ -158,7 +163,7 @@ ACTOR Future<Optional<UID>> checkReadWrite(Future<ErrorOr<GetShardStateReply>> f
 	return Optional<UID>(uid);
 }
 
-Future<Void> removeOldDestinations(Transaction* tr,
+Future<Void> removeOldDestinations(Reference<ReadYourWritesTransaction> tr,
                                    UID oldDest,
                                    VectorRef<KeyRangeRef> shards,
                                    KeyRangeRef currentKeys) {
@@ -235,7 +240,7 @@ ACTOR Future<vector<UID>> addReadWriteDestinations(KeyRangeRef shard,
 }
 
 ACTOR Future<vector<vector<UID>>> additionalSources(RangeResult shards,
-                                                    Transaction* tr,
+                                                    Reference<ReadYourWritesTransaction> tr,
                                                     int desiredHealthy,
                                                     int maxServers) {
 	state RangeResult UIDtoTagMap = wait(tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
@@ -325,6 +330,12 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 	state Future<Void> warningLogger = logWarningAfter("StartMoveKeysTooLong", 600, servers);
 	// state TraceInterval waitInterval("");
 
+	// TODO REMOVE
+	printf("starting move keys for [%s, %s): to %s\n",
+	       keys.begin.toString().c_str(),
+	       keys.end.toString().c_str(),
+	       servers[0].toString().c_str());
+
 	wait(startMoveKeysLock->take(TaskPriority::DataDistributionLaunch));
 	state FlowLock::Releaser releaser(*startMoveKeysLock);
 
@@ -343,7 +354,8 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 			TEST(begin > keys.begin); // Multi-transactional startMoveKeys
 			batches++;
 
-			state Transaction tr(occ);
+			// RYW to optimize re-reading the same key ranges
+			state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(occ);
 			state int retries = 0;
 
 			loop {
@@ -356,15 +368,16 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 					// Keep track of shards for all src servers so that we can preserve their values in serverKeys
 					state Map<UID, VectorRef<KeyRangeRef>> shardMap;
 
-					tr.info.taskID = TaskPriority::MoveKeys;
-					tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+					tr->getTransaction().info.taskID = TaskPriority::MoveKeys;
+					tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+					tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 
-					wait(checkMoveKeysLock(&tr, lock, ddEnabledState));
+					wait(checkMoveKeysLock(&(tr->getTransaction()), lock, ddEnabledState));
 
 					vector<Future<Optional<Value>>> serverListEntries;
 					serverListEntries.reserve(servers.size());
 					for (int s = 0; s < servers.size(); s++)
-						serverListEntries.push_back(tr.get(serverListKeyFor(servers[s])));
+						serverListEntries.push_back(tr->get(serverListKeyFor(servers[s])));
 					state vector<Optional<Value>> serverListValues = wait(getAll(serverListEntries));
 
 					for (int s = 0; s < serverListValues.size(); s++) {
@@ -380,11 +393,12 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 					// Get all existing shards overlapping keys (exclude any that have been processed in a previous
 					// iteration of the outer loop)
 					state KeyRange currentKeys = KeyRangeRef(begin, keys.end);
-					state RangeResult old = wait(krmGetRanges(&tr,
-					                                          keyServersPrefix,
-					                                          currentKeys,
-					                                          SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT,
-					                                          SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
+
+					state RangeResult old = wait(krmGetRanges(tr,
+					                                                         keyServersPrefix,
+					                                                         currentKeys,
+					                                                         SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT,
+					                                                         SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
 
 					// Determine the last processed key (which will be the beginning for the next iteration)
 					state Key endKey = old.end()[-1].key;
@@ -399,10 +413,10 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 					// 	printf("'%s': '%s'\n", old[i].key.toString().c_str(), old[i].value.toString().c_str());
 
 					// Check that enough servers for each shard are in the correct state
-					state RangeResult UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
+					state RangeResult UIDtoTagMap = wait(tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY));
 					ASSERT(!UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY);
 					vector<vector<UID>> addAsSource = wait(additionalSources(
-					    old, &tr, servers.size(), SERVER_KNOBS->MAX_ADDED_SOURCES_MULTIPLIER * servers.size()));
+					    old, tr, servers.size(), SERVER_KNOBS->MAX_ADDED_SOURCES_MULTIPLIER * servers.size()));
 
 					// For each intersecting range, update keyServers[range] dest to be servers and clear existing dest
 					// servers from serverKeys
@@ -417,7 +431,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 						//     .detail("KeyEnd", rangeIntersectKeys.end.toString())
 						//     .detail("OldSrc", describe(src))
 						//     .detail("OldDest", describe(dest))
-						//     .detail("ReadVersion", tr.getReadVersion().get());
+						//     .detail("ReadVersion", tr->getReadVersion().get());
 
 						for (auto& uid : addAsSource[i]) {
 							src.push_back(uid);
@@ -425,7 +439,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 						uniquify(src);
 
 						// Update dest servers for this range to be equal to servers
-						krmSetPreviouslyEmptyRange(&tr,
+						krmSetPreviouslyEmptyRange(&(tr->getTransaction()),
 						                           keyServersPrefix,
 						                           rangeIntersectKeys,
 						                           keyServersValue(UIDtoTagMap, src, servers),
@@ -455,7 +469,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 					vector<Future<Void>> actors;
 					for (oldDest = oldDests.begin(); oldDest != oldDests.end(); ++oldDest)
 						if (std::find(servers.begin(), servers.end(), *oldDest) == servers.end())
-							actors.push_back(removeOldDestinations(&tr, *oldDest, shardMap[*oldDest], currentKeys));
+							actors.push_back(removeOldDestinations(tr, *oldDest, shardMap[*oldDest], currentKeys));
 
 					// Update serverKeys to include keys (or the currently processed subset of keys) for each SS in
 					// servers
@@ -464,12 +478,12 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 						// to have the same shard boundaries If that invariant was important, we would have to move this
 						// inside the loop above and also set it for the src servers
 						actors.push_back(krmSetRangeCoalescing(
-						    &tr, serverKeysPrefixFor(servers[i]), currentKeys, allKeys, serverKeysTrue));
+						    tr, serverKeysPrefixFor(servers[i]), currentKeys, allKeys, serverKeysTrue));
 					}
 
 					wait(waitForAll(actors));
 
-					wait(tr.commit());
+					wait(tr->commit());
 
 					/*TraceEvent("StartMoveKeysCommitDone", relocationIntervalId)
 					    .detail("CommitVersion", tr.getCommittedVersion())
@@ -481,7 +495,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 					state Error err = e;
 					if (err.code() == error_code_move_to_removed_server)
 						throw;
-					wait(tr.onError(e));
+					wait(tr->onError(e));
 
 					if (retries % 10 == 0) {
 						TraceEvent(
@@ -500,7 +514,7 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 		}
 
 		// printf("Committed moving '%s'-'%s' (version %lld)\n", keys.begin.toString().c_str(),
-		// keys.end.toString().c_str(), tr.getCommittedVersion());
+		// keys.end.toString().c_str(), tr->getCommittedVersion());
 		TraceEvent(SevDebug, interval.end(), relocationIntervalId)
 		    .detail("Batches", batches)
 		    .detail("Shards", shards)
@@ -517,15 +531,37 @@ ACTOR Future<Void> waitForShardReady(StorageServerInterface server,
                                      KeyRange keys,
                                      Version minVersion,
                                      GetShardStateRequest::waitMode mode) {
+	// TODO REMOVE
+	printf("waiting for shard [%s, %s) in state %d from %sss %s @ %lld\n",
+	       keys.begin.toString().c_str(),
+	       keys.end.toString().c_str(),
+	       mode,
+	       server.isTss ? "t" : "",
+	       server.id().toString().c_str(),
+	       minVersion);
 	loop {
 		try {
 			GetShardStateReply rep =
 			    wait(server.getShardState.getReply(GetShardStateRequest(keys, mode), TaskPriority::MoveKeys));
 			if (rep.first >= minVersion) {
+				// TODO REMOVE
+				printf("shard [%s, %s) is in state %d from %sss %s @ %lld >= %lld\n",
+				       keys.begin.toString().c_str(),
+				       keys.end.toString().c_str(),
+				       mode,
+				       server.isTss ? "t" : "",
+				       server.id().toString().c_str(),
+				       rep.first,
+				       minVersion);
 				return Void();
 			}
 			wait(delayJittered(SERVER_KNOBS->SHARD_READY_DELAY, TaskPriority::MoveKeys));
 		} catch (Error& e) {
+			printf("Waiting for shard from %sss %s getValue=%s got error! %d\n",
+			       server.isTss ? "t" : "",
+			       server.id().toString().c_str(),
+			       server.getValue.getEndpoint().token.toString().c_str(),
+			       e.code());
 			if (e.code() != error_code_timed_out) {
 				if (e.code() != error_code_broken_promise)
 					throw e;
@@ -536,6 +572,8 @@ ACTOR Future<Void> waitForShardReady(StorageServerInterface server,
 	}
 }
 
+// best effort to also wait for TSS on data move
+
 ACTOR Future<Void> checkFetchingState(Database cx,
                                       vector<UID> dest,
                                       KeyRange keys,
@@ -557,6 +595,8 @@ ACTOR Future<Void> checkFetchingState(Database cx,
 				serverListEntries.push_back(tr.get(serverListKeyFor(dest[s])));
 			state vector<Optional<Value>> serverListValues = wait(getAll(serverListEntries));
 			vector<Future<Void>> requests;
+			state vector<Future<Void>> tssRequests;
+			ClientDBInfo clientInfo = cx->clientInfo->get();
 			for (int s = 0; s < serverListValues.size(); s++) {
 				if (!serverListValues[s].present()) {
 					// FIXME: Is this the right behavior?  dataMovementComplete will never be sent!
@@ -567,10 +607,25 @@ ACTOR Future<Void> checkFetchingState(Database cx,
 				ASSERT(si.id() == dest[s]);
 				requests.push_back(
 				    waitForShardReady(si, keys, tr.getReadVersion().get(), GetShardStateRequest::FETCHING));
+
+				Optional<StorageServerInterface> tssPair = clientInfo.getTssPair(si.id());
+				if (tssPair.present()) {
+					tssRequests.push_back(waitForShardReady(
+					    tssPair.get(), keys, tr.getReadVersion().get(), GetShardStateRequest::FETCHING));
+				}
 			}
 
 			wait(timeoutError(waitForAll(requests), SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, TaskPriority::MoveKeys));
 
+			// If normal servers return normally, give TSS data movement a bit of a chance, but don't block on it, and
+			// ignore errors in tss requests
+			if (tssRequests.size()) {
+				wait(timeout(waitForAllReady(tssRequests),
+				             SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT / 2,
+				             Void(),
+				             TaskPriority::MoveKeys));
+			}
+
 			dataMovementComplete.send(Void());
 			return Void();
 		} catch (Error& e) {
@@ -601,9 +656,18 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 	state Key endKey;
 	state int retries = 0;
 	state FlowLock::Releaser releaser;
+	state int waitForTSSCounter =
+	    2; // try waiting for tss for a 2 loops, give up if they're stuck to not affect the rest of the cluster
+
+	// for killing tss if any get stuck during movekeys
+	state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
+	state std::vector<StorageServerInterface> tssToKill;
+	state std::set<UID> tssToIgnore;
 
 	ASSERT(!destinationTeam.empty());
 
+	printf("finishing move keys for [%s, %s)\n", keys.begin.toString().c_str(), keys.end.toString().c_str());
+
 	try {
 		TraceEvent(SevDebug, interval.begin(), relocationIntervalId)
 		    .detail("KeyBegin", keys.begin)
@@ -616,9 +680,53 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 
 			state Transaction tr(occ);
 
-			// printf("finishMoveKeys( '%s'-'%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str());
+			// TODO re-comment and change back
+			printf("finishMoveKeys( '%s'-'%s' )\n", begin.toString().c_str(), keys.end.toString().c_str());
 			loop {
 				try {
+					if (tssToKill.size()) {
+						// TODO could move this to helper method?
+						// TODO add trace event
+						TEST(true); // killing TSS because they were unavailable for movekeys
+						printf("KILLING %d TSS BECAUSE THEY TIMED OUT IN MOVEKEYS\n", tssToKill.size());
+
+						// kill tss BEFORE committing main txn so that client requests don't make it to the tss when it
+						// has a different shard set than its pair use a different RYW transaction since i'm too lazy
+						// (and don't want to add bugs) by changing whole method to RYW. also using a different
+						// transaction makes it commit earlier which we may need to guarantee causality of tss getting
+						// removed before client sends a request to this key range on the new ss
+						state Reference<ReadYourWritesTransaction> tssTr =
+						    makeReference<ReadYourWritesTransaction>(occ);
+						loop {
+							try {
+								tssTr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+								tssTr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+								for (auto& tss : tssToKill) {
+									// DO NOT remove server list key - that'll break a bunch of stuff. DD will
+									// eventually call removeStorageServer tssTr->clear(serverListKeyFor(tss.id()));
+									tssTr->clear(serverTagKeyFor(tss.id()));
+									// tssTr->clear(serverTagHistoryRangeFor(tss.id()));
+									tssMapDB.erase(tssTr, tss.tssPairID);
+								}
+								tssTr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString());
+								wait(tssTr->commit());
+
+								for (auto& tss : tssToKill) {
+									// TODO ADD trace event (sev30?)
+									printf("Successfully removed TSS %s in finishMoveKeys\n",
+									       tss.id().toString().c_str());
+									tssToIgnore.insert(tss.id());
+								}
+								tssToKill.clear();
+
+								break;
+							} catch (Error& e) {
+								printf("MoveKeys TSS Removal Transaction got error %d\n", e.code());
+								wait(tssTr->onError(e));
+							}
+						}
+					}
+
 					tr.info.taskID = TaskPriority::MoveKeys;
 					tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 
@@ -763,6 +871,8 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 					// between
 					//   now and when this transaction commits.
 					state vector<Future<Void>> serverReady; // only for count below
+					state vector<Future<Void>> tssReady; // for waiting in parallel with tss
+					state vector<StorageServerInterface> tssReadyInterfs;
 					state vector<UID> newDestinations;
 					std::set<UID> completeSrcSet(completeSrc.begin(), completeSrc.end());
 					for (auto& it : dest) {
@@ -789,22 +899,104 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 						storageServerInterfaces.push_back(si);
 					}
 
+					// update client info in case tss mapping changed or server got updated
+
+					// Use most up to date version of tss mapping
+					ClientDBInfo clientInfo = occ->clientInfo->get();
+
 					// Wait for new destination servers to fetch the keys
+
 					serverReady.reserve(storageServerInterfaces.size());
-					for (int s = 0; s < storageServerInterfaces.size(); s++)
+					tssReady.reserve(storageServerInterfaces.size());
+					tssReadyInterfs.reserve(storageServerInterfaces.size());
+					for (int s = 0; s < storageServerInterfaces.size(); s++) {
 						serverReady.push_back(waitForShardReady(storageServerInterfaces[s],
 						                                        keys,
 						                                        tr.getReadVersion().get(),
 						                                        GetShardStateRequest::READABLE));
-					wait(timeout(waitForAll(serverReady),
+
+						Optional<StorageServerInterface> tssPair =
+						    clientInfo.getTssPair(storageServerInterfaces[s].id());
+
+						if (tssPair.present() && waitForTSSCounter > 0 && !tssToIgnore.count(tssPair.get().id())) {
+							tssReadyInterfs.push_back(tssPair.get());
+							tssReady.push_back(waitForShardReady(
+							    tssPair.get(), keys, tr.getReadVersion().get(), GetShardStateRequest::READABLE));
+						}
+					}
+
+					// Wait for all storage server moves, and explicitly swallow errors for tss ones with
+					// waitForAllReady If this takes too long the transaction will time out and retry, which is ok
+					wait(timeout(waitForAll(serverReady) && waitForAllReady(tssReady),
 					             SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT,
 					             Void(),
 					             TaskPriority::MoveKeys));
+
+					// Check to see if we're waiting only on tss. If so, decrement the waiting counter.
+					// If the waiting counter is zero, kill the slow/non-responsive tss processes before finalizing the
+					// data move.
+					if (tssReady.size()) {
+						bool allSSDone = true;
+						for (auto& f : serverReady) {
+							allSSDone &= f.isReady() && !f.isError();
+							if (!allSSDone) {
+								break;
+							}
+						}
+
+						if (allSSDone) {
+							bool anyTssNotDone = false;
+
+							for (auto& f : tssReady) {
+								if (!f.isReady() || f.isError()) {
+									anyTssNotDone = true;
+									printf("MK: [%s - %s) waiting on tss!\n",
+									       begin.toString().c_str(),
+									       keys.end.toString().c_str());
+									waitForTSSCounter--;
+									break;
+								}
+							}
+
+							if (anyTssNotDone && waitForTSSCounter == 0) {
+								for (int i = 0; i < tssReady.size(); i++) {
+									if (!tssReady[i].isReady() || tssReady[i].isError()) {
+										// TODO trace event!!
+										printf("TSS NOT DONE %s with move keys, killing!!\n",
+										       tssReadyInterfs[i].id().toString().c_str());
+										tssToKill.push_back(tssReadyInterfs[i]);
+									}
+								}
+								// repeat loop and go back to start to kill tss' before continuing on
+								continue;
+							}
+						}
+					}
+
 					int count = dest.size() - newDestinations.size();
 					for (int s = 0; s < serverReady.size(); s++)
 						count += serverReady[s].isReady() && !serverReady[s].isError();
 
-					// printf("  fMK: moved data to %d/%d servers\n", count, serverReady.size());
+					int tssCount = 0;
+					for (int s = 0; s < tssReady.size(); s++)
+						tssCount += tssReady[s].isReady() && !tssReady[s].isError();
+
+					// TODO re-comment
+					if (tssReady.size()) {
+						printf("  fMK: [%s - %s) moved data to %d/%d servers and %d/%d tss\n",
+						       begin.toString().c_str(),
+						       keys.end.toString().c_str(),
+						       count,
+						       serverReady.size(),
+						       tssCount,
+						       tssReady.size());
+					} else {
+						printf("  fMK: [%s - %s) moved data to %d/%d servers\n",
+						       begin.toString().c_str(),
+						       keys.end.toString().c_str(),
+						       count,
+						       serverReady.size());
+					}
 					TraceEvent(SevDebug, waitInterval.end(), relocationIntervalId).detail("ReadyServers", count);
 
 					if (count == dest.size()) {
@@ -834,6 +1026,7 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 					}
 					tr.reset();
 				} catch (Error& error) {
+					printf("   fMK: error %d\n", error.code());
 					if (error.code() == error_code_actor_cancelled)
 						throw;
 					state Error err = error;
@@ -862,43 +1055,50 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 }
 
 ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServerInterface server) {
-	state Transaction tr(cx);
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
+	state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
 	state int maxSkipTags = 1;
+
+	printf("%sSS %s adding itself\n", server.isTss ? "T" : "", server.id().toString().c_str());
 	loop {
 		try {
-			state Future<RangeResult> fTagLocalities = tr.getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
-			state Future<Optional<Value>> fv = tr.get(serverListKeyFor(server.id()));
+			// TODO should also set priority system immediate? also why is this needed?
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 
-			state Future<Optional<Value>> fExclProc = tr.get(
+			// TODO don't fetch tag localities, all tags, and history tags if tss. Just fetch pair's tag
+			state Future<RangeResult> fTagLocalities = tr->getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
+			state Future<Optional<Value>> fv = tr->get(serverListKeyFor(server.id()));
+
+			state Future<Optional<Value>> fExclProc = tr->get(
 			    StringRef(encodeExcludedServersKey(AddressExclusion(server.address().ip, server.address().port))));
 			state Future<Optional<Value>> fExclIP =
-			    tr.get(StringRef(encodeExcludedServersKey(AddressExclusion(server.address().ip))));
-			state Future<Optional<Value>> fFailProc =
-			    tr.get(StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip, server.address().port))));
+			    tr->get(StringRef(encodeExcludedServersKey(AddressExclusion(server.address().ip))));
+			state Future<Optional<Value>> fFailProc = tr->get(
+			    StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip, server.address().port))));
 			state Future<Optional<Value>> fFailIP =
-			    tr.get(StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip))));
+			    tr->get(StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip))));
 
 			state Future<Optional<Value>> fExclProc2 =
 			    server.secondaryAddress().present()
-			        ? tr.get(StringRef(encodeExcludedServersKey(
+			        ? tr->get(StringRef(encodeExcludedServersKey(
 			              AddressExclusion(server.secondaryAddress().get().ip, server.secondaryAddress().get().port))))
 			        : Future<Optional<Value>>(Optional<Value>());
 			state Future<Optional<Value>> fExclIP2 =
 			    server.secondaryAddress().present()
-			        ? tr.get(StringRef(encodeExcludedServersKey(AddressExclusion(server.secondaryAddress().get().ip))))
+			        ? tr->get(StringRef(encodeExcludedServersKey(AddressExclusion(server.secondaryAddress().get().ip))))
 			        : Future<Optional<Value>>(Optional<Value>());
 			state Future<Optional<Value>> fFailProc2 =
 			    server.secondaryAddress().present()
-			        ? tr.get(StringRef(encodeFailedServersKey(
+			        ? tr->get(StringRef(encodeFailedServersKey(
 			              AddressExclusion(server.secondaryAddress().get().ip, server.secondaryAddress().get().port))))
 			        : Future<Optional<Value>>(Optional<Value>());
 			state Future<Optional<Value>> fFailIP2 =
 			    server.secondaryAddress().present()
-			        ? tr.get(StringRef(encodeFailedServersKey(AddressExclusion(server.secondaryAddress().get().ip))))
+			        ? tr->get(StringRef(encodeFailedServersKey(AddressExclusion(server.secondaryAddress().get().ip))))
 			        : Future<Optional<Value>>(Optional<Value>());
 
-			state Future<RangeResult> fTags = tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY, true);
-			state Future<RangeResult> fHistoryTags = tr.getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY, true);
+			state Future<RangeResult> fTags = tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY, true);
+			state Future<RangeResult> fHistoryTags = tr->getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY, true);
 
 			wait(success(fTagLocalities) && success(fv) && success(fTags) && success(fHistoryTags) &&
 			     success(fExclProc) && success(fExclIP) && success(fFailProc) && success(fFailIP) &&
@@ -908,70 +1108,109 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
 			if (fExclProc.get().present() || fExclIP.get().present() || fFailProc.get().present() ||
 			    fFailIP.get().present() || fExclProc2.get().present() || fExclIP2.get().present() ||
 			    fFailProc2.get().present() || fFailIP2.get().present()) {
+				printf("%sSS %s failing to recruit because of exclusion\n",
+				       server.isTss ? "T" : "",
+				       server.id().toString().c_str());
 				throw recruitment_failed();
 			}
 
 			if (fTagLocalities.get().more || fTags.get().more || fHistoryTags.get().more)
 				ASSERT(false);
 
-			int8_t maxTagLocality = 0;
-			state int8_t locality = -1;
-			for (auto& kv : fTagLocalities.get()) {
-				int8_t loc = decodeTagLocalityListValue(kv.value);
-				if (decodeTagLocalityListKey(kv.key) == server.locality.dcId()) {
-					locality = loc;
-					break;
-				}
-				maxTagLocality = std::max(maxTagLocality, loc);
-			}
-
-			if (locality == -1) {
-				locality = maxTagLocality + 1;
-				if (locality < 0)
-					throw recruitment_failed();
-				tr.set(tagLocalityListKeyFor(server.locality.dcId()), tagLocalityListValue(locality));
-			}
-
-			int skipTags = deterministicRandom()->randomInt(0, maxSkipTags);
-
-			state uint16_t tagId = 0;
-			std::vector<uint16_t> usedTags;
-			for (auto& it : fTags.get()) {
-				Tag t = decodeServerTagValue(it.value);
-				if (t.locality == locality) {
-					usedTags.push_back(t.id);
-				}
-			}
-			for (auto& it : fHistoryTags.get()) {
-				Tag t = decodeServerTagValue(it.value);
-				if (t.locality == locality) {
-					usedTags.push_back(t.id);
-				}
-			}
-			std::sort(usedTags.begin(), usedTags.end());
-
-			int usedIdx = 0;
-			for (; usedTags.size() > 0 && tagId <= usedTags.end()[-1]; tagId++) {
-				if (tagId < usedTags[usedIdx]) {
-					if (skipTags == 0)
+			state Tag tag;
+			if (server.isTss) {
+				bool foundTag = false;
+				for (auto& it : fTags.get()) {
+					UID key = decodeServerTagKey(it.key);
+					if (key == server.tssPairID) {
+						tag = decodeServerTagValue(it.value);
+						foundTag = true;
 						break;
-					skipTags--;
-				} else {
-					usedIdx++;
+					}
 				}
+				if (!foundTag) {
+					throw recruitment_failed();
+				}
+				// ASSERT(foundTag); // TSS's pair was removed before TSS could register. Should never happen, since the
+				// SS shouldn't be tracked by DD until this completes.
+				printf("TSS %s found tag %s for pair %s\n",
+				       server.id().toString().c_str(),
+				       tag.toString().c_str(),
+				       server.tssPairID.toString().c_str());
+				tssMapDB.set(tr, server.tssPairID, server.id());
+				tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString());
+
+			} else {
+				int8_t maxTagLocality = 0;
+				state int8_t locality = -1;
+				// TODO i think tss can ignore this part?
+				for (auto& kv : fTagLocalities.get()) {
+					int8_t loc = decodeTagLocalityListValue(kv.value);
+					if (decodeTagLocalityListKey(kv.key) == server.locality.dcId()) {
+						locality = loc;
+						break;
+					}
+					maxTagLocality = std::max(maxTagLocality, loc);
+				}
+
+				if (locality == -1) {
+					locality = maxTagLocality + 1;
+					if (locality < 0) {
+						throw recruitment_failed();
+					}
+					tr->set(tagLocalityListKeyFor(server.locality.dcId()), tagLocalityListValue(locality));
+				}
+
+				int skipTags = deterministicRandom()->randomInt(0, maxSkipTags);
+
+				state uint16_t tagId = 0;
+				std::vector<uint16_t> usedTags;
+				for (auto& it : fTags.get()) {
+					Tag t = decodeServerTagValue(it.value);
+					if (t.locality == locality) {
+						usedTags.push_back(t.id);
+					}
+				}
+				for (auto& it : fHistoryTags.get()) {
+					Tag t = decodeServerTagValue(it.value);
+					if (t.locality == locality) {
+						usedTags.push_back(t.id);
+					}
+				}
+				std::sort(usedTags.begin(), usedTags.end());
+
+				int usedIdx = 0;
+				for (; usedTags.size() > 0 && tagId <= usedTags.end()[-1]; tagId++) {
+					if (tagId < usedTags[usedIdx]) {
+						if (skipTags == 0)
+							break;
+						skipTags--;
+					} else {
+						usedIdx++;
+					}
+				}
+				tagId += skipTags;
+
+				tag = Tag(locality, tagId);
+
+				tr->set(serverTagKeyFor(server.id()), serverTagValue(tag));
+				KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(tag));
+				tr->addReadConflictRange(conflictRange);
+				tr->addWriteConflictRange(conflictRange);
 			}
-			tagId += skipTags;
 
-			state Tag tag(locality, tagId);
-			tr.set(serverTagKeyFor(server.id()), serverTagValue(tag));
-			tr.set(serverListKeyFor(server.id()), serverListValue(server));
-			KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(tag));
-			tr.addReadConflictRange(conflictRange);
-			tr.addWriteConflictRange(conflictRange);
-
-			wait(tr.commit());
-			return std::make_pair(tr.getCommittedVersion(), tag);
+			tr->set(serverListKeyFor(server.id()), serverListValue(server));
+			wait(tr->commit());
+			printf("%sSS %s successfully added itself @ %lld\n",
+			       server.isTss ? "T" : "",
+			       server.id().toString().c_str(),
+			       tr->getCommittedVersion());
+			return std::make_pair(tr->getCommittedVersion(), tag);
 		} catch (Error& e) {
+			printf("%sSS %s got error adding itself: %d!!\n",
+			       server.isTss ? "T" : "",
+			       server.id().toString().c_str(),
+			       e.code());
 			if (e.code() == error_code_commit_unknown_result)
 				throw recruitment_failed(); // There is a remote possibility that we successfully added ourselves and
 				                            // then someone removed us, so we have to fail
@@ -980,12 +1219,12 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
 				maxSkipTags = SERVER_KNOBS->MAX_SKIP_TAGS;
 			}
 
-			wait(tr.onError(e));
+			wait(tr->onError(e));
 		}
 	}
 }
 // A SS can be removed only if all data (shards) on the SS have been moved away from the SS.
-ACTOR Future<bool> canRemoveStorageServer(Transaction* tr, UID serverID) {
+ACTOR Future<bool> canRemoveStorageServer(Reference<ReadYourWritesTransaction> tr, UID serverID) {
 	RangeResult keys = wait(krmGetRanges(tr, serverKeysPrefixFor(serverID), allKeys, 2));
 
 	ASSERT(keys.size() >= 2);
@@ -1005,34 +1244,39 @@ ACTOR Future<bool> canRemoveStorageServer(Transaction* tr, UID serverID) {
 
 ACTOR Future<Void> removeStorageServer(Database cx,
                                        UID serverID,
+                                       Optional<UID> tssPairID,
                                        MoveKeysLock lock,
                                        const DDEnabledState* ddEnabledState) {
-	state Transaction tr(cx);
+	state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
 	state bool retry = false;
 	state int noCanRemoveCount = 0;
+
+	printf("Removing storage server %s\n", serverID.toString().c_str());
+
 	loop {
 		try {
-			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-			wait(checkMoveKeysLock(&tr, lock, ddEnabledState));
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+			wait(checkMoveKeysLock(&(tr->getTransaction()), lock, ddEnabledState));
 			TraceEvent("RemoveStorageServerLocked")
 			    .detail("ServerID", serverID)
-			    .detail("Version", tr.getReadVersion().get());
+			    .detail("Version", tr->getReadVersion().get());
 
-			state bool canRemove = wait(canRemoveStorageServer(&tr, serverID));
+			state bool canRemove = wait(canRemoveStorageServer(tr, serverID));
 			if (!canRemove) {
 				TEST(true); // The caller had a transaction in flight that assigned keys to the server.  Wait for it to
 				            // reverse its mistake.
 				TraceEvent(SevWarn, "NoCanRemove").detail("Count", noCanRemoveCount++).detail("ServerID", serverID);
 				wait(delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::DataDistributionLaunch));
-				tr.reset();
+				tr->reset();
 				TraceEvent("RemoveStorageServerRetrying").detail("CanRemove", canRemove);
 			} else {
-
-				state Future<Optional<Value>> fListKey = tr.get(serverListKeyFor(serverID));
-				state Future<RangeResult> fTags = tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY);
-				state Future<RangeResult> fHistoryTags = tr.getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY);
-				state Future<RangeResult> fTagLocalities = tr.getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
-				state Future<RangeResult> fTLogDatacenters = tr.getRange(tLogDatacentersKeys, CLIENT_KNOBS->TOO_MANY);
+				state Future<Optional<Value>> fListKey = tr->get(serverListKeyFor(serverID));
+				state Future<RangeResult> fTags = tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY);
+				state Future<RangeResult> fHistoryTags = tr->getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY);
+				state Future<RangeResult> fTagLocalities = tr->getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
+				state Future<RangeResult> fTLogDatacenters = tr->getRange(tLogDatacentersKeys, CLIENT_KNOBS->TOO_MANY);
 
 				wait(success(fListKey) && success(fTags) && success(fHistoryTags) && success(fTagLocalities) &&
 				     success(fTLogDatacenters));
@@ -1072,22 +1316,33 @@ ACTOR Future<Void> removeStorageServer(Database cx,
 				if (locality >= 0 && !allLocalities.count(locality)) {
 					for (auto& it : fTagLocalities.get()) {
 						if (locality == decodeTagLocalityListValue(it.value)) {
-							tr.clear(it.key);
+							tr->clear(it.key);
 							break;
 						}
 					}
 				}
 
-				tr.clear(serverListKeyFor(serverID));
-				tr.clear(serverTagKeyFor(serverID));
-				tr.clear(serverTagHistoryRangeFor(serverID));
+				tr->clear(serverListKeyFor(serverID));
+				tr->clear(serverTagKeyFor(serverID)); // the tss uses this to communicate shutdown but it never has a
+				                                      // server tag key set in the first place
+				tr->clear(serverTagHistoryRangeFor(serverID));
+
+				// TODO a small optimization would be to only erase and trigger tss mapping if this is a tss or an  ss
+				// with a tss pair, instead of always
+				if (tssPairID.present()) {
+					tssMapDB.erase(tr, tssPairID.get());
+				} else {
+					tssMapDB.erase(tr, serverID);
+				}
+				tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString());
+
 				retry = true;
-				wait(tr.commit());
+				wait(tr->commit());
 				return Void();
 			}
 		} catch (Error& e) {
 			state Error err = e;
-			wait(tr.onError(e));
+			wait(tr->onError(e));
 			TraceEvent("RemoveStorageServerRetrying").error(err);
 		}
 	}
@@ -1099,6 +1354,7 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx,
                                               MoveKeysLock lock,
                                               const DDEnabledState* ddEnabledState) {
 	state Key begin = allKeys.begin;
+	printf("Removing keys from failed server %s\n", serverID.toString().c_str());
 	// Multi-transactional removal in case of large number of shards, concern in violating 5s transaction limit
 	while (begin < allKeys.end) {
 		state Transaction tr(cx);
@@ -1200,6 +1456,8 @@ ACTOR Future<Void> moveKeys(Database cx,
 	if (!dataMovementComplete.isSet())
 		dataMovementComplete.send(Void());
 
+	printf("move keys done for [%s, %s)\n", keys.begin.toString().c_str(), keys.end.toString().c_str());
+
 	return Void();
 }
 
diff --git a/fdbserver/MoveKeys.actor.h b/fdbserver/MoveKeys.actor.h
index e8ae691878..c8092bbcdd 100644
--- a/fdbserver/MoveKeys.actor.h
+++ b/fdbserver/MoveKeys.actor.h
@@ -89,13 +89,14 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
 
 ACTOR Future<Void> removeStorageServer(Database cx,
                                        UID serverID,
+                                       Optional<UID> tssPairID, // if serverID is a tss, set to its ss pair id
                                        MoveKeysLock lock,
                                        const DDEnabledState* ddEnabledState);
 // Removes the given storage server permanently from the database.  It must already
 // have no shards assigned to it.  The storage server MUST NOT be added again after this
 // (though a new storage server with a new unique ID may be recruited from the same fdbserver).
 
-ACTOR Future<bool> canRemoveStorageServer(Transaction* tr, UID serverID);
+ACTOR Future<bool> canRemoveStorageServer(Reference<ReadYourWritesTransaction> tr, UID serverID);
 // Returns true if the given storage server has no keys assigned to it and may be safely removed
 // Obviously that could change later!
 ACTOR Future<Void> removeKeysFromFailedServer(Database cx,
diff --git a/fdbserver/MutationTracking.cpp b/fdbserver/MutationTracking.cpp
index 16a17a0f10..b0e7215fb8 100644
--- a/fdbserver/MutationTracking.cpp
+++ b/fdbserver/MutationTracking.cpp
@@ -30,6 +30,9 @@
 // Track up to 2 keys in simulation via enabling MUTATION_TRACKING_ENABLED and setting the keys here.
 StringRef debugKey = LiteralStringRef("");
 StringRef debugKey2 = LiteralStringRef("\xff\xff\xff\xff");
+// StringRef debugKey = LiteralStringRef("\x00\x00\x02\xff\x00\x00\x04\xc1\x00\x00\x00\x01\x00\x00\x00\x02"); // missing
+// from ss StringRef debugKey2 = LiteralStringRef("\x00\x00\x02\xff\x00\x00\x01\x89\x00\x00\x00\x04\x00\x00\x00\x02");
+// // missing from tss
 
 TraceEvent debugMutationEnabled(const char* context, Version version, MutationRef const& mutation) {
 	if ((mutation.type == mutation.ClearRange || mutation.type == mutation.DebugKeyRange) &&
diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index 98f14d545e..40f731aed6 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -294,6 +294,11 @@ ACTOR Future<int64_t> getMaxStorageServerQueueSize(Database cx, Reference<AsyncV
 	state std::vector<StorageServerInterface> servers = wait(serversFuture);
 	state std::vector<WorkerDetails> workers = wait(workersFuture);
 
+	/*printf("Found %d storage servers:\n", servers.size());
+	for (auto& it : servers) {
+	    printf("    %s\n", it.id().toString().c_str());
+	}*/
+
 	std::map<NetworkAddress, WorkerInterface> workersMap;
 	for (auto worker : workers) {
 		workersMap[worker.interf.address()] = worker.interf;
@@ -323,6 +328,7 @@ ACTOR Future<int64_t> getMaxStorageServerQueueSize(Database cx, Reference<AsyncV
 		try {
 			maxQueueSize = std::max(maxQueueSize, getQueueSize(messages[i].get()));
 		} catch (Error& e) {
+			printf("Error getting max storage server queue size: %d\n", e.code());
 			TraceEvent("QuietDatabaseFailure")
 			    .detail("Reason", "Failed to extract MaxStorageServerQueue")
 			    .detail("SS", servers[i].id());
@@ -516,7 +522,15 @@ ACTOR Future<bool> getStorageServersRecruiting(Database cx, WorkerInterface dist
 		                      1.0));
 
 		TraceEvent("StorageServersRecruiting").detail("Message", recruitingMessage.toString());
-		return recruitingMessage.getValue("State") == "Recruiting";
+
+		if (recruitingMessage.getValue("State") == "Recruiting") {
+			std::string tssValue;
+			// if we're tss recruiting, that's fine because that can block indefinitely if only 1 free storage process
+			if (!recruitingMessage.tryGetValue("IsTSS", tssValue) || tssValue == "False") {
+				return true;
+			}
+		}
+		return false;
 	} catch (Error& e) {
 		TraceEvent("QuietDatabaseFailure", distributorWorker.id())
 		    .detail("Reason", "Failed to extract StorageServersRecruiting")
diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 71d3056489..83894c1201 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -719,9 +719,11 @@ ACTOR Future<Void> trackEachStorageServer(
 		when(state std::pair<UID, Optional<StorageServerInterface>> change = waitNext(serverChanges)) {
 			wait(delay(0)); // prevent storageServerTracker from getting cancelled while on the call stack
 			if (change.second.present()) {
-				auto& a = actors[change.first];
-				a = Future<Void>();
-				a = splitError(trackStorageServerQueueInfo(self, change.second.get()), err);
+				if (!change.second.get().isTss) { // TODO is this all we need to do to get ratekeeper to ignore tss?
+					auto& a = actors[change.first];
+					a = Future<Void>();
+					a = splitError(trackStorageServerQueueInfo(self, change.second.get()), err);
+				}
 			} else
 				actors.erase(change.first);
 		}
diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index 128eace3a8..24d7dfb01d 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -1138,6 +1138,7 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 			storage_engine_type = deterministicRandom()->randomInt(0, 4);
 		}
 	}
+
 	switch (storage_engine_type) {
 	case 0: {
 		TEST(true); // Simulated cluster using ssd storage engine
@@ -1162,6 +1163,17 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	default:
 		ASSERT(false); // Programmer forgot to adjust cases.
 	}
+
+	int tssCount = 0;
+	// if (!testConfig.simpleConfig && deterministicRandom()->random01() < 0.25) {
+	if (true) {
+		// if (false) {
+		// tss
+		// 1 or 2 tss
+		tssCount = deterministicRandom()->randomInt(1, 3);
+		printf("Initial tss count to %d\n", tssCount);
+	}
+
 	//	if (deterministicRandom()->random01() < 0.5) {
 	//		set_config("ssd");
 	//	} else {
@@ -1494,6 +1506,29 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	} else {
 		processes_per_machine = deterministicRandom()->randomInt(1, (extraDB ? 14 : 28) / machine_count + 2);
 	}
+
+	// reduce tss to half of extra non-seed servers that can be recruited in usable regions.
+	tssCount =
+	    std::max(0, std::min(tssCount, (db.usableRegions * (machine_count / datacenters) - replication_type) / 2));
+	printf("Adjusted tss count to %d\n", tssCount);
+
+	if (tssCount > 0) {
+		std::string confStr = format("tss_count:=%d tss_storage_engine:=%d", tssCount, db.storageServerStoreType);
+		set_config(confStr);
+		double tssRandom = deterministicRandom()->random01();
+		if (tssRandom > 0.5) {
+			// normal tss mode
+			g_simulator.tssMode = ISimulator::TSSMode::EnabledNormal;
+			printf("normal tss mode\n");
+		} else if (tssRandom < 0.25) {
+			// delay injection
+			g_simulator.tssMode = ISimulator::TSSMode::EnabledAddDelay;
+		} else {
+			// fault injection
+			g_simulator.tssMode = ISimulator::TSSMode::EnabledDropMutations;
+		}
+		printf("enabling tss for simulation in mode %d: %s\n", g_simulator.tssMode, confStr.c_str());
+	}
 }
 
 // Configures the system according to the given specifications in order to run
@@ -1517,6 +1552,9 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
 		startingConfigString += " locked";
 	}
 	for (auto kv : startingConfigJSON) {
+		if ("tss_storage_engine" == kv.first) {
+			continue;
+		}
 		startingConfigString += " ";
 		if (kv.second.type() == json_spirit::int_type) {
 			startingConfigString += kv.first + ":=" + format("%d", kv.second.get_int());
@@ -1531,6 +1569,12 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
 		}
 	}
 
+	// handle tss_storage_engine separately because the passthrough needs the enum ordinal, but it's serialized to json
+	// as the string name
+	if (simconfig.db.desiredTSSCount > 0) {
+		startingConfigString += format(" tss_storage_engine:=%d", simconfig.db.testingStorageServerStoreType);
+	}
+
 	if (g_simulator.originalRegions != "") {
 		simconfig.set_config(g_simulator.originalRegions);
 		g_simulator.startingDisabledConfiguration = startingConfigString + " " + g_simulator.disableRemote;
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 5f546638ff..723c6c6111 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -1880,10 +1880,10 @@ ACTOR static Future<vector<std::pair<CommitProxyInterface, EventMap>>> getCommit
 ACTOR static Future<vector<std::pair<GrvProxyInterface, EventMap>>> getGrvProxiesAndMetrics(
     Reference<AsyncVar<ServerDBInfo>> db,
     std::unordered_map<NetworkAddress, WorkerInterface> address_workers) {
-	vector<std::pair<GrvProxyInterface, EventMap>> results =
-	    wait(getServerMetrics(db->get().client.grvProxies,
-	                          address_workers,
-	                          std::vector<std::string>{ "GRVLatencyMetrics", "GRVLatencyBands", "GRVBatchLatencyMetrics" }));
+	vector<std::pair<GrvProxyInterface, EventMap>> results = wait(
+	    getServerMetrics(db->get().client.grvProxies,
+	                     address_workers,
+	                     std::vector<std::string>{ "GRVLatencyMetrics", "GRVLatencyBands", "GRVBatchLatencyMetrics" }));
 	return results;
 }
 
@@ -3005,6 +3005,14 @@ ACTOR Future<StatusReply> clusterGetStatus(
 		statusObj["incompatible_connections"] = incompatibleConnectionsArray;
 		statusObj["datacenter_lag"] = getLagObject(datacenterVersionDifference);
 
+		int activeTSSCount = 0;
+		for (auto& it : storageServers) {
+			if (it.first.isTss) {
+				activeTSSCount++;
+			}
+		}
+		statusObj["active_tss_count"] = activeTSSCount;
+
 		int totalDegraded = 0;
 		for (auto& it : workers) {
 			if (it.degraded) {
diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index 4ea9e83bee..f884a2e310 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -1671,6 +1671,11 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
 
 	Version poppedVer = poppedVersion(logData, req.tag);
 	if (poppedVer > req.begin) {
+		printf("tag %s - %s tried to peek popped data!!: %lld > %lld\n",
+		       req.tag.toString().c_str(),
+		       peekId.toString().c_str(),
+		       poppedVer,
+		       req.begin);
 		TLogPeekReply rep;
 		rep.maxKnownVersion = logData->version.get();
 		rep.minKnownCommittedVersion = logData->minKnownCommittedVersion;
diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h
index 3446b3a7b8..48a4d9ce07 100644
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@@ -614,11 +614,18 @@ struct InitializeStorageRequest {
 	UID reqId;
 	UID interfaceId;
 	KeyValueStoreType storeType;
+	bool isTss;
+	UID tssPairID;
+	Version tssPairVersion;
 	ReplyPromise<InitializeStorageReply> reply;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, seedTag, reqId, interfaceId, storeType, reply);
+		if (ar.protocolVersion().hasTSS()) {
+			serializer(ar, seedTag, reqId, interfaceId, storeType, reply, isTss, tssPairID, tssPairVersion);
+		} else {
+			serializer(ar, seedTag, reqId, interfaceId, storeType, reply);
+		}
 	}
 };
 
@@ -770,6 +777,7 @@ struct DiskStoreRequest {
 struct Role {
 	static const Role WORKER;
 	static const Role STORAGE_SERVER;
+	static const Role TESTING_STORAGE_SERVER;
 	static const Role TRANSACTION_LOG;
 	static const Role SHARED_TRANSACTION_LOG;
 	static const Role COMMIT_PROXY;
@@ -840,6 +848,7 @@ class IDiskQueue;
 ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
                                  StorageServerInterface ssi,
                                  Tag seedTag,
+                                 Version tssSeedVersion,
                                  ReplyPromise<InitializeStorageReply> recruitReply,
                                  Reference<AsyncVar<ServerDBInfo>> db,
                                  std::string folder);
diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp
index 97953ce1a3..c0dee60682 100644
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@@ -417,11 +417,14 @@ ACTOR Future<Void> newTLogServers(Reference<MasterData> self,
 ACTOR Future<Void> newSeedServers(Reference<MasterData> self,
                                   RecruitFromConfigurationReply recruits,
                                   vector<StorageServerInterface>* servers) {
+	printf("Seeding initial %d storage servers\n", recruits.storageServers.size());
 	// This is only necessary if the database is at version 0
 	servers->clear();
 	if (self->lastEpochEnd)
 		return Void();
 
+	// TODO might need to make this handle TSS recruitment (or make RecruitFromConfiguration handle it?) for simulation
+
 	state int idx = 0;
 	state std::map<Optional<Value>, Tag> dcId_tags;
 	state int8_t nextLocality = 0;
@@ -434,6 +437,7 @@ ACTOR Future<Void> newSeedServers(Reference<MasterData> self,
 		                  ? dcId_tags[recruits.storageServers[idx].locality.dcId()]
 		                  : Tag(nextLocality, 0);
 		isr.storeType = self->configuration.storageServerStoreType;
+		isr.isTss = false;
 		isr.reqId = deterministicRandom()->randomUniqueID();
 		isr.interfaceId = deterministicRandom()->randomUniqueID();
 
@@ -469,6 +473,8 @@ ACTOR Future<Void> newSeedServers(Reference<MasterData> self,
 	    .detail("TargetCount", self->configuration.storageTeamSize)
 	    .detail("Servers", describe(*servers));
 
+	printf("Seed servers sees %d desired tss\n", self->configuration.desiredTSSCount);
+
 	return Void();
 }
 
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 6789549944..7fe0b1c2a3 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -38,6 +38,7 @@
 #include "fdbclient/DatabaseContext.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/CommitProxyInterface.h"
+#include "fdbclient/KeyBackedTypes.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/Notified.h"
 #include "fdbclient/StatusClient.h"
@@ -463,7 +464,7 @@ public:
 	void byteSampleApplyClear(KeyRangeRef range, Version ver);
 
 	void popVersion(Version v, bool popAllTags = false) {
-		if (logSystem) {
+		if (logSystem && !isTss()) {
 			if (v > poppedAllAfter) {
 				popAllTags = true;
 				poppedAllAfter = std::numeric_limits<Version>::max();
@@ -510,6 +511,25 @@ public:
 		return mLV.push_back_deep(mLV.arena(), m);
 	}
 
+	void setTssPair(UID pairId) {
+		tssPairID = Optional<UID>(pairId);
+
+		// Set up tss fault injection here, only if we are in simulated mode and with fault injection.
+		// With fault injection enabled, the tss will start acting normal for a bit, then after the specified delay
+		// start behaving incorrectly.
+		if (g_network->isSimulated() && !g_simulator.speedUpSimulation &&
+		    g_simulator.tssMode >= ISimulator::TSSMode::EnabledAddDelay) {
+			tssFaultInjectTime = now() + deterministicRandom()->randomInt(60, 300);
+			TraceEvent(SevWarnAlways, "TSSInjectFaultEnabled", thisServerID)
+			    .detail("Mode", g_simulator.tssMode)
+			    .detail("At", tssFaultInjectTime.get());
+			printf("ENABLING FAULT INJECTION FOR TSS %s at time %.4f in mode %d\n",
+			       thisServerID.toString().c_str(),
+			       tssFaultInjectTime.get(),
+			       g_simulator.tssMode);
+		}
+	}
+
 	StorageServerDisk storage;
 
 	KeyRangeMap<Reference<ShardInfo>> shards;
@@ -552,6 +572,9 @@ public:
 	Reference<ILogSystem::IPeekCursor> logCursor;
 
 	UID thisServerID;
+	Optional<UID> tssPairID; // if this server is a tss, this is the id of its (ss) pair
+	Optional<UID> ssPairID; // if this server is an ss, this is the id of its (tss) pair
+	Optional<double> tssFaultInjectTime;
 	Key sk;
 	Reference<AsyncVar<ServerDBInfo>> db;
 	Database cx;
@@ -785,6 +808,14 @@ public:
 		mutableData().forgetVersionsBefore(ver);
 	}
 
+	bool isTss() const { return tssPairID.present(); }
+
+	bool isSSWithTSSPair() const { return ssPairID.present(); }
+
+	void setSSWithTssPair(UID idOfTSS) { ssPairID = Optional<UID>(idOfTSS); }
+
+	void clearSSWithTssPair() { ssPairID = Optional<UID>(); }
+
 	// This is the maximum version that might be read from storage (the minimum version is durableVersion)
 	Version storageVersion() const { return oldestVersion.get(); }
 
@@ -1046,12 +1077,24 @@ void updateProcessStats(StorageServer* self) {
 
 ACTOR Future<Version> waitForVersionActor(StorageServer* data, Version version, SpanID spanContext) {
 	state Span span("SS.WaitForVersion"_loc, { spanContext });
+	/*if (172218491 == version) {
+	    printf("%sSS %s starting waitForVersionActor @ %lld\n", data->tssPairID.present() ? "T" : "",
+	data->thisServerID.toString().c_str(), version);
+	}*/
 	choose {
 		when(wait(data->version.whenAtLeast(version))) {
 			// FIXME: A bunch of these can block with or without the following delay 0.
 			// wait( delay(0) );  // don't do a whole bunch of these at once
+			/*if (172218491 == version) {
+			    printf("%sSS %s waitForVersionActor @ %lld - at least version\n", data->tssPairID.present() ? "T" : "",
+			data->thisServerID.toString().c_str(), version);
+			}*/
 			if (version < data->oldestVersion.get())
 				throw transaction_too_old(); // just in case
+			/*if (172218491 == version) {
+			    printf("%sSS %s waitForVersionActor @ %lld - not too old\n", data->tssPairID.present() ? "T" : "",
+			data->thisServerID.toString().c_str(), version);
+			}*/
 			return version;
 		}
 		when(wait(delay(SERVER_KNOBS->FUTURE_VERSION_DELAY))) {
@@ -1060,23 +1103,39 @@ ACTOR Future<Version> waitForVersionActor(StorageServer* data, Version version,
 				    .detail("Version", version)
 				    .detail("MyVersion", data->version.get())
 				    .detail("ServerID", data->thisServerID);
+			/*if (172218491 == version) {
+			    printf("%sSS %s waitForVersionActor @ %lld - future version\n", data->tssPairID.present() ? "T" : "",
+			data->thisServerID.toString().c_str(), version);
+			}*/
 			throw future_version();
 		}
 	}
 }
 
 Future<Version> waitForVersion(StorageServer* data, Version version, SpanID spanContext) {
+	/*if (172218491 == version) {
+	    printf("%sSS %s started waitForVersion @ %lld\n", data->tssPairID.present() ? "T" : "",
+	data->thisServerID.toString().c_str(), version);
+	}*/
 	if (version == latestVersion) {
 		version = std::max(Version(1), data->version.get());
 	}
 
 	if (version < data->oldestVersion.get() || version <= 0) {
+		/*if (172218491 == version) {
+		    printf("%sSS %s waitForVersion @ %lld - transaction too old\n", data->tssPairID.present() ? "T" : "",
+		data->thisServerID.toString().c_str(), version);
+		}*/
 		return transaction_too_old();
 	} else if (version <= data->version.get()) {
 		return version;
 	}
 
 	if ((data->behind || data->versionBehind) && version > data->version.get()) {
+		/*if (172218491 == version) {
+		    printf("%sSS %s waitForVersion @ %lld - process_behind\n", data->tssPairID.present() ? "T" : "",
+		data->thisServerID.toString().c_str(), version);
+		}*/
 		return process_behind();
 	}
 
@@ -1110,6 +1169,11 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 	Span span("SS:getValue"_loc, { req.spanContext });
 	span.addTag("key"_sr, req.key);
 
+	/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
+	    printf("%sSS %s started getValueQ for %s @ %lld\n", data->tssPairID.present() ? "T" : "",
+	data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version);
+	}*/
+
 	try {
 		++data->counters.getValueQueries;
 		++data->counters.allQueries;
@@ -1121,6 +1185,11 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 		// so we need to downgrade here
 		wait(data->getQueryDelay());
 
+		/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
+		    printf("%sSS %s  getValueQ for %s @ %lld - got query delay\n", data->tssPairID.present() ? "T" : "",
+		data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version);
+		}*/
+
 		if (req.debugID.present())
 			g_traceBatch.addEvent("GetValueDebug",
 			                      req.debugID.get().first(),
@@ -1135,8 +1204,17 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 
 		state uint64_t changeCounter = data->shardChangeCounter;
 
+		/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
+		    printf("%sSS %s  getValueQ for %s @ %lld - waited for version\n", data->tssPairID.present() ? "T" : "",
+		data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version);
+		}*/
+
 		if (!data->shards[req.key]->isReadable()) {
 			//TraceEvent("WrongShardServer", data->thisServerID).detail("Key", req.key).detail("Version", version).detail("In", "getValueQ");
+			/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
+			    printf("%sSS %s started getValueQ for %s @ %lld got wrong shard server\n", data->tssPairID.present() ?
+			"T" : "", data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version);
+			}*/
 			throw wrong_shard_server();
 		}
 
@@ -1145,6 +1223,10 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 		if (i && i->isValue() && i.key() == req.key) {
 			v = (Value)i->getValue();
 			path = 1;
+			/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
+			    printf("%sSS %s  getValueQ for %s @ %lld - got from memory\n", data->tssPairID.present() ? "T" : "",
+			data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version);
+			}*/
 		} else if (!i || !i->isClearTo() || i->getEndKey() <= req.key) {
 			path = 2;
 			Optional<Value> vv = wait(data->storage.readValue(req.key, req.debugID));
@@ -1155,18 +1237,21 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 			}
 			data->checkChangeCounter(changeCounter, req.key);
 			v = vv;
+			/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
+			    printf("%sSS %s  getValueQ for %s @ %lld - got from storage\n", data->tssPairID.present() ? "T" : "",
+			data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version);
+			}*/
 		}
 
 		DEBUG_MUTATION("ShardGetValue",
 		               version,
 		               MutationRef(MutationRef::DebugKey, req.key, v.present() ? v.get() : LiteralStringRef("<null>")));
-		DEBUG_MUTATION("ShardGetPath",
-		               version,
-		               MutationRef(MutationRef::DebugKey,
-		                           req.key,
-		                           path == 0   ? LiteralStringRef("0")
-		                           : path == 1 ? LiteralStringRef("1")
-		                                       : LiteralStringRef("2")));
+		DEBUG_MUTATION(
+		    "ShardGetPath",
+		    version,
+		    MutationRef(MutationRef::DebugKey,
+		                req.key,
+		                path == 0 ? LiteralStringRef("0") : path == 1 ? LiteralStringRef("1") : LiteralStringRef("2")));
 
 		/*
 		StorageMetrics m;
@@ -1183,6 +1268,12 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 			++data->counters.emptyQueries;
 		}
 
+		/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
+		    printf("%sSS %s getValueQ for %s @ %lld = %s\n", data->tssPairID.present() ? "T" : "",
+		data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version, v.present() ?
+		v.get().toString().c_str() : "");
+		}*/
+
 		if (SERVER_KNOBS->READ_SAMPLING_ENABLED) {
 			// If the read yields no value, randomly sample the empty read.
 			int64_t bytesReadPerKSecond =
@@ -1205,8 +1296,16 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 		reply.penalty = data->getPenalty();
 		req.reply.send(reply);
 	} catch (Error& e) {
+		/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
+		    printf("%sSS %s getValueQ for %s @ %lld = ERROR: %d\n", data->tssPairID.present() ? "T" : "",
+		data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version, e.code());
+		}*/
 		if (!canReplyWith(e))
 			throw;
+		/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
+		    printf("%sSS %s getValueQ for %s @ %lld = replying with error: %d\n", data->tssPairID.present() ? "T" : "",
+		data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version, e.code());
+		}*/
 		data->sendErrorWithPenalty(req.reply, e, data->getPenalty());
 	}
 
@@ -1717,13 +1816,21 @@ ACTOR Future<Key> findKey(StorageServer* data,
 	state int distance = forward ? sel.offset : 1 - sel.offset;
 	state Span span("SS.findKey"_loc, { parentSpan });
 
+	/*if (version == 166817893 && sel.offset == 80) {
+	    printf("%sSS %s FindKey request %s:<%s:%d @ %lld: with key range [%s - %s):\n", data->isTss() ? "t" : "",
+	data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset,
+	version, range.begin.toString().c_str(), range.end.toString().c_str());
+	}*/
+
 	// Don't limit the number of bytes if this is a trivial key selector (there will be at most two items returned from
 	// the read range in this case)
 	state int maxBytes;
 	if (sel.offset <= 1 && sel.offset >= 0)
 		maxBytes = std::numeric_limits<int>::max();
 	else
-		maxBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_LIMIT_BYTES : SERVER_KNOBS->STORAGE_LIMIT_BYTES;
+		maxBytes = (g_network->isSimulated() && g_simulator.tssMode == ISimulator::TSSMode::Disabled && BUGGIFY)
+		               ? SERVER_KNOBS->BUGGIFY_LIMIT_BYTES
+		               : SERVER_KNOBS->STORAGE_LIMIT_BYTES;
 
 	state GetKeyValuesReply rep = wait(
 	    readRange(data,
@@ -1734,6 +1841,13 @@ ACTOR Future<Key> findKey(StorageServer* data,
 	              span.context));
 	state bool more = rep.more && rep.data.size() != distance + skipEqualKey;
 
+	/*if (version == 166817893 && sel.offset == 80) {
+	    printf("%sSS %s FindKey request %s:<%s:%d @ %lld: readRange with limBytes=%d got %d:\n", data->isTss() ? "t" :
+	"", data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset,
+	version, maxBytes, rep.data.size()); for (auto& it : rep.data) { printf("    %s\n", it.key.toString().c_str());
+	    }
+	}*/
+
 	// If we get only one result in the reverse direction as a result of the data being too large, we could get stuck in
 	// a loop
 	if (more && !forward && rep.data.size() == 1) {
@@ -1781,9 +1895,20 @@ ACTOR Future<Key> findKey(StorageServer* data,
 			// query SOMEDAY: graceful handling of exceptionally sized values
 			ASSERT(returnKey != sel.getKey());
 
+			/*if (version == 166817893 && sel.offset == 80) {
+			    printf("%sSS %s FindKey request %s:<%s:%d @ %lld: moving same shard\n", data->isTss() ? "t" : "",
+			data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset,
+			version);
+			}*/
 			return returnKey;
-		} else
+		} else {
+			/*if (version == 166817893 && sel.offset == 80) {
+			    printf("%sSS %s FindKey request %s:<%s:%d @ %lld: moving shard boundary\n", data->isTss() ? "t" : "",
+			data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset,
+			version);
+			}*/
 			return forward ? range.end : range.begin;
+		}
 	}
 }
 
@@ -1806,6 +1931,15 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 	state Span span("SS:getKeyValues"_loc, { req.spanContext });
 	state int64_t resultSize = 0;
 
+	if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) {
+		printf("%sSS %s starting query [%s - %s) @ %lld\n",
+		       data->isTss() ? "T" : "",
+		       data->thisServerID.toString().c_str(),
+		       req.begin.getKey().printable().c_str(),
+		       req.end.getKey().printable().c_str(),
+		       req.version);
+	}
+
 	++data->counters.getRangeQueries;
 	++data->counters.allQueries;
 	++data->readQueueSizeMetric;
@@ -1820,6 +1954,15 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 		wait(data->getQueryDelay());
 	}
 
+	if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) {
+		printf("%sSS %s downgraded [%s - %s) @ %lld\n",
+		       data->isTss() ? "T" : "",
+		       data->thisServerID.toString().c_str(),
+		       req.begin.getKey().printable().c_str(),
+		       req.end.getKey().printable().c_str(),
+		       req.version);
+	}
+
 	try {
 		if (req.debugID.present())
 			g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Before");
@@ -1844,6 +1987,15 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 			throw wrong_shard_server();
 		}
 
+		if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) {
+			printf("%sSS %s validated shard [%s - %s) @ %lld\n",
+			       data->isTss() ? "T" : "",
+			       data->thisServerID.toString().c_str(),
+			       req.begin.getKey().printable().c_str(),
+			       req.end.getKey().printable().c_str(),
+			       req.version);
+		}
+
 		state int offset1;
 		state int offset2;
 		state Future<Key> fBegin = req.begin.isFirstGreaterOrEqual()
@@ -1854,6 +2006,7 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 		                             : findKey(data, req.end, version, shard, &offset2, span.context);
 		state Key begin = wait(fBegin);
 		state Key end = wait(fEnd);
+
 		if (req.debugID.present())
 			g_traceBatch.addEvent(
 			    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterKeys");
@@ -1873,6 +2026,25 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 			throw wrong_shard_server();
 		}
 
+		if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) {
+			printf("%sSS %s resolved begin and end [%s - %s) @ %lld\n",
+			       data->isTss() ? "T" : "",
+			       data->thisServerID.toString().c_str(),
+			       req.begin.getKey().printable().c_str(),
+			       req.end.getKey().printable().c_str(),
+			       req.version);
+			printf("    %s:<%s:%d @ -> %s\n",
+			       req.begin.getKey().printable().c_str(),
+			       req.begin.orEqual ? "=" : "",
+			       req.begin.offset,
+			       req.begin.getKey().printable().c_str());
+			printf("    %s:<%s:%d @ -> %s\n",
+			       req.end.getKey().printable().c_str(),
+			       req.end.orEqual ? "=" : "",
+			       req.end.offset,
+			       req.end.getKey().printable().c_str());
+		}
+
 		if (begin >= end) {
 			if (req.debugID.present())
 				g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Send");
@@ -1890,10 +2062,28 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 		} else {
 			state int remainingLimitBytes = req.limitBytes;
 
+			/*if (req.begin.getKey().toString() == "m3fc7" && req.end.getKey().toString() == "s" && req.version ==
+			133421369) { printf("%sSS %s beginning readRange [%s - %s) @ %lld\n", data->isTss() ? "T" : "",
+			data->thisServerID.toString().c_str(), req.begin.getKey().printable().c_str(),
+			req.end.getKey().printable().c_str(), req.version);
+			}*/
+
 			GetKeyValuesReply _r =
 			    wait(readRange(data, version, KeyRangeRef(begin, end), req.limit, &remainingLimitBytes, span.context));
 			GetKeyValuesReply r = _r;
 
+			if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" &&
+			    req.version == 107157353) {
+				printf("%sSS %s completed readRange (%d)%s: \n",
+				       data->isTss() ? "T" : "",
+				       data->thisServerID.toString().c_str(),
+				       r.data.size(),
+				       r.more ? "+" : "");
+				/*for (auto& it : r.data) {
+				    printf("    %s=%s\n", it.key.printable().c_str(), it.value.printable().c_str());
+				}*/
+			}
+
 			if (req.debugID.present())
 				g_traceBatch.addEvent(
 				    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterReadRange");
@@ -1926,6 +2116,14 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 				data->metrics.notifyBytesReadPerKSecond(r.data[r.data.size() - 1].key, bytesReadPerKSecond);
 			}
 
+			if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" &&
+			    req.version == 107157353) {
+				printf("%sSS %s replying to %s\n",
+				       data->isTss() ? "T" : "",
+				       data->thisServerID.toString().c_str(),
+				       req.reply.getEndpoint().token.toString().c_str());
+			}
+
 			r.penalty = data->getPenalty();
 			req.reply.send(r);
 
@@ -1976,14 +2174,33 @@ ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
 	// so we need to downgrade here
 	wait(data->getQueryDelay());
 
+	/*if (req.version == 166817893 && req.sel.offset == 80) {
+	    printf("%sSS %s GetKey request %s:<%s:%d @ %lld\n", data->isTss() ? "t" : "",
+	data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "",
+	req.sel.offset, req.version);
+	}*/
+
 	try {
 		state Version version = wait(waitForVersion(data, req.version, req.spanContext));
+
+		/*if (req.version == 166817893 && req.sel.offset == 80) {
+		    printf("%sSS %s GetKey request %s:<%s:%d @ %lld: waited for version\n", data->isTss() ? "t" : "",
+		data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "",
+		req.sel.offset, req.version);
+		}*/
+
 		state uint64_t changeCounter = data->shardChangeCounter;
 		state KeyRange shard = getShardKeyRange(data, req.sel);
 
 		state int offset;
 		Key k = wait(findKey(data, req.sel, version, shard, &offset, req.spanContext));
 
+		/*if (req.version == 166817893 && req.sel.offset == 80) {
+		    printf("%sSS %s GetKey request %s:<%s:%d @ %lld: found key: %s\n", data->isTss() ? "t" : "",
+		data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "",
+		req.sel.offset, req.version, k.toString().c_str());
+		}*/
+
 		data->checkChangeCounter(
 		    changeCounter, KeyRangeRef(std::min<KeyRef>(req.sel.getKey(), k), std::max<KeyRef>(req.sel.getKey(), k)));
 
@@ -1998,6 +2215,12 @@ ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
 		else
 			updated = KeySelectorRef(k, true, 0); // found
 
+		/*if (req.version == 166817893 && req.sel.offset == 80) {
+		    printf("%sSS %s GetKey request %s:<%s:%d @ %lld: updated: %s:<%s:%d\n", data->isTss() ? "t" : "",
+		data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "",
+		req.sel.offset, req.version, updated.getKey().printable().c_str(), updated.orEqual ? "=" : "", updated.offset);
+		}*/
+
 		resultSize = k.size();
 		data->counters.bytesQueried += resultSize;
 		++data->counters.rowsQueried;
@@ -2322,6 +2545,14 @@ void removeDataRange(StorageServer* ss,
 	// disk when this latest version becomes durable mLV is also modified if necessary to ensure that split clears can
 	// be forgotten
 
+	// TODO REMOVE print
+	printf("%sss %s removing data range [%s - %s) @ %lld\n",
+	       ss->isTss() ? "t" : "",
+	       ss->thisServerID.toString().c_str(),
+	       range.begin.toString().c_str(),
+	       range.end.toString().c_str(),
+	       mLV.version);
+
 	MutationRef clearRange(MutationRef::ClearRange, range.begin, range.end);
 	clearRange = ss->addMutationToMutationLog(mLV, clearRange);
 
@@ -2352,6 +2583,13 @@ void removeDataRange(StorageServer* ss,
 	}
 
 	data.erase(range.begin, range.end);
+
+	printf("%sss %s removed data range [%s - %s) @ %lld\n",
+	       ss->isTss() ? "t" : "",
+	       ss->thisServerID.toString().c_str(),
+	       range.begin.toString().c_str(),
+	       range.end.toString().c_str(),
+	       mLV.version);
 }
 
 void setAvailableStatus(StorageServer* self, KeyRangeRef keys, bool available);
@@ -2932,32 +3170,30 @@ void changeServerKeys(StorageServer* data,
                       ChangeServerKeysContext context) {
 	ASSERT(!keys.empty());
 
-	//TraceEvent("ChangeServerKeys", data->thisServerID)
-	//	.detail("KeyBegin", keys.begin)
-	//	.detail("KeyEnd", keys.end)
-	//	.detail("NowAssigned", nowAssigned)
-	//	.detail("Version", version)
-	//	.detail("Context", changeServerKeysContextName[(int)context]);
+	TraceEvent("ChangeServerKeys", data->thisServerID)
+	    .detail("KeyBegin", keys.begin)
+	    .detail("KeyEnd", keys.end)
+	    .detail("NowAssigned", nowAssigned)
+	    .detail("Version", version)
+	    .detail("Context", changeServerKeysContextName[(int)context]);
 	validate(data);
 
 	// TODO(alexmiller): Figure out how to selectively enable spammy data distribution events.
-	// DEBUG_KEY_RANGE( nowAssigned ? "KeysAssigned" : "KeysUnassigned", version, keys );
+	DEBUG_KEY_RANGE(nowAssigned ? "KeysAssigned" : "KeysUnassigned", version, keys);
 
 	bool isDifferent = false;
 	auto existingShards = data->shards.intersectingRanges(keys);
 	for (auto it = existingShards.begin(); it != existingShards.end(); ++it) {
 		if (nowAssigned != it->value()->assigned()) {
 			isDifferent = true;
-			/*TraceEvent("CSKRangeDifferent", data->thisServerID)
-			  .detail("KeyBegin", it->range().begin)
-			  .detail("KeyEnd", it->range().end);*/
+			TraceEvent("CSKRangeDifferent", data->thisServerID)
+			    .detail("KeyBegin", it->range().begin)
+			    .detail("KeyEnd", it->range().end);
 			break;
 		}
 	}
 	if (!isDifferent) {
-		//TraceEvent("CSKShortCircuit", data->thisServerID)
-		//	.detail("KeyBegin", keys.begin)
-		//	.detail("KeyEnd", keys.end);
+		TraceEvent("CSKShortCircuit", data->thisServerID).detail("KeyBegin", keys.begin).detail("KeyEnd", keys.end);
 		return;
 	}
 
@@ -2995,13 +3231,13 @@ void changeServerKeys(StorageServer* data,
 	for (auto r = vr.begin(); r != vr.end(); ++r) {
 		KeyRangeRef range = keys & r->range();
 		bool dataAvailable = r->value() == latestVersion || r->value() >= version;
-		/*TraceEvent("CSKRange", data->thisServerID)
+		TraceEvent("CSKRange", data->thisServerID)
 		    .detail("KeyBegin", range.begin)
 		    .detail("KeyEnd", range.end)
 		    .detail("Available", dataAvailable)
 		    .detail("NowAssigned", nowAssigned)
 		    .detail("NewestAvailable", r->value())
-		    .detail("ShardState0", data->shards[range.begin]->debugDescribeState());*/
+		    .detail("ShardState0", data->shards[range.begin]->debugDescribeState());
 		if (!nowAssigned) {
 			if (dataAvailable) {
 				ASSERT(r->value() ==
@@ -3043,8 +3279,14 @@ void changeServerKeys(StorageServer* data,
 	oldShards.clear();
 	ranges.clear();
 	for (auto r = removeRanges.begin(); r != removeRanges.end(); ++r) {
+		// TODO should we do this at the passed in version? (or the passed in version + 1?)
 		removeDataRange(data, data->addVersionToMutationLog(data->data().getLatestVersion()), data->shards, *r);
 		setAvailableStatus(data, *r, false);
+		printf("%sss %s set data range unavailable [%s - %s)\n",
+		       data->isTss() ? "t" : "",
+		       data->thisServerID.toString().c_str(),
+		       keys.begin.toString().c_str(),
+		       keys.end.toString().c_str());
 	}
 	validate(data);
 }
@@ -3103,6 +3345,7 @@ static const KeyValueRef persistFormat(LiteralStringRef(PERSIST_PREFIX "Format")
 static const KeyRangeRef persistFormatReadableRange(LiteralStringRef("FoundationDB/StorageServer/1/2"),
                                                     LiteralStringRef("FoundationDB/StorageServer/1/5"));
 static const KeyRef persistID = LiteralStringRef(PERSIST_PREFIX "ID");
+static const KeyRef persistTssPairID = LiteralStringRef(PERSIST_PREFIX "tssPairID");
 
 // (Potentially) change with the durable version or when fetchKeys completes
 static const KeyRef persistVersion = LiteralStringRef(PERSIST_PREFIX "Version");
@@ -3215,15 +3458,26 @@ private:
 
 			data->recoveryVersionSkips.emplace_back(rollbackVersion, currentVersion - rollbackVersion);
 		} else if (m.type == MutationRef::SetValue && m.param1 == killStoragePrivateKey) {
+			printf("worked removed kill storage: %s\n", data->thisServerID.toString().c_str());
 			throw worker_removed();
 		} else if ((m.type == MutationRef::SetValue || m.type == MutationRef::ClearRange) &&
 		           m.param1.substr(1).startsWith(serverTagPrefix)) {
-			bool matchesThisServer = decodeServerTagKey(m.param1.substr(1)) == data->thisServerID;
-			if ((m.type == MutationRef::SetValue && !matchesThisServer) ||
-			    (m.type == MutationRef::ClearRange && matchesThisServer))
+			UID serverTagKey = decodeServerTagKey(m.param1.substr(1));
+			// bool matchesThisServer = (!data->isTss() && serverTagKey == data->thisServerID) || (data->isTss() &&
+			// serverTagKey == data->tssPairID.get());
+			bool matchesThisServer = serverTagKey == data->thisServerID;
+			bool matchesTssPair = data->isTss() ? serverTagKey == data->tssPairID.get() : false;
+			if ((m.type == MutationRef::SetValue && !data->isTss() && !matchesThisServer) ||
+			    (m.type == MutationRef::ClearRange && (matchesThisServer || (data->isTss() && matchesTssPair)))) {
+				printf("%sSS %s removed b/c tag mutation: %s\n",
+				       data->isTss() ? "T" : "",
+				       data->thisServerID.toString().c_str(),
+				       m.toString().c_str());
 				throw worker_removed();
+			}
 		} else if (m.type == MutationRef::SetValue && m.param1 == rebootWhenDurablePrivateKey) {
 			data->rebootAfterDurableVersion = currentVersion;
+			printf("%s got reboot after durable @ %lld\n", data->thisServerID.toString().c_str(), currentVersion);
 			TraceEvent("RebootWhenDurableSet", data->thisServerID)
 			    .detail("DurableVersion", data->durableVersion.get())
 			    .detail("RebootAfterDurableVersion", data->rebootAfterDurableVersion);
@@ -3288,6 +3542,24 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 			wait(delayJittered(.005, TaskPriority::TLogPeekReply));
 		}
 
+		// TODO REMOVE!! just for testing what happens when TSS gets behind
+		if (g_network->isSimulated() && data->isTss() && g_simulator.tssMode == ISimulator::TSSMode::EnabledAddDelay &&
+		    data->tssFaultInjectTime.present() && data->tssFaultInjectTime.get() < now()) {
+			if (deterministicRandom()->random01() < 0.01) {
+				TraceEvent(SevWarnAlways, "TSSInjectDelayForever", data->thisServerID);
+				printf("TSS %s INJECTING DELAY FOREVER!!\n", data->thisServerID.toString().c_str());
+				// small random chance to just completely get stuck here, each tss should eventually hit this in this
+				// mode
+				wait(Never());
+			} else {
+				// otherwise pause for part of a second
+				double delayTime = deterministicRandom()->random01();
+				TraceEvent(SevWarnAlways, "TSSInjectDelay", data->thisServerID).detail("Delay", delayTime);
+				printf("TSS %s INJECTING DELAY for %.4f!!\n", data->thisServerID.toString().c_str(), delayTime);
+				wait(delay(delayTime));
+			}
+		}
+
 		while (data->byteSampleClearsTooLarge.get()) {
 			wait(data->byteSampleClearsTooLarge.onChange());
 		}
@@ -3300,8 +3572,11 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 				break;
 			}
 		}
-		if (cursor->popped() > 0)
+		if (cursor->popped() > 0) {
+			printf(
+			    "Worker removed because of popped=%d: %s\n", cursor->popped(), data->thisServerID.toString().c_str());
 			throw worker_removed();
+		}
 
 		++data->counters.updateBatches;
 		data->lastTLogVersion = cursor->getMaxKnownVersion();
@@ -3352,7 +3627,7 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 				} else {
 					MutationRef msg;
 					cloneReader >> msg;
-					//TraceEvent(SevDebug, "SSReadingLog", data->thisServerID).detail("Mutation", msg.toString());
+					// TraceEvent(SevDebug, "SSReadingLog", data->thisServerID).detail("Mutation", msg.toString());
 
 					if (firstMutation && msg.param1.startsWith(systemKeys.end))
 						hasPrivateData = true;
@@ -3460,7 +3735,15 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 				Span span("SS:update"_loc, { spanContext });
 				span.addTag("key"_sr, msg.param1);
 
-				if (ver != invalidVersion) { // This change belongs to a version < minVersion
+				if (g_network->isSimulated() && data->isTss() &&
+				    g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations &&
+				    data->tssFaultInjectTime.present() && data->tssFaultInjectTime.get() < now() &&
+				    (msg.type == MutationRef::SetValue || msg.type == MutationRef::ClearRange) && msg.param1.size() &&
+				    msg.param1[0] != 0xff && deterministicRandom()->random01() < 0.05) {
+					TraceEvent(SevWarnAlways, "TSSInjectDropMutation", data->thisServerID)
+					    .detail("Mutation", msg.toString())
+					    .detail("Version", cloneCursor2->version().toString());
+				} else if (ver != invalidVersion) { // This change belongs to a version < minVersion
 					DEBUG_MUTATION("SSPeek", ver, msg).detail("ServerID", data->thisServerID);
 					if (ver == 1) {
 						TraceEvent("SSPeekMutation", data->thisServerID);
@@ -3699,8 +3982,14 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
 #endif
 
 void StorageServerDisk::makeNewStorageServerDurable() {
+	// TODO REMOVE print
+	printf(
+	    "%sSS %s saving durable state\n", data->tssPairID.present() ? "T" : "", data->thisServerID.toString().c_str());
 	storage->set(persistFormat);
 	storage->set(KeyValueRef(persistID, BinaryWriter::toValue(data->thisServerID, Unversioned())));
+	if (data->tssPairID.present()) {
+		storage->set(KeyValueRef(persistTssPairID, BinaryWriter::toValue(data->tssPairID.get(), Unversioned())));
+	}
 	storage->set(KeyValueRef(persistVersion, BinaryWriter::toValue(data->version.get(), Unversioned())));
 	storage->set(KeyValueRef(persistShardAssignedKeys.begin.toString(), LiteralStringRef("0")));
 	storage->set(KeyValueRef(persistShardAvailableKeys.begin.toString(), LiteralStringRef("0")));
@@ -3929,6 +4218,7 @@ ACTOR Future<Void> restoreByteSample(StorageServer* data,
 ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* storage) {
 	state Future<Optional<Value>> fFormat = storage->readValue(persistFormat.key);
 	state Future<Optional<Value>> fID = storage->readValue(persistID);
+	state Future<Optional<Value>> ftssPairID = storage->readValue(persistTssPairID);
 	state Future<Optional<Value>> fVersion = storage->readValue(persistVersion);
 	state Future<Optional<Value>> fLogProtocol = storage->readValue(persistLogProtocol);
 	state Future<Optional<Value>> fPrimaryLocality = storage->readValue(persistPrimaryLocality);
@@ -3941,7 +4231,7 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
 	    restoreByteSample(data, storage, byteSampleSampleRecovered, startByteSampleRestore.getFuture());
 
 	TraceEvent("ReadingDurableState", data->thisServerID);
-	wait(waitForAll(std::vector{ fFormat, fID, fVersion, fLogProtocol, fPrimaryLocality }));
+	wait(waitForAll(std::vector{ fFormat, fID, ftssPairID, fVersion, fLogProtocol, fPrimaryLocality }));
 	wait(waitForAll(std::vector{ fShardAssigned, fShardAvailable }));
 	wait(byteSampleSampleRecovered.getFuture());
 	TraceEvent("RestoringDurableState", data->thisServerID);
@@ -3961,7 +4251,12 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
 		throw worker_recovery_failed();
 	}
 	data->thisServerID = BinaryReader::fromStringRef<UID>(fID.get().get(), Unversioned());
-	data->sk = serverKeysPrefixFor(data->thisServerID).withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
+	if (ftssPairID.get().present()) {
+		data->setTssPair(BinaryReader::fromStringRef<UID>(ftssPairID.get().get(), Unversioned()));
+	}
+
+	data->sk = serverKeysPrefixFor((data->tssPairID.present()) ? data->tssPairID.get() : data->thisServerID)
+	               .withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
 
 	if (fLogProtocol.get().present())
 		data->logProtocol = BinaryReader::fromStringRef<ProtocolVersion>(fLogProtocol.get().get(), Unversioned());
@@ -3973,6 +4268,17 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
 	debug_checkRestoredVersion(data->thisServerID, version, "StorageServer");
 	data->setInitialVersion(version);
 
+	// TODO REMOVE print
+	printf("%sSS %s restored durable state @ %lld\n",
+	       data->tssPairID.present() ? "T" : "",
+	       data->thisServerID.toString().c_str(),
+	       version);
+	if (data->tssPairID.present()) {
+		printf("TSS %s recovered pairing to SS %s\n",
+		       data->thisServerID.toString().c_str(),
+		       data->tssPairID.get().toString().c_str());
+	}
+
 	state RangeResult available = fShardAvailable.get();
 	state int availableLoc;
 	for (availableLoc = 0; availableLoc < available.size(); availableLoc++) {
@@ -4006,6 +4312,7 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
 		wait(yield());
 	}
 
+	// TODO why is this seemingly random delay here?
 	wait(delay(0.0001));
 
 	{
@@ -4253,20 +4560,30 @@ ACTOR Future<Void> metricsCore(StorageServer* self, StorageServerInterface ssi)
 
 	wait(self->byteSampleRecovery);
 
-	Tag tag = self->tag;
 	self->actors.add(traceCounters("StorageMetrics",
 	                               self->thisServerID,
 	                               SERVER_KNOBS->STORAGE_LOGGING_DELAY,
 	                               &self->counters.cc,
 	                               self->thisServerID.toString() + "/StorageMetrics",
-	                               [tag, self=self](TraceEvent& te) {
-		                               te.detail("Tag", tag.toString());
-		                               StorageBytes sb = self->storage.getStorageBytes();
+	                               [self=self](TraceEvent& te) {
+		                               te.detail("Tag", self->tag.toString());
+									   StorageBytes sb = self->storage.getStorageBytes();
 		                               te.detail("KvstoreBytesUsed", sb.used);
 		                               te.detail("KvstoreBytesFree", sb.free);
 		                               te.detail("KvstoreBytesAvailable", sb.available);
 		                               te.detail("KvstoreBytesTotal", sb.total);
 		                               te.detail("KvstoreBytesTemp", sb.temp);
+		                               if (self->isTss()) {
+			                               te.detail("TSSPairID", self->tssPairID);
+			                               te.detail("TSSJointID",
+			                                         UID(self->thisServerID.first() ^ self->tssPairID.get().first(),
+			                                             self->thisServerID.second() ^ self->tssPairID.get().second()));
+		                               } else if (self->isSSWithTSSPair()) {
+			                               te.detail("SSPairID", self->ssPairID);
+			                               te.detail("TSSJointID",
+			                                         UID(self->thisServerID.first() ^ self->ssPairID.get().first(),
+			                                             self->thisServerID.second() ^ self->ssPairID.get().second()));
+		                               }
 	                               }));
 
 	loop {
@@ -4370,6 +4687,20 @@ ACTOR Future<Void> serveGetValueRequests(StorageServer* self, FutureStream<GetVa
 ACTOR Future<Void> serveGetKeyValuesRequests(StorageServer* self, FutureStream<GetKeyValuesRequest> getKeyValues) {
 	loop {
 		GetKeyValuesRequest req = waitNext(getKeyValues);
+
+		if (req.begin.getKey().toString() == "m3fc7" && req.end.getKey().toString() == "s" &&
+		    req.version == 133421369) {
+			printf("%sSS %s got range read [%s - %s) @ %lld\n",
+			       self->isTss() ? "T" : "",
+			       self->thisServerID.toString().c_str(),
+			       req.begin.getKey().printable().c_str(),
+			       req.end.getKey().printable().c_str(),
+			       req.version);
+		}
+
+		// A TSS should never be the source for fetch keys
+		ASSERT(!self->tssPairID.present() || !req.isFetchKeys);
+
 		// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade
 		// before doing real work
 		self->actors.add(self->readGuard(req, getKeyValuesQ));
@@ -4601,6 +4932,28 @@ ACTOR Future<Void> storageServerCore(StorageServer* self, StorageServerInterface
 						}
 					}
 				}
+				// SS monitors tss mapping here to see if it has a tss pair.
+				// This information is only used for ss/tss pair metrics reporting so it's ok to be eventually
+				// consistent.
+				if (!self->isTss()) {
+					ClientDBInfo clientInfo = self->db->get().client;
+					Optional<StorageServerInterface> myTssPair = clientInfo.getTssPair(self->thisServerID);
+					if (myTssPair.present()) {
+						// TODO REMOVE print, just for debugging
+						if (!self->ssPairID.present()) {
+							printf("SS %s found tss pair %s\n",
+							       self->thisServerID.toString().c_str(),
+							       myTssPair.get().id().toString().c_str());
+						}
+						self->setSSWithTssPair(myTssPair.get().id());
+					} else {
+						// TODO REMOVE print, just for debugging
+						if (self->ssPairID.present()) {
+							printf("SS %s lost tss pair\n", self->thisServerID.toString().c_str());
+						}
+						self->clearSSWithTssPair();
+					}
+				}
 			}
 			when(GetShardStateRequest req = waitNext(ssi.getShardState.getFuture())) {
 				if (req.mode == GetShardStateRequest::NO_WAIT) {
@@ -4667,18 +5020,19 @@ ACTOR Future<Void> memoryStoreRecover(IKeyValueStore* store, Reference<ClusterCo
 	// create a temp client connect to DB
 	Database cx = Database::createDatabase(connFile, Database::API_VERSION_LATEST);
 
-	state Transaction tr(cx);
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
 	state int noCanRemoveCount = 0;
 	loop {
 		try {
-			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 
-			state bool canRemove = wait(canRemoveStorageServer(&tr, id));
+			state bool canRemove = wait(canRemoveStorageServer(tr, id));
 			if (!canRemove) {
 				TEST(true); // it's possible that the caller had a transaction in flight that assigned keys to the
 				            // server. Wait for it to reverse its mistake.
 				wait(delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::UpdateStorage));
-				tr.reset();
+				tr->reset();
 				TraceEvent("RemoveStorageServerRetrying")
 				    .detail("Count", noCanRemoveCount++)
 				    .detail("ServerID", id)
@@ -4688,21 +5042,34 @@ ACTOR Future<Void> memoryStoreRecover(IKeyValueStore* store, Reference<ClusterCo
 			}
 		} catch (Error& e) {
 			state Error err = e;
-			wait(tr.onError(e));
+			wait(tr->onError(e));
 			TraceEvent("RemoveStorageServerRetrying").error(err);
 		}
 	}
 }
 
+// for creating a new storage server
 ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
                                  StorageServerInterface ssi,
                                  Tag seedTag,
+                                 Version tssSeedVersion,
                                  ReplyPromise<InitializeStorageReply> recruitReply,
                                  Reference<AsyncVar<ServerDBInfo>> db,
                                  std::string folder) {
 	state StorageServer self(persistentData, db, ssi);
+	if (ssi.isTss) {
+		self.setTssPair(ssi.tssPairID);
+		ASSERT(self.isTss());
+	}
 
-	self.sk = serverKeysPrefixFor(self.thisServerID).withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
+	// TODO REMOVE
+	printf("initializing %sstorage %s with tag %s and tss pair=%s\n",
+	       ssi.isTss ? "testing " : "",
+	       ssi.id().toString().c_str(),
+	       seedTag.toString().c_str(),
+	       self.tssPairID.present() ? self.tssPairID.get().toString().c_str() : "");
+	self.sk = serverKeysPrefixFor(self.tssPairID.present() ? self.tssPairID.get() : self.thisServerID)
+	              .withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
 	self.folder = folder;
 
 	try {
@@ -4713,7 +5080,16 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
 			std::pair<Version, Tag> verAndTag = wait(addStorageServer(
 			    self.cx, ssi)); // Might throw recruitment_failed in case of simultaneous master failure
 			self.tag = verAndTag.second;
-			self.setInitialVersion(verAndTag.first - 1);
+			// self.setInitialVersion(ssi.isTss ? 0 : verAndTag.first - 1);
+			if (ssi.isTss) {
+				printf("TSS %s overriding initial version from %lld to %lld\n",
+				       ssi.id().toString().c_str(),
+				       verAndTag.first - 1,
+				       tssSeedVersion);
+				self.setInitialVersion(tssSeedVersion);
+			} else {
+				self.setInitialVersion(verAndTag.first - 1);
+			}
 		} else {
 			self.tag = seedTag;
 		}
@@ -4723,7 +5099,8 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
 
 		TraceEvent("StorageServerInit", ssi.id())
 		    .detail("Version", self.version.get())
-		    .detail("SeedTag", seedTag.toString());
+		    .detail("SeedTag", seedTag.toString())
+		    .detail("TssPair", ssi.isTss ? ssi.tssPairID.toString() : "");
 		InitializeStorageReply rep;
 		rep.interf = ssi;
 		rep.addedVersion = self.version.get();
@@ -4744,6 +5121,10 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
 }
 
 ACTOR Future<Void> replaceInterface(StorageServer* self, StorageServerInterface ssi) {
+	printf("SS %s replacing interface\ngetValue=%s\n",
+	       ssi.id().toString().c_str(),
+	       ssi.getValue.getEndpoint().token.toString().c_str());
+	ASSERT(!ssi.isTss);
 	state Transaction tr(self->cx);
 
 	loop {
@@ -4758,8 +5139,17 @@ ACTOR Future<Void> replaceInterface(StorageServer* self, StorageServerInterface
 			                                     GetStorageServerRejoinInfoRequest(ssi.id(), ssi.locality.dcId()))
 			                  : Never())) {
 				state GetStorageServerRejoinInfoReply rep = _rep;
+
+				printf("SS %s got rejoin reply:\nversion: %" PRIu64 "\ntag: %s\nnewTag: %s\nnewLocality: %s\n",
+				       ssi.id().toString().c_str(),
+				       rep.version,
+				       rep.tag.toString().c_str(),
+				       rep.newTag.present() ? rep.newTag.get().toString().c_str() : "",
+				       rep.newLocality ? "true" : "false");
+
 				try {
 					tr.reset();
+					// TODO why doesn't this need ACCESS_SYSTEM_KEYS?
 					tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 					tr.setVersion(rep.version);
 
@@ -4776,6 +5166,7 @@ ACTOR Future<Void> replaceInterface(StorageServer* self, StorageServerInterface
 						       tagLocalityListValue(rep.newTag.get().locality));
 					}
 
+					// this only should happen if SS moved datacenters
 					if (rep.newTag.present()) {
 						KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(rep.newTag.get()));
 						tr.addReadConflictRange(conflictRange);
@@ -4793,6 +5184,7 @@ ACTOR Future<Void> replaceInterface(StorageServer* self, StorageServerInterface
 
 					choose {
 						when(wait(tr.commit())) {
+							printf("SS committed rejoin txn\n");
 							self->history = rep.history;
 
 							if (rep.newTag.present()) {
@@ -4821,6 +5213,7 @@ ACTOR Future<Void> replaceInterface(StorageServer* self, StorageServerInterface
 						when(wait(infoChanged)) {}
 					}
 				} catch (Error& e) {
+					printf("rejoin txn got error: %d!!\n", e.code());
 					wait(tr.onError(e));
 				}
 			}
@@ -4831,6 +5224,64 @@ ACTOR Future<Void> replaceInterface(StorageServer* self, StorageServerInterface
 	return Void();
 }
 
+ACTOR Future<Void> replaceTSSInterface(StorageServer* self, StorageServerInterface ssi) {
+	// RYW for KeyBackedMap
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
+	state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
+
+	ASSERT(ssi.isTss);
+
+	printf("TSS %s replacing interface:\ngetValue=%s\n",
+	       ssi.id().toString().c_str(),
+	       ssi.getValue.getEndpoint().token.toString().c_str());
+
+	// TODO should this loop until successful? it should never have conflicts, in theory
+
+	loop {
+		try {
+			state Tag myTag;
+
+			tr->reset();
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); // TODO is this needed?
+			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+
+			Optional<Value> pairTagValue = wait(tr->get(serverTagKeyFor(self->tssPairID.get())));
+
+			if (!pairTagValue.present()) {
+				TEST(true); // Race where tss was down, pair was removed, tss starts back up
+				throw worker_removed();
+			}
+
+			myTag = decodeServerTagValue(pairTagValue.get());
+
+			tr->addReadConflictRange(singleKeyRange(serverListKeyFor(ssi.id())));
+			tr->set(serverListKeyFor(ssi.id()), serverListValue(ssi));
+
+			// add itself back to tss mapping
+			// tr->set(tssMappingKeyFor(self->tssPairID.get()), tssMappingValueFor(ssi.id()));
+			tssMapDB.set(tr, self->tssPairID.get(), ssi.id());
+			tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString());
+
+			wait(tr->commit());
+
+			// TODO trace event instead
+			printf("tss %s added itself back, got tag %s for partner %s\n",
+			       self->thisServerID.toString().c_str(),
+			       self->tag.toString().c_str(),
+			       self->tssPairID.get().toString().c_str());
+			self->tag = myTag;
+
+			break;
+		} catch (Error& e) {
+			printf("tss replace interface got error %d!!\n", e.code());
+			wait(tr->onError(e));
+		}
+	}
+
+	return Void();
+}
+
+// for recovering an existing storage server
 ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
                                  StorageServerInterface ssi,
                                  Reference<AsyncVar<ServerDBInfo>> db,
@@ -4839,7 +5290,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
                                  Reference<ClusterConnectionFile> connFile) {
 	state StorageServer self(persistentData, db, ssi);
 	self.folder = folder;
-	self.sk = serverKeysPrefixFor(self.thisServerID).withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
+
 	try {
 		state double start = now();
 		TraceEvent("StorageServerRebootStart", self.thisServerID);
@@ -4864,13 +5315,30 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
 		}
 		TraceEvent("SSTimeRestoreDurableState", self.thisServerID).detail("TimeTaken", now() - start);
 
+		// if this is a tss storage file, use that as source of truth for this server being a tss instead of the
+		// presence of the tss pair key in the storage engine
+		if (ssi.isTss) {
+			ASSERT(self.isTss());
+			ssi.tssPairID = self.tssPairID.get();
+		} else {
+			ASSERT(!self.isTss());
+		}
+
 		ASSERT(self.thisServerID == ssi.id());
+
+		self.sk = serverKeysPrefixFor(self.tssPairID.present() ? self.tssPairID.get() : self.thisServerID)
+		              .withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
+
 		TraceEvent("StorageServerReboot", self.thisServerID).detail("Version", self.version.get());
 
 		if (recovered.canBeSet())
 			recovered.send(Void());
 
-		wait(replaceInterface(&self, ssi));
+		if (self.isTss()) {
+			wait(replaceTSSInterface(&self, ssi));
+		} else {
+			wait(replaceInterface(&self, ssi));
+		}
 
 		TraceEvent("StorageServerStartingCore", self.thisServerID).detail("TimeTaken", now() - start);
 
diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp
index 4b98b38486..1b23040d0d 100644
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@@ -869,6 +869,7 @@ ACTOR Future<Void> checkConsistency(Database cx,
                                     std::vector<TesterInterface> testers,
                                     bool doQuiescentCheck,
                                     bool doCacheCheck,
+                                    bool doTSSCheck,
                                     double quiescentWaitTimeout,
                                     double softTimeLimit,
                                     double databasePingDelay,
@@ -885,12 +886,16 @@ ACTOR Future<Void> checkConsistency(Database cx,
 	Standalone<VectorRef<KeyValueRef>> options;
 	StringRef performQuiescent = LiteralStringRef("false");
 	StringRef performCacheCheck = LiteralStringRef("false");
+	StringRef performTSSCheck = LiteralStringRef("false");
 	if (doQuiescentCheck) {
 		performQuiescent = LiteralStringRef("true");
 	}
 	if (doCacheCheck) {
 		performCacheCheck = LiteralStringRef("true");
 	}
+	if (doTSSCheck) {
+		performTSSCheck = LiteralStringRef("true");
+	}
 	spec.title = LiteralStringRef("ConsistencyCheck");
 	spec.databasePingDelay = databasePingDelay;
 	spec.timeout = 32000;
@@ -898,6 +903,7 @@ ACTOR Future<Void> checkConsistency(Database cx,
 	                       KeyValueRef(LiteralStringRef("testName"), LiteralStringRef("ConsistencyCheck")));
 	options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("performQuiescentChecks"), performQuiescent));
 	options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("performCacheCheck"), performCacheCheck));
+	options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("performTSSCheck"), performTSSCheck));
 	options.push_back_deep(options.arena(),
 	                       KeyValueRef(LiteralStringRef("quiescentWaitTimeout"),
 	                                   ValueRef(options.arena(), format("%f", quiescentWaitTimeout))));
@@ -973,6 +979,8 @@ ACTOR Future<bool> runTest(Database cx,
 				                                   testers,
 				                                   quiescent,
 				                                   spec.runConsistencyCheckOnCache,
+				                                   // spec.runConsistencyCheckOnTSS, // TODO override with true to test
+				                                   true,
 				                                   10000.0,
 				                                   18000,
 				                                   spec.databasePingDelay,
@@ -1108,6 +1116,11 @@ std::map<std::string, std::function<void(const std::string& value, TestSpec* spe
 	      spec->runConsistencyCheckOnCache = (value == "true");
 	      TraceEvent("TestParserTest").detail("ParsedRunConsistencyCheckOnCache", spec->runConsistencyCheckOnCache);
 	  } },
+	{ "runConsistencyCheckOnTSS",
+	  [](const std::string& value, TestSpec* spec) {
+	      spec->runConsistencyCheckOnTSS = (value == "true");
+	      TraceEvent("TestParserTest").detail("ParsedRunConsistencyCheckOnTSS", spec->runConsistencyCheckOnTSS);
+	  } },
 	{ "waitForQuiescence",
 	  [](const std::string& value, TestSpec* spec) {
 	      bool toWait = value == "true";
@@ -1416,14 +1429,19 @@ ACTOR Future<Void> runTests(Reference<AsyncVar<Optional<struct ClusterController
 	if (useDB && startingConfiguration != StringRef()) {
 		try {
 			wait(timeoutError(changeConfiguration(cx, testers, startingConfiguration), 2000.0));
+			printf("starting config changed\n");
 			if (g_network->isSimulated() && enableDD) {
+				printf("waiting for DD\n");
 				wait(success(setDDMode(cx, 1)));
+				printf("done waiting for DD\n");
 			}
 		} catch (Error& e) {
 			TraceEvent(SevError, "TestFailure").error(e).detail("Reason", "Unable to set starting configuration");
 		}
 	}
 
+	printf("starting configuration set, moving on\n");
+
 	if (useDB && waitForQuiescenceBegin) {
 		TraceEvent("TesterStartingPreTestChecks")
 		    .detail("DatabasePingDelay", databasePingDelay)
@@ -1439,6 +1457,8 @@ ACTOR Future<Void> runTests(Reference<AsyncVar<Optional<struct ClusterController
 		}
 	}
 
+	printf("database quiesced, starting tests.\n");
+
 	TraceEvent("TestsExpectedToPass").detail("Count", tests.size());
 	state int idx = 0;
 	for (; idx < tests.size(); idx++) {
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 5721b154d4..4a136b4bd4 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -272,6 +272,7 @@ ACTOR Future<Void> loadedPonger(FutureStream<LoadedPingRequest> pings) {
 }
 
 StringRef fileStoragePrefix = LiteralStringRef("storage-");
+StringRef testingStoragePrefix = LiteralStringRef("testingstorage-");
 StringRef fileLogDataPrefix = LiteralStringRef("log-");
 StringRef fileVersionedLogDataPrefix = LiteralStringRef("log2-");
 StringRef fileLogQueuePrefix = LiteralStringRef("logqueue-");
@@ -315,6 +316,7 @@ std::string filenameFromSample(KeyValueStoreType storeType, std::string folder,
 }
 
 std::string filenameFromId(KeyValueStoreType storeType, std::string folder, std::string prefix, UID id) {
+
 	if (storeType == KeyValueStoreType::SSD_BTREE_V1)
 		return joinPath(folder, prefix + id.toString() + ".fdb");
 	else if (storeType == KeyValueStoreType::SSD_BTREE_V2)
@@ -326,6 +328,7 @@ std::string filenameFromId(KeyValueStoreType storeType, std::string folder, std:
 	else if (storeType == KeyValueStoreType::SSD_ROCKSDB_V1)
 		return joinPath(folder, prefix + id.toString() + ".rocksdb");
 
+	printf("UNKNOWN storeType %s\n", storeType.toString().c_str());
 	UNREACHABLE();
 }
 
@@ -444,6 +447,9 @@ std::vector<DiskStore> getDiskStores(std::string folder,
 		if (filename.startsWith(fileStoragePrefix)) {
 			store.storedComponent = DiskStore::Storage;
 			prefix = fileStoragePrefix;
+		} else if (filename.startsWith(testingStoragePrefix)) {
+			store.storedComponent = DiskStore::Storage;
+			prefix = testingStoragePrefix;
 		} else if (filename.startsWith(fileVersionedLogDataPrefix)) {
 			store.storedComponent = DiskStore::TLogData;
 			// Use the option string that's in the file rather than tLogOptions.toPrefix(),
@@ -739,6 +745,7 @@ ACTOR Future<Void> storageServerRollbackRebooter(Future<Void> prevStorageServer,
                                                  std::string filename,
                                                  UID id,
                                                  LocalityData locality,
+                                                 bool isTss,
                                                  Reference<AsyncVar<ServerDBInfo>> db,
                                                  std::string folder,
                                                  ActorCollection* filesClosed,
@@ -756,6 +763,7 @@ ACTOR Future<Void> storageServerRollbackRebooter(Future<Void> prevStorageServer,
 		StorageServerInterface recruited;
 		recruited.uniqueID = id;
 		recruited.locality = locality;
+		recruited.isTss = isTss;
 		recruited.initEndpoints();
 
 		DUMPTOKEN(recruited.getValue);
@@ -1097,14 +1105,26 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 				Future<Void> kvClosed = kv->onClosed();
 				filesClosed.add(kvClosed);
 
+				// std::string doesn't have startsWith
+				std::string tssPrefix = testingStoragePrefix.toString();
+				// TODO might be more efficient to mark a boolean on DiskStore in getDiskStores, but that kind of breaks
+				// the abstraction since DiskStore also applies to storage cache + tlog
+				bool isTss = s.filename.find(tssPrefix) != std::string::npos;
+				// TODO REMOVE after test
+				printf("%s is%s tss filename\n", s.filename.c_str(), isTss ? "" : " not");
+				Role ssRole = isTss ? Role::TESTING_STORAGE_SERVER : Role::STORAGE_SERVER;
+
 				StorageServerInterface recruited;
 				recruited.uniqueID = s.storeID;
 				recruited.locality = locality;
+				recruited.isTss = isTss;
 				recruited.initEndpoints();
 
 				std::map<std::string, std::string> details;
 				details["StorageEngine"] = s.storeType.toString();
-				startRole(Role::STORAGE_SERVER, recruited.id(), interf.id(), details, "Restored");
+				details["IsTSS"] = isTss ? "Yes" : "No";
+
+				startRole(ssRole, recruited.id(), interf.id(), details, "Restored");
 
 				DUMPTOKEN(recruited.getValue);
 				DUMPTOKEN(recruited.getKey);
@@ -1129,12 +1149,13 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 				                                  s.filename,
 				                                  recruited.id(),
 				                                  recruited.locality,
+				                                  isTss,
 				                                  dbInfo,
 				                                  folder,
 				                                  &filesClosed,
 				                                  memoryLimit,
 				                                  kv);
-				errorForwarders.add(forwardError(errors, Role::STORAGE_SERVER, recruited.id(), f));
+				errorForwarders.add(forwardError(errors, ssRole, recruited.id(), f));
 			} else if (s.storedComponent == DiskStore::TLogData) {
 				std::string logQueueBasename;
 				const std::string filename = basename(s.filename);
@@ -1487,13 +1508,29 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 			}
 			when(InitializeStorageRequest req = waitNext(interf.storage.getFuture())) {
 				if (!storageCache.exists(req.reqId)) {
+
+					printf("Got "
+					       "InitializeStorageRequest:seedTag=%s\nreqId=%s\ninterfaceId=%s\nstoreType=%s\nisTss=%"
+					       "s\ntssPairID=%s\ntssPairVersion=%lld\n\n",
+					       req.seedTag.toString().c_str(),
+					       req.reqId.toString().c_str(),
+					       req.interfaceId.toString().c_str(),
+					       req.storeType.toString().c_str(),
+					       req.isTss ? "true" : "false",
+					       req.isTss ? req.tssPairID.toString().c_str() : "",
+					       req.isTss ? req.tssPairVersion : 0);
+
 					StorageServerInterface recruited(req.interfaceId);
 					recruited.locality = locality;
+					recruited.isTss = req.isTss;
+					recruited.tssPairID = req.tssPairID;
 					recruited.initEndpoints();
 
 					std::map<std::string, std::string> details;
 					details["StorageEngine"] = req.storeType.toString();
-					startRole(Role::STORAGE_SERVER, recruited.id(), interf.id(), details);
+					details["IsTSS"] = std::to_string(recruited.isTss);
+					Role ssRole = recruited.isTss ? Role::TESTING_STORAGE_SERVER : Role::STORAGE_SERVER;
+					startRole(ssRole, recruited.id(), interf.id(), details);
 
 					DUMPTOKEN(recruited.getValue);
 					DUMPTOKEN(recruited.getKey);
@@ -1508,16 +1545,21 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 					DUMPTOKEN(recruited.getQueuingMetrics);
 					DUMPTOKEN(recruited.getKeyValueStoreType);
 					DUMPTOKEN(recruited.watchValue);
-					// printf("Recruited as storageServer\n");
+					// TODO re-comment!
+					printf("Recruited as storageServer\n");
 
 					std::string filename =
-					    filenameFromId(req.storeType, folder, fileStoragePrefix.toString(), recruited.id());
+					    filenameFromId(req.storeType,
+					                   folder,
+					                   recruited.isTss ? testingStoragePrefix.toString() : fileStoragePrefix.toString(),
+					                   recruited.id());
 					IKeyValueStore* data = openKVStore(req.storeType, filename, recruited.id(), memoryLimit);
 					Future<Void> kvClosed = data->onClosed();
 					filesClosed.add(kvClosed);
 					ReplyPromise<InitializeStorageReply> storageReady = req.reply;
 					storageCache.set(req.reqId, storageReady.getFuture());
-					Future<Void> s = storageServer(data, recruited, req.seedTag, storageReady, dbInfo, folder);
+					Future<Void> s =
+					    storageServer(data, recruited, req.seedTag, req.tssPairVersion, storageReady, dbInfo, folder);
 					s = handleIOErrors(s, data, recruited.id(), kvClosed);
 					s = storageCache.removeOnReady(req.reqId, s);
 					s = storageServerRollbackRebooter(s,
@@ -1525,12 +1567,13 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 					                                  filename,
 					                                  recruited.id(),
 					                                  recruited.locality,
+					                                  req.isTss,
 					                                  dbInfo,
 					                                  folder,
 					                                  &filesClosed,
 					                                  memoryLimit,
 					                                  data);
-					errorForwarders.add(forwardError(errors, Role::STORAGE_SERVER, recruited.id(), s));
+					errorForwarders.add(forwardError(errors, ssRole, recruited.id(), s));
 				} else
 					forwardPromise(req.reply, storageCache.get(req.reqId));
 			}
@@ -2111,6 +2154,7 @@ ACTOR Future<Void> fdbd(Reference<ClusterConnectionFile> connFile,
 
 const Role Role::WORKER("Worker", "WK", false);
 const Role Role::STORAGE_SERVER("StorageServer", "SS");
+const Role Role::TESTING_STORAGE_SERVER("TestingStorageServer", "TS");
 const Role Role::TRANSACTION_LOG("TLog", "TL");
 const Role Role::SHARED_TRANSACTION_LOG("SharedTLog", "SL", false);
 const Role Role::COMMIT_PROXY("CommitProxyServer", "CP");
@@ -2118,7 +2162,7 @@ const Role Role::GRV_PROXY("GrvProxyServer", "GP");
 const Role Role::MASTER("MasterServer", "MS");
 const Role Role::RESOLVER("Resolver", "RV");
 const Role Role::CLUSTER_CONTROLLER("ClusterController", "CC");
-const Role Role::TESTER("Tester", "TS");
+const Role Role::TESTER("TestClient", "TC");
 const Role Role::LOG_ROUTER("LogRouter", "LR");
 const Role Role::DATA_DISTRIBUTOR("DataDistributor", "DD");
 const Role Role::RATEKEEPER("Ratekeeper", "RK");
diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp
index 722e1f5e6e..0aae7ca9d4 100644
--- a/fdbserver/workloads/ConsistencyCheck.actor.cpp
+++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp
@@ -48,6 +48,9 @@ struct ConsistencyCheckWorkload : TestWorkload {
 	// Whether or not perform consistency check between storage cache servers and storage servers
 	bool performCacheCheck;
 
+	// Whether or not to perform consistency check between storage servers and pair TSS
+	bool performTSSCheck;
+
 	// How long to wait for the database to go quiet before failing (if doing quiescent checks)
 	double quiescentWaitTimeout;
 
@@ -94,6 +97,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 	ConsistencyCheckWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
 		performQuiescentChecks = getOption(options, LiteralStringRef("performQuiescentChecks"), false);
 		performCacheCheck = getOption(options, LiteralStringRef("performCacheCheck"), false);
+		performTSSCheck = getOption(options, LiteralStringRef("performTSSCheck"), false);
 		quiescentWaitTimeout = getOption(options, LiteralStringRef("quiescentWaitTimeout"), 600.0);
 		distributed = getOption(options, LiteralStringRef("distributed"), true);
 		shardSampleFactor = std::max(getOption(options, LiteralStringRef("shardSampleFactor"), 1), 1);
@@ -1057,7 +1061,9 @@ struct ConsistencyCheckWorkload : TestWorkload {
 					TraceEvent("ConsistencyCheck_FailedToFetchMetrics")
 					    .detail("Begin", printable(shard.begin))
 					    .detail("End", printable(shard.end))
-					    .detail("StorageServer", storageServers[i].id());
+					    .detail("StorageServer", storageServers[i].id())
+					    .detail("IsTSS", storageServers[i].isTss ? "True" : "False")
+					    .error(reply.getError());
 					estimatedBytes.push_back(-1);
 				}
 
@@ -1074,7 +1080,10 @@ struct ConsistencyCheckWorkload : TestWorkload {
 						    .detail("Begin", printable(shard.begin))
 						    .detail("End", printable(shard.end))
 						    .detail("StorageServer1", storageServers[firstValidStorageServer].id())
-						    .detail("StorageServer2", storageServers[i].id());
+						    .detail("StorageServer2", storageServers[i].id())
+						    .detail("IsTSS",
+						            storageServers[i].isTss || storageServers[firstValidStorageServer].isTss ? "True"
+						                                                                                     : "False");
 					}
 				}
 			}
@@ -1236,6 +1245,28 @@ struct ConsistencyCheckWorkload : TestWorkload {
 				}
 			}
 
+			// add TSS to end of list, if configured and if not relocating
+			if (!isRelocating && self->performTSSCheck) {
+				printf("CCheck: Checking for tss to add: isRelocating: %s, performTSSCheck: %s\n",
+				       isRelocating ? "T" : "F",
+				       self->performTSSCheck ? "T" : "F");
+				int initialSize = storageServers.size();
+				for (int i = 0; i < initialSize; i++) {
+					Optional<StorageServerInterface> tssPair = cx->clientInfo->get().getTssPair(storageServers[i]);
+					if (tssPair.present()) {
+						printf("CCheck: Adding TSS %s to consistency check!\n", tssPair.get().id().toString().c_str());
+						storageServers.push_back(tssPair.get().id());
+						storageServerInterfaces.push_back(tssPair.get());
+					} else {
+						printf("CCheck: SS %s doesn't have tss pair\n", storageServers[i].toString().c_str());
+					}
+				}
+			} else {
+				printf("CCheck: Not checking for tss to add: isRelocating: %s, performTSSCheck: %s\n",
+				       isRelocating ? "T" : "F",
+				       self->performTSSCheck ? "T" : "F");
+			}
+
 			state vector<int64_t> estimatedBytes = wait(self->getStorageSizeEstimate(storageServerInterfaces, range));
 
 			// Gets permitted size range of shard
@@ -1323,7 +1354,8 @@ struct ConsistencyCheckWorkload : TestWorkload {
 										// Be especially verbose if in simulation
 										if (g_network->isSimulated()) {
 											int invalidIndex = -1;
-											printf("\nSERVER %d (%s); shard = %s - %s:\n",
+											printf("\n%sSERVER %d (%s); shard = %s - %s:\n",
+											       storageServerInterfaces[j].isTss ? "TSS " : "",
 											       j,
 											       storageServerInterfaces[j].address().toString().c_str(),
 											       printable(req.begin.getKey()).c_str(),
@@ -1341,7 +1373,8 @@ struct ConsistencyCheckWorkload : TestWorkload {
 											}
 
 											printf(
-											    "\nSERVER %d (%s); shard = %s - %s:\n",
+											    "\n%sSERVER %d (%s); shard = %s - %s:\n",
+											    storageServerInterfaces[firstValidServer].isTss ? "TSS " : "",
 											    firstValidServer,
 											    storageServerInterfaces[firstValidServer].address().toString().c_str(),
 											    printable(req.begin.getKey()).c_str(),
@@ -1430,16 +1463,31 @@ struct ConsistencyCheckWorkload : TestWorkload {
 										            printable(referenceUniqueKey))
 										    .detail("ValueMismatches", valueMismatches)
 										    .detail("ValueMismatchKey", printable(valueMismatchKey))
-										    .detail("MatchingKVPairs", matchingKVPairs);
+										    .detail("MatchingKVPairs", matchingKVPairs)
+										    .detail("IsTSS",
+										            storageServerInterfaces[j].isTss ||
+										                    storageServerInterfaces[firstValidServer].isTss
+										                ? "True"
+										                : "False");
 
-										self->testFailure("Data inconsistent", true);
-										return false;
+										// TODO should the test still fail if TSS is wrong? Or is just logging the trace
+										// logs ok
+										if ((g_network->isSimulated() &&
+										     g_simulator.tssMode != ISimulator::TSSMode::EnabledDropMutations) ||
+										    (!storageServerInterfaces[j].isTss &&
+										     !storageServerInterfaces[firstValidServer].isTss)) {
+											self->testFailure("Data inconsistent", true);
+											return false;
+										}
 									}
 								}
 							}
 
 							// If the data is not available and we aren't relocating this shard
 							else if (!isRelocating) {
+								Error e =
+								    rangeResult.isError() ? rangeResult.getError() : rangeResult.get().error.get();
+
 								TraceEvent("ConsistencyCheck_StorageServerUnavailable")
 								    .suppressFor(1.0)
 								    .detail("StorageServer", storageServers[j])
@@ -1448,10 +1496,20 @@ struct ConsistencyCheckWorkload : TestWorkload {
 								    .detail("Address", storageServerInterfaces[j].address())
 								    .detail("UID", storageServerInterfaces[j].id())
 								    .detail("GetKeyValuesToken",
-								            storageServerInterfaces[j].getKeyValues.getEndpoint().token);
+								            storageServerInterfaces[j].getKeyValues.getEndpoint().token)
+								    .detail("IsTSS", storageServerInterfaces[j].isTss ? "True" : "False")
+								    .error(e);
+
+								printf("CC %sSS %s failed with error % d\n",
+								       storageServerInterfaces[j].isTss ? "T" : "",
+								       storageServers[j].toString().c_str(),
+								       e.code());
 
 								// All shards should be available in quiscence
-								if (self->performQuiescentChecks) {
+								// TODO should the test still fail if TSS is unavailable? Or is just logging the trace
+								// logs ok
+								if (self->performQuiescentChecks &&
+								    (g_network->isSimulated() || !storageServerInterfaces[j].isTss)) {
 									self->testFailure("Storage server unavailable");
 									return false;
 								}
@@ -1546,19 +1604,25 @@ struct ConsistencyCheckWorkload : TestWorkload {
 				bool hasValidEstimate = estimatedBytes.size() > 0;
 
 				// If the storage servers' sampled estimate of shard size is different from ours
+				// TODO should the test still fail if TSS has wrong estimate? Or is just logging the trace logs ok
 				if (self->performQuiescentChecks) {
 					for (int j = 0; j < estimatedBytes.size(); j++) {
 						if (estimatedBytes[j] >= 0 && estimatedBytes[j] != sampledBytes) {
 							TraceEvent("ConsistencyCheck_IncorrectEstimate")
 							    .detail("EstimatedBytes", estimatedBytes[j])
 							    .detail("CorrectSampledBytes", sampledBytes)
-							    .detail("StorageServer", storageServers[j]);
-							self->testFailure("Storage servers had incorrect sampled estimate");
+							    .detail("StorageServer", storageServers[j])
+							    .detail("IsTSS", storageServerInterfaces[j].isTss ? "True" : "False");
+
+							if (!storageServerInterfaces[j].isTss) {
+								self->testFailure("Storage servers had incorrect sampled estimate");
+							}
 
 							hasValidEstimate = false;
 
 							break;
-						} else if (estimatedBytes[j] < 0) {
+						} else if (estimatedBytes[j] < 0 &&
+						           (g_network->isSimulated() || !storageServerInterfaces[j].isTss)) {
 							self->testFailure("Could not get storage metrics from server");
 							hasValidEstimate = false;
 							break;
@@ -1670,7 +1734,9 @@ struct ConsistencyCheckWorkload : TestWorkload {
 			if (!keyValueStoreType.present()) {
 				TraceEvent("ConsistencyCheck_ServerUnavailable").detail("ServerID", storageServers[i].id());
 				self->testFailure("Storage server unavailable");
-			} else if (keyValueStoreType.get() != configuration.storageServerStoreType) {
+			} else if ((!storageServers[i].isTss && keyValueStoreType.get() != configuration.storageServerStoreType) ||
+			           (storageServers[i].isTss &&
+			            keyValueStoreType.get() != configuration.testingStorageServerStoreType)) {
 				TraceEvent("ConsistencyCheck_WrongKeyValueStoreType")
 				    .detail("ServerID", storageServers[i].id())
 				    .detail("StoreType", keyValueStoreType.get().toString())
@@ -1681,6 +1747,10 @@ struct ConsistencyCheckWorkload : TestWorkload {
 
 			// Check each pair of storage servers for an address match
 			for (j = i + 1; j < storageServers.size(); j++) {
+				// TODO change this hack back once i fix recruitment
+				/*if (storageServers[i].isTss || storageServers[j].isTss) {
+				    continue;
+				}*/
 				if (storageServers[i].address() == storageServers[j].address()) {
 					TraceEvent("ConsistencyCheck_UndesirableServer")
 					    .detail("StorageServer1", storageServers[i].id())
@@ -1701,8 +1771,18 @@ struct ConsistencyCheckWorkload : TestWorkload {
 	                                   ConsistencyCheckWorkload* self) {
 		state vector<WorkerDetails> workers = wait(getWorkers(self->dbInfo));
 		state vector<StorageServerInterface> storageServers = wait(getStorageServers(cx));
-		std::set<Optional<Key>> missingStorage;
+		std::vector<Optional<Key>> missingStorage; // vector instead of a set to get the count
 
+		printf("CC starting check for storage: %d workers, %d SS\n", workers.size(), storageServers.size());
+		printf("CC checking %d regions: ", configuration.regions.size());
+		if (configuration.regions.size() == 1) {
+			printf("%s", configuration.regions[0].dcId.toString().c_str());
+		} else if (configuration.regions.size() == 2) {
+			printf("%s %s",
+			       configuration.regions[0].dcId.toString().c_str(),
+			       configuration.regions[1].dcId.toString().c_str());
+		}
+		printf("\n");
 		for (int i = 0; i < workers.size(); i++) {
 			NetworkAddress addr = workers[i].interf.stableAddress();
 			if (!configuration.isExcludedServer(workers[i].interf.addresses()) &&
@@ -1712,29 +1792,83 @@ struct ConsistencyCheckWorkload : TestWorkload {
 				for (int j = 0; j < storageServers.size(); j++) {
 					if (storageServers[j].stableAddress() == addr) {
 						found = true;
+						printf("CC found SS %s on %s in dc %s\n",
+						       storageServers[j].id().toString().c_str(),
+						       addr.toString().c_str(),
+						       workers[i].interf.locality.dcId().present()
+						           ? workers[i].interf.locality.dcId().get().toString().c_str()
+						           : "");
 						break;
 					}
 				}
 				if (!found) {
+					if (configuration.regions.size() == 0 ||
+					    (configuration.regions.size() == 1 &&
+					     workers[i].interf.locality.dcId() == configuration.regions[0].dcId) ||
+					    (configuration.regions.size() == 2 &&
+					     (workers[i].interf.locality.dcId() == configuration.regions[0].dcId ||
+					      workers[i].interf.locality.dcId() == configuration.regions[1].dcId))) {
+						printf("CC found no SS on %s in dc %s\n",
+						       addr.toString().c_str(),
+						       workers[i].interf.locality.dcId().present()
+						           ? workers[i].interf.locality.dcId().get().toString().c_str()
+						           : "");
+					}
+
 					TraceEvent("ConsistencyCheck_NoStorage")
 					    .detail("Address", addr)
 					    .detail("ProcessClassEqualToStorageClass",
 					            (int)(workers[i].processClass == ProcessClass::StorageClass));
-					missingStorage.insert(workers[i].interf.locality.dcId());
+					missingStorage.push_back(workers[i].interf.locality.dcId());
 				}
 			}
 		}
 
+		int missingDc0 = configuration.regions.size() == 0
+		                     ? 0
+		                     : std::count(missingStorage.begin(), missingStorage.end(), configuration.regions[0].dcId);
+		int missingDc1 = configuration.regions.size() < 2
+		                     ? 0
+		                     : std::count(missingStorage.begin(), missingStorage.end(), configuration.regions[1].dcId);
+
 		if ((configuration.regions.size() == 0 && missingStorage.size()) ||
-		    (configuration.regions.size() == 1 && missingStorage.count(configuration.regions[0].dcId)) ||
-		    (configuration.regions.size() == 2 && configuration.usableRegions == 1 &&
-		     missingStorage.count(configuration.regions[0].dcId) &&
-		     missingStorage.count(configuration.regions[1].dcId)) ||
-		    (configuration.regions.size() == 2 && configuration.usableRegions > 1 &&
-		     (missingStorage.count(configuration.regions[0].dcId) ||
-		      missingStorage.count(configuration.regions[1].dcId)))) {
-			self->testFailure("No storage server on worker");
-			return false;
+		    (configuration.regions.size() == 1 && missingDc0) ||
+		    (configuration.regions.size() == 2 && configuration.usableRegions == 1 && missingDc0 && missingDc1) ||
+		    (configuration.regions.size() == 2 && configuration.usableRegions > 1 && (missingDc0 || missingDc1))) {
+
+			// TODO could improve this check by also ensuring DD is currently recruiting a TSS by using quietdb?
+			bool couldExpectMissingTss =
+			    (configuration.desiredTSSCount - self->dbInfo->get().client.tssMapping.size()) > 0;
+			printf("CC couldExpectMissingTss = %s\n", couldExpectMissingTss ? "True" : "False");
+
+			int countMissing = missingStorage.size();
+			int acceptableTssMissing = 1;
+			if (configuration.regions.size() == 1) {
+				countMissing = missingDc0;
+			} else if (configuration.regions.size() == 2) {
+				if (configuration.usableRegions == 1) {
+					// all processes should be missing from 1, so take the number missing from the other
+					countMissing = std::min(missingDc0, missingDc1);
+				} else if (configuration.usableRegions == 2) {
+					countMissing = missingDc0 + missingDc1;
+					acceptableTssMissing = 2;
+				} else {
+					ASSERT(false); // in case fdb ever adds 3+ region support?
+				}
+			}
+
+			if (!couldExpectMissingTss || countMissing > acceptableTssMissing) {
+				printf("No storage server on %d workers. CouldBeTSS=%s, acceptableTssMissing=%d\n",
+				       countMissing,
+				       couldExpectMissingTss ? "T" : "F",
+				       acceptableTssMissing);
+				self->testFailure("No storage server on worker");
+				return false;
+			} else {
+				// TODO sev=30 warn instead of print
+				printf("CC found %d missing storage server on worker, but it is likely a tss(es) waiting for a pair\n",
+				       configuration.usableRegions);
+			}
 		}
 
 		return true;
@@ -1751,8 +1885,10 @@ struct ConsistencyCheckWorkload : TestWorkload {
 		state bool foundExtraDataStore = false;
 		state std::vector<struct ProcessInfo*> protectedProcessesToKill;
 
+		printf("CC checking for extra data stores\n");
 		state std::map<NetworkAddress, std::set<UID>> statefulProcesses;
 		for (const auto& ss : storageServers) {
+			printf("CC Marking %ss as ok\n", ss.id().toString().c_str());
 			statefulProcesses[ss.address()].insert(ss.id());
 			// A process may have two addresses (same ip, different ports)
 			if (ss.secondaryAddress().present()) {
@@ -1809,6 +1945,9 @@ struct ConsistencyCheckWorkload : TestWorkload {
 				if (statefulProcesses[itr->interf.address()].count(id)) {
 					continue;
 				}
+				printf("CC found extra data store %s on %s\n",
+				       id.toString().c_str(),
+				       itr->interf.address().toString().c_str());
 				// For extra data store
 				TraceEvent("ConsistencyCheck_ExtraDataStore")
 				    .detail("Address", itr->interf.address())
@@ -1841,7 +1980,10 @@ struct ConsistencyCheckWorkload : TestWorkload {
 			}
 		}
 
+		printf("CC check for extra data stores complete\n");
+
 		if (foundExtraDataStore) {
+			printf("CC Extra Data Stores\n");
 			self->testFailure("Extra data stores present on workers");
 			return false;
 		}
diff --git a/fdbserver/workloads/RandomMoveKeys.actor.cpp b/fdbserver/workloads/RandomMoveKeys.actor.cpp
index 8bdafb35e0..4fe5654a18 100644
--- a/fdbserver/workloads/RandomMoveKeys.actor.cpp
+++ b/fdbserver/workloads/RandomMoveKeys.actor.cpp
@@ -162,12 +162,13 @@ struct MoveKeysWorkload : TestWorkload {
 		// The real data distribution algorithm doesn't want to deal with multiple servers
 		// with the same address having keys.  So if there are two servers with the same address,
 		// don't use either one (so we don't have to find out which of them, if any, already has keys).
+		// Also get rid of tss since we don't want to move a shard to a tss.
 		std::map<NetworkAddress, int> count;
 		for (int s = 0; s < servers.size(); s++)
 			count[servers[s].address()]++;
 		int o = 0;
 		for (int s = 0; s < servers.size(); s++)
-			if (count[servers[s].address()] == 1)
+			if (count[servers[s].address()] == 1 && !servers[s].isTss)
 				servers[o++] = servers[s];
 		servers.resize(o);
 	}
diff --git a/fdbserver/workloads/workloads.actor.h b/fdbserver/workloads/workloads.actor.h
index 36fcf28312..ffd669e88b 100644
--- a/fdbserver/workloads/workloads.actor.h
+++ b/fdbserver/workloads/workloads.actor.h
@@ -152,6 +152,7 @@ public:
 		databasePingDelay = g_network->isSimulated() ? 0.0 : 15.0;
 		runConsistencyCheck = g_network->isSimulated();
 		runConsistencyCheckOnCache = false;
+		runConsistencyCheckOnTSS = false;
 		waitForQuiescenceBegin = true;
 		waitForQuiescenceEnd = true;
 		simCheckRelocationDuration = false;
@@ -167,8 +168,8 @@ public:
 	         double databasePingDelay = -1.0)
 	  : title(title), dumpAfterTest(dump), clearAfterTest(clear), startDelay(startDelay), useDB(useDB), timeout(600),
 	    databasePingDelay(databasePingDelay), runConsistencyCheck(g_network->isSimulated()),
-	    runConsistencyCheckOnCache(false), waitForQuiescenceBegin(true), waitForQuiescenceEnd(true),
-	    simCheckRelocationDuration(false), simConnectionFailuresDisableDuration(0),
+	    runConsistencyCheckOnCache(false), runConsistencyCheckOnTSS(false), waitForQuiescenceBegin(true),
+	    waitForQuiescenceEnd(true), simCheckRelocationDuration(false), simConnectionFailuresDisableDuration(0),
 	    simBackupAgents(ISimulator::BackupAgentType::NoBackupAgents),
 	    simDrAgents(ISimulator::BackupAgentType::NoBackupAgents) {
 		phases = TestWorkload::SETUP | TestWorkload::EXECUTION | TestWorkload::CHECK | TestWorkload::METRICS;
@@ -187,6 +188,7 @@ public:
 	double databasePingDelay;
 	bool runConsistencyCheck;
 	bool runConsistencyCheckOnCache;
+	bool runConsistencyCheckOnTSS;
 	bool waitForQuiescenceBegin;
 	bool waitForQuiescenceEnd;
 
diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h
index d3c601a9b5..7feb6b3839 100644
--- a/flow/ProtocolVersion.h
+++ b/flow/ProtocolVersion.h
@@ -121,7 +121,7 @@ public: // introduced features
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, CloseUnusedConnection);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, DBCoreState);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, TagThrottleValue);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, ServerListValue);
+	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, ServerListValue);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, StorageCacheValue);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, RestoreStatusValue);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, RestoreRequestValue);
@@ -138,6 +138,8 @@ public: // introduced features
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010000LL, StableInterfaces);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, TagThrottleValueReason);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, SpanContext);
+	// TODO is this right?
+	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, TSS);
 };
 
 template <>
diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h
index 7bf2a05e63..e0e84c6e25 100644
--- a/flow/genericactors.actor.h
+++ b/flow/genericactors.actor.h
@@ -1230,6 +1230,8 @@ Future<T> brokenPromiseToMaybeDelivered(Future<T> in) {
 		return t;
 	} catch (Error& e) {
 		if (e.code() == error_code_broken_promise) {
+			// TODO REMOVE!
+			printf("broken promise!!");
 			throw request_maybe_delivered();
 		}
 		throw;
diff --git a/flow/serialize.h b/flow/serialize.h
index 81bb18ad4d..7653648a80 100644
--- a/flow/serialize.h
+++ b/flow/serialize.h
@@ -22,6 +22,9 @@
 #define FLOW_SERIALIZE_H
 #pragma once
 
+// TODO REMOVE
+#include <cinttypes>
+
 #include <stdint.h>
 #include <array>
 #include <set>
@@ -109,6 +112,12 @@ class Serializer {
 public:
 	static void serialize(Archive& ar, T& t) {
 		t.serialize(ar);
+		// TODO REMOVE
+		if (!ar.protocolVersion().isValid()) {
+			printf("invalid protocol version %" PRIx64 " < %" PRIx64 "!!!\n",
+			       ar.protocolVersion().version(),
+			       ProtocolVersion::minValidProtocolVersion);
+		}
 		ASSERT(ar.protocolVersion().isValid());
 	}
 };
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e12b1e3ce9..63f5c3bab8 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -87,7 +87,9 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES SlowTask.txt IGNORE)
   add_fdb_test(TEST_FILES SpecificUnitTest.txt IGNORE)
   add_fdb_test(TEST_FILES StorageMetricsSampleTests.txt IGNORE)
+  add_fdb_test(TEST_FILES StorageServerInterface.txt)
   add_fdb_test(TEST_FILES StreamingWrite.txt IGNORE)
+  add_fdb_test(TEST_FILES SystemData.txt)
   add_fdb_test(TEST_FILES ThreadSafety.txt IGNORE)
   add_fdb_test(TEST_FILES TraceEventMetrics.txt IGNORE)
   add_fdb_test(TEST_FILES PopulateTPCC.txt IGNORE)
diff --git a/tests/StorageServerInterface.txt b/tests/StorageServerInterface.txt
new file mode 100644
index 0000000000..b2bf01bb14
--- /dev/null
+++ b/tests/StorageServerInterface.txt
@@ -0,0 +1,7 @@
+testTitle=UnitTests
+startDelay=0
+useDB=false
+
+    testName=UnitTests
+    maxTestCases=0
+    testsMatching=/StorageServerInterface/
\ No newline at end of file
diff --git a/tests/SystemData.txt b/tests/SystemData.txt
new file mode 100644
index 0000000000..e8bbc2c57d
--- /dev/null
+++ b/tests/SystemData.txt
@@ -0,0 +1,7 @@
+testTitle=UnitTests
+startDelay=0
+useDB=false
+
+    testName=UnitTests
+    maxTestCases=0
+    testsMatching=/SystemData/
\ No newline at end of file

From 4257ac2b4dac2f41df581e8f7a3105a45c2a6354 Mon Sep 17 00:00:00 2001
From: Josh Slocum <josh.slocum@snowflake.com>
Date: Wed, 12 May 2021 18:53:20 +0000
Subject: [PATCH 436/461] More TSS Changes/Fixes

---
 fdbclient/BackupAgentBase.actor.cpp           |   4 +-
 fdbclient/CommitProxyInterface.h              |   1 -
 fdbclient/DatabaseContext.h                   |   1 -
 fdbclient/ManagementAPI.actor.cpp             |   1 -
 fdbclient/NativeAPI.actor.cpp                 | 103 +++---
 fdbclient/StorageServerInterface.cpp          | 162 ++------
 fdbclient/StorageServerInterface.h            |  12 +-
 fdbclient/SystemData.cpp                      |  17 +-
 fdbrpc/LoadBalance.actor.h                    |  53 +--
 fdbrpc/QueueModel.cpp                         |  15 +-
 fdbrpc/QueueModel.h                           |  12 +-
 fdbrpc/TSSComparison.h                        |  23 +-
 fdbrpc/fdbrpc.h                               |   2 -
 fdbserver/ApplyMetadataMutation.cpp           |  72 +++-
 fdbserver/ClusterController.actor.cpp         |  34 +-
 fdbserver/CommitProxyServer.actor.cpp         |   3 -
 fdbserver/DataDistribution.actor.cpp          | 265 ++++++-------
 fdbserver/DataDistributionTracker.actor.cpp   |  14 +-
 fdbserver/Knobs.cpp                           |   4 +-
 fdbserver/Knobs.h                             |   2 +
 fdbserver/MoveKeys.actor.cpp                  | 171 +++------
 fdbserver/MutationTracking.cpp                |   3 -
 fdbserver/QuietDatabase.actor.cpp             |   6 -
 fdbserver/Ratekeeper.actor.cpp                |   2 +-
 fdbserver/SimulatedCluster.actor.cpp          |  23 +-
 fdbserver/Status.actor.cpp                    |  12 +-
 fdbserver/TLogServer.actor.cpp                |   5 -
 fdbserver/WorkerInterface.actor.h             |  11 +-
 fdbserver/masterserver.actor.cpp              |   6 -
 fdbserver/storageserver.actor.cpp             | 349 ++----------------
 fdbserver/tester.actor.cpp                    |  10 +-
 fdbserver/worker.actor.cpp                    |  47 ++-
 .../workloads/ConsistencyCheck.actor.cpp      | 104 +-----
 fdbserver/workloads/RandomMoveKeys.actor.cpp  |   2 +-
 fdbserver/workloads/workloads.actor.h         |   2 +-
 flow/Knobs.cpp                                |   1 +
 flow/Knobs.h                                  |   1 +
 flow/genericactors.actor.h                    |   2 -
 flow/serialize.h                              |   9 -
 39 files changed, 482 insertions(+), 1084 deletions(-)

diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp
index cc861f310a..4b00857503 100644
--- a/fdbclient/BackupAgentBase.actor.cpp
+++ b/fdbclient/BackupAgentBase.actor.cpp
@@ -406,7 +406,7 @@ ACTOR Future<Void> readCommitted(Database cx,
 			// When this buggify line is enabled, if there are more than 1 result then use half of the results
 			// Copy the data instead of messing with the results directly to avoid TSS issues.
 			if (values.size() > 1 && BUGGIFY) {
-				Standalone<RangeResultRef> copy;
+				RangeResult copy;
 				// only copy first half of values into copy
 				for (int i = 0; i < values.size() / 2; i++) {
 					copy.push_back_deep(copy.arena(), values[i]);
@@ -478,7 +478,7 @@ ACTOR Future<Void> readCommitted(Database cx,
 			// When this buggify line is enabled, if there are more than 1 result then use half of the results.
 			// Copy the data instead of messing with the results directly to avoid TSS issues.
 			if (rangevalue.size() > 1 && BUGGIFY) {
-				Standalone<RangeResultRef> copy;
+				RangeResult copy;
 				// only copy first half of rangevalue into copy
 				for (int i = 0; i < rangevalue.size() / 2; i++) {
 					copy.push_back_deep(copy.arena(), rangevalue[i]);
diff --git a/fdbclient/CommitProxyInterface.h b/fdbclient/CommitProxyInterface.h
index 16f6695a03..2ac4481a15 100644
--- a/fdbclient/CommitProxyInterface.h
+++ b/fdbclient/CommitProxyInterface.h
@@ -125,7 +125,6 @@ struct ClientDBInfo {
 	bool operator!=(ClientDBInfo const& r) const { return id != r.id; }
 
 	// convenience method to treat tss mapping like a map
-	// TODO can serializer handle maps? could just change it
 	Optional<StorageServerInterface> getTssPair(UID storageServerID) const {
 		for (auto& it : tssMapping) {
 			if (it.first == storageServerID) {
diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h
index b1dee87f18..2a3d2ec35a 100644
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@@ -425,7 +425,6 @@ public:
 	static const std::vector<std::string> debugTransactionTagChoices;
 	std::unordered_map<KeyRef, Reference<WatchMetadata>> watchMap;
 
-	// TODO should this be private?
 	void maybeAddTssMapping(StorageServerInterface const& ssi);
 	void addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi);
 };
diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp
index 8b4d03b4d8..490d7404d9 100644
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@@ -358,7 +358,6 @@ ConfigurationResult buildConfiguration(std::vector<StringRef> const& modeTokens,
 		// A new tss setup must have count + storage engine. An adjustment must have at least one.
 		if ((isNew && (!count.present() || !storageEngine.present())) ||
 		    (!isNew && !count.present() && !storageEngine.present())) {
-			// TODO is this the right error type? And should we log something?
 			return ConfigurationResult::INCOMPLETE_CONFIGURATION;
 		}
 
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 9cd9a32d8c..b5ec6c179f 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -122,7 +122,6 @@ NetworkOptions::NetworkOptions()
 static const Key CLIENT_LATENCY_INFO_PREFIX = LiteralStringRef("client_latency/");
 static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = LiteralStringRef("client_latency_counter/");
 
-// TODO make tss function here
 void DatabaseContext::maybeAddTssMapping(StorageServerInterface const& ssi) {
 	// add tss mapping if server is new
 
@@ -135,21 +134,14 @@ void DatabaseContext::maybeAddTssMapping(StorageServerInterface const& ssi) {
 // calling getInterface potentially recursively is weird, but since this function is only called when an entry is
 // created/changed, the recursive call should never recurse itself.
 void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi) {
-	// TODO get both with a getInterface call which will create the tss endpoint and/or update both endpoints if there
-	// was a change in endpoint tokens
-
-	// the order of these is important because it hits the "different token same locality" issue, so we always want to
-	// request the tss first so the ss request overrides it.
-	// TODO this shouldn't be necessary after i stop doing the same server hack
 	Reference<StorageServerInfo> tssInfo = StorageServerInfo::getInterface(this, tssi, clientLocality);
 	Reference<StorageServerInfo> ssInfo = StorageServerInfo::getInterface(this, ssi, clientLocality);
 
-	// add new tss metrics object to queue
 	Reference<TSSMetrics> metrics = makeReference<TSSMetrics>();
 	tssMetrics[tssi.id()] = metrics;
 
-	// TODO any other requests it makes sense to duplicate?
-	// add each read data request interface to map (getValue, getKey, getKeyValues, watchValue)
+	// Add each read data request we want to duplicate to TSS to endpoint mapping (getValue, getKey, getKeyValues,
+	// watchValue)
 	queueModel.updateTssEndpoint(
 	    ssInfo->interf.getValue.getEndpoint().token.first(),
 	    TSSEndpointData(tssi.id(), tssInfo->interf.getValue.getEndpoint(), metrics, clientInfo->get().id));
@@ -162,10 +154,6 @@ void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageSe
 	queueModel.updateTssEndpoint(
 	    ssInfo->interf.watchValue.getEndpoint().token.first(),
 	    TSSEndpointData(tssi.id(), tssInfo->interf.watchValue.getEndpoint(), metrics, clientInfo->get().id));
-
-	// TODO REMOVE
-	printf(
-	    "added tss endpoints to queue for mapping %s=%s\n", ssi.id().toString().c_str(), tssi.id().toString().c_str());
 }
 
 Reference<StorageServerInfo> StorageServerInfo::getInterface(DatabaseContext* cx,
@@ -182,16 +170,11 @@ Reference<StorageServerInfo> StorageServerInfo::getInterface(DatabaseContext* cx
 				//       changes.
 
 				it->second->interf = ssi;
-
-				// TODO remove print
-				printf("maybeAddTss same locality %s\n", ssi.id().toString().c_str());
 				cx->maybeAddTssMapping(ssi);
 			} else {
 				it->second->notifyContextDestroyed();
 				Reference<StorageServerInfo> loc(new StorageServerInfo(cx, ssi, locality));
 				cx->server_interf[ssi.id()] = loc.getPtr();
-				// TODO REMOVE print
-				printf("maybeAddTss different locality %s\n", ssi.id().toString().c_str());
 				cx->maybeAddTssMapping(ssi);
 				return loc;
 			}
@@ -202,8 +185,6 @@ Reference<StorageServerInfo> StorageServerInfo::getInterface(DatabaseContext* cx
 
 	Reference<StorageServerInfo> loc(new StorageServerInfo(cx, ssi, locality));
 	cx->server_interf[ssi.id()] = loc.getPtr();
-	// TODO REMOVE print
-	// printf("maybeAddTss new ssi %s\n", ssi.id().toString().c_str());
 	cx->maybeAddTssMapping(ssi);
 	return loc;
 }
@@ -343,6 +324,13 @@ void delref(DatabaseContext* ptr) {
 	ptr->delref();
 }
 
+void traceTSSErrors(const char* name, UID tssId, const std::unordered_map<int, uint64_t>& errorsByCode) {
+	TraceEvent ev(name, tssId);
+	for (auto& it : errorsByCode) {
+		ev.detail("E" + std::to_string(it.first), it.second);
+	}
+}
+
 ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
 	state double lastLogged = 0;
 	loop {
@@ -389,11 +377,18 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
 			// TODO could skip this tss if request counter is zero? would potentially complicate elapsed calculation
 			// though
 			if (it.second->mismatches.getIntervalDelta()) {
-				printf("Found tss %s with %d mismatches!!\n",
-				       it.first.toString().c_str(),
-				       it.second->mismatches.getIntervalDelta());
 				cx->tssMismatchStream.send(it.first);
 			}
+
+			// do error histograms as separate event
+			if (it.second->ssErrorsByCode.size()) {
+				traceTSSErrors("TSS_SSErrors", it.first, it.second->ssErrorsByCode);
+			}
+
+			if (it.second->tssErrorsByCode.size()) {
+				traceTSSErrors("TSS_TSSErrors", it.first, it.second->tssErrorsByCode);
+			}
+
 			TraceEvent tssEv("TSSClientMetrics", cx->dbId);
 			tssEv.detail("TSSID", it.first)
 			    .detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged)
@@ -409,7 +404,7 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
 			tssEv.detail("MeanTSSGetValueLatency", it.second->TSSgetValueLatency.mean())
 			    .detail("MedianTSSGetValueLatency", it.second->TSSgetValueLatency.median())
 			    .detail("TSSGetValueLatency90", it.second->TSSgetValueLatency.percentile(0.90))
-			    .detail("TSSGetValueLatencyDiff99", it.second->TSSgetValueLatency.percentile(0.99));
+			    .detail("TSSGetValueLatency99", it.second->TSSgetValueLatency.percentile(0.99));
 
 			tssEv.detail("MeanSSGetKeyLatency", it.second->SSgetKeyLatency.mean())
 			    .detail("MedianSSGetKeyLatency", it.second->SSgetKeyLatency.median())
@@ -419,7 +414,7 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
 			tssEv.detail("MeanTSSGetKeyLatency", it.second->TSSgetKeyLatency.mean())
 			    .detail("MedianTSSGetKeyLatency", it.second->TSSgetKeyLatency.median())
 			    .detail("TSSGetKeyLatency90", it.second->TSSgetKeyLatency.percentile(0.90))
-			    .detail("TSSGetKeyLatencyDiff99", it.second->TSSgetKeyLatency.percentile(0.99));
+			    .detail("TSSGetKeyLatency99", it.second->TSSgetKeyLatency.percentile(0.99));
 
 			tssEv.detail("MeanSSGetKeyValuesLatency", it.second->SSgetKeyLatency.mean())
 			    .detail("MedianSSGetKeyValuesLatency", it.second->SSgetKeyLatency.median())
@@ -429,7 +424,7 @@ ACTOR Future<Void> databaseLogger(DatabaseContext* cx) {
 			tssEv.detail("MeanTSSGetKeyValuesLatency", it.second->TSSgetKeyValuesLatency.mean())
 			    .detail("MedianTSSGetKeyValuesLatency", it.second->TSSgetKeyValuesLatency.median())
 			    .detail("TSSGetKeyValuesLatency90", it.second->TSSgetKeyValuesLatency.percentile(0.90))
-			    .detail("TSSGetKeyValuesLatencyDiff99", it.second->TSSgetKeyValuesLatency.percentile(0.99));
+			    .detail("TSSGetKeyValuesLatency99", it.second->TSSgetKeyValuesLatency.percentile(0.99));
 
 			it.second->clear();
 		}
@@ -826,13 +821,12 @@ ACTOR static Future<Void> monitorTssChange(DatabaseContext* cx) {
 	loop {
 		wait(cx->clientInfo->onChange());
 		if (cx->clientInfo->get().tssMapping != curTssMapping) {
-			// TODO maybe re-read this from system keys instead if it changes
+			// To optimize size of the ClientDBInfo payload, we could eventually change CC to just send a tss change
+			// id/generation, and have client reread the mapping here if it changed. It's a very minor optimization
+			// though, and would cause extra read load.
 			ClientDBInfo clientInfo = cx->clientInfo->get();
 			curTssMapping = clientInfo.tssMapping;
 
-			// TODO REMOVE print
-			// printf("gonna do tss stuff with %d tss's\n", curTssMapping.size());
-
 			std::unordered_set<UID> seenTssIds;
 
 			if (curTssMapping.size()) {
@@ -840,15 +834,7 @@ ACTOR static Future<Void> monitorTssChange(DatabaseContext* cx) {
 					seenTssIds.insert(it.second.id());
 
 					if (cx->server_interf.count(it.first)) {
-						// TODO REMOVE
-						printf("found new tss mapping %s -> %s\n",
-						       it.first.toString().c_str(),
-						       it.second.id().toString().c_str());
 						cx->addTssMapping(cx->server_interf[it.first]->interf, it.second);
-					} else {
-						// TODO REMOVE case and print
-						// printf("server %s with tss pair %s not in server_interf, skipping for now\n",
-						// it.first.toString().c_str(), it.second.id().toString().c_str());
 					}
 				}
 			}
@@ -857,8 +843,6 @@ ACTOR static Future<Void> monitorTssChange(DatabaseContext* cx) {
 				if (seenTssIds.count(it->first)) {
 					it++;
 				} else {
-					// TODO REMOVE
-					printf("Erasing tss %s from tss_metrics\n", it->first.toString().c_str());
 					it = cx->tssMetrics.erase(it);
 				}
 			}
@@ -883,18 +867,14 @@ ACTOR static Future<Void> handleTssMismatches(DatabaseContext* cx) {
 				break;
 			}
 		}
-		// TODO maybe instead of assert, do a trace event because it's possible that by the time we checked the mismatch
-		// the tss is gone?
 		if (found) {
-			// TODO add trace event
+			TraceEvent(SevWarnAlways, "TSS_KillMismatch").detail("TSSID", tssID.toString());
 			TEST(true); // killing TSS because it got mismatch
-			printf("KILLING TSS %s (partner=%s) BECAUSE OF TSS MISMATCH\n",
-			       tssID.toString().c_str(),
-			       tssPairID.toString().c_str());
-
+			
 			// TODO we could write something to the system keyspace and then have DD listen to that keyspace and then DD
 			// do exactly this, so why not just cut out the middle man (or the middle system keys, as it were)
 			tr = makeReference<ReadYourWritesTransaction>(Database(Reference<DatabaseContext>::addRef(cx)));
+			state int tries = 0;
 			loop {
 				try {
 					tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
@@ -908,16 +888,20 @@ ACTOR static Future<Void> handleTssMismatches(DatabaseContext* cx) {
 
 					break;
 				} catch (Error& e) {
-					printf("Kill Mismatch TSS Transaction got error %d\n", e.code());
 					wait(tr->onError(e));
 				}
+				tries++;
+				if (tries > 10) {
+					// Give up on trying to kill the tss, it'll get another mismatch or a human will investigate
+					// eventually
+					TraceEvent("TSS_KillMismatchGaveUp").detail("TSSID", tssID.toString());
+					break;
+				}
 			}
-			tr = makeReference<ReadYourWritesTransaction>(); // clear out txn so that the extra ref gets decref'd and we
-			                                                 // can free cx
-
+			// clear out txn so that the extra DatabaseContext ref gets decref'd and we can free cx
+			tr = makeReference<ReadYourWritesTransaction>();
 		} else {
 			TEST(true); // Not killing TSS with mismatch because it's already gone
-			printf("Not killing TSS %s because of tss mismatch, must be already removed\n", tssID.toString().c_str());
 		}
 	}
 }
@@ -1264,14 +1248,16 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 		    SpecialKeySpace::IMPLTYPE::READWRITE,
 		    std::make_unique<ClientProfilingImpl>(
 		        KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
-				.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
+		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
 		registerSpecialKeySpaceModule(
-		    SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE,
+		    SpecialKeySpace::MODULE::MANAGEMENT,
+		    SpecialKeySpace::IMPLTYPE::READWRITE,
 		    std::make_unique<MaintenanceImpl>(
 		        KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0"))
 		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
 		registerSpecialKeySpaceModule(
-		    SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE,
+		    SpecialKeySpace::MODULE::MANAGEMENT,
+		    SpecialKeySpace::IMPLTYPE::READWRITE,
 		    std::make_unique<DataDistributionImpl>(
 		        KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
 		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
@@ -3364,10 +3350,9 @@ ACTOR Future<RangeResult> getRange(Database cx,
 					output.readThroughEnd = readThroughEnd;
 
 					if (BUGGIFY && limits.hasByteLimit() && output.size() > std::max(1, originalLimits.minRows)) {
-						printf("Buggify resizing in nativeapi\n");
 						// Copy instead of resizing because TSS maybe be using output's arena for comparison. This only
 						// happens in simulation so it's fine
-						Standalone<RangeResultRef> copy;
+						RangeResult copy;
 						int newSize =
 						    deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size());
 						for (int i = 0; i < newSize; i++) {
@@ -4419,8 +4404,6 @@ ACTOR static Future<Void> tryCommit(Database cx,
 		choose {
 			when(wait(cx->onProxiesChanged())) {
 				reply.cancel();
-				// TODO REMOVE
-				printf("tryCommit proxies changed ERROR!\n");
 				throw request_maybe_delivered();
 			}
 			when(CommitID ci = wait(reply)) {
diff --git a/fdbclient/StorageServerInterface.cpp b/fdbclient/StorageServerInterface.cpp
index 180d1b814c..fe5ef4aaeb 100644
--- a/fdbclient/StorageServerInterface.cpp
+++ b/fdbclient/StorageServerInterface.cpp
@@ -37,11 +37,6 @@ bool TSS_doCompare(const GetValueRequest& req,
                    Severity traceSeverity,
                    UID tssId) {
 	if (src.value.present() != tss.value.present() || (src.value.present() && src.value.get() != tss.value.get())) {
-		printf("GetValue %s @ %lld mismatch: src=%s, tss=%s\n",
-		       req.key.printable().c_str(),
-		       req.version,
-		       src.value.present() ? traceChecksumValue(src.value.get()).c_str() : "missing",
-		       tss.value.present() ? traceChecksumValue(tss.value.get()).c_str() : "missing");
 		TraceEvent(traceSeverity, "TSSMismatchGetValue")
 		    .suppressFor(1.0)
 		    .detail("TSSID", tssId)
@@ -52,8 +47,6 @@ bool TSS_doCompare(const GetValueRequest& req,
 
 		return false;
 	}
-	// printf("tss GetValueReply matched! src=%s, tss=%s\n", src.value.present() ? src.value.get().toString().c_str() :
-	// "missing", tss.value.present() ? tss.value.get().toString().c_str() : "missing");
 	return true;
 }
 
@@ -70,8 +63,6 @@ bool TSS_doCompare(const GetKeyRequest& req,
 	// check key selectors that start in a TSS shard and end in a non-TSS shard because the other read queries and the
 	// consistency check will eventually catch a misbehaving storage engine.
 	bool matches = true;
-	// printf("GetKey %s:<%s:%d @ %lld start:\n",
-	//		req.sel.getKey().toString().c_str(), req.sel.orEqual ? "=" : "", req.sel.offset, req.version);
 	if (src.sel.orEqual == tss.sel.orEqual && src.sel.offset == tss.sel.offset) {
 		// full matching case
 		if (src.sel.offset == 0 && src.sel.orEqual) {
@@ -99,23 +90,9 @@ bool TSS_doCompare(const GetKeyRequest& req,
 		// where one response has <=0 with the actual result and the other has <0 with the shard upper boundary.
 		// So whichever one has the actual result should have the lower key.
 		bool tssOffsetLarger = (src.sel.offset == tss.sel.offset) ? tss.sel.orEqual : src.sel.offset < tss.sel.offset;
-		// printf("  partial comparison: tssLarger=%s, tssOffsetLarger=%s, matches=%s\n", tssKeyLarger ? "T" : "F",
-		// tssOffsetLarger ? "T": "F", matches ? "T" : "F");
 		matches = tssKeyLarger != tssOffsetLarger;
 	}
 	if (!matches) {
-		// TODO REMOVE print
-		printf("GetKey %s:<%s:%d @ %lld mismatch: src=%s:<%s:%d, tss=%s:<%s:%d\n",
-		       req.sel.getKey().printable().c_str(),
-		       req.sel.orEqual ? "=" : "",
-		       req.sel.offset,
-		       req.version,
-		       src.sel.getKey().printable().c_str(),
-		       src.sel.orEqual ? "=" : "",
-		       src.sel.offset,
-		       tss.sel.getKey().printable().c_str(),
-		       tss.sel.orEqual ? "=" : "",
-		       tss.sel.offset);
 		TraceEvent(traceSeverity, "TSSMismatchGetKey")
 		    .suppressFor(1.0)
 		    .detail("TSSID", tssId)
@@ -138,32 +115,16 @@ bool TSS_doCompare(const GetKeyValuesRequest& req,
                    Severity traceSeverity,
                    UID tssId) {
 	if (src.more != tss.more || src.data != tss.data) {
-		// TODO REMOVE debugging prints
-		printf("GetKeyValues [%s:<%s:%d - %s:<%s:%d) @ %lld (lim=%d limB=%d) mismatch:\n",
-		       req.begin.getKey().printable().c_str(),
-		       req.begin.orEqual ? "=" : "",
-		       req.begin.offset,
-		       req.end.getKey().printable().c_str(),
-		       req.end.orEqual ? "=" : "",
-		       req.end.offset,
-		       req.version,
-		       req.limit,
-		       req.limitBytes);
 
 		std::string ssResultsString = format("(%d)%s:\n", src.data.size(), src.more ? "+" : "");
-		printf("src= (%d)%s:", src.data.size(), src.more ? "+" : "");
 		for (auto& it : src.data) {
-			printf("    %s=%s\n", it.key.printable().c_str(), it.value.printable().c_str());
 			ssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value);
 		}
 
 		std::string tssResultsString = format("(%d)%s:\n", tss.data.size(), tss.more ? "+" : "");
-		printf("tss= (%d)%s:", tss.data.size(), tss.more ? "+" : "");
 		for (auto& it : tss.data) {
-			printf("    %s=%s\n", it.key.printable().c_str(), it.value.printable().c_str());
 			tssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value);
 		}
-		printf("\n");
 
 		TraceEvent(traceSeverity, "TSSMismatchGetKeyValues")
 		    .suppressFor(1.0)
@@ -182,10 +143,6 @@ bool TSS_doCompare(const GetKeyValuesRequest& req,
 
 		return false;
 	}
-	/*printf("tss GetKeyValues [%s:<%s:%d - %s:<%s:%d) matched! %d=%d\n",
-	        req.begin.getKey().printable().c_str(), req.begin.orEqual ? "=" : "", req.begin.offset,
-	        req.end.getKey().printable().c_str(), req.end.orEqual ? "=" : "", req.end.offset,
-	        src.data.size(), tss.data.size());*/
 	return true;
 }
 
@@ -195,7 +152,7 @@ bool TSS_doCompare(const WatchValueRequest& req,
                    const WatchValueReply& tss,
                    Severity traceSeverity,
                    UID tssId) {
-	// TODO should this check that both returned the same version? We mainly want to duplicate watches just for load
+	// We duplicate watches just for load, no need to validte replies.
 	return true;
 }
 
@@ -233,53 +190,9 @@ bool TSS_doCompare(const SplitRangeRequest& req,
                    const SplitRangeReply& tss,
                    Severity traceSeverity,
                    UID tssId) {
-	// TODO in theory this should return the same response from both right?
 	return true;
 }
 
-// don't duplicate \xff reads or fetchKeys (avoid adding load to servers)
-template <>
-bool TSS_shouldDuplicateRequest(const GetValueRequest& req) {
-	return req.key.size() == 0 || req.key[0] != 0xff;
-}
-
-template <>
-bool TSS_shouldDuplicateRequest(const GetKeyRequest& req) {
-	return req.sel.getKey().size() == 0 || req.sel.getKey()[0] != 0xff;
-}
-
-template <>
-bool TSS_shouldDuplicateRequest(const GetKeyValuesRequest& req) {
-	return (req.begin.getKey().size() == 0 || req.begin.getKey()[0] != 0xff || req.end.getKey().size() == 0 ||
-	        req.end.getKey()[0] != 0xff) &&
-	       !req.isFetchKeys;
-}
-
-template <>
-bool TSS_shouldDuplicateRequest(const WatchValueRequest& req) {
-	return req.key.size() == 0 || req.key[0] != 0xff;
-}
-
-template <>
-bool TSS_shouldDuplicateRequest(const WaitMetricsRequest& req) {
-	return false;
-}
-
-template <>
-bool TSS_shouldDuplicateRequest(const SplitMetricsRequest& req) {
-	return false;
-}
-
-template <>
-bool TSS_shouldDuplicateRequest(const ReadHotSubRangeRequest& req) {
-	return false;
-}
-
-template <>
-bool TSS_shouldDuplicateRequest(const SplitRangeRequest& req) {
-	return false;
-}
-
 // only record metrics for data reads
 
 template <>
@@ -317,20 +230,26 @@ void TSSMetrics::recordLatency(const SplitRangeRequest& req, double ssLatency, d
 
 // -------------------
 
-// TODO ADD UNIT TESTS for compare methods, especially GetKey!!
 TEST_CASE("/StorageServerInterface/TSSCompare/TestComparison") {
 	printf("testing tss comparisons\n");
 
+	// to avoid compiler issues that StringRef(char* is deprecated)
+	std::string s_a = "a";
+	std::string s_b = "b";
+	std::string s_c = "c";
+	std::string s_d = "d";
+	std::string s_e = "e";
+
 	// test getValue
 	GetValueRequest gvReq;
-	gvReq.key = StringRef("a");
+	gvReq.key = StringRef(s_a);
 	gvReq.version = 5;
 
 	UID tssId;
 
 	GetValueReply gvReplyMissing;
-	GetValueReply gvReplyA(Optional<Value>(StringRef("a")), false);
-	GetValueReply gvReplyB(Optional<Value>(StringRef("b")), false);
+	GetValueReply gvReplyA(Optional<Value>(StringRef(s_a)), false);
+	GetValueReply gvReplyB(Optional<Value>(StringRef(s_b)), false);
 	ASSERT(TSS_doCompare(gvReq, gvReplyMissing, gvReplyMissing, SevInfo, tssId));
 	ASSERT(TSS_doCompare(gvReq, gvReplyA, gvReplyA, SevInfo, tssId));
 	ASSERT(TSS_doCompare(gvReq, gvReplyB, gvReplyB, SevInfo, tssId));
@@ -341,15 +260,15 @@ TEST_CASE("/StorageServerInterface/TSSCompare/TestComparison") {
 	// test GetKeyValues
 	Arena a; // for all of the refs. ASAN complains if this isn't done. Could also make them all standalone i guess
 	GetKeyValuesRequest gkvReq;
-	gkvReq.begin = firstGreaterOrEqual(StringRef(a, "A"));
-	gkvReq.end = firstGreaterOrEqual(StringRef(a, "C"));
+	gkvReq.begin = firstGreaterOrEqual(StringRef(a, s_a));
+	gkvReq.end = firstGreaterOrEqual(StringRef(a, s_b));
 	gkvReq.version = 5;
 
 	GetKeyValuesReply gkvReplyEmpty;
 	GetKeyValuesReply gkvReplyOne;
 	KeyValueRef v;
-	v.key = StringRef(a, "a");
-	v.value = StringRef(a, "1");
+	v.key = StringRef(a, s_a);
+	v.value = StringRef(a, s_b);
 	gkvReplyOne.data.push_back_deep(gkvReplyOne.arena, v);
 	GetKeyValuesReply gkvReplyOneMore;
 	gkvReplyOneMore.data.push_back_deep(gkvReplyOneMore.arena, v);
@@ -363,14 +282,14 @@ TEST_CASE("/StorageServerInterface/TSSCompare/TestComparison") {
 
 	// test GetKey
 	GetKeyRequest gkReq;
-	gkReq.sel = KeySelectorRef(StringRef(a, "Z"), false, 1);
+	gkReq.sel = KeySelectorRef(StringRef(a, s_a), false, 1);
 	gkReq.version = 5;
 
-	GetKeyReply gkReplyA(KeySelectorRef(StringRef(a, "A"), false, 20), false);
-	GetKeyReply gkReplyB(KeySelectorRef(StringRef(a, "B"), false, 10), false);
-	GetKeyReply gkReplyC(KeySelectorRef(StringRef(a, "C"), true, 0), false);
-	GetKeyReply gkReplyD(KeySelectorRef(StringRef(a, "D"), false, -10), false);
-	GetKeyReply gkReplyE(KeySelectorRef(StringRef(a, "E"), false, -20), false);
+	GetKeyReply gkReplyA(KeySelectorRef(StringRef(a, s_a), false, 20), false);
+	GetKeyReply gkReplyB(KeySelectorRef(StringRef(a, s_b), false, 10), false);
+	GetKeyReply gkReplyC(KeySelectorRef(StringRef(a, s_c), true, 0), false);
+	GetKeyReply gkReplyD(KeySelectorRef(StringRef(a, s_d), false, -10), false);
+	GetKeyReply gkReplyE(KeySelectorRef(StringRef(a, s_e), false, -20), false);
 
 	// identical cases
 	ASSERT(TSS_doCompare(gkReq, gkReplyA, gkReplyA, SevInfo, tssId));
@@ -396,26 +315,26 @@ TEST_CASE("/StorageServerInterface/TSSCompare/TestComparison") {
 
 	// test same offset/orEqual wrong key
 	ASSERT(!TSS_doCompare(gkReq,
-	                      GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false),
-	                      GetKeyReply(KeySelectorRef(StringRef("B"), true, 0), false),
+	                      GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false),
+	                      GetKeyReply(KeySelectorRef(StringRef(a, s_b), true, 0), false),
 	                      SevInfo,
 	                      tssId));
 	// this could be from different shard boundaries, so don't say it's a mismatch
 	ASSERT(TSS_doCompare(gkReq,
-	                     GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 10), false),
-	                     GetKeyReply(KeySelectorRef(StringRef(a, "B"), false, 10), false),
+	                     GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 10), false),
+	                     GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 10), false),
 	                     SevInfo,
 	                     tssId));
 
 	// test offsets and key difference don't match
 	ASSERT(!TSS_doCompare(gkReq,
-	                      GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 0), false),
-	                      GetKeyReply(KeySelectorRef(StringRef("B"), false, 10), false),
+	                      GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 0), false),
+	                      GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 10), false),
 	                      SevInfo,
 	                      tssId));
 	ASSERT(!TSS_doCompare(gkReq,
-	                      GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, -10), false),
-	                      GetKeyReply(KeySelectorRef(StringRef("B"), false, 0), false),
+	                      GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, -10), false),
+	                      GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 0), false),
 	                      SevInfo,
 	                      tssId));
 
@@ -423,42 +342,41 @@ TEST_CASE("/StorageServerInterface/TSSCompare/TestComparison") {
 	// positive
 	// one that didn't find is +1
 	ASSERT(TSS_doCompare(gkReq,
-	                     GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 1), false),
-	                     GetKeyReply(KeySelectorRef(StringRef("B"), true, 0), false),
+	                     GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 1), false),
+	                     GetKeyReply(KeySelectorRef(StringRef(a, s_b), true, 0), false),
 	                     SevInfo,
 	                     tssId));
 	ASSERT(!TSS_doCompare(gkReq,
-	                      GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false),
-	                      GetKeyReply(KeySelectorRef(StringRef("B"), false, 1), false),
+	                      GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false),
+	                      GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 1), false),
 	                      SevInfo,
 	                      tssId));
 
 	// negative will have zero offset but not equal set
 	ASSERT(TSS_doCompare(gkReq,
-	                     GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false),
-	                     GetKeyReply(KeySelectorRef(StringRef("B"), false, 0), false),
+	                     GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false),
+	                     GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 0), false),
 	                     SevInfo,
 	                     tssId));
 	ASSERT(!TSS_doCompare(gkReq,
-	                      GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 0), false),
-	                      GetKeyReply(KeySelectorRef(StringRef("B"), true, 0), false),
+	                      GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 0), false),
+	                      GetKeyReply(KeySelectorRef(StringRef(a, s_b), true, 0), false),
 	                      SevInfo,
 	                      tssId));
 
 	// test shard boundary key returned by incomplete query is the same as the key found by the other (only possible in
 	// positive direction)
 	ASSERT(TSS_doCompare(gkReq,
-	                     GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false),
-	                     GetKeyReply(KeySelectorRef(StringRef("A"), false, 1), false),
+	                     GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false),
+	                     GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 1), false),
 	                     SevInfo,
 	                     tssId));
 
 	// explictly test checksum function
-	std::string s = "A";
 	std::string s12 = "ABCDEFGHIJKL";
 	std::string s13 = "ABCDEFGHIJKLO";
 	std::string checksumStart13 = "(13)";
-	ASSERT(s == traceChecksumValue(StringRef(s)));
+	ASSERT(s_a == traceChecksumValue(StringRef(s_a)));
 	ASSERT(s12 == traceChecksumValue(StringRef(s12)));
 	ASSERT(checksumStart13 == traceChecksumValue(StringRef(s13)).substr(0, 4));
 	return Void();
diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h
index 9a514a447e..be1a223453 100644
--- a/fdbclient/StorageServerInterface.h
+++ b/fdbclient/StorageServerInterface.h
@@ -56,10 +56,7 @@ struct StorageServerInterface {
 
 	LocalityData locality;
 	UID uniqueID;
-	// TODO get rid of explicit mapping?
-	// Effectively implements Optional<UID> but serializer didn't like Optional
-	bool isTss;
-	UID tssPairID;
+	Optional<UID> tssPairID;
 
 	RequestStream<struct GetValueRequest> getValue;
 	RequestStream<struct GetKeyRequest> getKey;
@@ -80,12 +77,13 @@ struct StorageServerInterface {
 	RequestStream<struct ReadHotSubRangeRequest> getReadHotRanges;
 	RequestStream<struct SplitRangeRequest> getRangeSplitPoints;
 
-	explicit StorageServerInterface(UID uid) : uniqueID(uid), isTss(false) {}
-	StorageServerInterface() : uniqueID(deterministicRandom()->randomUniqueID()), isTss(false) {}
+	explicit StorageServerInterface(UID uid) : uniqueID(uid) {}
+	StorageServerInterface() : uniqueID(deterministicRandom()->randomUniqueID()) {}
 	NetworkAddress address() const { return getValue.getEndpoint().getPrimaryAddress(); }
 	NetworkAddress stableAddress() const { return getValue.getEndpoint().getStableAddress(); }
 	Optional<NetworkAddress> secondaryAddress() const { return getValue.getEndpoint().addresses.secondaryAddress; }
 	UID id() const { return uniqueID; }
+	bool isTss() const { return tssPairID.present(); }
 	std::string toString() const { return id().shortString(); }
 	template <class Ar>
 	void serialize(Ar& ar) {
@@ -95,7 +93,7 @@ struct StorageServerInterface {
 
 		if (ar.protocolVersion().hasSmallEndpoints()) {
 			if (ar.protocolVersion().hasTSS()) {
-				serializer(ar, uniqueID, locality, getValue, isTss, tssPairID);
+				serializer(ar, uniqueID, locality, getValue, tssPairID);
 			} else {
 				serializer(ar, uniqueID, locality, getValue);
 			}
diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index 1d7a750fe5..9ffd58464f 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -556,7 +556,6 @@ StorageServerInterface decodeServerListValue(ValueRef const& value) {
 	return s;
 }
 
-// TODO merge this with above stuff or something
 const Value serverListValueFB(StorageServerInterface const& server) {
 	return ObjectWriter::toValue(server, IncludeVersion());
 }
@@ -1111,8 +1110,8 @@ void testSSISerdes(StorageServerInterface const& ssi, bool useFB) {
 	printf("ssi=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\naddress=%s\ngetValue=%s\n\n\n",
 	       ssi.id().toString().c_str(),
 	       ssi.locality.toString().c_str(),
-	       ssi.isTss ? "true" : "false",
-	       ssi.isTss ? ssi.tssPairID.toString().c_str() : "",
+	       ssi.isTss() ? "true" : "false",
+	       ssi.isTss() ? ssi.tssPairID.get().toString().c_str() : "",
 	       ssi.address().toString().c_str(),
 	       ssi.getValue.getEndpoint().token.toString().c_str());
 
@@ -1122,16 +1121,16 @@ void testSSISerdes(StorageServerInterface const& ssi, bool useFB) {
 	printf("ssi2=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\naddress=%s\ngetValue=%s\n\n\n",
 	       ssi2.id().toString().c_str(),
 	       ssi2.locality.toString().c_str(),
-	       ssi2.isTss ? "true" : "false",
-	       ssi2.isTss ? ssi2.tssPairID.toString().c_str() : "",
+	       ssi2.isTss() ? "true" : "false",
+	       ssi2.isTss() ? ssi2.tssPairID.get().toString().c_str() : "",
 	       ssi2.address().toString().c_str(),
 	       ssi2.getValue.getEndpoint().token.toString().c_str());
 
 	ASSERT(ssi.id() == ssi2.id());
 	ASSERT(ssi.locality == ssi2.locality);
-	ASSERT(ssi.isTss == ssi2.isTss);
-	if (ssi.isTss) {
-		ASSERT(ssi2.tssPairID == ssi2.tssPairID);
+	ASSERT(ssi.isTss() == ssi2.isTss());
+	if (ssi.isTss()) {
+		ASSERT(ssi2.tssPairID.get() == ssi2.tssPairID.get());
 	}
 	ASSERT(ssi.address() == ssi2.address());
 	ASSERT(ssi.getValue.getEndpoint().token == ssi2.getValue.getEndpoint().token);
@@ -1149,13 +1148,11 @@ TEST_CASE("/SystemData/SerDes/SSI") {
 	StorageServerInterface ssi;
 	ssi.uniqueID = UID(0x1234123412341234, 0x5678567856785678);
 	ssi.locality = localityData;
-	ssi.isTss = false;
 	ssi.initEndpoints();
 
 	testSSISerdes(ssi, false);
 	testSSISerdes(ssi, true);
 
-	ssi.isTss = true;
 	ssi.tssPairID = UID(0x2345234523452345, 0x1238123812381238);
 
 	testSSISerdes(ssi, false);
diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h
index 2f1ee375bf..33a689c31d 100644
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@@ -31,9 +31,6 @@
 #include "flow/flow.h"
 #include "flow/Knobs.h"
 
-// TODO REMOVE?
-#include <cinttypes>
-
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/fdbrpc.h"
 #include "fdbrpc/Locality.h"
@@ -85,9 +82,8 @@ Future<Void> tssComparison(Req req,
                            Future<ErrorOr<Resp>> fSource,
                            Future<ErrorOr<Resp>> fTss,
                            TSSEndpointData tssData) {
-	// TODO add timeout and time requests
 	state double startTime = now();
-	state Future<Optional<ErrorOr<Resp>>> fTssWithTimeout = timeout(fTss, 5.0 /*TODO knob?*/);
+	state Future<Optional<ErrorOr<Resp>>> fTssWithTimeout = timeout(fTss, FLOW_KNOBS->LOAD_BALANCE_TSS_TIMEOUT);
 	state int finished = 0;
 	state double srcEndTime;
 	state double tssEndTime;
@@ -113,16 +109,21 @@ Future<Void> tssComparison(Req req,
 		}
 	}
 
+	// we want to record ss/tss errors to metrics
+	int srcErrorCode = error_code_success;
+	int tssErrorCode = error_code_success;
+
 	++tssData.metrics->requests;
 
 	if (src.isError()) {
-		++tssData.metrics->ssErrors;
+		srcErrorCode = src.getError().code();
+		tssData.metrics->ssError(srcErrorCode);
 	}
 	if (!tss.present()) {
 		++tssData.metrics->tssTimeouts;
 	} else if (tss.get().isError()) {
-		++tssData.metrics->tssErrors;
-		printf("Tss got error %d\n", tss.get().getError().code());
+		tssErrorCode = tss.get().getError().code();
+		tssData.metrics->tssError(tssErrorCode);
 	}
 	if (!src.isError() && tss.present() && !tss.get().isError()) {
 		Optional<LoadBalancedReply> srcLB = getLoadBalancedReply(&src.get());
@@ -146,13 +147,23 @@ Future<Void> tssComparison(Req req,
 				++tssData.metrics->mismatches;
 			}
 		} else if (tssLB.present() && tssLB.get().error.present()) {
-			++tssData.metrics->tssErrors;
-			printf("Tss got LB error %d\n", tssLB.get().error.get().code());
+			tssErrorCode = tssLB.get().error.get().code();
+			tssData.metrics->tssError(tssErrorCode);
 		} else if (srcLB.present() && srcLB.get().error.present()) {
-			++tssData.metrics->ssErrors;
+			srcErrorCode = srcLB.get().error.get().code();
+			tssData.metrics->ssError(srcErrorCode);
 		}
 	}
 
+	if (srcErrorCode != error_code_success && tssErrorCode != error_code_success && srcErrorCode != tssErrorCode) {
+		// if ss and tss both got different errors, record them
+		TraceEvent("TSSErrorMismatch")
+		    .suppressFor(1.0)
+		    .detail("TSSID", tssData.tssId)
+		    .detail("SSError", srcErrorCode)
+		    .detail("TSSError", tssErrorCode);
+	}
+
 	return Void();
 }
 
@@ -172,22 +183,20 @@ struct RequestData : NonCopyable {
 	// This is true once setupRequest is called, even though at that point the response is Never().
 	bool isValid() { return response.isValid(); }
 
-	void maybeDuplicateTSSRequest(RequestStream<Request> const* stream,
-	                              Request const& request,
-	                              QueueModel* model,
-								  Future<Reply> ssResponse) {
+	static void maybeDuplicateTSSRequest(RequestStream<Request> const* stream,
+	                                     Request& request,
+	                                     QueueModel* model,
+	                                     Future<Reply> ssResponse) {
 		if (model) {
 			// Send parallel request to TSS pair, if it exists
 			Optional<TSSEndpointData> tssData = model->getTssData(stream->getEndpoint().token.first());
 
-			if (tssData.present() && TSS_shouldDuplicateRequest(request)) {
+			if (tssData.present()) {
 				resetReply(request);
-
-				// TODO add timeout from knob to tss request?
 				// FIXME: optimize to avoid creating new netNotifiedQueue for each message
 				RequestStream<Request> tssRequestStream(tssData.get().endpoint);
 				Future<ErrorOr<REPLY_TYPE(Request)>> fTssResult = tssRequestStream.tryGetReply(request);
-				model->addActor.send(tssComparison(request, fResult, fTssResult, tssData.get()));
+				model->addActor.send(tssComparison(request, ssResponse, fTssResult, tssData.get()));
 			}
 		}
 	}
@@ -196,7 +205,7 @@ struct RequestData : NonCopyable {
 	void startRequest(double backoff,
 	                  bool triedAllOptions,
 	                  RequestStream<Request> const* stream,
-	                  Request const& request,
+	                  Request& request,
 	                  QueueModel* model) {
 		modelHolder = Reference<ModelHolder>();
 		requestStarted = false;
@@ -207,8 +216,8 @@ struct RequestData : NonCopyable {
 				    requestStarted = true;
 				    modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
 				    Future<Reply> resp = stream->tryGetReply(request);
-					maybeDuplicateTSSRequest(stream, request, model, resp);
-					return resp;
+				    maybeDuplicateTSSRequest(stream, request, model, resp);
+				    return resp;
 			    });
 		} else {
 			requestStarted = true;
diff --git a/fdbrpc/QueueModel.cpp b/fdbrpc/QueueModel.cpp
index 2cb5687b61..6aaaf3df34 100644
--- a/fdbrpc/QueueModel.cpp
+++ b/fdbrpc/QueueModel.cpp
@@ -18,8 +18,6 @@
  * limitations under the License.
  */
 
-#include <cinttypes>
-
 #include "fdbrpc/QueueModel.h"
 #include "fdbrpc/LoadBalance.h"
 
@@ -66,11 +64,10 @@ void QueueModel::updateTssEndpoint(uint64_t endpointId, TSSEndpointData tssData)
 	auto& d = data[endpointId];
 	if (!d.tssData.present()) {
 		tssCount++;
+		d.tssData = Optional<TSSEndpointData>(tssData);
+	} else {
+		d.tssData.get().generation = tssData.generation;
 	}
-
-	d.tssData = Optional<TSSEndpointData>(tssData);
-	// TODO REMOVE print
-	printf("Setting tss endpoint for %" PRIx64 " = %s\n", endpointId, tssData.endpoint.token.toString().c_str());
 }
 
 void QueueModel::removeOldTssData(UID currentGeneration) {
@@ -78,12 +75,6 @@ void QueueModel::removeOldTssData(UID currentGeneration) {
 		// expire old tss mappings that aren't present in new mapping
 		for (auto& it : data) {
 			if (it.second.tssData.present() && it.second.tssData.get().generation != currentGeneration) {
-				// TODO REMOVE print
-				printf("Removing tss endpoint for %" PRIx64
-				       " because its generation %s doesn't match the current one %s\n",
-				       it.first,
-				       it.second.tssData.get().generation.toString().c_str(),
-				       currentGeneration.toString().c_str());
 				it.second.tssData = Optional<TSSEndpointData>();
 				tssCount--;
 			}
diff --git a/fdbrpc/QueueModel.h b/fdbrpc/QueueModel.h
index f8592fa9a5..1e8cd009a0 100644
--- a/fdbrpc/QueueModel.h
+++ b/fdbrpc/QueueModel.h
@@ -33,8 +33,7 @@ struct TSSEndpointData {
 	UID tssId;
 	Endpoint endpoint;
 	Reference<TSSMetrics> metrics;
-	UID generation; // TODO this isn't exactly like a generation since it's not ordered, i'll try to think of a better
-	                // name
+	UID generation;
 
 	TSSEndpointData(UID tssId, Endpoint endpoint, Reference<TSSMetrics> metrics, UID generation)
 	  : tssId(tssId), endpoint(endpoint), metrics(metrics), generation(generation) {}
@@ -106,7 +105,10 @@ public:
 	double secondBudget;
 	PromiseStream<Future<Void>> addActor;
 	Future<Void> laggingRequests; // requests for which a different recipient already answered
+	PromiseStream<Future<Void>> addTSSActor;
+	Future<Void> tssComparisons; // requests for which a different recipient already answered
 	int laggingRequestCount;
+	int laggingTSSCompareCount;
 
 	void updateTssEndpoint(uint64_t endpointId, TSSEndpointData endpointData);
 	void removeOldTssData(UID currentGeneration);
@@ -114,9 +116,13 @@ public:
 
 	QueueModel() : secondMultiplier(1.0), secondBudget(0), laggingRequestCount(0), tssCount(0) {
 		laggingRequests = actorCollection(addActor.getFuture(), &laggingRequestCount);
+		tssComparisons = actorCollection(addTSSActor.getFuture(), &laggingTSSCompareCount);
 	}
 
-	~QueueModel() { laggingRequests.cancel(); }
+	~QueueModel() {
+		laggingRequests.cancel();
+		tssComparisons.cancel();
+	}
 
 private:
 	std::unordered_map<uint64_t, QueueData> data;
diff --git a/fdbrpc/TSSComparison.h b/fdbrpc/TSSComparison.h
index 6724e3dae7..335e8ae68e 100644
--- a/fdbrpc/TSSComparison.h
+++ b/fdbrpc/TSSComparison.h
@@ -37,7 +37,7 @@ struct TSSMetrics : ReferenceCounted<TSSMetrics>, NonCopyable {
 	Counter tssTimeouts;
 	Counter mismatches;
 
-	// TODO we could probably just ignore getKey as it's seldom used?
+	// We could probably just ignore getKey as it's seldom used?
 	ContinuousSample<double> SSgetValueLatency;
 	ContinuousSample<double> SSgetKeyLatency;
 	ContinuousSample<double> SSgetKeyValuesLatency;
@@ -46,6 +46,19 @@ struct TSSMetrics : ReferenceCounted<TSSMetrics>, NonCopyable {
 	ContinuousSample<double> TSSgetKeyLatency;
 	ContinuousSample<double> TSSgetKeyValuesLatency;
 
+	std::unordered_map<int, uint64_t> ssErrorsByCode;
+	std::unordered_map<int, uint64_t> tssErrorsByCode;
+
+	void ssError(int code) {
+		++ssErrors;
+		ssErrorsByCode[code]++;
+	}
+
+	void tssError(int code) {
+		++tssErrors;
+		tssErrorsByCode[code]++;
+	}
+
 	template <class Req>
 	void recordLatency(const Req& req, double ssLatency, double tssLatency);
 
@@ -57,6 +70,9 @@ struct TSSMetrics : ReferenceCounted<TSSMetrics>, NonCopyable {
 		TSSgetValueLatency.clear();
 		TSSgetKeyLatency.clear();
 		TSSgetKeyValuesLatency.clear();
+
+		tssErrorsByCode.clear();
+		ssErrorsByCode.clear();
 	}
 
 	TSSMetrics()
@@ -65,11 +81,6 @@ struct TSSMetrics : ReferenceCounted<TSSMetrics>, NonCopyable {
 	    SSgetKeyValuesLatency(1000), TSSgetValueLatency(1000), TSSgetKeyLatency(1000), TSSgetKeyValuesLatency(1000) {}
 };
 
-// global static functions
-
-template <class Req>
-bool TSS_shouldDuplicateRequest(const Req& req);
-
 // part of the contract of this function is that if there is a mismatch, the implementation needs to record a trace
 // event with the specified severity and tssId in the event.
 template <class Req, class Rep>
diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h
index e15e0126a1..a2a6af5af6 100644
--- a/fdbrpc/fdbrpc.h
+++ b/fdbrpc/fdbrpc.h
@@ -335,7 +335,6 @@ public:
 			Future<Void> disc =
 			    makeDependent<T>(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint(taskID));
 			if (disc.isReady()) {
-				printf("got disconnect or failure 1 :O\n");
 				return ErrorOr<REPLY_TYPE(X)>(request_maybe_delivered());
 			}
 			Reference<Peer> peer =
@@ -354,7 +353,6 @@ public:
 			Future<Void> disc =
 			    makeDependent<T>(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint());
 			if (disc.isReady()) {
-				printf("got disconnect or failure 2 :O\n");
 				return ErrorOr<REPLY_TYPE(X)>(request_maybe_delivered());
 			}
 			Reference<Peer> peer =
diff --git a/fdbserver/ApplyMetadataMutation.cpp b/fdbserver/ApplyMetadataMutation.cpp
index 87044f49b7..7349918f7a 100644
--- a/fdbserver/ApplyMetadataMutation.cpp
+++ b/fdbserver/ApplyMetadataMutation.cpp
@@ -68,6 +68,12 @@ void applyMetadataMutations(SpanID const& spanContext,
 	// std::map<keyRef, vector<uint16_t>> cacheRangeInfo;
 	std::map<KeyRef, MutationRef> cachedRangeInfo;
 
+	// Testing Storage Server removal (clearing serverTagKey) needs to read tss server list value to determine it is a
+	// tss + find partner's tag to send the private mutation. Since the removeStorageServer transaction clears both the
+	// storage list and server tag, we have to enforce ordering, proccessing the server tag first, and postpone the
+	// server list clear until the end;
+	std::vector<KeyRangeRef> tssServerListToRemove;
+
 	for (auto const& m : mutations) {
 		//TraceEvent("MetadataMutation", dbgid).detail("M", m.toString());
 		if (toCommit) {
@@ -95,14 +101,14 @@ void applyMetadataMutations(SpanID const& spanContext,
 
 						for (const auto& id : src) {
 							auto storageInfo = getStorageInfo(id, storageCache, txnStateStore);
-							ASSERT(!storageInfo->interf.isTss);
+							ASSERT(!storageInfo->interf.isTss());
 							ASSERT(storageInfo->tag != invalidTag);
 							info.tags.push_back(storageInfo->tag);
 							info.src_info.push_back(storageInfo);
 						}
 						for (const auto& id : dest) {
 							auto storageInfo = getStorageInfo(id, storageCache, txnStateStore);
-							ASSERT(!storageInfo->interf.isTss);
+							ASSERT(!storageInfo->interf.isTss());
 							ASSERT(storageInfo->tag != invalidTag);
 							info.tags.push_back(storageInfo->tag);
 							info.dest_info.push_back(storageInfo);
@@ -115,11 +121,8 @@ void applyMetadataMutations(SpanID const& spanContext,
 					txnStateStore->set(KeyValueRef(m.param1, m.param2));
 			} else if (m.param1.startsWith(serverKeysPrefix)) {
 				if (toCommit) {
-					Optional<Value> t =
-					    txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get();
-					// printf("got SetValue for serverKeysPrefix/%s, tag=%s\n",
-					// serverKeysDecodeServer(m.param1).toString().c_str(), t.present() ?
-					// decodeServerTagValue(t.get()).toString().c_str() : "");
+					Tag tag = decodeServerTagValue(
+					    txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get().get());
 					MutationRef privatized = m;
 					privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena);
 					TraceEvent(SevDebug, "SendingPrivateMutation", dbgid)
@@ -127,14 +130,9 @@ void applyMetadataMutations(SpanID const& spanContext,
 					    .detail("Privatized", privatized.toString())
 					    .detail("Server", serverKeysDecodeServer(m.param1))
 					    .detail("TagKey", serverTagKeyFor(serverKeysDecodeServer(m.param1)))
-					    .detail(
-					        "Tag",
-					        decodeServerTagValue(
-					            txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get().get())
-					            .toString());
+					    .detail("Tag", tag.toString());
 
-					toCommit->addTag(decodeServerTagValue(
-					    txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get().get()));
+					toCommit->addTag(tag);
 					toCommit->writeTypedMessage(privatized);
 				}
 			} else if (m.param1.startsWith(serverTagPrefix)) {
@@ -386,8 +384,20 @@ void applyMetadataMutations(SpanID const& spanContext,
 				}
 			}
 			if (serverListKeys.intersects(range)) {
-				if (!initialCommit)
-					txnStateStore->clear(range & serverListKeys);
+				if (!initialCommit) {
+					KeyRangeRef rangeToClear = range & serverListKeys;
+					if (rangeToClear.singleKeyRange()) {
+						UID id = decodeServerListKey(rangeToClear.begin);
+						Optional<Value> ssiV = txnStateStore->readValue(serverListKeyFor(id)).get();
+						if (ssiV.present() && decodeServerListValue(ssiV.get()).isTss()) {
+							tssServerListToRemove.push_back(rangeToClear);
+						} else {
+							txnStateStore->clear(rangeToClear);
+						}
+					} else {
+						txnStateStore->clear(rangeToClear);
+					}
+				}
 			}
 			if (tagLocalityListKeys.intersects(range)) {
 				if (!initialCommit)
@@ -418,6 +428,32 @@ void applyMetadataMutations(SpanID const& spanContext,
 							toCommit->writeTypedMessage(privatized);
 						}
 					}
+					// Might be a tss removal, which doesn't store a tag there.
+					// Chained if is a little verbose, but avoids unecessary work
+					if (!initialCommit && !serverKeysCleared.size()) {
+						KeyRangeRef maybeTssRange = range & serverTagKeys;
+						if (maybeTssRange.singleKeyRange()) {
+							UID id = decodeServerTagKey(maybeTssRange.begin);
+							Optional<Value> ssiV = txnStateStore->readValue(serverListKeyFor(id)).get();
+
+							if (ssiV.present()) {
+								StorageServerInterface ssi = decodeServerListValue(ssiV.get());
+								if (ssi.isTss()) {
+									Optional<Value> tagV =
+									    txnStateStore->readValue(serverTagKeyFor(ssi.tssPairID.get())).get();
+									if (tagV.present()) {
+										MutationRef privatized = m;
+										privatized.param1 = maybeTssRange.begin.withPrefix(systemKeys.begin, arena);
+										privatized.param2 =
+										    keyAfter(maybeTssRange.begin, arena).withPrefix(systemKeys.begin, arena);
+
+										toCommit->addTag(decodeServerTagValue(tagV.get()));
+										toCommit->writeTypedMessage(privatized);
+									}
+								}
+							}
+						}
+					}
 				}
 				if (!initialCommit) {
 					KeyRangeRef clearRange = range & serverTagKeys;
@@ -575,6 +611,10 @@ void applyMetadataMutations(SpanID const& spanContext,
 		}
 	}
 
+	for (KeyRangeRef& range : tssServerListToRemove) {
+		txnStateStore->clear(range);
+	}
+
 	// If we accumulated private mutations for cached key-ranges, we also need to
 	// tag them with the relevant storage servers. This is done to make the storage
 	// servers aware of the cached key-ranges
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index abb87fdf2d..d6a2482950 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1,4 +1,3 @@
-
 /*
  * ClusterController.actor.cpp
  *
@@ -3186,9 +3185,9 @@ ACTOR Future<Void> workerAvailabilityWatch(WorkerInterface worker,
 					checkOutstandingRequests(cluster);
 				}
 			}
-
 			when(wait(failed)) { // remove workers that have failed
 				WorkerInfo& failedWorkerInfo = cluster->id_worker[worker.locality.processId()];
+
 				if (!failedWorkerInfo.reply.isSet()) {
 					failedWorkerInfo.reply.send(
 					    RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo));
@@ -3379,12 +3378,6 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
 		isChanged = true;
 	}
 
-	// TODO remove debugging
-	printf("CC:\ntss_count=%d\ntss_storage_engine=%d|%s\n",
-	       db->config.desiredTSSCount,
-	       db->config.testingStorageServerStoreType,
-	       db->config.testingStorageServerStoreType.toString().c_str());
-
 	// Construct the client information
 	if (db->clientInfo->get().commitProxies != req.commitProxies ||
 	    db->clientInfo->get().grvProxies != req.grvProxies) {
@@ -3894,12 +3887,11 @@ ACTOR Future<Void> monitorTSSMapping(ClusterControllerData* self) {
 					tssIdMap[it.first] = it.second;
 					// ensure two storage servers don't map to same TSS
 					ASSERT(seenTssIds.insert(it.second).second);
+					// ensure a storage server doesn't accidentally map to itself (unless we're in HACK_IDENTITY_MAPPING
+					// mode)
+					ASSERT(SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING || it.first != it.second);
 				}
 
-				// TODO REMOVE print
-				printf("tss mapping of size %d\n", tssIdMap.size());
-
-				// TODO is copying storage server interfaces bad?
 				state std::vector<std::pair<UID, StorageServerInterface>> newMapping;
 				state std::map<UID, StorageServerInterface> oldMapping;
 				state bool mappingChanged = false;
@@ -3909,10 +3901,6 @@ ACTOR Future<Void> monitorTSSMapping(ClusterControllerData* self) {
 				for (auto& it : clientInfo.tssMapping) {
 					oldMapping[it.first] = it.second;
 					if (!tssIdMap.count(it.first)) {
-						// TODO add trace event
-						printf("tss mapping removed: %s=%s\n",
-						       it.first.toString().c_str(),
-						       it.second.id().toString().c_str());
 						TraceEvent("TSS_MappingRemoved", self->id)
 						    .detail("SSID", it.first)
 						    .detail("TSSID", it.second.id());
@@ -3936,22 +3924,16 @@ ACTOR Future<Void> monitorTSSMapping(ClusterControllerData* self) {
 							    .detail("SSID", it.first)
 							    .detail("TSSID", it.second)
 							    .detail("OldTSSID", interf.id());
-							printf("tss mapping updated: %s=%s\n",
-							       it.first.toString().c_str(),
-							       it.second.toString().c_str());
 							mappingChanged = true;
 						}
 					} else {
-						// TODO add trace event
 						TraceEvent("TSS_MappingAdded", self->id).detail("SSID", it.first).detail("TSSID", it.second);
-						printf("tss mapping added: %s=%s\n", it.first.toString().c_str(), it.second.toString().c_str());
 						mappingChanged = true;
 					}
 
 					state UID ssid = it.first;
 					state UID tssid = it.second;
 					// request storage server interface for tssid, add it to results
-					// TODO could issue all of these futures and then process then after as an optimization
 					Optional<Value> tssiVal = wait(tr->get(serverListKeyFor(it.second)));
 
 					// because we read the tss mapping in the same transaction, there can be no races with tss removal
@@ -3961,11 +3943,6 @@ ACTOR Future<Void> monitorTSSMapping(ClusterControllerData* self) {
 					StorageServerInterface tssi = decodeServerListValue(tssiVal.get());
 					if (oldTssId.present() && tssi.id() == oldTssId.get() && oldGetValueEndpoint.present() &&
 					    oldGetValueEndpoint.get() != tssi.getValue.getEndpoint().token) {
-						// TODO REMOVE print
-						printf("tss %s restarted, getValue %s -> %s\n",
-						       tssi.id().toString().c_str(),
-						       oldGetValueEndpoint.get().toString().c_str(),
-						       tssi.getValue.getEndpoint().token.toString().c_str());
 						mappingChanged = true;
 					}
 					newMapping.push_back(std::pair<UID, StorageServerInterface>(ssid, tssi));
@@ -3973,8 +3950,6 @@ ACTOR Future<Void> monitorTSSMapping(ClusterControllerData* self) {
 
 				// if nothing changed, skip updating
 				if (mappingChanged) {
-					// TODO REMOVE print
-					printf("CC updating tss client and server info\n");
 					clientInfo.id = deterministicRandom()->randomUniqueID();
 					clientInfo.tssMapping = newMapping;
 					self->db.clientInfo->set(clientInfo);
@@ -4592,7 +4567,6 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
 		when(GetWorkersRequest req = waitNext(interf.getWorkers.getFuture())) {
 			++self.getWorkersRequests;
 			vector<WorkerDetails> workers;
-			// printf("CC got GetWorkersRequest\n");
 
 			for (auto& it : self.id_worker) {
 				if ((req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) &&
diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp
index 3fc1ed02c3..d1469c0d3b 100644
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@@ -1507,7 +1507,6 @@ ACTOR static Future<Void> rejoinServer(CommitProxyInterface proxy, ProxyCommitDa
 
 	loop {
 		GetStorageServerRejoinInfoRequest req = waitNext(proxy.getStorageServerRejoinInfo.getFuture());
-		printf("Proxy got Rejoin req for %s\n", req.id.toString().c_str());
 		if (commitData->txnStateStore->readValue(serverListKeyFor(req.id)).get().present()) {
 			GetStorageServerRejoinInfoReply rep;
 			rep.version = commitData->version;
@@ -1568,10 +1567,8 @@ ACTOR static Future<Void> rejoinServer(CommitProxyInterface proxy, ProxyCommitDa
 				}
 				rep.newTag = Tag(maxTagLocality + 1, 0);
 			}
-			printf("Proxy sent Rejoin response for %s\n", req.id.toString().c_str());
 			req.reply.send(rep);
 		} else {
-			printf("Proxy notifying %s it can't rejoin because it was removed.\n", req.id.toString().c_str());
 			req.reply.sendError(worker_removed());
 		}
 	}
diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp
index cbb0364178..659ce2cbd0 100644
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@@ -67,6 +67,7 @@ struct TCServerInfo : public ReferenceCounted<TCServerInfo> {
 	Promise<Void> removed;
 	Future<Void> onRemoved;
 	Future<Void> onTSSPairRemoved;
+	Promise<Void> killTss;
 	Promise<Void> wakeUpTracker;
 	bool inDesiredDC;
 	LocalityEntry localityEntry;
@@ -85,7 +86,7 @@ struct TCServerInfo : public ReferenceCounted<TCServerInfo> {
 	  : id(ssi.id()), collection(collection), lastKnownInterface(ssi), lastKnownClass(processClass),
 	    dataInFlightToServer(0), onInterfaceChanged(interfaceChanged.getFuture()), onRemoved(removed.getFuture()),
 	    inDesiredDC(inDesiredDC), storeType(KeyValueStoreType::END), onTSSPairRemoved(Never()) {
-		if (!ssi.isTss) {
+		if (!ssi.isTss()) {
 			localityEntry = ((LocalityMap<UID>*)storageServerSet.getPtr())->add(ssi.locality, &id);
 		}
 	}
@@ -451,19 +452,16 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution(Data
 
 			for (int i = 0; i < serverList.get().size(); i++) {
 				auto ssi = decodeServerListValue(serverList.get()[i].value);
-				if (!ssi.isTss) {
-					printf("DD adding SS %s on init\n", ssi.id().toString().c_str());
+				if (!ssi.isTss()) {
 					result->allServers.push_back(std::make_pair(ssi, id_data[ssi.locality.processId()].processClass));
 					server_dc[ssi.id()] = ssi.locality.dcId();
 				} else {
-					printf("DD ignoring TSS %s on init until after team building\n", ssi.id().toString().c_str());
 					tss_servers.push_back(std::make_pair(ssi, id_data[ssi.locality.processId()].processClass));
 				}
 			}
 
 			break;
 		} catch (Error& e) {
-			printf("get initial DD failed %d\n", e.code());
 			wait(tr.onError(e));
 
 			ASSERT(!succeeded); // We shouldn't be retrying if we have already started modifying result in this loop
@@ -557,7 +555,6 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution(Data
 				beginKey = keyServers.end()[-1].key;
 				break;
 			} catch (Error& e) {
-				printf("GetInitialTeams got error %d\n", e.code());
 				TraceEvent("GetInitialTeamsKeyServersRetry", distributorId).error(e);
 
 				wait(tr.onError(e));
@@ -573,7 +570,6 @@ ACTOR Future<Reference<InitialDataDistribution>> getInitialDataDistribution(Data
 
 	// add tss to server list AFTER teams are built
 	for (auto& it : tss_servers) {
-		printf("DD adding TSS %s on init\n", it.first.id().toString().c_str());
 		result->allServers.push_back(it);
 	}
 
@@ -1167,7 +1163,6 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 						self->addActor.send(self->checkInvalidLocalities);
 					}
 				}
-				printf("%p init adding %s\n", (void*)self, i->first.toString().c_str());
 				self->addServer(i->first, i->second, self->serverTrackerErrorOut, 0, ddEnabledState);
 			}
 		}
@@ -2447,23 +2442,16 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 			return;
 		}
 
-		// printf("addServer(%s)\n", newServer.id().toString().c_str());
-
-		if (!newServer.isTss) {
+		if (!newServer.isTss()) {
 			allServers.push_back(newServer.id());
 		}
 
-		TraceEvent(newServer.isTss ? "AddedTSS" : "AddedStorageServer", distributorId)
+		TraceEvent(newServer.isTss() ? "AddedTSS" : "AddedStorageServer", distributorId)
 		    .detail("ServerID", newServer.id())
 		    .detail("ProcessClass", processClass.toString())
 		    .detail("WaitFailureToken", newServer.waitFailure.getEndpoint().token)
 		    .detail("Address", newServer.waitFailure.getEndpoint().getPrimaryAddress());
 
-		// TODO how to do this?
-		/*if (newServer.isTss) {
-		    tr.detail("TSSPairID", newServer.tssPairID);
-		}*/
-
 		auto& r = server_and_tss_info[newServer.id()] = makeReference<TCServerInfo>(
 		    newServer,
 		    this,
@@ -2472,11 +2460,11 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		        std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end(),
 		    storageServerSet);
 
-		if (newServer.isTss) {
-			tss_info_by_pair[newServer.tssPairID] = r;
+		if (newServer.isTss()) {
+			tss_info_by_pair[newServer.tssPairID.get()] = r;
 
-			if (server_info.count(newServer.tssPairID)) {
-				r->onTSSPairRemoved = server_info[newServer.tssPairID]->onRemoved;
+			if (server_info.count(newServer.tssPairID.get())) {
+				r->onTSSPairRemoved = server_info[newServer.tssPairID.get()]->onRemoved;
 			}
 		} else {
 			server_info[newServer.id()] = r;
@@ -2485,9 +2473,9 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 		}
 
 		r->tracker =
-		    storageServerTracker(this, cx, r.getPtr(), errorOut, addedVersion, ddEnabledState, newServer.isTss);
+		    storageServerTracker(this, cx, r.getPtr(), errorOut, addedVersion, ddEnabledState, newServer.isTss());
 
-		if (!newServer.isTss) {
+		if (!newServer.isTss()) {
 			// link and wake up tss' tracker so it knows when this server gets removed
 			if (tss_info_by_pair.count(newServer.id())) {
 				tss_info_by_pair[newServer.id()]->onTSSPairRemoved = r->onRemoved;
@@ -2666,19 +2654,16 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 
 	void removeTSS(UID removedServer) {
 		// much simpler than remove server. tss isn't in any teams, so just remove it from data structures
-		TEST(true); // Remove a TSS frm the cluster
-		printf("Removing tss %s\n", removedServer.toString().c_str());
 		TraceEvent("RemovedTSS", distributorId).detail("ServerID", removedServer);
 		Reference<TCServerInfo> removedServerInfo = server_and_tss_info[removedServer];
 
-		tss_info_by_pair.erase(removedServerInfo->lastKnownInterface.tssPairID);
+		tss_info_by_pair.erase(removedServerInfo->lastKnownInterface.tssPairID.get());
 		server_and_tss_info.erase(removedServer);
 
 		server_status.clear(removedServer);
 	}
 
 	void removeServer(UID removedServer) {
-		printf("Removing ss %s\n", removedServer.toString().c_str());
 		TraceEvent("RemovedStorageServer", distributorId).detail("ServerID", removedServer);
 
 		// ASSERT( !shardsAffectedByTeamFailure->getServersForTeam( t ) for all t in teams that contain removedServer )
@@ -2800,7 +2785,7 @@ struct DDTeamCollection : ReferenceCounted<DDTeamCollection> {
 };
 
 TCServerInfo::~TCServerInfo() {
-	if (collection && ssVersionTooFarBehind.get() && !lastKnownInterface.isTss) {
+	if (collection && ssVersionTooFarBehind.get() && !lastKnownInterface.isTss()) {
 		collection->removeLaggingStorageServer(lastKnownInterface.locality.zoneId().get());
 	}
 }
@@ -3476,19 +3461,18 @@ ACTOR Future<Void> teamTracker(DDTeamCollection* self, Reference<TCTeamInfo> tea
 			bool recheck = !healthy && (lastReady != self->initialFailureReactionDelay.isReady() ||
 			                            (lastZeroHealthy && !self->zeroHealthyTeams->get()) || containsFailed);
 
-			// TODO recomment
-			TraceEvent("TeamHealthChangeDetected", self->distributorId)
-			    .detail("Team", team->getDesc())
-			    .detail("ServersLeft", serversLeft)
-			    .detail("LastServersLeft", lastServersLeft)
-			    .detail("AnyUndesired", anyUndesired)
-			    .detail("LastAnyUndesired", lastAnyUndesired)
-			    .detail("AnyWrongConfiguration", anyWrongConfiguration)
-			    .detail("LastWrongConfiguration", lastWrongConfiguration)
-			    .detail("Recheck", recheck)
-			    .detail("BadTeam", badTeam)
-			    .detail("LastZeroHealthy", lastZeroHealthy)
-			    .detail("ZeroHealthyTeam", self->zeroHealthyTeams->get());
+			// TraceEvent("TeamHealthChangeDetected", self->distributorId)
+			//     .detail("Team", team->getDesc())
+			//     .detail("ServersLeft", serversLeft)
+			//     .detail("LastServersLeft", lastServersLeft)
+			//     .detail("AnyUndesired", anyUndesired)
+			//     .detail("LastAnyUndesired", lastAnyUndesired)
+			//     .detail("AnyWrongConfiguration", anyWrongConfiguration)
+			//     .detail("LastWrongConfiguration", lastWrongConfiguration)
+			//     .detail("Recheck", recheck)
+			//     .detail("BadTeam", badTeam)
+			//     .detail("LastZeroHealthy", lastZeroHealthy)
+			//     .detail("ZeroHealthyTeam", self->zeroHealthyTeams->get());
 
 			lastReady = self->initialFailureReactionDelay.isReady();
 			lastZeroHealthy = self->zeroHealthyTeams->get();
@@ -3860,7 +3844,7 @@ ACTOR Future<Void> waitServerListChange(DDTeamCollection* self,
 							                self->serverTrackerErrorOut,
 							                tr.getReadVersion().get(),
 							                ddEnabledState);
-							if (!ssi.isTss) {
+							if (!ssi.isTss()) {
 								self->doBuildTeams = true;
 							}
 						}
@@ -3877,7 +3861,6 @@ ACTOR Future<Void> waitServerListChange(DDTeamCollection* self,
 				}
 			}
 		} catch (Error& e) {
-			printf("WaitServerListChange got error %d\n", e.code());
 			wait(tr.onError(e));
 			serverListAndProcessClasses = Never();
 			isFetchingResults = false;
@@ -3967,7 +3950,6 @@ ACTOR Future<Void> keyValueStoreTypeTracker(DDTeamCollection* self, TCServerInfo
 
 ACTOR Future<Void> waitForAllDataRemoved(Database cx, UID serverID, Version addedVersion, DDTeamCollection* teams) {
 	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);
-	printf("Waiting for data to be removed from %s\n", serverID.toString().c_str());
 	loop {
 		try {
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
@@ -4005,10 +3987,6 @@ ACTOR Future<Void> storageServerFailureTracker(DDTeamCollection* self,
 	state StorageServerInterface interf = server->lastKnownInterface;
 	state int targetTeamNumPerServer =
 	    (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (self->configuration.storageTeamSize + 1)) / 2;
-
-	printf("Starting failure tracker for %sSS %s\n",
-	       server->lastKnownInterface.isTss ? "T" : "",
-	       server->lastKnownInterface.id().toString().c_str());
 	loop {
 		state bool inHealthyZone = false; // healthChanged actor will be Never() if this flag is true
 		if (self->healthyZone.get().present()) {
@@ -4027,7 +4005,7 @@ ACTOR Future<Void> storageServerFailureTracker(DDTeamCollection* self,
 			}
 		}
 
-		if (!interf.isTss) {
+		if (!interf.isTss()) {
 			if (self->server_status.get(interf.id()).initialized) {
 				bool unhealthy = self->server_status.get(interf.id()).isUnhealthy();
 				if (unhealthy && !status->isUnhealthy()) {
@@ -4059,7 +4037,7 @@ ACTOR Future<Void> storageServerFailureTracker(DDTeamCollection* self,
 		choose {
 			when(wait(healthChanged)) {
 				status->isFailed = !status->isFailed;
-				if (!status->isFailed && !server->lastKnownInterface.isTss &&
+				if (!status->isFailed && !server->lastKnownInterface.isTss() &&
 				    (server->teams.size() < targetTeamNumPerServer || self->lastBuildTeamsFailed)) {
 					self->doBuildTeams = true;
 				}
@@ -4119,9 +4097,6 @@ ACTOR Future<Void> storageServerTracker(
 	state int targetTeamNumPerServer =
 	    (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (self->configuration.storageTeamSize + 1)) / 2;
 
-	// TODO REMOVE
-	printf("Started %sSS tracker for %s\n", isTss ? "T" : "", server->id.toString().c_str());
-
 	try {
 		loop {
 			status.isUndesired = !self->disableFailingLaggingServers.get() && server->ssVersionTooFarBehind.get();
@@ -4267,9 +4242,7 @@ ACTOR Future<Void> storageServerTracker(
 
 			state bool recordTeamCollectionInfo = false;
 			choose {
-				when(wait(failureTracker || server->onTSSPairRemoved)) {
-					printf("Server %s getting removed\n", server->id.toString().c_str());
-
+				when(wait(failureTracker || server->onTSSPairRemoved || server->killTss.getFuture())) {
 					// The server is failed AND all data has been removed from it, so permanently remove it.
 					TraceEvent("StatusMapChange", self->distributorId)
 					    .detail("ServerID", server->id)
@@ -4280,9 +4253,8 @@ ACTOR Future<Void> storageServerTracker(
 					}
 
 					// Remove server from FF/serverList
-					Optional<UID> tssPairID =
-					    server->lastKnownInterface.isTss ? server->lastKnownInterface.tssPairID : Optional<UID>();
-					wait(removeStorageServer(cx, server->id, tssPairID, self->lock, ddEnabledState));
+					wait(removeStorageServer(
+					    cx, server->id, server->lastKnownInterface.tssPairID, self->lock, ddEnabledState));
 
 					TraceEvent("StatusMapChange", self->distributorId)
 					    .detail("ServerID", server->id)
@@ -4556,7 +4528,6 @@ ACTOR Future<Void> checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) {
 }
 
 int numExistingSSOnAddr(DDTeamCollection* self, const AddressExclusion& addr) {
-	// TODO add tss?
 	int numExistingSS = 0;
 	for (auto& server : self->server_and_tss_info) {
 		const NetworkAddress& netAddr = server.second->lastKnownInterface.stableAddress();
@@ -4570,16 +4541,24 @@ int numExistingSSOnAddr(DDTeamCollection* self, const AddressExclusion& addr) {
 }
 
 // All state that represents an ongoing tss pair recruitment
-struct TSSRecruitmentState : ReferenceCounted<TSSRecruitmentState>, NonCopyable {
+struct TSSPairState : ReferenceCounted<TSSPairState>, NonCopyable {
 	Promise<Optional<std::pair<UID, Version>>>
 	    ssPairInfo; // if set, for ss to pass its id to tss pair once it is successfully recruited
 	Promise<bool> tssPairDone; // if set, for tss to pass ss that it was successfully recruited
+
 	Optional<Key> dcId; // dc
+	Optional<Key> dataHallId; // data hall
+
 	bool active;
 
-	TSSRecruitmentState() : active(false) {}
+	TSSPairState() : active(false) {}
 
-	TSSRecruitmentState(Optional<Key> dcId) : active(true), dcId(dcId) {}
+	TSSPairState(const LocalityData& locality)
+	  : active(true), dcId(locality.dcId()), dataHallId(locality.dataHallId()) {}
+
+	bool inDataZone(const LocalityData& locality) {
+		return locality.dcId() == dcId && locality.dataHallId() == dataHallId;
+	}
 
 	void cancel() {
 		// only cancel if both haven't been set, otherwise one half of pair could think it was successful but the other
@@ -4603,7 +4582,6 @@ struct TSSRecruitmentState : ReferenceCounted<TSSRecruitmentState>, NonCopyable
 
 	bool tssRecruitFailed() {
 		if (active && tssPairDone.canBeSet()) {
-			printf("tssPair: %p\n", &tssPairDone);
 			tssPairDone.send(false);
 			return true;
 		}
@@ -4631,13 +4609,11 @@ struct TSSRecruitmentState : ReferenceCounted<TSSRecruitmentState>, NonCopyable
 	Future<bool> waitOnTSS() { return tssPairDone.getFuture(); }
 };
 
-// TODO switch recruitment order(ish) - grab tss but don't init it, wait for it to actually grab an ss, then the ss
-// signals here to start, then when done this signals the ss to add server
 ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
                                      RecruitStorageReply candidateWorker,
                                      const DDEnabledState* ddEnabledState,
                                      bool recruitTss,
-                                     Reference<TSSRecruitmentState> tssState) {
+                                     Reference<TSSPairState> tssState) {
 	// SOMEDAY: Cluster controller waits for availability, retry quickly if a server's Locality changes
 	self->recruitingStream.set(self->recruitingStream.get() + 1);
 
@@ -4656,12 +4632,6 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
 		isr.seedTag = invalidTag;
 		isr.reqId = deterministicRandom()->randomUniqueID();
 		isr.interfaceId = interfaceId;
-		isr.isTss = recruitTss;
-
-		printf("InitStorage %s on %sSS %s\n",
-		       interfaceId.toString().c_str(),
-		       recruitTss ? "T" : "",
-		       candidateWorker.worker.address().toString().c_str());
 
 		self->recruitingIds.insert(interfaceId);
 		self->recruitingLocalities.insert(candidateWorker.worker.stableAddress());
@@ -4675,29 +4645,21 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
 			    .detail("Addr", candidateWorker.worker.address())
 			    .detail("Locality", candidateWorker.worker.locality.toString());
 
-			printf("TSS %s waiting for partner uid\n", interfaceId.toString().c_str());
 			Optional<std::pair<UID, Version>> ssPairInfoResult = wait(tssState->waitOnSS());
 			if (ssPairInfoResult.present()) {
-				printf("TSS %s got pair of %s @ %lld\n",
-				       interfaceId.toString().c_str(),
-				       ssPairInfoResult.get().first.toString().c_str(),
-				       ssPairInfoResult.get().second);
-				isr.tssPairID = ssPairInfoResult.get().first;
-				isr.tssPairVersion = ssPairInfoResult.get().second;
+				isr.tssPairIDAndVersion = ssPairInfoResult.get();
 
 				TraceEvent("TSS_Recruit", self->distributorId)
-				    .detail("SSID", isr.tssPairID)
+				    .detail("SSID", ssPairInfoResult.get().first)
 				    .detail("TSSID", interfaceId)
 				    .detail("Stage", "TSSWaitingPair")
 				    .detail("Addr", candidateWorker.worker.address())
+				    .detail("Version", ssPairInfoResult.get().second)
 				    .detail("Locality", candidateWorker.worker.locality.toString());
 			} else {
-				printf("TSS %s didn't get partner, partner recruitment must have failed, abandoning\n",
-				       interfaceId.toString().c_str());
-				isr.isTss = false;
 				doRecruit = false;
 
-				TraceEvent(SevWarn, "TSS_RecruitError", self->distributorId)
+				TraceEvent(SevWarnAlways, "TSS_RecruitError", self->distributorId)
 				    .detail("TSSID", interfaceId)
 				    .detail("Reason", "SS recruitment failed for some reason")
 				    .detail("Addr", candidateWorker.worker.address())
@@ -4731,10 +4693,7 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
 
 		if (!recruitTss && newServer.present() &&
 		    tssState->ssRecruitSuccess(std::pair(interfaceId, newServer.get().addedVersion))) {
-			printf("ss %s signalling tss pair with version %lld\n",
-			       interfaceId.toString().c_str(),
-			       newServer.get().addedVersion);
-			// ss has a tss pair. send it this id, but wait for add server until tss is recruited
+			// SS has a tss pair. send it this id, but try to wait for add server until tss is recruited
 
 			TraceEvent("TSS_Recruit", self->distributorId)
 			    .detail("SSID", interfaceId)
@@ -4742,10 +4701,9 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
 			    .detail("Addr", candidateWorker.worker.address())
 			    .detail("Locality", candidateWorker.worker.locality.toString());
 
-			// wait for timeout, and give up if no TSS pair recruited
+			// wait for timeout, but eventually move on if no TSS pair recruited
 			Optional<bool> tssSuccessful = wait(timeout(tssState->waitOnTSS(), SERVER_KNOBS->TSS_RECRUITMENT_TIMEOUT));
 
-			// TODO if unsuccessful, fail out tss so it doesn't cause a mismatch error?
 			if (tssSuccessful.present() && tssSuccessful.get()) {
 				TraceEvent("TSS_Recruit", self->distributorId)
 				    .detail("SSID", interfaceId)
@@ -4760,14 +4718,7 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
 				                                    : "TSS recruitment timed out")
 				    .detail("Addr", candidateWorker.worker.address())
 				    .detail("Locality", candidateWorker.worker.locality.toString());
-
-				// TODO need to remove that tss here!!
 			}
-
-			// TODO trace event, change sev and message if timeout or if unsuccessful
-			printf("ss %s %ssuccessfully got tss pair!\n",
-			       interfaceId.toString().c_str(),
-			       (tssSuccessful.present() && tssSuccessful.get()) ? "" : "un");
 		}
 
 		self->recruitingIds.erase(interfaceId);
@@ -4791,9 +4742,6 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
 					                self->serverTrackerErrorOut,
 					                newServer.get().addedVersion,
 					                ddEnabledState);
-				} else {
-					// TODO tss recruitment was cancelled since it failed to send a response to the ss, kill it
-					printf("TSS recruitment was cancelled, stop\n");
 				}
 			} else {
 				TraceEvent(SevWarn, "DDRecruitmentError")
@@ -4806,15 +4754,12 @@ ACTOR Future<Void> initializeStorage(DDTeamCollection* self,
 		}
 	}
 
+	// SS and/or TSS recruitment failed at this point, update tssState
 	if (recruitTss && tssState->tssRecruitFailed()) {
 		TEST(true); // TSS recruitment failed for some reason
-		// if tss wasn't already marked as done, it was unsuccessful in recruitment
-		printf("tss recruitment failed for some reason, signalling ss.\n");
 	}
 	if (!recruitTss && tssState->ssRecruitFailed()) {
 		TEST(true); // SS with pair TSS recruitment failed for some reason
-		// if ss didn't already send its pair id to tss, it was unsuccessful in recruitment
-		printf("ss recruitment failed for some reason, signalling tss.\n");
 	}
 
 	self->recruitingStream.set(self->recruitingStream.get() - 1);
@@ -4832,13 +4777,11 @@ ACTOR Future<Void> storageRecruiter(DDTeamCollection* self,
 	state std::map<AddressExclusion, int> numSSPerAddr;
 
 	// tss-specific recruitment state
-	state uint32_t tssToRecruit = self->configuration.desiredTSSCount - db->get().client.tssMapping.size();
-	state Reference<TSSRecruitmentState> tssState = makeReference<TSSRecruitmentState>();
+	state int32_t tssToRecruit = self->configuration.desiredTSSCount - db->get().client.tssMapping.size();
+	state Reference<TSSPairState> tssState = makeReference<TSSPairState>();
+	state Future<Void> checkKillTss = self->initialFailureReactionDelay;
+	state bool sleepingAfterKillTss = false;
 
-	printf("DD setting tssToRecruit=%d (%d - %d)\n",
-	       tssToRecruit,
-	       self->configuration.desiredTSSCount,
-	       db->get().client.tssMapping.size());
 	TraceEvent(SevDebug, "TSS_RecruitUpdated", self->distributorId).detail("Count", tssToRecruit);
 
 	loop {
@@ -4924,87 +4867,97 @@ ACTOR Future<Void> storageRecruiter(DDTeamCollection* self,
 						    .detail("Locality", candidateWorker.worker.locality.toString());
 
 						TEST(true); // Starting TSS recruitment
-						printf("starting recruitment of tss\n");
 						self->isTssRecruiting = true;
-						tssState = makeReference<TSSRecruitmentState>(candidateWorker.worker.locality.dcId());
+						tssState = makeReference<TSSPairState>(candidateWorker.worker.locality);
 
 						self->addActor.send(initializeStorage(self, candidateWorker, ddEnabledState, true, tssState));
 					} else {
-						if (tssState->active && candidateWorker.worker.locality.dcId() == tssState->dcId) {
-							TEST(true); // TSS recruits pair in same dc
+						if (tssState->active && tssState->inDataZone(candidateWorker.worker.locality)) {
+							TEST(true); // TSS recruits pair in same dc/datahall
 							self->isTssRecruiting = false;
 							TraceEvent("TSS_Recruit", self->distributorId)
 							    .detail("Stage", "PairSS")
 							    .detail("Addr", candidateSSAddr.toString())
 							    .detail("Locality", candidateWorker.worker.locality.toString());
-							printf("starting recruitment of ss with eventual tss pair in dc \'%s\'\n",
-							       tssState->dcId.present() ? tssState->dcId.get().toString().c_str() : "");
 							self->addActor.send(
 							    initializeStorage(self, candidateWorker, ddEnabledState, false, tssState));
 							// successfully started recruitment of pair, reset tss recruitment state
-							tssState = makeReference<TSSRecruitmentState>();
+							tssState = makeReference<TSSPairState>();
 							tssToRecruit--;
-							if (tssToRecruit > 0) {
-								printf("%d tss pairs left to recruit\n", tssToRecruit);
-							}
 						} else {
-							if (tssState->active) {
-								TEST(true); // TSS recruitment skipped potential pair because it's in a different dc
-								printf("Recruiting normal ss (no tss) b/c new ss is in different dc \'%s\' than tss "
-								       "\'%s\'\n",
-								       candidateWorker.worker.locality.dcId().present()
-								           ? candidateWorker.worker.locality.dcId().get().toString().c_str()
-								           : "",
-								       tssState->dcId.present() ? tssState->dcId.get().toString().c_str() : "");
-							} else {
-								printf("recruiting normal ss (no tss)\n");
-							}
+							TEST(tssState->active); // TSS recruitment skipped potential pair because it's in a
+							                        // different dc/datahall
 							self->addActor.send(initializeStorage(
-							    self, candidateWorker, ddEnabledState, false, makeReference<TSSRecruitmentState>()));
+							    self, candidateWorker, ddEnabledState, false, makeReference<TSSPairState>()));
 						}
 					}
 				}
 				when(wait(db->onChange())) { // SOMEDAY: only if clusterInterface or tss changes?
 					fCandidateWorker = Future<RecruitStorageReply>();
-					// TODO REMOVE print
 					int newTssToRecruit = self->configuration.desiredTSSCount - db->get().client.tssMapping.size();
+
 					if (newTssToRecruit != tssToRecruit) {
 						TraceEvent("TSS_RecruitUpdated", self->distributorId).detail("Count", newTssToRecruit);
 						tssToRecruit = newTssToRecruit;
 					}
 
-					// TODO HANDLE HERE if count is more than desired tss?
-
-					printf("DD updated tssToRecruit=%d (%d - %d)\n",
-					       tssToRecruit,
-					       self->configuration.desiredTSSCount,
-					       db->get().client.tssMapping.size());
-
-					if (self->isTssRecruiting && (tssToRecruit == 0 || self->zeroHealthyTeams->get())) {
-						TEST(tssToRecruit == 0); // tss recruitment cancelled due to too many TSS
+					if (self->isTssRecruiting && (tssToRecruit <= 0 || self->zeroHealthyTeams->get())) {
+						TEST(tssToRecruit <= 0); // tss recruitment cancelled due to too many TSS
 						TEST(self->zeroHealthyTeams->get()); // tss recruitment cancelled due zero healthy teams
 						TraceEvent(SevWarn, "TSS_RecruitCancelled", self->distributorId)
-						    .detail("Reason", tssToRecruit == 0 ? "ConfigChange" : "ZeroHealthyTeams");
-						printf("Cancelling tss recruitment! tssToRecruit: %d, zeroHealthyTeams: %s\n",
-						       tssToRecruit,
-						       self->zeroHealthyTeams->get() ? "T" : "F");
+						    .detail("Reason", tssToRecruit <= 0 ? "ConfigChange" : "ZeroHealthyTeams");
 						tssState->cancel();
-						tssState = makeReference<TSSRecruitmentState>();
+						tssState = makeReference<TSSPairState>();
 						self->isTssRecruiting = false;
+					} else if (!self->isTssRecruiting &&
+					           (tssToRecruit < 0 ||
+					            (self->zeroHealthyTeams->get() && db->get().client.tssMapping.size() > 0))) {
+						if (!sleepingAfterKillTss) {
+							checkKillTss = self->initialFailureReactionDelay;
+						}
 					}
 				}
 				when(wait(self->zeroHealthyTeams->onChange())) {
-					// TODO refactor?
 					if (self->isTssRecruiting && self->zeroHealthyTeams->get()) {
 						TEST(self->zeroHealthyTeams->get()); // tss recruitment cancelled due zero healthy teams 2
 						TraceEvent(SevWarn, "TSS_RecruitCancelled", self->distributorId)
 						    .detail("Reason", "ZeroHealthyTeams");
-						printf("Cancelling tss recruitment!! tssToRecruit: %d, zeroHealthyTeams: %s\n",
-						       tssToRecruit,
-						       self->zeroHealthyTeams->get() ? "T" : "F");
 						tssState->cancel();
-						tssState = makeReference<TSSRecruitmentState>();
+						tssState = makeReference<TSSPairState>();
 						self->isTssRecruiting = false;
+					} else if (!self->isTssRecruiting && self->zeroHealthyTeams->get() &&
+					           db->get().client.tssMapping.size() > 0) {
+						if (!sleepingAfterKillTss) {
+							checkKillTss = self->initialFailureReactionDelay;
+						}
+					}
+				}
+				when(wait(checkKillTss)) {
+					int tssToKill = std::min((int)db->get().client.tssMapping.size(),
+					                         std::max(-tssToRecruit, self->zeroHealthyTeams->get() ? 1 : 0));
+					if (tssToKill > 0) {
+						for (int i = 0; i < tssToKill; i++) {
+							StorageServerInterface tssi = db->get().client.tssMapping[i].second;
+
+							if (self->shouldHandleServer(tssi) && self->server_and_tss_info.count(tssi.id())) {
+								TraceEvent(SevWarn, "TSS_DDKill", self->distributorId)
+								    .detail("TSSID", tssi.id())
+								    .detail("Reason",
+								            self->zeroHealthyTeams->get() ? "ZeroHealthyTeams" : "ConfigChange");
+
+								Promise<Void> killPromise = self->server_and_tss_info[tssi.id()]->killTss;
+								if (killPromise.canBeSet()) {
+									killPromise.send(Void());
+								}
+							}
+						}
+						// If we're killing a TSS because of zero healthy teams, wait a bit to give the replacing SS a
+						// change to join teams and stuff before killing another TSS
+						sleepingAfterKillTss = true;
+						checkKillTss = delay(SERVER_KNOBS->TSS_DD_KILL_INTERVAL);
+					} else {
+						sleepingAfterKillTss = false;
+						checkKillTss = Never();
 					}
 				}
 				when(wait(self->restartRecruiting.onTrigger())) {}
@@ -5652,8 +5605,6 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributorData> self,
 			wait(waitForAll(actors));
 			return Void();
 		} catch (Error& e) {
-			// TODO REMOVE
-			printf("DD got error! %d\n", e.code());
 			trackerCancelled = true;
 			state Error err = e;
 			TraceEvent("DataDistributorDestroyTeamCollections").error(e);
@@ -6321,6 +6272,4 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") {
 	ASSERT(result == 8);
 
 	return Void();
-}
-
-// TODO add unit test for TSS recruitment?
+}
\ No newline at end of file
diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp
index 51501c9b62..94f38622f0 100644
--- a/fdbserver/DataDistributionTracker.actor.cpp
+++ b/fdbserver/DataDistributionTracker.actor.cpp
@@ -497,22 +497,14 @@ ACTOR Future<Void> shardSplitter(DataDistributionTracker* self,
 		    .detail("MaxBytes", shardBounds.max.bytes)
 		    .detail("MetricsBytes", metrics.bytes)
 		    .detail("Bandwidth",
-		            bandwidthStatus == BandwidthStatusHigh
-		                ? "High"
-		                : bandwidthStatus == BandwidthStatusNormal ? "Normal" : "Low")
+		            bandwidthStatus == BandwidthStatusHigh     ? "High"
+		            : bandwidthStatus == BandwidthStatusNormal ? "Normal"
+		                                                       : "Low")
 		    .detail("BytesPerKSec", metrics.bytesPerKSecond)
 		    .detail("NumShards", numShards);
 	}
 
 	if (numShards > 1) {
-		// TODO REMOVE
-		printf("Splitting [%s - %s) into %d shards:\n",
-		       splitKeys[0].toString().c_str(),
-		       splitKeys[numShards].toString().c_str(),
-		       numShards);
-		for (int i = 0; i < numShards; i++) {
-			printf("    [%s - %s)\n", splitKeys[i].toString().c_str(), splitKeys[i + 1].toString().c_str());
-		}
 		int skipRange = deterministicRandom()->randomInt(0, numShards);
 		// The queue can't deal with RelocateShard requests which split an existing shard into three pieces, so
 		// we have to send the unskipped ranges in this order (nibbling in from the edges of the old range)
diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index 8e507f1727..61c3c62f82 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -217,7 +217,9 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( SERVER_LIST_DELAY,                                     1.0 );
 	init( RECRUITMENT_IDLE_DELAY,                                1.0 );
 	init( STORAGE_RECRUITMENT_DELAY,                            10.0 );
-	init( TSS_RECRUITMENT_TIMEOUT,       3*STORAGE_RECRUITMENT_DELAY ); if (randomize && BUGGIFY ) TSS_RECRUITMENT_TIMEOUT = 1.0; //Super low timeout should cause tss recruitments to fail
+	init( TSS_HACK_IDENTITY_MAPPING,                           false ); // THIS SHOULD NEVER BE SET IN PROD. Only for performance testing
+	init( TSS_RECRUITMENT_TIMEOUT,       3*STORAGE_RECRUITMENT_DELAY ); if (randomize && BUGGIFY ) TSS_RECRUITMENT_TIMEOUT = 1.0; // Super low timeout should cause tss recruitments to fail
+	init( TSS_DD_KILL_INTERVAL,                                 60.0 ); if (randomize && BUGGIFY ) TSS_DD_KILL_INTERVAL = 1.0;    // May kill all TSS quickly
 	init( DATA_DISTRIBUTION_LOGGING_INTERVAL,                    5.0 );
 	init( DD_ENABLED_CHECK_DELAY,                                1.0 );
 	init( DD_STALL_CHECK_DELAY,                                  0.4 ); //Must be larger than 2*MAX_BUGGIFIED_DELAY
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 9a4cc4a047..6b47e6ef30 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -167,7 +167,9 @@ public:
 	double SERVER_LIST_DELAY;
 	double RECRUITMENT_IDLE_DELAY;
 	double STORAGE_RECRUITMENT_DELAY;
+	bool TSS_HACK_IDENTITY_MAPPING;
 	double TSS_RECRUITMENT_TIMEOUT;
+	double TSS_DD_KILL_INTERVAL;
 	double DATA_DISTRIBUTION_LOGGING_INTERVAL;
 	double DD_ENABLED_CHECK_DELAY;
 	double DD_STALL_CHECK_DELAY;
diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp
index 927a7af00b..afd12a81c1 100644
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@@ -101,7 +101,6 @@ ACTOR static Future<Void> checkMoveKeysLock(Transaction* tr,
                                             bool isWrite = true) {
 	if (!ddEnabledState->isDDEnabled()) {
 		TraceEvent(SevDebug, "DDDisabledByInMemoryCheck");
-		printf("MK: DD disabled\n");
 		throw movekeys_conflict();
 	}
 	Optional<Value> readVal = wait(tr->get(moveKeysLockOwnerKey));
@@ -113,7 +112,6 @@ ACTOR static Future<Void> checkMoveKeysLock(Transaction* tr,
 		UID lastWrite = readVal.present() ? BinaryReader::fromStringRef<UID>(readVal.get(), Unversioned()) : UID();
 		if (lastWrite != lock.prevWrite) {
 			TEST(true); // checkMoveKeysLock: Conflict with previous owner
-			printf("MK: conflict with previous owner\n");
 			throw movekeys_conflict();
 		}
 
@@ -147,7 +145,6 @@ ACTOR static Future<Void> checkMoveKeysLock(Transaction* tr,
 		return Void();
 	} else {
 		TEST(true); // checkMoveKeysLock: Conflict with new owner
-		printf("MK: conflict %s with new owner %s\n", currentOwner.toString().c_str(), lock.myOwner.toString().c_str());
 		throw movekeys_conflict();
 	}
 }
@@ -330,12 +327,6 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 	state Future<Void> warningLogger = logWarningAfter("StartMoveKeysTooLong", 600, servers);
 	// state TraceInterval waitInterval("");
 
-	// TODO REMOVE
-	printf("starting move keys for [%s, %s): to %s\n",
-	       keys.begin.toString().c_str(),
-	       keys.end.toString().c_str(),
-	       servers[0].toString().c_str());
-
 	wait(startMoveKeysLock->take(TaskPriority::DataDistributionLaunch));
 	state FlowLock::Releaser releaser(*startMoveKeysLock);
 
@@ -395,10 +386,10 @@ ACTOR static Future<Void> startMoveKeys(Database occ,
 					state KeyRange currentKeys = KeyRangeRef(begin, keys.end);
 
 					state RangeResult old = wait(krmGetRanges(tr,
-					                                                         keyServersPrefix,
-					                                                         currentKeys,
-					                                                         SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT,
-					                                                         SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
+					                                          keyServersPrefix,
+					                                          currentKeys,
+					                                          SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT,
+					                                          SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES));
 
 					// Determine the last processed key (which will be the beginning for the next iteration)
 					state Key endKey = old.end()[-1].key;
@@ -531,37 +522,15 @@ ACTOR Future<Void> waitForShardReady(StorageServerInterface server,
                                      KeyRange keys,
                                      Version minVersion,
                                      GetShardStateRequest::waitMode mode) {
-	// TODO REMOVE
-	printf("waiting for shard [%s, %s) in state %d from %sss %s @ %lld\n",
-	       keys.begin.toString().c_str(),
-	       keys.end.toString().c_str(),
-	       mode,
-	       server.isTss ? "t" : "",
-	       server.id().toString().c_str(),
-	       minVersion);
 	loop {
 		try {
 			GetShardStateReply rep =
 			    wait(server.getShardState.getReply(GetShardStateRequest(keys, mode), TaskPriority::MoveKeys));
 			if (rep.first >= minVersion) {
-				// TODO REMOVE
-				printf("shard [%s, %s) is in state %d from %sss %s @ %lld >= %lld\n",
-				       keys.begin.toString().c_str(),
-				       keys.end.toString().c_str(),
-				       mode,
-				       server.isTss ? "t" : "",
-				       server.id().toString().c_str(),
-				       rep.first,
-				       minVersion);
 				return Void();
 			}
 			wait(delayJittered(SERVER_KNOBS->SHARD_READY_DELAY, TaskPriority::MoveKeys));
 		} catch (Error& e) {
-			printf("Waiting for shard from %sss %s getValue=%s got error! %d\n",
-			       server.isTss ? "t" : "",
-			       server.id().toString().c_str(),
-			       server.getValue.getEndpoint().token.toString().c_str(),
-			       e.code());
 			if (e.code() != error_code_timed_out) {
 				if (e.code() != error_code_broken_promise)
 					throw e;
@@ -656,18 +625,16 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 	state Key endKey;
 	state int retries = 0;
 	state FlowLock::Releaser releaser;
-	state int waitForTSSCounter =
-	    2; // try waiting for tss for a 2 loops, give up if they're stuck to not affect the rest of the cluster
 
 	// for killing tss if any get stuck during movekeys
 	state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
 	state std::vector<StorageServerInterface> tssToKill;
-	state std::set<UID> tssToIgnore;
+	state std::unordered_set<UID> tssToIgnore;
+	// try waiting for tss for a 2 loops, give up if they're stuck to not affect the rest of the cluster
+	state int waitForTSSCounter = 2;
 
 	ASSERT(!destinationTeam.empty());
 
-	printf("finishing move keys for [%s, %s)\n", keys.begin.toString().c_str(), keys.end.toString().c_str());
-
 	try {
 		TraceEvent(SevDebug, interval.begin(), relocationIntervalId)
 		    .detail("KeyBegin", keys.begin)
@@ -680,21 +647,17 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 
 			state Transaction tr(occ);
 
-			// TODO re-comment and change back
-			printf("finishMoveKeys( '%s'-'%s' )\n", begin.toString().c_str(), keys.end.toString().c_str());
+			// printf("finishMoveKeys( '%s'-'%s' )\n", begin.toString().c_str(), keys.end.toString().c_str());
 			loop {
 				try {
 					if (tssToKill.size()) {
-						// TODO could move this to helper method?
-						// TODO add trace event
 						TEST(true); // killing TSS because they were unavailable for movekeys
-						printf("KILLING %d TSS BECAUSE THEY TIMED OUT IN MOVEKEYS\n", tssToKill.size());
 
-						// kill tss BEFORE committing main txn so that client requests don't make it to the tss when it
+						// Kill tss BEFORE committing main txn so that client requests don't make it to the tss when it
 						// has a different shard set than its pair use a different RYW transaction since i'm too lazy
-						// (and don't want to add bugs) by changing whole method to RYW. also using a different
+						// (and don't want to add bugs) by changing whole method to RYW. Also, using a different
 						// transaction makes it commit earlier which we may need to guarantee causality of tss getting
-						// removed before client sends a request to this key range on the new ss
+						// removed before client sends a request to this key range on the new SS
 						state Reference<ReadYourWritesTransaction> tssTr =
 						    makeReference<ReadYourWritesTransaction>(occ);
 						loop {
@@ -703,25 +666,22 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 								tssTr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 								for (auto& tss : tssToKill) {
 									// DO NOT remove server list key - that'll break a bunch of stuff. DD will
-									// eventually call removeStorageServer tssTr->clear(serverListKeyFor(tss.id()));
+									// eventually call removeStorageServer
+
 									tssTr->clear(serverTagKeyFor(tss.id()));
-									// tssTr->clear(serverTagHistoryRangeFor(tss.id()));
-									tssMapDB.erase(tssTr, tss.tssPairID);
+									tssMapDB.erase(tssTr, tss.tssPairID.get());
 								}
 								tssTr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString());
 								wait(tssTr->commit());
 
 								for (auto& tss : tssToKill) {
-									// TODO ADD trace event (sev30?)
-									printf("Successfully removed TSS %s in finishMoveKeys\n",
-									       tss.id().toString().c_str());
+									TraceEvent(SevWarnAlways, "TSS_KillMoveKeys").detail("TSSID", tss.id().toString());
 									tssToIgnore.insert(tss.id());
 								}
 								tssToKill.clear();
 
 								break;
 							} catch (Error& e) {
-								printf("MoveKeys TSS Removal Transaction got error %d\n", e.code());
 								wait(tssTr->onError(e));
 							}
 						}
@@ -950,9 +910,6 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 							for (auto& f : tssReady) {
 								if (!f.isReady() || f.isError()) {
 									anyTssNotDone = true;
-									printf("MK: [%s - %s) waiting on tss!\n",
-									       begin.toString().c_str(),
-									       keys.end.toString().c_str());
 									waitForTSSCounter--;
 									break;
 								}
@@ -961,9 +918,6 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 							if (anyTssNotDone && waitForTSSCounter == 0) {
 								for (int i = 0; i < tssReady.size(); i++) {
 									if (!tssReady[i].isReady() || tssReady[i].isError()) {
-										// TODO trace event!!
-										printf("TSS NOT DONE %s with move keys, killing!!\n",
-										       tssReadyInterfs[i].id().toString().c_str());
 										tssToKill.push_back(tssReadyInterfs[i]);
 									}
 								}
@@ -981,22 +935,21 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 					for (int s = 0; s < tssReady.size(); s++)
 						tssCount += tssReady[s].isReady() && !tssReady[s].isError();
 
-					// TODO re-comment
-					if (tssReady.size()) {
-						printf("  fMK: [%s - %s) moved data to %d/%d servers and %d/%d tss\n",
-						       begin.toString().c_str(),
-						       keys.end.toString().c_str(),
-						       count,
-						       serverReady.size(),
-						       tssCount,
-						       tssReady.size());
+					/*if (tssReady.size()) {
+					    printf("  fMK: [%s - %s) moved data to %d/%d servers and %d/%d tss\n",
+					           begin.toString().c_str(),
+					           keys.end.toString().c_str(),
+					           count,
+					           serverReady.size(),
+					           tssCount,
+					           tssReady.size());
 					} else {
-						printf("  fMK: [%s - %s) moved data to %d/%d servers\n",
-						       begin.toString().c_str(),
-						       keys.end.toString().c_str(),
-						       count,
-						       serverReady.size());
-					}
+					    printf("  fMK: [%s - %s) moved data to %d/%d servers\n",
+					           begin.toString().c_str(),
+					           keys.end.toString().c_str(),
+					           count,
+					           serverReady.size());
+					}*/
 					TraceEvent(SevDebug, waitInterval.end(), relocationIntervalId).detail("ReadyServers", count);
 
 					if (count == dest.size()) {
@@ -1026,7 +979,6 @@ ACTOR static Future<Void> finishMoveKeys(Database occ,
 					}
 					tr.reset();
 				} catch (Error& error) {
-					printf("   fMK: error %d\n", error.code());
 					if (error.code() == error_code_actor_cancelled)
 						throw;
 					state Error err = error;
@@ -1059,13 +1011,11 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
 	state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
 	state int maxSkipTags = 1;
 
-	printf("%sSS %s adding itself\n", server.isTss ? "T" : "", server.id().toString().c_str());
 	loop {
 		try {
-			// TODO should also set priority system immediate? also why is this needed?
 			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 
-			// TODO don't fetch tag localities, all tags, and history tags if tss. Just fetch pair's tag
+			// FIXME: don't fetch tag localities, all tags, and history tags if tss. Just fetch pair's tag
 			state Future<RangeResult> fTagLocalities = tr->getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY);
 			state Future<Optional<Value>> fv = tr->get(serverListKeyFor(server.id()));
 
@@ -1108,9 +1058,6 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
 			if (fExclProc.get().present() || fExclIP.get().present() || fFailProc.get().present() ||
 			    fFailIP.get().present() || fExclProc2.get().present() || fExclIP2.get().present() ||
 			    fFailProc2.get().present() || fFailIP2.get().present()) {
-				printf("%sSS %s failing to recruit because of exclusion\n",
-				       server.isTss ? "T" : "",
-				       server.id().toString().c_str());
 				throw recruitment_failed();
 			}
 
@@ -1118,11 +1065,11 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
 				ASSERT(false);
 
 			state Tag tag;
-			if (server.isTss) {
+			if (server.isTss()) {
 				bool foundTag = false;
 				for (auto& it : fTags.get()) {
 					UID key = decodeServerTagKey(it.key);
-					if (key == server.tssPairID) {
+					if (key == server.tssPairID.get()) {
 						tag = decodeServerTagValue(it.value);
 						foundTag = true;
 						break;
@@ -1131,19 +1078,13 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
 				if (!foundTag) {
 					throw recruitment_failed();
 				}
-				// ASSERT(foundTag); // TSS's pair was removed before TSS could register. Should never happen, since the
-				// SS shouldn't be tracked by DD until this completes.
-				printf("TSS %s found tag %s for pair %s\n",
-				       server.id().toString().c_str(),
-				       tag.toString().c_str(),
-				       server.tssPairID.toString().c_str());
-				tssMapDB.set(tr, server.tssPairID, server.id());
+
+				tssMapDB.set(tr, server.tssPairID.get(), server.id());
 				tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString());
 
 			} else {
 				int8_t maxTagLocality = 0;
 				state int8_t locality = -1;
-				// TODO i think tss can ignore this part?
 				for (auto& kv : fTagLocalities.get()) {
 					int8_t loc = decodeTagLocalityListValue(kv.value);
 					if (decodeTagLocalityListKey(kv.key) == server.locality.dcId()) {
@@ -1197,20 +1138,19 @@ ACTOR Future<std::pair<Version, Tag>> addStorageServer(Database cx, StorageServe
 				KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(tag));
 				tr->addReadConflictRange(conflictRange);
 				tr->addWriteConflictRange(conflictRange);
+
+				if (SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING) {
+					// THIS SHOULD NEVER BE ENABLED IN ANY NON-TESTING ENVIRONMENT
+					TraceEvent(SevError, "TSSIdentityMappingEnabled");
+					tssMapDB.set(tr, server.id(), server.id());
+					tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString());
+				}
 			}
 
 			tr->set(serverListKeyFor(server.id()), serverListValue(server));
 			wait(tr->commit());
-			printf("%sSS %s successfully added itself @ %lld\n",
-			       server.isTss ? "T" : "",
-			       server.id().toString().c_str(),
-			       tr->getCommittedVersion());
 			return std::make_pair(tr->getCommittedVersion(), tag);
 		} catch (Error& e) {
-			printf("%sSS %s got error adding itself: %d!!\n",
-			       server.isTss ? "T" : "",
-			       server.id().toString().c_str(),
-			       e.code());
 			if (e.code() == error_code_commit_unknown_result)
 				throw recruitment_failed(); // There is a remote possibility that we successfully added ourselves and
 				                            // then someone removed us, so we have to fail
@@ -1252,8 +1192,6 @@ ACTOR Future<Void> removeStorageServer(Database cx,
 	state bool retry = false;
 	state int noCanRemoveCount = 0;
 
-	printf("Removing storage server %s\n", serverID.toString().c_str());
-
 	loop {
 		try {
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
@@ -1323,18 +1261,19 @@ ACTOR Future<Void> removeStorageServer(Database cx,
 				}
 
 				tr->clear(serverListKeyFor(serverID));
-				tr->clear(serverTagKeyFor(serverID)); // the tss uses this to communicate shutdown but it never has a
+				tr->clear(serverTagKeyFor(serverID)); // A tss uses this to communicate shutdown but it never has a
 				                                      // server tag key set in the first place
 				tr->clear(serverTagHistoryRangeFor(serverID));
 
-				// TODO a small optimization would be to only erase and trigger tss mapping if this is a tss or an  ss
-				// with a tss pair, instead of always
-				if (tssPairID.present()) {
-					tssMapDB.erase(tr, tssPairID.get());
-				} else {
+				if (SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING) {
+					// THIS SHOULD NEVER BE ENABLED IN ANY NON-TESTING ENVIRONMENT
+					TraceEvent(SevError, "TSSIdentityMappingEnabled");
 					tssMapDB.erase(tr, serverID);
+					tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString());
+				} else if (tssPairID.present()) {
+					tssMapDB.erase(tr, tssPairID.get());
+					tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString());
 				}
-				tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString());
 
 				retry = true;
 				wait(tr->commit());
@@ -1354,7 +1293,6 @@ ACTOR Future<Void> removeKeysFromFailedServer(Database cx,
                                               MoveKeysLock lock,
                                               const DDEnabledState* ddEnabledState) {
 	state Key begin = allKeys.begin;
-	printf("Removing keys from failed server %s\n", serverID.toString().c_str());
 	// Multi-transactional removal in case of large number of shards, concern in violating 5s transaction limit
 	while (begin < allKeys.end) {
 		state Transaction tr(cx);
@@ -1456,8 +1394,6 @@ ACTOR Future<Void> moveKeys(Database cx,
 	if (!dataMovementComplete.isSet())
 		dataMovementComplete.send(Void());
 
-	printf("move keys done for [%s, %s)\n", keys.begin.toString().c_str(), keys.end.toString().c_str());
-
 	return Void();
 }
 
@@ -1486,6 +1422,15 @@ void seedShardServers(Arena& arena, CommitTransactionRef& tr, vector<StorageServ
 	for (auto& s : servers) {
 		tr.set(arena, serverTagKeyFor(s.id()), serverTagValue(server_tag[s.id()]));
 		tr.set(arena, serverListKeyFor(s.id()), serverListValue(s));
+		if (SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING) {
+			// THIS SHOULD NEVER BE ENABLED IN ANY NON-TESTING ENVIRONMENT
+			TraceEvent(SevError, "TSSIdentityMappingEnabled");
+			// hack key-backed map here since we can't really change CommitTransactionRef to a RYW transaction
+			Key uidRef = Codec<UID>::pack(s.id()).pack();
+			tr.set(arena, uidRef.withPrefix(tssMappingKeys.begin), uidRef);
+			// tssMapDB.set(tr, server.id(), server.id());
+			tr.set(arena, tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString());
+		}
 	}
 
 	std::vector<Tag> serverTags;
diff --git a/fdbserver/MutationTracking.cpp b/fdbserver/MutationTracking.cpp
index b0e7215fb8..16a17a0f10 100644
--- a/fdbserver/MutationTracking.cpp
+++ b/fdbserver/MutationTracking.cpp
@@ -30,9 +30,6 @@
 // Track up to 2 keys in simulation via enabling MUTATION_TRACKING_ENABLED and setting the keys here.
 StringRef debugKey = LiteralStringRef("");
 StringRef debugKey2 = LiteralStringRef("\xff\xff\xff\xff");
-// StringRef debugKey = LiteralStringRef("\x00\x00\x02\xff\x00\x00\x04\xc1\x00\x00\x00\x01\x00\x00\x00\x02"); // missing
-// from ss StringRef debugKey2 = LiteralStringRef("\x00\x00\x02\xff\x00\x00\x01\x89\x00\x00\x00\x04\x00\x00\x00\x02");
-// // missing from tss
 
 TraceEvent debugMutationEnabled(const char* context, Version version, MutationRef const& mutation) {
 	if ((mutation.type == mutation.ClearRange || mutation.type == mutation.DebugKeyRange) &&
diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp
index 40f731aed6..223393bcdd 100644
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@@ -294,11 +294,6 @@ ACTOR Future<int64_t> getMaxStorageServerQueueSize(Database cx, Reference<AsyncV
 	state std::vector<StorageServerInterface> servers = wait(serversFuture);
 	state std::vector<WorkerDetails> workers = wait(workersFuture);
 
-	/*printf("Found %d storage servers:\n", servers.size());
-	for (auto& it : servers) {
-	    printf("    %s\n", it.id().toString().c_str());
-	}*/
-
 	std::map<NetworkAddress, WorkerInterface> workersMap;
 	for (auto worker : workers) {
 		workersMap[worker.interf.address()] = worker.interf;
@@ -328,7 +323,6 @@ ACTOR Future<int64_t> getMaxStorageServerQueueSize(Database cx, Reference<AsyncV
 		try {
 			maxQueueSize = std::max(maxQueueSize, getQueueSize(messages[i].get()));
 		} catch (Error& e) {
-			printf("Error getting max storage server queue size: %d\n", e.code());
 			TraceEvent("QuietDatabaseFailure")
 			    .detail("Reason", "Failed to extract MaxStorageServerQueue")
 			    .detail("SS", servers[i].id());
diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 83894c1201..4881b00914 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -719,7 +719,7 @@ ACTOR Future<Void> trackEachStorageServer(
 		when(state std::pair<UID, Optional<StorageServerInterface>> change = waitNext(serverChanges)) {
 			wait(delay(0)); // prevent storageServerTracker from getting cancelled while on the call stack
 			if (change.second.present()) {
-				if (!change.second.get().isTss) { // TODO is this all we need to do to get ratekeeper to ignore tss?
+				if (!change.second.get().isTss()) {
 					auto& a = actors[change.first];
 					a = Future<Void>();
 					a = splitError(trackStorageServerQueueInfo(self, change.second.get()), err);
diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index 24d7dfb01d..70fed849d9 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -170,6 +170,9 @@ class TestConfig {
 			if (attrib == "maxTLogVersion") {
 				sscanf(value.c_str(), "%d", &maxTLogVersion);
 			}
+                        if (attrib == "restartInfoLocation") {
+				isFirstTestInRestart = true;
+			}
 		}
 
 		ifs.close();
@@ -183,6 +186,7 @@ public:
 	bool configureLocked = false;
 	bool startIncompatibleProcess = false;
 	int logAntiQuorum = -1;
+	bool firstTestInRestart = false;
 	// Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig
 	//	0 = "ssd"
 	//	1 = "memory"
@@ -235,6 +239,8 @@ public:
 				for (const auto& [key, value] : conf) {
 					if (key == "ClientInfoLogging") {
 						setNetworkOption(FDBNetworkOptions::DISABLE_CLIENT_STATISTICS_LOGGING);
+					} else if (key == "restartInfoLocation") {
+						isFirstTestInRestart = true;
 					} else {
 						builder.set(key, value);
 					}
@@ -1165,13 +1171,9 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	}
 
 	int tssCount = 0;
-	// if (!testConfig.simpleConfig && deterministicRandom()->random01() < 0.25) {
-	if (true) {
-		// if (false) {
-		// tss
+	if (!testconfig.simpleConfig && deterministicRandom()->random01() < 0.25) {
 		// 1 or 2 tss
 		tssCount = deterministicRandom()->randomInt(1, 3);
-		printf("Initial tss count to %d\n", tssCount);
 	}
 
 	//	if (deterministicRandom()->random01() < 0.5) {
@@ -1510,7 +1512,6 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	// reduce tss to half of extra non-seed servers that can be recruited in usable regions.
 	tssCount =
 	    std::max(0, std::min(tssCount, (db.usableRegions * (machine_count / datacenters) - replication_type) / 2));
-	printf("Adjusted tss count to %d\n", tssCount);
 
 	if (tssCount > 0) {
 		std::string confStr = format("tss_count:=%d tss_storage_engine:=%d", tssCount, db.storageServerStoreType);
@@ -1519,13 +1520,13 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 		if (tssRandom > 0.5) {
 			// normal tss mode
 			g_simulator.tssMode = ISimulator::TSSMode::EnabledNormal;
-			printf("normal tss mode\n");
-		} else if (tssRandom < 0.25) {
+		} else if (tssRandom < 0.25 && !testConfig.isFirstTestInRestart) {
+			// fault injection - don't enable in first test in restart because second test won't know it intentionally
+			// lost data
+			g_simulator.tssMode = ISimulator::TSSMode::EnabledDropMutations;
+		} else {
 			// delay injection
 			g_simulator.tssMode = ISimulator::TSSMode::EnabledAddDelay;
-		} else {
-			// fault injection
-			g_simulator.tssMode = ISimulator::TSSMode::EnabledDropMutations;
 		}
 		printf("enabling tss for simulation in mode %d: %s\n", g_simulator.tssMode, confStr.c_str());
 	}
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 723c6c6111..5305abacbc 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -630,7 +630,7 @@ struct RolesInfo {
 			TraceEventFields const& commitLatencyBands = metrics.at("CommitLatencyBands");
 			if (commitLatencyBands.size()) {
 				obj["commit_latency_bands"] = addLatencyBandInfo(commitLatencyBands);
-			} 
+			}
 
 			TraceEventFields const& commitBatchingWindowSize = metrics.at("CommitBatchingWindowSize");
 			if (commitBatchingWindowSize.size()) {
@@ -1869,10 +1869,10 @@ ACTOR static Future<vector<std::pair<TLogInterface, EventMap>>> getTLogsAndMetri
 ACTOR static Future<vector<std::pair<CommitProxyInterface, EventMap>>> getCommitProxiesAndMetrics(
     Reference<AsyncVar<ServerDBInfo>> db,
     std::unordered_map<NetworkAddress, WorkerInterface> address_workers) {
-	vector<std::pair<CommitProxyInterface, EventMap>> results =
-	    wait(getServerMetrics(db->get().client.commitProxies,
-	                          address_workers,
-	                          std::vector<std::string>{ "CommitLatencyMetrics", "CommitLatencyBands", "CommitBatchingWindowSize"}));
+	vector<std::pair<CommitProxyInterface, EventMap>> results = wait(getServerMetrics(
+	    db->get().client.commitProxies,
+	    address_workers,
+	    std::vector<std::string>{ "CommitLatencyMetrics", "CommitLatencyBands", "CommitBatchingWindowSize" }));
 
 	return results;
 }
@@ -3007,7 +3007,7 @@ ACTOR Future<StatusReply> clusterGetStatus(
 
 		int activeTSSCount = 0;
 		for (auto& it : storageServers) {
-			if (it.first.isTss) {
+			if (it.first.isTss()) {
 				activeTSSCount++;
 			}
 		}
diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index f884a2e310..4ea9e83bee 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -1671,11 +1671,6 @@ ACTOR Future<Void> tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen
 
 	Version poppedVer = poppedVersion(logData, req.tag);
 	if (poppedVer > req.begin) {
-		printf("tag %s - %s tried to peek popped data!!: %lld > %lld\n",
-		       req.tag.toString().c_str(),
-		       peekId.toString().c_str(),
-		       poppedVer,
-		       req.begin);
 		TLogPeekReply rep;
 		rep.maxKnownVersion = logData->version.get();
 		rep.minKnownCommittedVersion = logData->minKnownCommittedVersion;
diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h
index 48a4d9ce07..689bf0a68c 100644
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@@ -614,18 +614,13 @@ struct InitializeStorageRequest {
 	UID reqId;
 	UID interfaceId;
 	KeyValueStoreType storeType;
-	bool isTss;
-	UID tssPairID;
-	Version tssPairVersion;
+	Optional<std::pair<UID, Version>>
+	    tssPairIDAndVersion; // Only set if recruiting a tss. Will be the UID and Version of its SS pair.
 	ReplyPromise<InitializeStorageReply> reply;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		if (ar.protocolVersion().hasTSS()) {
-			serializer(ar, seedTag, reqId, interfaceId, storeType, reply, isTss, tssPairID, tssPairVersion);
-		} else {
-			serializer(ar, seedTag, reqId, interfaceId, storeType, reply);
-		}
+		serializer(ar, seedTag, reqId, interfaceId, storeType, reply, tssPairIDAndVersion);
 	}
 };
 
diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp
index c0dee60682..97953ce1a3 100644
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@@ -417,14 +417,11 @@ ACTOR Future<Void> newTLogServers(Reference<MasterData> self,
 ACTOR Future<Void> newSeedServers(Reference<MasterData> self,
                                   RecruitFromConfigurationReply recruits,
                                   vector<StorageServerInterface>* servers) {
-	printf("Seeding initial %d storage servers\n", recruits.storageServers.size());
 	// This is only necessary if the database is at version 0
 	servers->clear();
 	if (self->lastEpochEnd)
 		return Void();
 
-	// TODO might need to make this handle TSS recruitment (or make RecruitFromConfiguration handle it?) for simulation
-
 	state int idx = 0;
 	state std::map<Optional<Value>, Tag> dcId_tags;
 	state int8_t nextLocality = 0;
@@ -437,7 +434,6 @@ ACTOR Future<Void> newSeedServers(Reference<MasterData> self,
 		                  ? dcId_tags[recruits.storageServers[idx].locality.dcId()]
 		                  : Tag(nextLocality, 0);
 		isr.storeType = self->configuration.storageServerStoreType;
-		isr.isTss = false;
 		isr.reqId = deterministicRandom()->randomUniqueID();
 		isr.interfaceId = deterministicRandom()->randomUniqueID();
 
@@ -473,8 +469,6 @@ ACTOR Future<Void> newSeedServers(Reference<MasterData> self,
 	    .detail("TargetCount", self->configuration.storageTeamSize)
 	    .detail("Servers", describe(*servers));
 
-	printf("Seed servers sees %d desired tss\n", self->configuration.desiredTSSCount);
-
 	return Void();
 }
 
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 7fe0b1c2a3..507de28f32 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -523,10 +523,6 @@ public:
 			TraceEvent(SevWarnAlways, "TSSInjectFaultEnabled", thisServerID)
 			    .detail("Mode", g_simulator.tssMode)
 			    .detail("At", tssFaultInjectTime.get());
-			printf("ENABLING FAULT INJECTION FOR TSS %s at time %.4f in mode %d\n",
-			       thisServerID.toString().c_str(),
-			       tssFaultInjectTime.get(),
-			       g_simulator.tssMode);
 		}
 	}
 
@@ -1077,24 +1073,12 @@ void updateProcessStats(StorageServer* self) {
 
 ACTOR Future<Version> waitForVersionActor(StorageServer* data, Version version, SpanID spanContext) {
 	state Span span("SS.WaitForVersion"_loc, { spanContext });
-	/*if (172218491 == version) {
-	    printf("%sSS %s starting waitForVersionActor @ %lld\n", data->tssPairID.present() ? "T" : "",
-	data->thisServerID.toString().c_str(), version);
-	}*/
 	choose {
 		when(wait(data->version.whenAtLeast(version))) {
 			// FIXME: A bunch of these can block with or without the following delay 0.
 			// wait( delay(0) );  // don't do a whole bunch of these at once
-			/*if (172218491 == version) {
-			    printf("%sSS %s waitForVersionActor @ %lld - at least version\n", data->tssPairID.present() ? "T" : "",
-			data->thisServerID.toString().c_str(), version);
-			}*/
 			if (version < data->oldestVersion.get())
 				throw transaction_too_old(); // just in case
-			/*if (172218491 == version) {
-			    printf("%sSS %s waitForVersionActor @ %lld - not too old\n", data->tssPairID.present() ? "T" : "",
-			data->thisServerID.toString().c_str(), version);
-			}*/
 			return version;
 		}
 		when(wait(delay(SERVER_KNOBS->FUTURE_VERSION_DELAY))) {
@@ -1103,39 +1087,23 @@ ACTOR Future<Version> waitForVersionActor(StorageServer* data, Version version,
 				    .detail("Version", version)
 				    .detail("MyVersion", data->version.get())
 				    .detail("ServerID", data->thisServerID);
-			/*if (172218491 == version) {
-			    printf("%sSS %s waitForVersionActor @ %lld - future version\n", data->tssPairID.present() ? "T" : "",
-			data->thisServerID.toString().c_str(), version);
-			}*/
 			throw future_version();
 		}
 	}
 }
 
 Future<Version> waitForVersion(StorageServer* data, Version version, SpanID spanContext) {
-	/*if (172218491 == version) {
-	    printf("%sSS %s started waitForVersion @ %lld\n", data->tssPairID.present() ? "T" : "",
-	data->thisServerID.toString().c_str(), version);
-	}*/
 	if (version == latestVersion) {
 		version = std::max(Version(1), data->version.get());
 	}
 
 	if (version < data->oldestVersion.get() || version <= 0) {
-		/*if (172218491 == version) {
-		    printf("%sSS %s waitForVersion @ %lld - transaction too old\n", data->tssPairID.present() ? "T" : "",
-		data->thisServerID.toString().c_str(), version);
-		}*/
 		return transaction_too_old();
 	} else if (version <= data->version.get()) {
 		return version;
 	}
 
 	if ((data->behind || data->versionBehind) && version > data->version.get()) {
-		/*if (172218491 == version) {
-		    printf("%sSS %s waitForVersion @ %lld - process_behind\n", data->tssPairID.present() ? "T" : "",
-		data->thisServerID.toString().c_str(), version);
-		}*/
 		return process_behind();
 	}
 
@@ -1169,11 +1137,6 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 	Span span("SS:getValue"_loc, { req.spanContext });
 	span.addTag("key"_sr, req.key);
 
-	/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
-	    printf("%sSS %s started getValueQ for %s @ %lld\n", data->tssPairID.present() ? "T" : "",
-	data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version);
-	}*/
-
 	try {
 		++data->counters.getValueQueries;
 		++data->counters.allQueries;
@@ -1185,11 +1148,6 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 		// so we need to downgrade here
 		wait(data->getQueryDelay());
 
-		/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
-		    printf("%sSS %s  getValueQ for %s @ %lld - got query delay\n", data->tssPairID.present() ? "T" : "",
-		data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version);
-		}*/
-
 		if (req.debugID.present())
 			g_traceBatch.addEvent("GetValueDebug",
 			                      req.debugID.get().first(),
@@ -1204,17 +1162,8 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 
 		state uint64_t changeCounter = data->shardChangeCounter;
 
-		/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
-		    printf("%sSS %s  getValueQ for %s @ %lld - waited for version\n", data->tssPairID.present() ? "T" : "",
-		data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version);
-		}*/
-
 		if (!data->shards[req.key]->isReadable()) {
 			//TraceEvent("WrongShardServer", data->thisServerID).detail("Key", req.key).detail("Version", version).detail("In", "getValueQ");
-			/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
-			    printf("%sSS %s started getValueQ for %s @ %lld got wrong shard server\n", data->tssPairID.present() ?
-			"T" : "", data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version);
-			}*/
 			throw wrong_shard_server();
 		}
 
@@ -1223,10 +1172,6 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 		if (i && i->isValue() && i.key() == req.key) {
 			v = (Value)i->getValue();
 			path = 1;
-			/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
-			    printf("%sSS %s  getValueQ for %s @ %lld - got from memory\n", data->tssPairID.present() ? "T" : "",
-			data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version);
-			}*/
 		} else if (!i || !i->isClearTo() || i->getEndKey() <= req.key) {
 			path = 2;
 			Optional<Value> vv = wait(data->storage.readValue(req.key, req.debugID));
@@ -1237,10 +1182,6 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 			}
 			data->checkChangeCounter(changeCounter, req.key);
 			v = vv;
-			/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
-			    printf("%sSS %s  getValueQ for %s @ %lld - got from storage\n", data->tssPairID.present() ? "T" : "",
-			data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version);
-			}*/
 		}
 
 		DEBUG_MUTATION("ShardGetValue",
@@ -1268,12 +1209,6 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 			++data->counters.emptyQueries;
 		}
 
-		/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
-		    printf("%sSS %s getValueQ for %s @ %lld = %s\n", data->tssPairID.present() ? "T" : "",
-		data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version, v.present() ?
-		v.get().toString().c_str() : "");
-		}*/
-
 		if (SERVER_KNOBS->READ_SAMPLING_ENABLED) {
 			// If the read yields no value, randomly sample the empty read.
 			int64_t bytesReadPerKSecond =
@@ -1296,16 +1231,8 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 		reply.penalty = data->getPenalty();
 		req.reply.send(reply);
 	} catch (Error& e) {
-		/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
-		    printf("%sSS %s getValueQ for %s @ %lld = ERROR: %d\n", data->tssPairID.present() ? "T" : "",
-		data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version, e.code());
-		}*/
 		if (!canReplyWith(e))
 			throw;
-		/*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) {
-		    printf("%sSS %s getValueQ for %s @ %lld = replying with error: %d\n", data->tssPairID.present() ? "T" : "",
-		data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version, e.code());
-		}*/
 		data->sendErrorWithPenalty(req.reply, e, data->getPenalty());
 	}
 
@@ -1816,12 +1743,6 @@ ACTOR Future<Key> findKey(StorageServer* data,
 	state int distance = forward ? sel.offset : 1 - sel.offset;
 	state Span span("SS.findKey"_loc, { parentSpan });
 
-	/*if (version == 166817893 && sel.offset == 80) {
-	    printf("%sSS %s FindKey request %s:<%s:%d @ %lld: with key range [%s - %s):\n", data->isTss() ? "t" : "",
-	data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset,
-	version, range.begin.toString().c_str(), range.end.toString().c_str());
-	}*/
-
 	// Don't limit the number of bytes if this is a trivial key selector (there will be at most two items returned from
 	// the read range in this case)
 	state int maxBytes;
@@ -1841,13 +1762,6 @@ ACTOR Future<Key> findKey(StorageServer* data,
 	              span.context));
 	state bool more = rep.more && rep.data.size() != distance + skipEqualKey;
 
-	/*if (version == 166817893 && sel.offset == 80) {
-	    printf("%sSS %s FindKey request %s:<%s:%d @ %lld: readRange with limBytes=%d got %d:\n", data->isTss() ? "t" :
-	"", data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset,
-	version, maxBytes, rep.data.size()); for (auto& it : rep.data) { printf("    %s\n", it.key.toString().c_str());
-	    }
-	}*/
-
 	// If we get only one result in the reverse direction as a result of the data being too large, we could get stuck in
 	// a loop
 	if (more && !forward && rep.data.size() == 1) {
@@ -1894,19 +1808,8 @@ ACTOR Future<Key> findKey(StorageServer* data,
 			// This is possible if key/value pairs are very large and only one result is returned on a last less than
 			// query SOMEDAY: graceful handling of exceptionally sized values
 			ASSERT(returnKey != sel.getKey());
-
-			/*if (version == 166817893 && sel.offset == 80) {
-			    printf("%sSS %s FindKey request %s:<%s:%d @ %lld: moving same shard\n", data->isTss() ? "t" : "",
-			data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset,
-			version);
-			}*/
 			return returnKey;
 		} else {
-			/*if (version == 166817893 && sel.offset == 80) {
-			    printf("%sSS %s FindKey request %s:<%s:%d @ %lld: moving shard boundary\n", data->isTss() ? "t" : "",
-			data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset,
-			version);
-			}*/
 			return forward ? range.end : range.begin;
 		}
 	}
@@ -1931,15 +1834,6 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 	state Span span("SS:getKeyValues"_loc, { req.spanContext });
 	state int64_t resultSize = 0;
 
-	if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) {
-		printf("%sSS %s starting query [%s - %s) @ %lld\n",
-		       data->isTss() ? "T" : "",
-		       data->thisServerID.toString().c_str(),
-		       req.begin.getKey().printable().c_str(),
-		       req.end.getKey().printable().c_str(),
-		       req.version);
-	}
-
 	++data->counters.getRangeQueries;
 	++data->counters.allQueries;
 	++data->readQueueSizeMetric;
@@ -1954,15 +1848,6 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 		wait(data->getQueryDelay());
 	}
 
-	if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) {
-		printf("%sSS %s downgraded [%s - %s) @ %lld\n",
-		       data->isTss() ? "T" : "",
-		       data->thisServerID.toString().c_str(),
-		       req.begin.getKey().printable().c_str(),
-		       req.end.getKey().printable().c_str(),
-		       req.version);
-	}
-
 	try {
 		if (req.debugID.present())
 			g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Before");
@@ -1987,15 +1872,6 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 			throw wrong_shard_server();
 		}
 
-		if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) {
-			printf("%sSS %s validated shard [%s - %s) @ %lld\n",
-			       data->isTss() ? "T" : "",
-			       data->thisServerID.toString().c_str(),
-			       req.begin.getKey().printable().c_str(),
-			       req.end.getKey().printable().c_str(),
-			       req.version);
-		}
-
 		state int offset1;
 		state int offset2;
 		state Future<Key> fBegin = req.begin.isFirstGreaterOrEqual()
@@ -2026,25 +1902,6 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 			throw wrong_shard_server();
 		}
 
-		if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) {
-			printf("%sSS %s resolved begin and end [%s - %s) @ %lld\n",
-			       data->isTss() ? "T" : "",
-			       data->thisServerID.toString().c_str(),
-			       req.begin.getKey().printable().c_str(),
-			       req.end.getKey().printable().c_str(),
-			       req.version);
-			printf("    %s:<%s:%d @ -> %s\n",
-			       req.begin.getKey().printable().c_str(),
-			       req.begin.orEqual ? "=" : "",
-			       req.begin.offset,
-			       req.begin.getKey().printable().c_str());
-			printf("    %s:<%s:%d @ -> %s\n",
-			       req.end.getKey().printable().c_str(),
-			       req.end.orEqual ? "=" : "",
-			       req.end.offset,
-			       req.end.getKey().printable().c_str());
-		}
-
 		if (begin >= end) {
 			if (req.debugID.present())
 				g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Send");
@@ -2062,28 +1919,10 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 		} else {
 			state int remainingLimitBytes = req.limitBytes;
 
-			/*if (req.begin.getKey().toString() == "m3fc7" && req.end.getKey().toString() == "s" && req.version ==
-			133421369) { printf("%sSS %s beginning readRange [%s - %s) @ %lld\n", data->isTss() ? "T" : "",
-			data->thisServerID.toString().c_str(), req.begin.getKey().printable().c_str(),
-			req.end.getKey().printable().c_str(), req.version);
-			}*/
-
 			GetKeyValuesReply _r =
 			    wait(readRange(data, version, KeyRangeRef(begin, end), req.limit, &remainingLimitBytes, span.context));
 			GetKeyValuesReply r = _r;
 
-			if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" &&
-			    req.version == 107157353) {
-				printf("%sSS %s completed readRange (%d)%s: \n",
-				       data->isTss() ? "T" : "",
-				       data->thisServerID.toString().c_str(),
-				       r.data.size(),
-				       r.more ? "+" : "");
-				/*for (auto& it : r.data) {
-				    printf("    %s=%s\n", it.key.printable().c_str(), it.value.printable().c_str());
-				}*/
-			}
-
 			if (req.debugID.present())
 				g_traceBatch.addEvent(
 				    "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterReadRange");
@@ -2116,14 +1955,6 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 				data->metrics.notifyBytesReadPerKSecond(r.data[r.data.size() - 1].key, bytesReadPerKSecond);
 			}
 
-			if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" &&
-			    req.version == 107157353) {
-				printf("%sSS %s replying to %s\n",
-				       data->isTss() ? "T" : "",
-				       data->thisServerID.toString().c_str(),
-				       req.reply.getEndpoint().token.toString().c_str());
-			}
-
 			r.penalty = data->getPenalty();
 			req.reply.send(r);
 
@@ -2174,33 +2005,15 @@ ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
 	// so we need to downgrade here
 	wait(data->getQueryDelay());
 
-	/*if (req.version == 166817893 && req.sel.offset == 80) {
-	    printf("%sSS %s GetKey request %s:<%s:%d @ %lld\n", data->isTss() ? "t" : "",
-	data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "",
-	req.sel.offset, req.version);
-	}*/
-
 	try {
 		state Version version = wait(waitForVersion(data, req.version, req.spanContext));
 
-		/*if (req.version == 166817893 && req.sel.offset == 80) {
-		    printf("%sSS %s GetKey request %s:<%s:%d @ %lld: waited for version\n", data->isTss() ? "t" : "",
-		data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "",
-		req.sel.offset, req.version);
-		}*/
-
 		state uint64_t changeCounter = data->shardChangeCounter;
 		state KeyRange shard = getShardKeyRange(data, req.sel);
 
 		state int offset;
 		Key k = wait(findKey(data, req.sel, version, shard, &offset, req.spanContext));
 
-		/*if (req.version == 166817893 && req.sel.offset == 80) {
-		    printf("%sSS %s GetKey request %s:<%s:%d @ %lld: found key: %s\n", data->isTss() ? "t" : "",
-		data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "",
-		req.sel.offset, req.version, k.toString().c_str());
-		}*/
-
 		data->checkChangeCounter(
 		    changeCounter, KeyRangeRef(std::min<KeyRef>(req.sel.getKey(), k), std::max<KeyRef>(req.sel.getKey(), k)));
 
@@ -2215,12 +2028,6 @@ ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
 		else
 			updated = KeySelectorRef(k, true, 0); // found
 
-		/*if (req.version == 166817893 && req.sel.offset == 80) {
-		    printf("%sSS %s GetKey request %s:<%s:%d @ %lld: updated: %s:<%s:%d\n", data->isTss() ? "t" : "",
-		data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "",
-		req.sel.offset, req.version, updated.getKey().printable().c_str(), updated.orEqual ? "=" : "", updated.offset);
-		}*/
-
 		resultSize = k.size();
 		data->counters.bytesQueried += resultSize;
 		++data->counters.rowsQueried;
@@ -2545,14 +2352,6 @@ void removeDataRange(StorageServer* ss,
 	// disk when this latest version becomes durable mLV is also modified if necessary to ensure that split clears can
 	// be forgotten
 
-	// TODO REMOVE print
-	printf("%sss %s removing data range [%s - %s) @ %lld\n",
-	       ss->isTss() ? "t" : "",
-	       ss->thisServerID.toString().c_str(),
-	       range.begin.toString().c_str(),
-	       range.end.toString().c_str(),
-	       mLV.version);
-
 	MutationRef clearRange(MutationRef::ClearRange, range.begin, range.end);
 	clearRange = ss->addMutationToMutationLog(mLV, clearRange);
 
@@ -2583,13 +2382,6 @@ void removeDataRange(StorageServer* ss,
 	}
 
 	data.erase(range.begin, range.end);
-
-	printf("%sss %s removed data range [%s - %s) @ %lld\n",
-	       ss->isTss() ? "t" : "",
-	       ss->thisServerID.toString().c_str(),
-	       range.begin.toString().c_str(),
-	       range.end.toString().c_str(),
-	       mLV.version);
 }
 
 void setAvailableStatus(StorageServer* self, KeyRangeRef keys, bool available);
@@ -3170,12 +2962,12 @@ void changeServerKeys(StorageServer* data,
                       ChangeServerKeysContext context) {
 	ASSERT(!keys.empty());
 
-	TraceEvent("ChangeServerKeys", data->thisServerID)
-	    .detail("KeyBegin", keys.begin)
-	    .detail("KeyEnd", keys.end)
-	    .detail("NowAssigned", nowAssigned)
-	    .detail("Version", version)
-	    .detail("Context", changeServerKeysContextName[(int)context]);
+	// TraceEvent("ChangeServerKeys", data->thisServerID)
+	//     .detail("KeyBegin", keys.begin)
+	//     .detail("KeyEnd", keys.end)
+	//     .detail("NowAssigned", nowAssigned)
+	//     .detail("Version", version)
+	//     .detail("Context", changeServerKeysContextName[(int)context]);
 	validate(data);
 
 	// TODO(alexmiller): Figure out how to selectively enable spammy data distribution events.
@@ -3193,7 +2985,7 @@ void changeServerKeys(StorageServer* data,
 		}
 	}
 	if (!isDifferent) {
-		TraceEvent("CSKShortCircuit", data->thisServerID).detail("KeyBegin", keys.begin).detail("KeyEnd", keys.end);
+		// TraceEvent("CSKShortCircuit", data->thisServerID).detail("KeyBegin", keys.begin).detail("KeyEnd", keys.end);
 		return;
 	}
 
@@ -3231,13 +3023,13 @@ void changeServerKeys(StorageServer* data,
 	for (auto r = vr.begin(); r != vr.end(); ++r) {
 		KeyRangeRef range = keys & r->range();
 		bool dataAvailable = r->value() == latestVersion || r->value() >= version;
-		TraceEvent("CSKRange", data->thisServerID)
-		    .detail("KeyBegin", range.begin)
-		    .detail("KeyEnd", range.end)
-		    .detail("Available", dataAvailable)
-		    .detail("NowAssigned", nowAssigned)
-		    .detail("NewestAvailable", r->value())
-		    .detail("ShardState0", data->shards[range.begin]->debugDescribeState());
+		// TraceEvent("CSKRange", data->thisServerID)
+		//     .detail("KeyBegin", range.begin)
+		//     .detail("KeyEnd", range.end)
+		//     .detail("Available", dataAvailable)
+		//     .detail("NowAssigned", nowAssigned)
+		//     .detail("NewestAvailable", r->value())
+		//     .detail("ShardState0", data->shards[range.begin]->debugDescribeState());
 		if (!nowAssigned) {
 			if (dataAvailable) {
 				ASSERT(r->value() ==
@@ -3279,14 +3071,8 @@ void changeServerKeys(StorageServer* data,
 	oldShards.clear();
 	ranges.clear();
 	for (auto r = removeRanges.begin(); r != removeRanges.end(); ++r) {
-		// TODO should we do this at the passed in version? (or the passed in version + 1?)
 		removeDataRange(data, data->addVersionToMutationLog(data->data().getLatestVersion()), data->shards, *r);
 		setAvailableStatus(data, *r, false);
-		printf("%sss %s set data range unavailable [%s - %s)\n",
-		       data->isTss() ? "t" : "",
-		       data->thisServerID.toString().c_str(),
-		       keys.begin.toString().c_str(),
-		       keys.end.toString().c_str());
 	}
 	validate(data);
 }
@@ -3458,26 +3244,18 @@ private:
 
 			data->recoveryVersionSkips.emplace_back(rollbackVersion, currentVersion - rollbackVersion);
 		} else if (m.type == MutationRef::SetValue && m.param1 == killStoragePrivateKey) {
-			printf("worked removed kill storage: %s\n", data->thisServerID.toString().c_str());
 			throw worker_removed();
 		} else if ((m.type == MutationRef::SetValue || m.type == MutationRef::ClearRange) &&
 		           m.param1.substr(1).startsWith(serverTagPrefix)) {
 			UID serverTagKey = decodeServerTagKey(m.param1.substr(1));
-			// bool matchesThisServer = (!data->isTss() && serverTagKey == data->thisServerID) || (data->isTss() &&
-			// serverTagKey == data->tssPairID.get());
 			bool matchesThisServer = serverTagKey == data->thisServerID;
 			bool matchesTssPair = data->isTss() ? serverTagKey == data->tssPairID.get() : false;
 			if ((m.type == MutationRef::SetValue && !data->isTss() && !matchesThisServer) ||
 			    (m.type == MutationRef::ClearRange && (matchesThisServer || (data->isTss() && matchesTssPair)))) {
-				printf("%sSS %s removed b/c tag mutation: %s\n",
-				       data->isTss() ? "T" : "",
-				       data->thisServerID.toString().c_str(),
-				       m.toString().c_str());
 				throw worker_removed();
 			}
 		} else if (m.type == MutationRef::SetValue && m.param1 == rebootWhenDurablePrivateKey) {
 			data->rebootAfterDurableVersion = currentVersion;
-			printf("%s got reboot after durable @ %lld\n", data->thisServerID.toString().c_str(), currentVersion);
 			TraceEvent("RebootWhenDurableSet", data->thisServerID)
 			    .detail("DurableVersion", data->durableVersion.get())
 			    .detail("RebootAfterDurableVersion", data->rebootAfterDurableVersion);
@@ -3542,12 +3320,10 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 			wait(delayJittered(.005, TaskPriority::TLogPeekReply));
 		}
 
-		// TODO REMOVE!! just for testing what happens when TSS gets behind
 		if (g_network->isSimulated() && data->isTss() && g_simulator.tssMode == ISimulator::TSSMode::EnabledAddDelay &&
 		    data->tssFaultInjectTime.present() && data->tssFaultInjectTime.get() < now()) {
 			if (deterministicRandom()->random01() < 0.01) {
 				TraceEvent(SevWarnAlways, "TSSInjectDelayForever", data->thisServerID);
-				printf("TSS %s INJECTING DELAY FOREVER!!\n", data->thisServerID.toString().c_str());
 				// small random chance to just completely get stuck here, each tss should eventually hit this in this
 				// mode
 				wait(Never());
@@ -3555,7 +3331,6 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 				// otherwise pause for part of a second
 				double delayTime = deterministicRandom()->random01();
 				TraceEvent(SevWarnAlways, "TSSInjectDelay", data->thisServerID).detail("Delay", delayTime);
-				printf("TSS %s INJECTING DELAY for %.4f!!\n", data->thisServerID.toString().c_str(), delayTime);
 				wait(delay(delayTime));
 			}
 		}
@@ -3573,8 +3348,6 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
 			}
 		}
 		if (cursor->popped() > 0) {
-			printf(
-			    "Worker removed because of popped=%d: %s\n", cursor->popped(), data->thisServerID.toString().c_str());
 			throw worker_removed();
 		}
 
@@ -3982,9 +3755,6 @@ ACTOR Future<Void> updateStorage(StorageServer* data) {
 #endif
 
 void StorageServerDisk::makeNewStorageServerDurable() {
-	// TODO REMOVE print
-	printf(
-	    "%sSS %s saving durable state\n", data->tssPairID.present() ? "T" : "", data->thisServerID.toString().c_str());
 	storage->set(persistFormat);
 	storage->set(KeyValueRef(persistID, BinaryWriter::toValue(data->thisServerID, Unversioned())));
 	if (data->tssPairID.present()) {
@@ -4268,17 +4038,6 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
 	debug_checkRestoredVersion(data->thisServerID, version, "StorageServer");
 	data->setInitialVersion(version);
 
-	// TODO REMOVE print
-	printf("%sSS %s restored durable state @ %lld\n",
-	       data->tssPairID.present() ? "T" : "",
-	       data->thisServerID.toString().c_str(),
-	       version);
-	if (data->tssPairID.present()) {
-		printf("TSS %s recovered pairing to SS %s\n",
-		       data->thisServerID.toString().c_str(),
-		       data->tssPairID.get().toString().c_str());
-	}
-
 	state RangeResult available = fShardAvailable.get();
 	state int availableLoc;
 	for (availableLoc = 0; availableLoc < available.size(); availableLoc++) {
@@ -4565,9 +4324,9 @@ ACTOR Future<Void> metricsCore(StorageServer* self, StorageServerInterface ssi)
 	                               SERVER_KNOBS->STORAGE_LOGGING_DELAY,
 	                               &self->counters.cc,
 	                               self->thisServerID.toString() + "/StorageMetrics",
-	                               [self=self](TraceEvent& te) {
+	                               [self = self](TraceEvent& te) {
 		                               te.detail("Tag", self->tag.toString());
-									   StorageBytes sb = self->storage.getStorageBytes();
+		                               StorageBytes sb = self->storage.getStorageBytes();
 		                               te.detail("KvstoreBytesUsed", sb.used);
 		                               te.detail("KvstoreBytesFree", sb.free);
 		                               te.detail("KvstoreBytesAvailable", sb.available);
@@ -4688,19 +4447,6 @@ ACTOR Future<Void> serveGetKeyValuesRequests(StorageServer* self, FutureStream<G
 	loop {
 		GetKeyValuesRequest req = waitNext(getKeyValues);
 
-		if (req.begin.getKey().toString() == "m3fc7" && req.end.getKey().toString() == "s" &&
-		    req.version == 133421369) {
-			printf("%sSS %s got range read [%s - %s) @ %lld\n",
-			       self->isTss() ? "T" : "",
-			       self->thisServerID.toString().c_str(),
-			       req.begin.getKey().printable().c_str(),
-			       req.end.getKey().printable().c_str(),
-			       req.version);
-		}
-
-		// A TSS should never be the source for fetch keys
-		ASSERT(!self->tssPairID.present() || !req.isFetchKeys);
-
 		// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade
 		// before doing real work
 		self->actors.add(self->readGuard(req, getKeyValuesQ));
@@ -4939,18 +4685,8 @@ ACTOR Future<Void> storageServerCore(StorageServer* self, StorageServerInterface
 					ClientDBInfo clientInfo = self->db->get().client;
 					Optional<StorageServerInterface> myTssPair = clientInfo.getTssPair(self->thisServerID);
 					if (myTssPair.present()) {
-						// TODO REMOVE print, just for debugging
-						if (!self->ssPairID.present()) {
-							printf("SS %s found tss pair %s\n",
-							       self->thisServerID.toString().c_str(),
-							       myTssPair.get().id().toString().c_str());
-						}
 						self->setSSWithTssPair(myTssPair.get().id());
 					} else {
-						// TODO REMOVE print, just for debugging
-						if (self->ssPairID.present()) {
-							printf("SS %s lost tss pair\n", self->thisServerID.toString().c_str());
-						}
 						self->clearSSWithTssPair();
 					}
 				}
@@ -5057,17 +4793,11 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
                                  Reference<AsyncVar<ServerDBInfo>> db,
                                  std::string folder) {
 	state StorageServer self(persistentData, db, ssi);
-	if (ssi.isTss) {
-		self.setTssPair(ssi.tssPairID);
+	if (ssi.isTss()) {
+		self.setTssPair(ssi.tssPairID.get());
 		ASSERT(self.isTss());
 	}
 
-	// TODO REMOVE
-	printf("initializing %sstorage %s with tag %s and tss pair=%s\n",
-	       ssi.isTss ? "testing " : "",
-	       ssi.id().toString().c_str(),
-	       seedTag.toString().c_str(),
-	       self.tssPairID.present() ? self.tssPairID.get().toString().c_str() : "");
 	self.sk = serverKeysPrefixFor(self.tssPairID.present() ? self.tssPairID.get() : self.thisServerID)
 	              .withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
 	self.folder = folder;
@@ -5080,12 +4810,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
 			std::pair<Version, Tag> verAndTag = wait(addStorageServer(
 			    self.cx, ssi)); // Might throw recruitment_failed in case of simultaneous master failure
 			self.tag = verAndTag.second;
-			// self.setInitialVersion(ssi.isTss ? 0 : verAndTag.first - 1);
-			if (ssi.isTss) {
-				printf("TSS %s overriding initial version from %lld to %lld\n",
-				       ssi.id().toString().c_str(),
-				       verAndTag.first - 1,
-				       tssSeedVersion);
+			if (ssi.isTss()) {
 				self.setInitialVersion(tssSeedVersion);
 			} else {
 				self.setInitialVersion(verAndTag.first - 1);
@@ -5100,7 +4825,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
 		TraceEvent("StorageServerInit", ssi.id())
 		    .detail("Version", self.version.get())
 		    .detail("SeedTag", seedTag.toString())
-		    .detail("TssPair", ssi.isTss ? ssi.tssPairID.toString() : "");
+		    .detail("TssPair", ssi.isTss() ? ssi.tssPairID.get().toString() : "");
 		InitializeStorageReply rep;
 		rep.interf = ssi;
 		rep.addedVersion = self.version.get();
@@ -5121,10 +4846,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
 }
 
 ACTOR Future<Void> replaceInterface(StorageServer* self, StorageServerInterface ssi) {
-	printf("SS %s replacing interface\ngetValue=%s\n",
-	       ssi.id().toString().c_str(),
-	       ssi.getValue.getEndpoint().token.toString().c_str());
-	ASSERT(!ssi.isTss);
+	ASSERT(!ssi.isTss());
 	state Transaction tr(self->cx);
 
 	loop {
@@ -5140,16 +4862,8 @@ ACTOR Future<Void> replaceInterface(StorageServer* self, StorageServerInterface
 			                  : Never())) {
 				state GetStorageServerRejoinInfoReply rep = _rep;
 
-				printf("SS %s got rejoin reply:\nversion: %" PRIu64 "\ntag: %s\nnewTag: %s\nnewLocality: %s\n",
-				       ssi.id().toString().c_str(),
-				       rep.version,
-				       rep.tag.toString().c_str(),
-				       rep.newTag.present() ? rep.newTag.get().toString().c_str() : "",
-				       rep.newLocality ? "true" : "false");
-
 				try {
 					tr.reset();
-					// TODO why doesn't this need ACCESS_SYSTEM_KEYS?
 					tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 					tr.setVersion(rep.version);
 
@@ -5184,7 +4898,6 @@ ACTOR Future<Void> replaceInterface(StorageServer* self, StorageServerInterface
 
 					choose {
 						when(wait(tr.commit())) {
-							printf("SS committed rejoin txn\n");
 							self->history = rep.history;
 
 							if (rep.newTag.present()) {
@@ -5213,7 +4926,6 @@ ACTOR Future<Void> replaceInterface(StorageServer* self, StorageServerInterface
 						when(wait(infoChanged)) {}
 					}
 				} catch (Error& e) {
-					printf("rejoin txn got error: %d!!\n", e.code());
 					wait(tr.onError(e));
 				}
 			}
@@ -5229,20 +4941,14 @@ ACTOR Future<Void> replaceTSSInterface(StorageServer* self, StorageServerInterfa
 	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
 	state KeyBackedMap<UID, UID> tssMapDB = KeyBackedMap<UID, UID>(tssMappingKeys.begin);
 
-	ASSERT(ssi.isTss);
-
-	printf("TSS %s replacing interface:\ngetValue=%s\n",
-	       ssi.id().toString().c_str(),
-	       ssi.getValue.getEndpoint().token.toString().c_str());
-
-	// TODO should this loop until successful? it should never have conflicts, in theory
+	ASSERT(ssi.isTss());
 
 	loop {
 		try {
 			state Tag myTag;
 
 			tr->reset();
-			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); // TODO is this needed?
+			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 
 			Optional<Value> pairTagValue = wait(tr->get(serverTagKeyFor(self->tssPairID.get())));
@@ -5263,17 +4969,10 @@ ACTOR Future<Void> replaceTSSInterface(StorageServer* self, StorageServerInterfa
 			tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString());
 
 			wait(tr->commit());
-
-			// TODO trace event instead
-			printf("tss %s added itself back, got tag %s for partner %s\n",
-			       self->thisServerID.toString().c_str(),
-			       self->tag.toString().c_str(),
-			       self->tssPairID.get().toString().c_str());
 			self->tag = myTag;
 
 			break;
 		} catch (Error& e) {
-			printf("tss replace interface got error %d!!\n", e.code());
 			wait(tr->onError(e));
 		}
 	}
@@ -5317,7 +5016,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
 
 		// if this is a tss storage file, use that as source of truth for this server being a tss instead of the
 		// presence of the tss pair key in the storage engine
-		if (ssi.isTss) {
+		if (ssi.isTss()) {
 			ASSERT(self.isTss());
 			ssi.tssPairID = self.tssPairID.get();
 		} else {
diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp
index 1b23040d0d..121e2477e0 100644
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@@ -979,8 +979,7 @@ ACTOR Future<bool> runTest(Database cx,
 				                                   testers,
 				                                   quiescent,
 				                                   spec.runConsistencyCheckOnCache,
-				                                   // spec.runConsistencyCheckOnTSS, // TODO override with true to test
-				                                   true,
+				                                   spec.runConsistencyCheckOnTSS,
 				                                   10000.0,
 				                                   18000,
 				                                   spec.databasePingDelay,
@@ -1429,19 +1428,14 @@ ACTOR Future<Void> runTests(Reference<AsyncVar<Optional<struct ClusterController
 	if (useDB && startingConfiguration != StringRef()) {
 		try {
 			wait(timeoutError(changeConfiguration(cx, testers, startingConfiguration), 2000.0));
-			printf("starting config changed\n");
 			if (g_network->isSimulated() && enableDD) {
-				printf("waiting for DD\n");
 				wait(success(setDDMode(cx, 1)));
-				printf("done waiting for DD\n");
 			}
 		} catch (Error& e) {
 			TraceEvent(SevError, "TestFailure").error(e).detail("Reason", "Unable to set starting configuration");
 		}
 	}
 
-	printf("starting configuration set, moving on\n");
-
 	if (useDB && waitForQuiescenceBegin) {
 		TraceEvent("TesterStartingPreTestChecks")
 		    .detail("DatabasePingDelay", databasePingDelay)
@@ -1457,8 +1451,6 @@ ACTOR Future<Void> runTests(Reference<AsyncVar<Optional<struct ClusterController
 		}
 	}
 
-	printf("database quiesced, starting tests.\n");
-
 	TraceEvent("TestsExpectedToPass").detail("Count", tests.size());
 	state int idx = 0;
 	for (; idx < tests.size(); idx++) {
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 4a136b4bd4..11b90dbae5 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -328,7 +328,7 @@ std::string filenameFromId(KeyValueStoreType storeType, std::string folder, std:
 	else if (storeType == KeyValueStoreType::SSD_ROCKSDB_V1)
 		return joinPath(folder, prefix + id.toString() + ".rocksdb");
 
-	printf("UNKNOWN storeType %s\n", storeType.toString().c_str());
+	TraceEvent(SevError, "UnknownStoreType").detail("StoreType", storeType.toString());
 	UNREACHABLE();
 }
 
@@ -763,7 +763,9 @@ ACTOR Future<Void> storageServerRollbackRebooter(Future<Void> prevStorageServer,
 		StorageServerInterface recruited;
 		recruited.uniqueID = id;
 		recruited.locality = locality;
-		recruited.isTss = isTss;
+		recruited.tssPairID =
+		    isTss ? Optional<UID>(UID()) : Optional<UID>(); // set this here since we use its presence to determine
+		                                                    // whether this server is a tss or not
 		recruited.initEndpoints();
 
 		DUMPTOKEN(recruited.getValue);
@@ -1110,14 +1112,15 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 				// TODO might be more efficient to mark a boolean on DiskStore in getDiskStores, but that kind of breaks
 				// the abstraction since DiskStore also applies to storage cache + tlog
 				bool isTss = s.filename.find(tssPrefix) != std::string::npos;
-				// TODO REMOVE after test
-				printf("%s is%s tss filename\n", s.filename.c_str(), isTss ? "" : " not");
 				Role ssRole = isTss ? Role::TESTING_STORAGE_SERVER : Role::STORAGE_SERVER;
 
 				StorageServerInterface recruited;
 				recruited.uniqueID = s.storeID;
 				recruited.locality = locality;
-				recruited.isTss = isTss;
+				recruited.tssPairID =
+				    isTss ? Optional<UID>(UID())
+				          : Optional<UID>(); // presence of optional is used as source of truth for tss vs not. Value
+				                             // gets overridden later in restoreDurableState
 				recruited.initEndpoints();
 
 				std::map<std::string, std::string> details;
@@ -1509,27 +1512,17 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 			when(InitializeStorageRequest req = waitNext(interf.storage.getFuture())) {
 				if (!storageCache.exists(req.reqId)) {
 
-					printf("Got "
-					       "InitializeStorageRequest:seedTag=%s\nreqId=%s\ninterfaceId=%s\nstoreType=%s\nisTss=%"
-					       "s\ntssPairID=%s\ntssPairVersion=%lld\n\n",
-					       req.seedTag.toString().c_str(),
-					       req.reqId.toString().c_str(),
-					       req.interfaceId.toString().c_str(),
-					       req.storeType.toString().c_str(),
-					       req.isTss ? "true" : "false",
-					       req.isTss ? req.tssPairID.toString().c_str() : "",
-					       req.isTss ? req.tssPairVersion : 0);
+					bool isTss = req.tssPairIDAndVersion.present();
 
 					StorageServerInterface recruited(req.interfaceId);
 					recruited.locality = locality;
-					recruited.isTss = req.isTss;
-					recruited.tssPairID = req.tssPairID;
+					recruited.tssPairID = isTss ? req.tssPairIDAndVersion.get().first : Optional<UID>();
 					recruited.initEndpoints();
 
 					std::map<std::string, std::string> details;
 					details["StorageEngine"] = req.storeType.toString();
-					details["IsTSS"] = std::to_string(recruited.isTss);
-					Role ssRole = recruited.isTss ? Role::TESTING_STORAGE_SERVER : Role::STORAGE_SERVER;
+					details["IsTSS"] = std::to_string(isTss);
+					Role ssRole = isTss ? Role::TESTING_STORAGE_SERVER : Role::STORAGE_SERVER;
 					startRole(ssRole, recruited.id(), interf.id(), details);
 
 					DUMPTOKEN(recruited.getValue);
@@ -1545,21 +1538,25 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 					DUMPTOKEN(recruited.getQueuingMetrics);
 					DUMPTOKEN(recruited.getKeyValueStoreType);
 					DUMPTOKEN(recruited.watchValue);
-					// TODO re-comment!
-					printf("Recruited as storageServer\n");
+					// printf("Recruited as storageServer\n");
 
 					std::string filename =
 					    filenameFromId(req.storeType,
 					                   folder,
-					                   recruited.isTss ? testingStoragePrefix.toString() : fileStoragePrefix.toString(),
+					                   isTss ? testingStoragePrefix.toString() : fileStoragePrefix.toString(),
 					                   recruited.id());
 					IKeyValueStore* data = openKVStore(req.storeType, filename, recruited.id(), memoryLimit);
 					Future<Void> kvClosed = data->onClosed();
 					filesClosed.add(kvClosed);
 					ReplyPromise<InitializeStorageReply> storageReady = req.reply;
 					storageCache.set(req.reqId, storageReady.getFuture());
-					Future<Void> s =
-					    storageServer(data, recruited, req.seedTag, req.tssPairVersion, storageReady, dbInfo, folder);
+					Future<Void> s = storageServer(data,
+					                               recruited,
+					                               req.seedTag,
+					                               isTss ? req.tssPairIDAndVersion.get().second : 0,
+					                               storageReady,
+					                               dbInfo,
+					                               folder);
 					s = handleIOErrors(s, data, recruited.id(), kvClosed);
 					s = storageCache.removeOnReady(req.reqId, s);
 					s = storageServerRollbackRebooter(s,
@@ -1567,7 +1564,7 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 					                                  filename,
 					                                  recruited.id(),
 					                                  recruited.locality,
-					                                  req.isTss,
+					                                  isTss,
 					                                  dbInfo,
 					                                  folder,
 					                                  &filesClosed,
diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp
index 0aae7ca9d4..3b5156fb1e 100644
--- a/fdbserver/workloads/ConsistencyCheck.actor.cpp
+++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp
@@ -1062,7 +1062,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 					    .detail("Begin", printable(shard.begin))
 					    .detail("End", printable(shard.end))
 					    .detail("StorageServer", storageServers[i].id())
-					    .detail("IsTSS", storageServers[i].isTss ? "True" : "False")
+					    .detail("IsTSS", storageServers[i].isTss() ? "True" : "False")
 					    .error(reply.getError());
 					estimatedBytes.push_back(-1);
 				}
@@ -1082,8 +1082,9 @@ struct ConsistencyCheckWorkload : TestWorkload {
 						    .detail("StorageServer1", storageServers[firstValidStorageServer].id())
 						    .detail("StorageServer2", storageServers[i].id())
 						    .detail("IsTSS",
-						            storageServers[i].isTss || storageServers[firstValidStorageServer].isTss ? "True"
-						                                                                                     : "False");
+						            storageServers[i].isTss() || storageServers[firstValidStorageServer].isTss()
+						                ? "True"
+						                : "False");
 					}
 				}
 			}
@@ -1247,24 +1248,14 @@ struct ConsistencyCheckWorkload : TestWorkload {
 
 			// add TSS to end of list, if configured and if not relocating
 			if (!isRelocating && self->performTSSCheck) {
-				printf("CCheck: Checking for tss to add: isRelocating: %s, performTSSCheck: %s\n",
-				       isRelocating ? "T" : "F",
-				       self->performTSSCheck ? "T" : "F");
 				int initialSize = storageServers.size();
 				for (int i = 0; i < initialSize; i++) {
 					Optional<StorageServerInterface> tssPair = cx->clientInfo->get().getTssPair(storageServers[i]);
 					if (tssPair.present()) {
-						printf("CCheck: Adding TSS %s to consistency check!\n", tssPair.get().id().toString().c_str());
 						storageServers.push_back(tssPair.get().id());
 						storageServerInterfaces.push_back(tssPair.get());
-					} else {
-						printf("CCheck: SS %s doesn't have tss pair\n", storageServers[i].toString().c_str());
 					}
 				}
-			} else {
-				printf("CCheck: Not checking for tss to add: isRelocating: %s, performTSSCheck: %s\n",
-				       isRelocating ? "T" : "F",
-				       self->performTSSCheck ? "T" : "F");
 			}
 
 			state vector<int64_t> estimatedBytes = wait(self->getStorageSizeEstimate(storageServerInterfaces, range));
@@ -1355,7 +1346,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 										if (g_network->isSimulated()) {
 											int invalidIndex = -1;
 											printf("\n%sSERVER %d (%s); shard = %s - %s:\n",
-											       storageServerInterfaces[j].isTss ? "TSS " : "",
+											       storageServerInterfaces[j].isTss() ? "TSS " : "",
 											       j,
 											       storageServerInterfaces[j].address().toString().c_str(),
 											       printable(req.begin.getKey()).c_str(),
@@ -1374,7 +1365,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 
 											printf(
 											    "\n%sSERVER %d (%s); shard = %s - %s:\n",
-											    storageServerInterfaces[firstValidServer].isTss ? "TSS " : "",
+											    storageServerInterfaces[firstValidServer].isTss() ? "TSS " : "",
 											    firstValidServer,
 											    storageServerInterfaces[firstValidServer].address().toString().c_str(),
 											    printable(req.begin.getKey()).c_str(),
@@ -1465,17 +1456,15 @@ struct ConsistencyCheckWorkload : TestWorkload {
 										    .detail("ValueMismatchKey", printable(valueMismatchKey))
 										    .detail("MatchingKVPairs", matchingKVPairs)
 										    .detail("IsTSS",
-										            storageServerInterfaces[j].isTss ||
-										                    storageServerInterfaces[firstValidServer].isTss
+										            storageServerInterfaces[j].isTss() ||
+										                    storageServerInterfaces[firstValidServer].isTss()
 										                ? "True"
 										                : "False");
 
-										// TODO should the test still fail if TSS is wrong? Or is just logging the trace
-										// logs ok
 										if ((g_network->isSimulated() &&
 										     g_simulator.tssMode != ISimulator::TSSMode::EnabledDropMutations) ||
-										    (!storageServerInterfaces[j].isTss &&
-										     !storageServerInterfaces[firstValidServer].isTss)) {
+										    (!storageServerInterfaces[j].isTss() &&
+										     !storageServerInterfaces[firstValidServer].isTss())) {
 											self->testFailure("Data inconsistent", true);
 											return false;
 										}
@@ -1497,19 +1486,12 @@ struct ConsistencyCheckWorkload : TestWorkload {
 								    .detail("UID", storageServerInterfaces[j].id())
 								    .detail("GetKeyValuesToken",
 								            storageServerInterfaces[j].getKeyValues.getEndpoint().token)
-								    .detail("IsTSS", storageServerInterfaces[j].isTss ? "True" : "False")
+								    .detail("IsTSS", storageServerInterfaces[j].isTss() ? "True" : "False")
 								    .error(e);
 
-								printf("CC %sSS %s failed with error % d\n",
-								       storageServerInterfaces[j].isTss ? "T" : "",
-								       storageServers[j].toString().c_str(),
-								       e.code());
-
 								// All shards should be available in quiscence
-								// TODO should the test still fail if TSS is unavailable? Or is just logging the trace
-								// logs ok
 								if (self->performQuiescentChecks &&
-								    (g_network->isSimulated() || !storageServerInterfaces[j].isTss)) {
+								    (g_network->isSimulated() || !storageServerInterfaces[j].isTss())) {
 									self->testFailure("Storage server unavailable");
 									return false;
 								}
@@ -1604,7 +1586,6 @@ struct ConsistencyCheckWorkload : TestWorkload {
 				bool hasValidEstimate = estimatedBytes.size() > 0;
 
 				// If the storage servers' sampled estimate of shard size is different from ours
-				// TODO should the test still fail if TSS has wrong estimate? Or is just logging the trace logs ok
 				if (self->performQuiescentChecks) {
 					for (int j = 0; j < estimatedBytes.size(); j++) {
 						if (estimatedBytes[j] >= 0 && estimatedBytes[j] != sampledBytes) {
@@ -1612,9 +1593,9 @@ struct ConsistencyCheckWorkload : TestWorkload {
 							    .detail("EstimatedBytes", estimatedBytes[j])
 							    .detail("CorrectSampledBytes", sampledBytes)
 							    .detail("StorageServer", storageServers[j])
-							    .detail("IsTSS", storageServerInterfaces[j].isTss ? "True" : "False");
+							    .detail("IsTSS", storageServerInterfaces[j].isTss() ? "True" : "False");
 
-							if (!storageServerInterfaces[j].isTss) {
+							if (!storageServerInterfaces[j].isTss()) {
 								self->testFailure("Storage servers had incorrect sampled estimate");
 							}
 
@@ -1622,7 +1603,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 
 							break;
 						} else if (estimatedBytes[j] < 0 &&
-						           (g_network->isSimulated() || !storageServerInterfaces[j].isTss)) {
+						           (g_network->isSimulated() || !storageServerInterfaces[j].isTss())) {
 							self->testFailure("Could not get storage metrics from server");
 							hasValidEstimate = false;
 							break;
@@ -1734,8 +1715,9 @@ struct ConsistencyCheckWorkload : TestWorkload {
 			if (!keyValueStoreType.present()) {
 				TraceEvent("ConsistencyCheck_ServerUnavailable").detail("ServerID", storageServers[i].id());
 				self->testFailure("Storage server unavailable");
-			} else if ((!storageServers[i].isTss && keyValueStoreType.get() != configuration.storageServerStoreType) ||
-			           (storageServers[i].isTss &&
+			} else if ((!storageServers[i].isTss() &&
+			            keyValueStoreType.get() != configuration.storageServerStoreType) ||
+			           (storageServers[i].isTss() &&
 			            keyValueStoreType.get() != configuration.testingStorageServerStoreType)) {
 				TraceEvent("ConsistencyCheck_WrongKeyValueStoreType")
 				    .detail("ServerID", storageServers[i].id())
@@ -1747,10 +1729,6 @@ struct ConsistencyCheckWorkload : TestWorkload {
 
 			// Check each pair of storage servers for an address match
 			for (j = i + 1; j < storageServers.size(); j++) {
-				// TODO change this hack back once i fix recruitment
-				/*if (storageServers[i].isTss || storageServers[j].isTss) {
-				    continue;
-				}*/
 				if (storageServers[i].address() == storageServers[j].address()) {
 					TraceEvent("ConsistencyCheck_UndesirableServer")
 					    .detail("StorageServer1", storageServers[i].id())
@@ -1773,16 +1751,6 @@ struct ConsistencyCheckWorkload : TestWorkload {
 		state vector<StorageServerInterface> storageServers = wait(getStorageServers(cx));
 		std::vector<Optional<Key>> missingStorage; // vector instead of a set to get the count
 
-		printf("CC starting check for storage: %d workers, %d SS\n", workers.size(), storageServers.size());
-		printf("CC checking %d regions: ", configuration.regions.size());
-		if (configuration.regions.size() == 1) {
-			printf("%s", configuration.regions[0].dcId.toString().c_str());
-		} else if (configuration.regions.size() == 2) {
-			printf("%s %s",
-			       configuration.regions[0].dcId.toString().c_str(),
-			       configuration.regions[1].dcId.toString().c_str());
-		}
-		printf("\n");
 		for (int i = 0; i < workers.size(); i++) {
 			NetworkAddress addr = workers[i].interf.stableAddress();
 			if (!configuration.isExcludedServer(workers[i].interf.addresses()) &&
@@ -1792,29 +1760,10 @@ struct ConsistencyCheckWorkload : TestWorkload {
 				for (int j = 0; j < storageServers.size(); j++) {
 					if (storageServers[j].stableAddress() == addr) {
 						found = true;
-						printf("CC found SS %s on %s in dc %s\n",
-						       storageServers[j].id().toString().c_str(),
-						       addr.toString().c_str(),
-						       workers[i].interf.locality.dcId().present()
-						           ? workers[i].interf.locality.dcId().get().toString().c_str()
-						           : "");
 						break;
 					}
 				}
 				if (!found) {
-					if (configuration.regions.size() == 0 ||
-					    (configuration.regions.size() == 1 &&
-					     workers[i].interf.locality.dcId() == configuration.regions[0].dcId) ||
-					    (configuration.regions.size() == 2 &&
-					     (workers[i].interf.locality.dcId() == configuration.regions[0].dcId ||
-					      workers[i].interf.locality.dcId() == configuration.regions[1].dcId))) {
-						printf("CC found no SS on %s in dc %s\n",
-						       addr.toString().c_str(),
-						       workers[i].interf.locality.dcId().present()
-						           ? workers[i].interf.locality.dcId().get().toString().c_str()
-						           : "");
-					}
-
 					TraceEvent("ConsistencyCheck_NoStorage")
 					    .detail("Address", addr)
 					    .detail("ProcessClassEqualToStorageClass",
@@ -1839,7 +1788,6 @@ struct ConsistencyCheckWorkload : TestWorkload {
 			// TODO could improve this check by also ensuring DD is currently recruiting a TSS by using quietdb?
 			bool couldExpectMissingTss =
 			    (configuration.desiredTSSCount - self->dbInfo->get().client.tssMapping.size()) > 0;
-			printf("CC couldExpectMissingTss = %s\n", couldExpectMissingTss ? "True" : "False");
 
 			int countMissing = missingStorage.size();
 			int acceptableTssMissing = 1;
@@ -1858,16 +1806,10 @@ struct ConsistencyCheckWorkload : TestWorkload {
 			}
 
 			if (!couldExpectMissingTss || countMissing > acceptableTssMissing) {
-				printf("No storage server on %d workers. CouldBeTSS=%s, acceptableTssMissing=%d\n",
-				       countMissing,
-				       couldExpectMissingTss ? "T" : "F",
-				       acceptableTssMissing);
 				self->testFailure("No storage server on worker");
 				return false;
 			} else {
-				// TODO sev=30 warn instead of print
-				printf("CC found %d missing storage server on worker, but it is likely a tss(es) waiting for a pair\n",
-				       configuration.usableRegions);
+				TraceEvent(SevWarn, "ConsistencyCheck_TSSMissing");
 			}
 		}
 
@@ -1885,10 +1827,8 @@ struct ConsistencyCheckWorkload : TestWorkload {
 		state bool foundExtraDataStore = false;
 		state std::vector<struct ProcessInfo*> protectedProcessesToKill;
 
-		printf("CC checking for extra data stores\n");
 		state std::map<NetworkAddress, std::set<UID>> statefulProcesses;
 		for (const auto& ss : storageServers) {
-			printf("CC Marking %ss as ok\n", ss.id().toString().c_str());
 			statefulProcesses[ss.address()].insert(ss.id());
 			// A process may have two addresses (same ip, different ports)
 			if (ss.secondaryAddress().present()) {
@@ -1945,9 +1885,6 @@ struct ConsistencyCheckWorkload : TestWorkload {
 				if (statefulProcesses[itr->interf.address()].count(id)) {
 					continue;
 				}
-				printf("CC found extra data store %s on %s\n",
-				       id.toString().c_str(),
-				       itr->interf.address().toString().c_str());
 				// For extra data store
 				TraceEvent("ConsistencyCheck_ExtraDataStore")
 				    .detail("Address", itr->interf.address())
@@ -1980,10 +1917,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 			}
 		}
 
-		printf("CC check for extra data stores complete\n");
-
 		if (foundExtraDataStore) {
-			printf("CC Extra Data Stores\n");
 			self->testFailure("Extra data stores present on workers");
 			return false;
 		}
diff --git a/fdbserver/workloads/RandomMoveKeys.actor.cpp b/fdbserver/workloads/RandomMoveKeys.actor.cpp
index 4fe5654a18..887c6da897 100644
--- a/fdbserver/workloads/RandomMoveKeys.actor.cpp
+++ b/fdbserver/workloads/RandomMoveKeys.actor.cpp
@@ -168,7 +168,7 @@ struct MoveKeysWorkload : TestWorkload {
 			count[servers[s].address()]++;
 		int o = 0;
 		for (int s = 0; s < servers.size(); s++)
-			if (count[servers[s].address()] == 1 && !servers[s].isTss)
+			if (count[servers[s].address()] == 1 && !servers[s].isTss())
 				servers[o++] = servers[s];
 		servers.resize(o);
 	}
diff --git a/fdbserver/workloads/workloads.actor.h b/fdbserver/workloads/workloads.actor.h
index ffd669e88b..d85e3469b5 100644
--- a/fdbserver/workloads/workloads.actor.h
+++ b/fdbserver/workloads/workloads.actor.h
@@ -152,7 +152,7 @@ public:
 		databasePingDelay = g_network->isSimulated() ? 0.0 : 15.0;
 		runConsistencyCheck = g_network->isSimulated();
 		runConsistencyCheckOnCache = false;
-		runConsistencyCheckOnTSS = false;
+		runConsistencyCheckOnTSS = true;
 		waitForQuiescenceBegin = true;
 		waitForQuiescenceEnd = true;
 		simCheckRelocationDuration = false;
diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp
index e4d5a4e6f9..1d287feed3 100644
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@@ -234,6 +234,7 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) {
 	init( BASIC_LOAD_BALANCE_MIN_CPU,                         0.05 ); //do not adjust LB probabilities if the proxies are less than 5% utilized
 	init( BASIC_LOAD_BALANCE_BUCKETS,                           40 ); //proxies bin recent GRV requests into 40 time bins
 	init( BASIC_LOAD_BALANCE_COMPUTE_PRECISION,              10000 ); //determines how much of the LB usage is holding the CPU usage of the proxy
+	init( LOAD_BALANCE_TSS_TIMEOUT,                            5.0 );
 
 	// Health Monitor
 	init( FAILURE_DETECTION_DELAY,                             4.0 ); if( randomize && BUGGIFY ) FAILURE_DETECTION_DELAY = 1.0;
diff --git a/flow/Knobs.h b/flow/Knobs.h
index 67ec3b82b7..9b700613f3 100644
--- a/flow/Knobs.h
+++ b/flow/Knobs.h
@@ -250,6 +250,7 @@ public:
 	int BASIC_LOAD_BALANCE_COMPUTE_PRECISION;
 	double BASIC_LOAD_BALANCE_MIN_REQUESTS;
 	double BASIC_LOAD_BALANCE_MIN_CPU;
+	double LOAD_BALANCE_TSS_TIMEOUT;
 
 	// Health Monitor
 	int FAILURE_DETECTION_DELAY;
diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h
index e0e84c6e25..7bf2a05e63 100644
--- a/flow/genericactors.actor.h
+++ b/flow/genericactors.actor.h
@@ -1230,8 +1230,6 @@ Future<T> brokenPromiseToMaybeDelivered(Future<T> in) {
 		return t;
 	} catch (Error& e) {
 		if (e.code() == error_code_broken_promise) {
-			// TODO REMOVE!
-			printf("broken promise!!");
 			throw request_maybe_delivered();
 		}
 		throw;
diff --git a/flow/serialize.h b/flow/serialize.h
index 7653648a80..81bb18ad4d 100644
--- a/flow/serialize.h
+++ b/flow/serialize.h
@@ -22,9 +22,6 @@
 #define FLOW_SERIALIZE_H
 #pragma once
 
-// TODO REMOVE
-#include <cinttypes>
-
 #include <stdint.h>
 #include <array>
 #include <set>
@@ -112,12 +109,6 @@ class Serializer {
 public:
 	static void serialize(Archive& ar, T& t) {
 		t.serialize(ar);
-		// TODO REMOVE
-		if (!ar.protocolVersion().isValid()) {
-			printf("invalid protocol version %" PRIx64 " < %" PRIx64 "!!!\n",
-			       ar.protocolVersion().version(),
-			       ProtocolVersion::minValidProtocolVersion);
-		}
 		ASSERT(ar.protocolVersion().isValid());
 	}
 };

From 95ab07fcb698ff5cbc926ada3c9974b5cb33b031 Mon Sep 17 00:00:00 2001
From: Josh Slocum <josh.slocum@snowflake.com>
Date: Tue, 25 May 2021 20:42:07 +0000
Subject: [PATCH 437/461] Adding comments for clarity

---
 fdbserver/VersionedBTree.actor.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 49eac05655..454e7ea5d9 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -7565,11 +7565,11 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") {
 		    { deterministicRandom()->randomInt(prev.k, next.k), deterministicRandom()->randomInt(prev.v, next.v) });
 	};
 
-	// Build a set of N unique items
+	// Build a set of N unique items, where no consecutive items are in the set, a requirement of the seek behavior tests.
 	std::set<IntIntPair> uniqueItems;
 	while (uniqueItems.size() < N) {
 		IntIntPair p = randomPair();
-		auto nextP = p; // also check if next highest/lowest key is not in set for testLTE/testGTE
+		auto nextP = p; // also check if next highest/lowest key is not in set
 		nextP.v++;
 		auto prevP = p;
 		prevP.v--;
@@ -7591,7 +7591,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") {
 	std::vector<IntIntPair> toDelete;
 	while (1) {
 		IntIntPair p = randomPair();
-		auto nextP = p; // also check if next highest/lowest key is not in set for testLTE/testGTE
+		auto nextP = p; // also check if next highest/lowest key is not in the set
 		nextP.v++;
 		auto prevP = p;
 		prevP.v--;
@@ -7745,6 +7745,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") {
 	}
 
 	// SeekLTE to the next possible int pair value after each element to make sure the base element is found
+	// Assumes no consecutive items are present in the set
 	for (int i = 0; i < items.size(); ++i) {
 		IntIntPair p = items[i];
 		IntIntPair q = p;
@@ -7761,6 +7762,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") {
 	}
 
 	// SeekGTE to the previous possible int pair value after each element to make sure the base element is found
+	// Assumes no consecutive items are present in the set
 	for (int i = 0; i < items.size(); ++i) {
 		IntIntPair p = items[i];
 		IntIntPair q = p;
@@ -7796,6 +7798,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") {
 	}
 
 	// SeekLTE to each element's next possible value, using each element as a hint
+	// Assumes no consecutive items are present in the set
 	for (int i = 0; i < items.size(); ++i) {
 		IntIntPair p = items[i];
 		IntIntPair q = p;

From d68cb9b04872a505a2bfa4782f8a4e7e83c2efd3 Mon Sep 17 00:00:00 2001
From: Josh Slocum <josh.slocum@snowflake.com>
Date: Tue, 25 May 2021 20:06:32 +0000
Subject: [PATCH 438/461] Changing role names and enabling tss by default in
 consistency check

---
 fdbserver/SimulatedCluster.actor.cpp           | 12 ++++--------
 fdbserver/worker.actor.cpp                     |  4 ++--
 fdbserver/workloads/ConsistencyCheck.actor.cpp |  2 +-
 flow/ProtocolVersion.h                         |  1 -
 4 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index 70fed849d9..1747047d85 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -186,7 +186,7 @@ public:
 	bool configureLocked = false;
 	bool startIncompatibleProcess = false;
 	int logAntiQuorum = -1;
-	bool firstTestInRestart = false;
+	bool isFirstTestInRestart = false;
 	// Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig
 	//	0 = "ssd"
 	//	1 = "memory"
@@ -1171,7 +1171,7 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	}
 
 	int tssCount = 0;
-	if (!testconfig.simpleConfig && deterministicRandom()->random01() < 0.25) {
+	if (!testConfig.simpleConfig && deterministicRandom()->random01() < 0.25) {
 		// 1 or 2 tss
 		tssCount = deterministicRandom()->randomInt(1, 3);
 	}
@@ -1189,14 +1189,10 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 		db.grvProxyCount = 1;
 		db.resolverCount = 1;
 	}
+	int replication_type = testConfig.simpleConfig ? 1 : (std::max(testConfig.minimumReplication, datacenters > 4 ? deterministicRandom()->randomInt(1, 3) : std::min(deterministicRandom()->randomInt(0, 6), 3)));
 	if (testConfig.config.present()) {
 		set_config(testConfig.config.get());
 	} else {
-		int replication_type = testConfig.simpleConfig
-		                           ? 1
-		                           : (std::max(testConfig.minimumReplication,
-		                                       datacenters > 4 ? deterministicRandom()->randomInt(1, 3)
-		                                                       : std::min(deterministicRandom()->randomInt(0, 6), 3)));
 		switch (replication_type) {
 		case 0: {
 			TEST(true); // Simulated cluster using custom redundancy mode
@@ -1513,7 +1509,7 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	tssCount =
 	    std::max(0, std::min(tssCount, (db.usableRegions * (machine_count / datacenters) - replication_type) / 2));
 
-	if (tssCount > 0) {
+	if (!testConfig.config.present() && tssCount > 0) {
 		std::string confStr = format("tss_count:=%d tss_storage_engine:=%d", tssCount, db.storageServerStoreType);
 		set_config(confStr);
 		double tssRandom = deterministicRandom()->random01();
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 11b90dbae5..d48b9cd628 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -2151,7 +2151,7 @@ ACTOR Future<Void> fdbd(Reference<ClusterConnectionFile> connFile,
 
 const Role Role::WORKER("Worker", "WK", false);
 const Role Role::STORAGE_SERVER("StorageServer", "SS");
-const Role Role::TESTING_STORAGE_SERVER("TestingStorageServer", "TS");
+const Role Role::TESTING_STORAGE_SERVER("TestingStorageServer", "ST");
 const Role Role::TRANSACTION_LOG("TLog", "TL");
 const Role Role::SHARED_TRANSACTION_LOG("SharedTLog", "SL", false);
 const Role Role::COMMIT_PROXY("CommitProxyServer", "CP");
@@ -2159,7 +2159,7 @@ const Role Role::GRV_PROXY("GrvProxyServer", "GP");
 const Role Role::MASTER("MasterServer", "MS");
 const Role Role::RESOLVER("Resolver", "RV");
 const Role Role::CLUSTER_CONTROLLER("ClusterController", "CC");
-const Role Role::TESTER("TestClient", "TC");
+const Role Role::TESTER("Tester", "TS");
 const Role Role::LOG_ROUTER("LogRouter", "LR");
 const Role Role::DATA_DISTRIBUTOR("DataDistributor", "DD");
 const Role Role::RATEKEEPER("Ratekeeper", "RK");
diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp
index 3b5156fb1e..459501198e 100644
--- a/fdbserver/workloads/ConsistencyCheck.actor.cpp
+++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp
@@ -97,7 +97,7 @@ struct ConsistencyCheckWorkload : TestWorkload {
 	ConsistencyCheckWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
 		performQuiescentChecks = getOption(options, LiteralStringRef("performQuiescentChecks"), false);
 		performCacheCheck = getOption(options, LiteralStringRef("performCacheCheck"), false);
-		performTSSCheck = getOption(options, LiteralStringRef("performTSSCheck"), false);
+		performTSSCheck = getOption(options, LiteralStringRef("performTSSCheck"), true);
 		quiescentWaitTimeout = getOption(options, LiteralStringRef("quiescentWaitTimeout"), 600.0);
 		distributed = getOption(options, LiteralStringRef("distributed"), true);
 		shardSampleFactor = std::max(getOption(options, LiteralStringRef("shardSampleFactor"), 1), 1);
diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h
index 7feb6b3839..af7c6f1108 100644
--- a/flow/ProtocolVersion.h
+++ b/flow/ProtocolVersion.h
@@ -138,7 +138,6 @@ public: // introduced features
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010000LL, StableInterfaces);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, TagThrottleValueReason);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, SpanContext);
-	// TODO is this right?
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, TSS);
 };
 

From 6bd7fa4036bc0cad3cbcf22655379d8868df84b6 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 24 Mar 2021 16:27:35 -0600
Subject: [PATCH 439/461] Actually close files in simulation

---
 fdbrpc/AsyncFileNonDurable.actor.h |  4 ++++
 fdbrpc/sim2.actor.cpp              | 16 ++++++++--------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 00c0f7441d..c5f065a4e3 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -276,6 +276,10 @@ public:
 			Future<Void> deleteFuture = deleteFile(this);
 			if (!deleteFuture.isReady())
 				filesBeingDeleted[filename] = deleteFuture;
+		} else if (isSoleOwner()) {
+			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
+			// we remove the file from the map to make sure it gets closed.
+			g_simulator.getCurrentProcess()->machine->openFiles.erase(filename);
 		}
 	}
 
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index 1af14ec676..6cddbb7e88 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -536,7 +536,10 @@ public:
 
 	std::string getFilename() const override { return actualFilename; }
 
-	~SimpleFile() override { _close(h); }
+	~SimpleFile() override {
+		_close(h);
+		--openCount;
+	}
 
 private:
 	int h;
@@ -1933,10 +1936,7 @@ public:
 		TraceEvent("ClogInterface")
 		    .detail("IP", ip.toString())
 		    .detail("Delay", seconds)
-		    .detail("Queue",
-		            mode == ClogSend      ? "Send"
-		            : mode == ClogReceive ? "Receive"
-		                                  : "All");
+		    .detail("Queue", mode == ClogSend ? "Send" : mode == ClogReceive ? "Receive" : "All");
 
 		if (mode == ClogSend || mode == ClogAll)
 			g_clogging.clogSendFor(ip, seconds);
@@ -2408,9 +2408,9 @@ int sf_open(const char* filename, int flags, int convFlags, int mode) {
 	                       GENERIC_READ | ((flags & IAsyncFile::OPEN_READWRITE) ? GENERIC_WRITE : 0),
 	                       FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
 	                       nullptr,
-	                       (flags & IAsyncFile::OPEN_EXCLUSIVE) ? CREATE_NEW
-	                       : (flags & IAsyncFile::OPEN_CREATE)  ? OPEN_ALWAYS
-	                                                            : OPEN_EXISTING,
+	                       (flags & IAsyncFile::OPEN_EXCLUSIVE)
+	                           ? CREATE_NEW
+	                           : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS : OPEN_EXISTING,
 	                       FILE_ATTRIBUTE_NORMAL,
 	                       nullptr);
 	int h = -1;

From f32ce0c4b54265f2961f81663c1cc77a177f2e2d Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 24 Mar 2021 16:56:11 -0600
Subject: [PATCH 440/461] fix typo

---
 fdbrpc/AsyncFileNonDurable.actor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index c5f065a4e3..a1508f7fef 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -278,7 +278,7 @@ public:
 				filesBeingDeleted[filename] = deleteFuture;
 		} else if (isSoleOwner()) {
 			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
-			// we remove the file from the map to make sure it gets closed.
+			// remove the file from the map to make sure it gets closed.
 			g_simulator.getCurrentProcess()->machine->openFiles.erase(filename);
 		}
 	}

From 04613c3b1346fbe2b23bf4d1fb8edfc6a7d9ae02 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 24 Mar 2021 19:57:24 -0600
Subject: [PATCH 441/461] handle file renames properly

---
 fdbrpc/AsyncFileNonDurable.actor.h | 12 +++++++++++-
 flow/flow.h                        |  2 ++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index a1508f7fef..28b3506d6e 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -279,7 +279,17 @@ public:
 		} else if (isSoleOwner()) {
 			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
 			// remove the file from the map to make sure it gets closed.
-			g_simulator.getCurrentProcess()->machine->openFiles.erase(filename);
+			auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
+			auto iter = openFiles.find(filename);
+			// the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the
+			// map anymore.
+			if (iter != openFiles.end()) {
+				// even if the filename exists, it doesn't mean that it references the same file. It could be that the
+				// file was renamed and later a file with the same name was opened.
+				if (iter->second.canGet() && iter->second.get().getPtr() == this) {
+					openFiles.erase(filename);
+				}
+			}
 		}
 	}
 
diff --git a/flow/flow.h b/flow/flow.h
index 987572d7c5..e03d598d9b 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -674,6 +674,8 @@ public:
 	bool isValid() const { return sav != 0; }
 	bool isReady() const { return sav->isSet(); }
 	bool isError() const { return sav->isError(); }
+	// returns true if get can be called on this future (counterpart of canBeSet on Promises)
+	bool canGet() const { return isValid() && isReady() && !isError(); }
 	Error& getError() const {
 		ASSERT(isError());
 		return sav->error_state;

From 7cb767fd3c00b459f1546d68add6d67e0ade78b2 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 25 Mar 2021 13:22:29 -0600
Subject: [PATCH 442/461] only remove files from the open map if they have no
 modifications in flight

---
 fdbrpc/AsyncFileNonDurable.actor.h | 49 ++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 28b3506d6e..ef686271c5 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -268,6 +268,37 @@ public:
 		//TraceEvent("AsyncFileNonDurable_Destroy", id).detail("Filename", filename);
 	}
 
+	// The purpose of this actor is to simply keep a reference to a non-durable file until all pending modifications
+	// have completed. When they return, this actor will die and therefore decrement the reference count by 1.
+	ACTOR void waitOnOutstandingModifications(Reference<AsyncFileNonDurable> self) {
+		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
+		state TaskPriority currentTaskID = g_network->getCurrentTask();
+		state std::string filename = self->filename;
+
+		wait(g_simulator.onMachine(currentProcess));
+		try {
+			Promise<bool> startSyncPromise = self->startSyncPromise;
+			self->startSyncPromise = Promise<bool>();
+			startSyncPromise.send(true);
+
+			std::vector<Future<Void>> outstandingModifications;
+
+			for (auto itr = self->pendingModifications.ranges().begin();
+			     itr != self->pendingModifications.ranges().end();
+			     ++itr)
+				if (itr->value().isValid() && !itr->value().isReady())
+					outstandingModifications.push_back(itr->value());
+
+			// Ignore errors here so that all modifications can finish
+			wait(waitForAllReady(outstandingModifications));
+			wait(g_simulator.onProcess(currentProcess, currentTaskID));
+		} catch (Error& e) {
+			state Error err = e;
+			wait(g_simulator.onProcess(currentProcess, currentTaskID));
+			throw err;
+		}
+	}
+
 	void addref() override { ReferenceCounted<AsyncFileNonDurable>::addref(); }
 	void delref() override {
 		if (delref_no_destroy()) {
@@ -279,6 +310,24 @@ public:
 		} else if (isSoleOwner()) {
 			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
 			// remove the file from the map to make sure it gets closed.
+			bool hasPendingModifications = false;
+			for (auto iter = pendingModifications.ranges().begin(); iter != pendingModifications.ranges().end();
+			     ++iter) {
+				if (iter->value().isValid() && !iter->value().isReady()) {
+					hasPendingModifications = true;
+					break;
+				}
+			}
+			if (hasPendingModifications) {
+				// If we still have pending references we won't close the file and instead wait for them. But while we
+				// wait for those to complete, another actor might open the file. So we call into an actor that will
+				// hold a refernce until all pending operations are complete. If someone opens this file before this
+				// completes, nothing will happen. Otherwise we will enter delref again but this time
+				// hasPendingModifications will evalualte to false.
+				addref();
+				waitOnOutstandingModifications(Reference<AsyncFileNonDurable>(this));
+				return;
+			}
 			auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
 			auto iter = openFiles.find(filename);
 			// the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the

From 7b4de4e037bca8ea9cf99b8a77ab8db594cb9fb3 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 25 Mar 2021 14:00:07 -0600
Subject: [PATCH 443/461] Revert change

---
 fdbrpc/AsyncFileNonDurable.actor.h | 47 +++++++-----------------------
 1 file changed, 11 insertions(+), 36 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index ef686271c5..cc341ea155 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -276,27 +276,20 @@ public:
 		state std::string filename = self->filename;
 
 		wait(g_simulator.onMachine(currentProcess));
-		try {
-			Promise<bool> startSyncPromise = self->startSyncPromise;
-			self->startSyncPromise = Promise<bool>();
-			startSyncPromise.send(true);
+		Promise<bool> startSyncPromise = self->startSyncPromise;
+		self->startSyncPromise = Promise<bool>();
+		startSyncPromise.send(true);
 
-			std::vector<Future<Void>> outstandingModifications;
+		std::vector<Future<Void>> outstandingModifications;
 
-			for (auto itr = self->pendingModifications.ranges().begin();
-			     itr != self->pendingModifications.ranges().end();
-			     ++itr)
-				if (itr->value().isValid() && !itr->value().isReady())
-					outstandingModifications.push_back(itr->value());
+		for (auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end();
+		     ++itr)
+			if (itr->value().isValid() && !itr->value().isReady())
+				outstandingModifications.push_back(itr->value());
 
-			// Ignore errors here so that all modifications can finish
-			wait(waitForAllReady(outstandingModifications));
-			wait(g_simulator.onProcess(currentProcess, currentTaskID));
-		} catch (Error& e) {
-			state Error err = e;
-			wait(g_simulator.onProcess(currentProcess, currentTaskID));
-			throw err;
-		}
+		// Ignore errors here so that all modifications can finish
+		wait(waitForAllReady(outstandingModifications));
+		wait(g_simulator.onProcess(currentProcess, currentTaskID));
 	}
 
 	void addref() override { ReferenceCounted<AsyncFileNonDurable>::addref(); }
@@ -310,24 +303,6 @@ public:
 		} else if (isSoleOwner()) {
 			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
 			// remove the file from the map to make sure it gets closed.
-			bool hasPendingModifications = false;
-			for (auto iter = pendingModifications.ranges().begin(); iter != pendingModifications.ranges().end();
-			     ++iter) {
-				if (iter->value().isValid() && !iter->value().isReady()) {
-					hasPendingModifications = true;
-					break;
-				}
-			}
-			if (hasPendingModifications) {
-				// If we still have pending references we won't close the file and instead wait for them. But while we
-				// wait for those to complete, another actor might open the file. So we call into an actor that will
-				// hold a refernce until all pending operations are complete. If someone opens this file before this
-				// completes, nothing will happen. Otherwise we will enter delref again but this time
-				// hasPendingModifications will evalualte to false.
-				addref();
-				waitOnOutstandingModifications(Reference<AsyncFileNonDurable>(this));
-				return;
-			}
 			auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
 			auto iter = openFiles.find(filename);
 			// the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the

From cbce2f6f117ed2b6eb0064151648f2c730928844 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 1 Apr 2021 14:06:13 -0600
Subject: [PATCH 444/461] delete dead code

---
 fdbrpc/AsyncFileNonDurable.actor.h | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index cc341ea155..28b3506d6e 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -268,30 +268,6 @@ public:
 		//TraceEvent("AsyncFileNonDurable_Destroy", id).detail("Filename", filename);
 	}
 
-	// The purpose of this actor is to simply keep a reference to a non-durable file until all pending modifications
-	// have completed. When they return, this actor will die and therefore decrement the reference count by 1.
-	ACTOR void waitOnOutstandingModifications(Reference<AsyncFileNonDurable> self) {
-		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
-		state TaskPriority currentTaskID = g_network->getCurrentTask();
-		state std::string filename = self->filename;
-
-		wait(g_simulator.onMachine(currentProcess));
-		Promise<bool> startSyncPromise = self->startSyncPromise;
-		self->startSyncPromise = Promise<bool>();
-		startSyncPromise.send(true);
-
-		std::vector<Future<Void>> outstandingModifications;
-
-		for (auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end();
-		     ++itr)
-			if (itr->value().isValid() && !itr->value().isReady())
-				outstandingModifications.push_back(itr->value());
-
-		// Ignore errors here so that all modifications can finish
-		wait(waitForAllReady(outstandingModifications));
-		wait(g_simulator.onProcess(currentProcess, currentTaskID));
-	}
-
 	void addref() override { ReferenceCounted<AsyncFileNonDurable>::addref(); }
 	void delref() override {
 		if (delref_no_destroy()) {

From a7564696702442ed7397cce43b53251ad0718f9d Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 26 May 2021 13:38:24 -0700
Subject: [PATCH 445/461] Use a weak reference in the open files cache
 (abstracted from a similar cache in AsyncFileCached) to avoid a problem where
 removing an item from the cache could cause us to reentrantly remove it
 again.

---
 fdbrpc/AsyncFileCached.actor.cpp     |  3 +-
 fdbrpc/AsyncFileCached.actor.h       | 28 ++++------------
 fdbrpc/AsyncFileNonDurable.actor.h   | 40 +++++++++++++----------
 fdbrpc/sim2.actor.cpp                | 32 ++++++++++++-------
 fdbrpc/simulator.h                   |  6 +++-
 fdbserver/SimulatedCluster.actor.cpp |  9 +++---
 flow/genericactors.actor.h           | 48 ++++++++++++++++++++++++++++
 7 files changed, 110 insertions(+), 56 deletions(-)

diff --git a/fdbrpc/AsyncFileCached.actor.cpp b/fdbrpc/AsyncFileCached.actor.cpp
index f4a57d4646..984795c105 100644
--- a/fdbrpc/AsyncFileCached.actor.cpp
+++ b/fdbrpc/AsyncFileCached.actor.cpp
@@ -46,7 +46,8 @@ EvictablePage::~EvictablePage() {
 	}
 }
 
-std::map<std::string, OpenFileInfo> AsyncFileCached::openFiles;
+// A map of filename to the file handle for all opened cached files
+std::map<std::string, WeakFutureReference<IAsyncFile>> AsyncFileCached::openFiles;
 
 void AsyncFileCached::remove_page(AFCPage* page) {
 	pages.erase(page->pageOffset);
diff --git a/fdbrpc/AsyncFileCached.actor.h b/fdbrpc/AsyncFileCached.actor.h
index c5b6b3127c..2915b0557c 100644
--- a/fdbrpc/AsyncFileCached.actor.h
+++ b/fdbrpc/AsyncFileCached.actor.h
@@ -132,27 +132,13 @@ struct EvictablePageCache : ReferenceCounted<EvictablePageCache> {
 	const CacheEvictionType cacheEvictionType;
 };
 
-struct OpenFileInfo : NonCopyable {
-	IAsyncFile* f;
-	Future<Reference<IAsyncFile>> opened; // Only valid until the file is fully opened
-
-	OpenFileInfo() : f(0) {}
-	OpenFileInfo(OpenFileInfo&& r) noexcept : f(r.f), opened(std::move(r.opened)) { r.f = 0; }
-
-	Future<Reference<IAsyncFile>> get() {
-		if (f)
-			return Reference<IAsyncFile>::addRef(f);
-		else
-			return opened;
-	}
-};
-
 struct AFCPage;
 
 class AsyncFileCached final : public IAsyncFile, public ReferenceCounted<AsyncFileCached> {
 	friend struct AFCPage;
 
 public:
+	// Opens a file that uses the FDB in-memory page cache
 	static Future<Reference<IAsyncFile>> open(std::string filename, int flags, int mode) {
 		//TraceEvent("AsyncFileCachedOpen").detail("Filename", filename);
 		if (openFiles.find(filename) == openFiles.end()) {
@@ -160,7 +146,7 @@ public:
 			if (f.isReady() && f.isError())
 				return f;
 			if (!f.isReady())
-				openFiles[filename].opened = f;
+				openFiles[filename] = WeakFutureReference<IAsyncFile>(f);
 			else
 				return f.get();
 		}
@@ -263,7 +249,9 @@ public:
 	~AsyncFileCached() override;
 
 private:
-	static std::map<std::string, OpenFileInfo> openFiles;
+	// A map of filename to the file handle for all opened cached files
+	static std::map<std::string, WeakFutureReference<IAsyncFile>> openFiles;
+
 	std::string filename;
 	Reference<IAsyncFile> uncached;
 	int64_t length;
@@ -330,6 +318,7 @@ private:
 
 	static Future<Reference<IAsyncFile>> open_impl(std::string filename, int flags, int mode);
 
+	// Opens a file that uses the FDB in-memory page cache
 	ACTOR static Future<Reference<IAsyncFile>> open_impl(std::string filename,
 	                                                     int flags,
 	                                                     int mode,
@@ -345,10 +334,7 @@ private:
 			TraceEvent("AFCUnderlyingOpenEnd").detail("Filename", filename);
 			int64_t l = wait(f->size());
 			TraceEvent("AFCUnderlyingSize").detail("Filename", filename).detail("Size", l);
-			auto& of = openFiles[filename];
-			of.f = new AsyncFileCached(f, filename, l, pageCache);
-			of.opened = Future<Reference<IAsyncFile>>();
-			return Reference<IAsyncFile>(of.f);
+			return new AsyncFileCached(f, filename, l, pageCache);
 		} catch (Error& e) {
 			if (e.code() != error_code_actor_cancelled)
 				openFiles.erase(filename);
diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 28b3506d6e..ccc2ad42b4 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -130,6 +130,9 @@ public:
 	UID id;
 	std::string filename;
 
+	// For files that use atomic write and create, they are initially created with an extra suffix
+	std::string initialFilename;
+
 	// An approximation of the size of the file; .size() should be used instead of this variable in most cases
 	mutable int64_t approximateSize;
 
@@ -182,11 +185,13 @@ private:
 	    reponses; // cannot call getResult on this actor collection, since the actors will be on different processes
 
 	AsyncFileNonDurable(const std::string& filename,
+	                    const std::string& initialFilename,
 	                    Reference<IAsyncFile> file,
 	                    Reference<DiskParameters> diskParameters,
 	                    NetworkAddress openedAddress,
 	                    bool aio)
-	  : openedAddress(openedAddress), pendingModifications(uint64_t(-1)), approximateSize(0), reponses(false),
+	  : filename(filename), initialFilename(initialFilename), file(file), diskParameters(diskParameters),
+	    openedAddress(openedAddress), pendingModifications(uint64_t(-1)), approximateSize(0), reponses(false),
 	    aio(aio) {
 
 		// This is only designed to work in simulation
@@ -194,9 +199,6 @@ private:
 		this->id = deterministicRandom()->randomUniqueID();
 
 		//TraceEvent("AsyncFileNonDurable_Create", id).detail("Filename", filename);
-		this->file = file;
-		this->filename = filename;
-		this->diskParameters = diskParameters;
 		maxWriteDelay = FLOW_KNOBS->NON_DURABLE_MAX_WRITE_DELAY;
 		hasBeenSynced = false;
 
@@ -239,7 +241,7 @@ public:
 			}
 
 			state Reference<AsyncFileNonDurable> nonDurableFile(
-			    new AsyncFileNonDurable(filename, file, diskParameters, currentProcess->address, aio));
+			    new AsyncFileNonDurable(filename, actualFilename, file, diskParameters, currentProcess->address, aio));
 
 			// Causes the approximateSize member to be set
 			state Future<int64_t> sizeFuture = nonDurableFile->size();
@@ -269,25 +271,29 @@ public:
 	}
 
 	void addref() override { ReferenceCounted<AsyncFileNonDurable>::addref(); }
+
 	void delref() override {
 		if (delref_no_destroy()) {
-			ASSERT(filesBeingDeleted.count(filename) == 0);
-			//TraceEvent("AsyncFileNonDurable_StartDelete", id).detail("Filename", filename);
-			Future<Void> deleteFuture = deleteFile(this);
-			if (!deleteFuture.isReady())
-				filesBeingDeleted[filename] = deleteFuture;
-		} else if (isSoleOwner()) {
-			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
-			// remove the file from the map to make sure it gets closed.
+			if (filesBeingDeleted.count(filename) == 0) {
+				//TraceEvent("AsyncFileNonDurable_StartDelete", id).detail("Filename", filename);
+				Future<Void> deleteFuture = deleteFile(this);
+				if (!deleteFuture.isReady())
+					filesBeingDeleted[filename] = deleteFuture;
+			}
+
 			auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
 			auto iter = openFiles.find(filename);
-			// the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the
-			// map anymore.
+			if (iter == openFiles.end()) {
+				iter = openFiles.find(initialFilename);
+			}
+
+			// Various actions (e.g. simulated delete) can remove a file from openFiles prematurely, so it may already
+			// be gone
 			if (iter != openFiles.end()) {
 				// even if the filename exists, it doesn't mean that it references the same file. It could be that the
 				// file was renamed and later a file with the same name was opened.
-				if (iter->second.canGet() && iter->second.get().getPtr() == this) {
-					openFiles.erase(filename);
+				if (iter->second.getPtrIfReady().orDefault(nullptr) == this) {
+					openFiles.erase(iter);
 				}
 			}
 		}
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index 6cddbb7e88..f11caa5461 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -1018,8 +1018,8 @@ public:
 
 		// Get the size of all files we've created on the server and subtract them from the free space
 		for (auto file = proc->machine->openFiles.begin(); file != proc->machine->openFiles.end(); ++file) {
-			if (file->second.isReady()) {
-				totalFileSize += ((AsyncFileNonDurable*)file->second.get().getPtr())->approximateSize;
+			if (file->second.get().isReady()) {
+				totalFileSize += ((AsyncFileNonDurable*)file->second.get().get().getPtr())->approximateSize;
 			}
 			numFiles++;
 		}
@@ -1936,7 +1936,10 @@ public:
 		TraceEvent("ClogInterface")
 		    .detail("IP", ip.toString())
 		    .detail("Delay", seconds)
-		    .detail("Queue", mode == ClogSend ? "Send" : mode == ClogReceive ? "Receive" : "All");
+		    .detail("Queue",
+		            mode == ClogSend      ? "Send"
+		            : mode == ClogReceive ? "Receive"
+		                                  : "All");
 
 		if (mode == ClogSend || mode == ClogAll)
 			g_clogging.clogSendFor(ip, seconds);
@@ -2408,9 +2411,9 @@ int sf_open(const char* filename, int flags, int convFlags, int mode) {
 	                       GENERIC_READ | ((flags & IAsyncFile::OPEN_READWRITE) ? GENERIC_WRITE : 0),
 	                       FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
 	                       nullptr,
-	                       (flags & IAsyncFile::OPEN_EXCLUSIVE)
-	                           ? CREATE_NEW
-	                           : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS : OPEN_EXISTING,
+	                       (flags & IAsyncFile::OPEN_EXCLUSIVE) ? CREATE_NEW
+	                       : (flags & IAsyncFile::OPEN_CREATE)  ? OPEN_ALWAYS
+	                                                            : OPEN_EXISTING,
 	                       FILE_ATTRIBUTE_NORMAL,
 	                       nullptr);
 	int h = -1;
@@ -2440,7 +2443,7 @@ Future<Reference<class IAsyncFile>> Sim2FileSystem::open(const std::string& file
 			actualFilename = filename + ".part";
 			auto partFile = machineCache.find(actualFilename);
 			if (partFile != machineCache.end()) {
-				Future<Reference<IAsyncFile>> f = AsyncFileDetachable::open(partFile->second);
+				Future<Reference<IAsyncFile>> f = AsyncFileDetachable::open(partFile->second.get());
 				if (FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0)
 					f = map(f, [=](Reference<IAsyncFile> r) {
 						return Reference<IAsyncFile>(new AsyncFileWriteChecker(r));
@@ -2448,19 +2451,26 @@ Future<Reference<class IAsyncFile>> Sim2FileSystem::open(const std::string& file
 				return f;
 			}
 		}
-		if (machineCache.find(actualFilename) == machineCache.end()) {
+
+		Future<Reference<IAsyncFile>> f;
+		auto itr = machineCache.find(actualFilename);
+		if (itr == machineCache.end()) {
 			// Simulated disk parameters are shared by the AsyncFileNonDurable and the underlying SimpleFile.
 			// This way, they can both keep up with the time to start the next operation
 			auto diskParameters =
 			    makeReference<DiskParameters>(FLOW_KNOBS->SIM_DISK_IOPS, FLOW_KNOBS->SIM_DISK_BANDWIDTH);
-			machineCache[actualFilename] =
-			    AsyncFileNonDurable::open(filename,
+			f = AsyncFileNonDurable::open(filename,
 			                              actualFilename,
 			                              SimpleFile::open(filename, flags, mode, diskParameters, false),
 			                              diskParameters,
 			                              (flags & IAsyncFile::OPEN_NO_AIO) == 0);
+
+			machineCache[actualFilename] = WeakFutureReference<IAsyncFile>(f);
+		} else {
+			f = itr->second.get();
 		}
-		Future<Reference<IAsyncFile>> f = AsyncFileDetachable::open(machineCache[actualFilename]);
+
+		f = AsyncFileDetachable::open(f);
 		if (FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0)
 			f = map(f, [=](Reference<IAsyncFile> r) { return Reference<IAsyncFile>(new AsyncFileWriteChecker(r)); });
 		return f;
diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h
index 4b74ed91ba..19bed013f2 100644
--- a/fdbrpc/simulator.h
+++ b/fdbrpc/simulator.h
@@ -188,10 +188,14 @@ public:
 		Promise<KillType> shutdownSignal;
 	};
 
+	// A set of data associated with a simulated machine
 	struct MachineInfo {
 		ProcessInfo* machineProcess;
 		std::vector<ProcessInfo*> processes;
-		std::map<std::string, Future<Reference<IAsyncFile>>> openFiles;
+
+		// A map from filename to file handle for all open files on a machine
+		std::map<std::string, WeakFutureReference<IAsyncFile>> openFiles;
+
 		std::set<std::string> deletingFiles;
 		std::set<std::string> closingFiles;
 		Optional<Standalone<StringRef>> machineId;
diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index 128eace3a8..5b06143ba0 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -175,7 +175,6 @@ class TestConfig {
 		ifs.close();
 	}
 
-
 public:
 	int extraDB = 0;
 	int minimumReplication = 0;
@@ -708,8 +707,8 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
 				// Copy the file pointers to a vector because the map may be modified while we are killing files
 				std::vector<AsyncFileNonDurable*> files;
 				for (auto fileItr = machineCache.begin(); fileItr != machineCache.end(); ++fileItr) {
-					ASSERT(fileItr->second.isReady());
-					files.push_back((AsyncFileNonDurable*)fileItr->second.get().getPtr());
+					ASSERT(fileItr->second.get().isReady());
+					files.push_back((AsyncFileNonDurable*)fileItr->second.get().get().getPtr());
 				}
 
 				std::vector<Future<Void>> killFutures;
@@ -725,7 +724,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
 			for (auto it : machineCache) {
 				filenames.insert(it.first);
 				closingStr += it.first + ", ";
-				ASSERT(it.second.isReady() && !it.second.isError());
+				ASSERT(it.second.get().canGet());
 			}
 
 			for (auto it : g_simulator.getMachineById(localities.machineId())->deletingFiles) {
@@ -1240,7 +1239,7 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 			if (deterministicRandom()->random01() < 0.5)
 				set_config(format("log_spill:=%d", TLogSpillType::DEFAULT));
 		}
-		
+
 		if (deterministicRandom()->random01() < 0.5) {
 			set_config("backup_worker_enabled:=1");
 		}
diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h
index 7bf2a05e63..88360685cc 100644
--- a/flow/genericactors.actor.h
+++ b/flow/genericactors.actor.h
@@ -1899,6 +1899,54 @@ Future<U> operator>>(Future<T> const& lhs, Future<U> const& rhs) {
 	return runAfter(lhs, rhs);
 }
 
+// A weak reference type to wrap a future Reference<T> object.
+// Once the future is complete, this object holds a pointer to the referenced object but does
+// not contribute to its reference count.
+template <class T>
+class WeakFutureReference {
+public:
+	WeakFutureReference() {}
+	WeakFutureReference(Future<Reference<T>> future) : data(new WeakFutureReferenceData(future)) {}
+
+	// Returns a future to obtain a normal reference handle
+	// If the future is ready, this creates a Reference<T> to wrap the object
+	Future<Reference<T>> get() {
+		if (!data) {
+			return Reference<T>();
+		} else if (data->ptr.present()) {
+			return Reference<T>::addRef(data->ptr.get());
+		} else {
+			return data->future;
+		}
+	}
+
+	// Returns the raw pointer, if the object is ready
+	// Note: this should be used with care, as this pointer is not counted as a reference to the object and
+	// it could be deleted if all normal references are destroyed.
+	Optional<T*> getPtrIfReady() { return data->ptr; }
+
+private:
+	// A class to hold the state for a WeakFutureReference
+	struct WeakFutureReferenceData : public ReferenceCounted<WeakFutureReferenceData>, NonCopyable {
+		Optional<T*> ptr;
+		Future<Reference<T>> future;
+		Future<Void> moveResultFuture;
+
+		WeakFutureReferenceData(Future<Reference<T>> future) : future(future) { moveResultFuture = moveResult(this); }
+
+		// Waits for the future to complete and then stores the pointer in local storage
+		// When this completes, we will no longer be counted toward the reference count of the object
+		ACTOR Future<Void> moveResult(WeakFutureReferenceData* self) {
+			Reference<T> result = wait(self->future);
+			self->ptr = result.getPtr();
+			self->future = Future<Reference<T>>();
+			return Void();
+		}
+	};
+
+	Reference<WeakFutureReferenceData> data;
+};
+
 #include "flow/unactorcompiler.h"
 
 #endif

From 944a03d57589f1abbe1641d74f1462e17725eb5a Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 26 May 2021 16:26:45 -0700
Subject: [PATCH 446/461] For files that use the atomic write and create
 mechanism, attempt to remove the file from the openFiles map at both its old
 and new name

---
 fdbrpc/AsyncFileNonDurable.actor.h | 31 ++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index ccc2ad42b4..bde8e0fe9e 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -281,20 +281,27 @@ public:
 					filesBeingDeleted[filename] = deleteFuture;
 			}
 
-			auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
-			auto iter = openFiles.find(filename);
-			if (iter == openFiles.end()) {
-				iter = openFiles.find(initialFilename);
+			removeOpenFile(filename, this);
+			if (initialFilename != filename) {
+				removeOpenFile(initialFilename, this);
 			}
+		}
+	}
 
-			// Various actions (e.g. simulated delete) can remove a file from openFiles prematurely, so it may already
-			// be gone
-			if (iter != openFiles.end()) {
-				// even if the filename exists, it doesn't mean that it references the same file. It could be that the
-				// file was renamed and later a file with the same name was opened.
-				if (iter->second.getPtrIfReady().orDefault(nullptr) == this) {
-					openFiles.erase(iter);
-				}
+	// Removes a file from the openFiles map
+	static void removeOpenFile(std::string filename, AsyncFileNonDurable* file) {
+		auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
+
+		auto iter = openFiles.find(filename);
+
+		// Various actions (e.g. simulated delete) can remove a file from openFiles prematurely, so it may already
+		// be gone. Renamed files (from atomic write and create) will also be present under only one of the two
+		// names.
+		if (iter != openFiles.end()) {
+			// even if the filename exists, it doesn't mean that it references the same file. It could be that the
+			// file was renamed and later a file with the same name was opened.
+			if (iter->second.getPtrIfReady().orDefault(nullptr) == file) {
+				openFiles.erase(iter);
 			}
 		}
 	}

From 065c4fdd5a039aa561f73d585426ce23000e7da8 Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Tue, 30 Mar 2021 12:31:10 -0400
Subject: [PATCH 447/461] issue 4252

---
 fdbserver/Coordination.actor.cpp     | 78 +++++++++++++++++++++++++---
 fdbserver/CoordinationInterface.h    |  2 +-
 fdbserver/Knobs.cpp                  |  2 +
 fdbserver/Knobs.h                    |  3 ++
 fdbserver/SimulatedCluster.actor.cpp | 15 ++++--
 fdbserver/worker.actor.cpp           |  2 +-
 flow/error_definitions.h             |  1 +
 7 files changed, 90 insertions(+), 13 deletions(-)

diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp
index 92de5a5b3c..8443e849eb 100644
--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@@ -545,9 +545,15 @@ struct LeaderRegisterCollection {
 	}
 };
 
+StringRef getClusterName(Key key) {
+	StringRef str = key.contents();
+	return str.eat(":");
+}
+
 // leaderServer multiplexes multiple leaderRegisters onto a single LeaderElectionRegInterface,
 // creating and destroying them on demand.
-ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore* pStore, UID id) {
+ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore* pStore, UID id,
+								Reference<ClusterConnectionFile> ccf) {
 	state LeaderRegisterCollection regs(pStore);
 	state ActorCollection forwarders(false);
 
@@ -562,6 +568,16 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 				info.forward = forward.get().serializedInfo;
 				req.reply.send(CachedSerialization<ClientDBInfo>(info));
 			} else {
+				StringRef reqClusterName = getClusterName(req.clusterKey);
+				StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey());
+				if (reqClusterName.compare(clusterName) ||
+				     ccf->getConnectionString().coordinators() != req.coordinators) {
+					TraceEvent(SevWarnAlways, "CCFMismatch")
+					    .detail("RequestType", "OpenDatabaseCoordRequest")
+					    .detail("LocalCS", ccf->getConnectionString().toString())
+					    .detail("IncomingClusterKey", req.clusterKey)
+					    .detail("IncomingCoordinators", describeList(req.coordinators, req.coordinators.size()));
+				}
 				regs.getInterface(req.clusterKey, id).openDatabase.send(req);
 			}
 		}
@@ -570,6 +586,16 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 			if (forward.present()) {
 				req.reply.send(forward.get());
 			} else {
+				StringRef reqClusterName = getClusterName(req.key);
+				StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey());
+				if (reqClusterName.compare(clusterName) ||
+					ccf->getConnectionString().coordinators() != req.coordinators) {
+					TraceEvent(SevWarnAlways, "CCFMismatch")
+					    .detail("RequestType", "ElectionResultRequest")
+					    .detail("LocalCS", ccf->getConnectionString().toString())
+					    .detail("IncomingClusterKey", req.key)
+					    .detail("IncomingCoordinators", describeList(req.coordinators, req.coordinators.size()));
+				}
 				regs.getInterface(req.key, id).electionResult.send(req);
 			}
 		}
@@ -577,30 +603,66 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 			Optional<LeaderInfo> forward = regs.getForward(req.key);
 			if (forward.present())
 				req.reply.send(forward.get());
-			else
+			else {
+				StringRef reqClusterName = getClusterName(req.key);
+				StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey());
+				if (reqClusterName.compare(clusterName)) {
+					TraceEvent(SevWarnAlways, "CCFMismatch")
+					    .detail("RequestType", "GetLeaderRequest")
+					    .detail("LocalCS", ccf->getConnectionString().toString())
+					    .detail("IncomingClusterKey", req.key)
+						.detail("Key", reqClusterName).detail("Key2",clusterName);
+				}
 				regs.getInterface(req.key, id).getLeader.send(req);
+			}
 		}
 		when(CandidacyRequest req = waitNext(interf.candidacy.getFuture())) {
 			Optional<LeaderInfo> forward = regs.getForward(req.key);
 			if (forward.present())
 				req.reply.send(forward.get());
-			else
+			else {
+				StringRef reqClusterName = getClusterName(req.key);
+				StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey());
+				if (reqClusterName.compare(clusterName)) {
+					TraceEvent(SevWarnAlways, "CCFMismatch")
+					    .detail("RequestType", "CandidacyRequest")
+					    .detail("LocalCS", ccf->getConnectionString().toString())
+					    .detail("IncomingClusterKey", req.key);
+				}
 				regs.getInterface(req.key, id).candidacy.send(req);
+			}
 		}
 		when(LeaderHeartbeatRequest req = waitNext(interf.leaderHeartbeat.getFuture())) {
 			Optional<LeaderInfo> forward = regs.getForward(req.key);
 			if (forward.present())
 				req.reply.send(LeaderHeartbeatReply{ false });
-			else
+			else {
+				StringRef reqClusterName = getClusterName(req.key);
+				StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey());
+				if (reqClusterName.compare(clusterName)) {
+					TraceEvent(SevWarnAlways, "CCFMismatch")
+					    .detail("RequestType", "LeaderHeartbeatRequest")
+					    .detail("LocalCS", ccf->getConnectionString().toString())
+					    .detail("IncomingClusterKey", req.key);
+				}
 				regs.getInterface(req.key, id).leaderHeartbeat.send(req);
+			}
 		}
 		when(ForwardRequest req = waitNext(interf.forward.getFuture())) {
 			Optional<LeaderInfo> forward = regs.getForward(req.key);
 			if (forward.present())
 				req.reply.send(Void());
 			else {
-				forwarders.add(
-				    LeaderRegisterCollection::setForward(&regs, req.key, ClusterConnectionString(req.conn.toString())));
+				StringRef reqClusterName = getClusterName(req.key);
+				StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey());
+				if (reqClusterName.compare(clusterName)) {
+					TraceEvent(SevWarnAlways, "CCFMismatch")
+					    .detail("RequestType", "ForwardRequest")
+					    .detail("LocalCS", ccf->getConnectionString().toString())
+					    .detail("IncomingClusterKey", req.key);
+				}
+				forwarders.add(LeaderRegisterCollection::setForward(&regs, req.key,
+																	ClusterConnectionString(req.conn.toString())));
 				regs.getInterface(req.key, id).forward.send(req);
 			}
 		}
@@ -611,7 +673,7 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 	}
 }
 
-ACTOR Future<Void> coordinationServer(std::string dataFolder) {
+ACTOR Future<Void> coordinationServer(std::string dataFolder, Reference<ClusterConnectionFile> ccf) {
 	state UID myID = deterministicRandom()->randomUniqueID();
 	state LeaderElectionRegInterface myLeaderInterface(g_network);
 	state GenerationRegInterface myInterface(g_network);
@@ -622,7 +684,7 @@ ACTOR Future<Void> coordinationServer(std::string dataFolder) {
 	    .detail("Folder", dataFolder);
 
 	try {
-		wait(localGenerationReg(myInterface, &store) || leaderServer(myLeaderInterface, &store, myID) ||
+		wait(localGenerationReg(myInterface, &store) || leaderServer(myLeaderInterface, &store, myID, ccf) ||
 		     store.getError());
 		throw internal_error();
 	} catch (Error& e) {
diff --git a/fdbserver/CoordinationInterface.h b/fdbserver/CoordinationInterface.h
index 5e824ee0ee..ea379d1358 100644
--- a/fdbserver/CoordinationInterface.h
+++ b/fdbserver/CoordinationInterface.h
@@ -225,6 +225,6 @@ public:
 	vector<GenerationRegInterface> stateServers;
 };
 
-Future<Void> coordinationServer(std::string const& dataFolder);
+Future<Void> coordinationServer(std::string const& dataFolder, Reference<ClusterConnectionFile> const& ccf);
 
 #endif
diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index fc1234d243..d3e32203d9 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -631,6 +631,8 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 
 	// Coordination
 	init( COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL,            1.0 ); if( randomize && BUGGIFY ) COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL = 10.0;
+	init( FORWARD_REQUEST_TOO_OLD,                             600.0 ); if( randomize && BUGGIFY ) FORWARD_REQUEST_TOO_OLD = 60.0;
+	init( ENABLE_CROSS_CLUSTER_SUPPORT,                         true ); if( randomize && BUGGIFY ) ENABLE_CROSS_CLUSTER_SUPPORT = false;
 
 	// Buggification
 	init( BUGGIFIED_EVENTUAL_CONSISTENCY,                        1.0 );
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index be2caba6a1..6bc56d4457 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -559,6 +559,9 @@ public:
 
 	// Coordination
 	double COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL;
+	double FORWARD_REQUEST_TOO_OLD;
+	bool ENABLE_CROSS_CLUSTER_SUPPORT; // Allow a coordinator to serve requests whose connection string does not match
+	                                   // the local copy
 
 	// Buggification
 	double BUGGIFIED_EVENTUAL_CONSISTENCY;
diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index 128eace3a8..72d810961d 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -1605,6 +1605,7 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
 	TEST(!useIPv6); // Use IPv4
 
 	vector<NetworkAddress> coordinatorAddresses;
+	vector<NetworkAddress> extraCoordinatorAddresses; // Used by extra DB if the DR db is a new one
 	if (testConfig.minimumRegions > 1) {
 		// do not put coordinators in the primary region so that we can kill that region safely
 		int nonPrimaryDcs = dataCenters / 2;
@@ -1614,6 +1615,9 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
 				auto ip = makeIPAddressForSim(useIPv6, { 2, dc, 1, m });
 				coordinatorAddresses.push_back(
 				    NetworkAddress(ip, sslEnabled && !sslOnly ? 2 : 1, true, sslEnabled && sslOnly));
+				auto extraIp = makeIPAddressForSim(useIPv6, { 4, dc, 1, m });
+				extraCoordinatorAddresses.push_back(
+				    NetworkAddress(extraIp, sslEnabled && !sslOnly ? 2 : 1, true, sslEnabled && sslOnly));
 				TraceEvent("SelectedCoordinator").detail("Address", coordinatorAddresses.back());
 			}
 		}
@@ -1642,6 +1646,9 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
 					auto ip = makeIPAddressForSim(useIPv6, { 2, dc, 1, m });
 					coordinatorAddresses.push_back(
 					    NetworkAddress(ip, sslEnabled && !sslOnly ? 2 : 1, true, sslEnabled && sslOnly));
+					auto extraIp = makeIPAddressForSim(useIPv6, { 4, dc, 1, m });
+					extraCoordinatorAddresses.push_back(
+					    NetworkAddress(extraIp, sslEnabled && !sslOnly ? 2 : 1, true, sslEnabled && sslOnly));
 					TraceEvent("SelectedCoordinator")
 					    .detail("Address", coordinatorAddresses.back())
 					    .detail("M", m)
@@ -1678,11 +1685,13 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
 	// If extraDB==0, leave g_simulator.extraDB as null because the test does not use DR.
 	if (testConfig.extraDB == 1) {
 		// The DR database can be either a new database or itself
-		g_simulator.extraDB = new ClusterConnectionString(
-		    coordinatorAddresses, BUGGIFY ? LiteralStringRef("TestCluster:0") : LiteralStringRef("ExtraCluster:0"));
+		g_simulator.extraDB =
+		    BUGGIFY ? new ClusterConnectionString(coordinatorAddresses, LiteralStringRef("TestCluster:0"))
+		            : new ClusterConnectionString(extraCoordinatorAddresses, LiteralStringRef("ExtraCluster:0"));
 	} else if (testConfig.extraDB == 2) {
 		// The DR database is a new database
-		g_simulator.extraDB = new ClusterConnectionString(coordinatorAddresses, LiteralStringRef("ExtraCluster:0"));
+		g_simulator.extraDB =
+		    new ClusterConnectionString(extraCoordinatorAddresses, LiteralStringRef("ExtraCluster:0"));		
 	} else if (testConfig.extraDB == 3) {
 		// The DR database is the same database
 		g_simulator.extraDB = new ClusterConnectionString(coordinatorAddresses, LiteralStringRef("TestCluster:0"));
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 5721b154d4..db5c09e0ed 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -2047,7 +2047,7 @@ ACTOR Future<Void> fdbd(Reference<ClusterConnectionFile> connFile,
 		if (coordFolder.size()) {
 			// SOMEDAY: remove the fileNotFound wrapper and make DiskQueue construction safe from errors setting up
 			// their files
-			actors.push_back(fileNotFoundToNever(coordinationServer(coordFolder)));
+			actors.push_back(fileNotFoundToNever(coordinationServer(coordFolder, coordinators.ccf)));
 		}
 
 		state UID processIDUid = wait(createAndLockProcessIdFile(dataFolder));
diff --git a/flow/error_definitions.h b/flow/error_definitions.h
index 70f8750836..4af3aee275 100755
--- a/flow/error_definitions.h
+++ b/flow/error_definitions.h
@@ -74,6 +74,7 @@ ERROR( disk_adapter_reset, 1050, "The disk queue adpater reset" )
 ERROR( batch_transaction_throttled, 1051, "Batch GRV request rate limit exceeded")
 ERROR( dd_cancelled, 1052, "Data distribution components cancelled")
 ERROR( dd_not_found, 1053, "Data distributor not found")
+ERROR( wrong_connection_file, 1054, "Connection file mismatch")
 
 ERROR( broken_promise, 1100, "Broken promise" )
 ERROR( operation_cancelled, 1101, "Asynchronous operation cancelled" )

From 60d27d05d8edccaa77c57a04bf2d68b19da4447b Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Mon, 5 Apr 2021 15:52:48 -0400
Subject: [PATCH 448/461] add knob enabling cross cluster support (default
 true)

---
 fdbserver/Coordination.actor.cpp              | 81 +++++++++++--------
 fdbserver/Knobs.cpp                           |  1 -
 fdbserver/Knobs.h                             |  4 +-
 fdbserver/SimulatedCluster.actor.cpp          |  2 +-
 .../workloads/ConfigureDatabase.actor.cpp     |  8 +-
 .../SpecialKeySpaceCorrectness.actor.cpp      |  3 +-
 6 files changed, 57 insertions(+), 42 deletions(-)

diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp
index 8443e849eb..b4d5f9f38a 100644
--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@@ -568,17 +568,18 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 				info.forward = forward.get().serializedInfo;
 				req.reply.send(CachedSerialization<ClientDBInfo>(info));
 			} else {
-				StringRef reqClusterName = getClusterName(req.clusterKey);
-				StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey());
-				if (reqClusterName.compare(clusterName) ||
-				     ccf->getConnectionString().coordinators() != req.coordinators) {
-					TraceEvent(SevWarnAlways, "CCFMismatch")
+				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
+				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
+					getClusterName(req.clusterKey).compare(clusterName)) {
+					TraceEvent(SevError, "CCFMismatch")
 					    .detail("RequestType", "OpenDatabaseCoordRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
 					    .detail("IncomingClusterKey", req.clusterKey)
 					    .detail("IncomingCoordinators", describeList(req.coordinators, req.coordinators.size()));
+					req.reply.sendError(wrong_connection_file());
+				} else {
+					regs.getInterface(req.clusterKey, id).openDatabase.send(req);
 				}
-				regs.getInterface(req.clusterKey, id).openDatabase.send(req);
 			}
 		}
 		when(ElectionResultRequest req = waitNext(interf.electionResult.getFuture())) {
@@ -586,17 +587,19 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 			if (forward.present()) {
 				req.reply.send(forward.get());
 			} else {
-				StringRef reqClusterName = getClusterName(req.key);
-				StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey());
-				if (reqClusterName.compare(clusterName) ||
-					ccf->getConnectionString().coordinators() != req.coordinators) {
-					TraceEvent(SevWarnAlways, "CCFMismatch")
+				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
+				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
+					getClusterName(req.key).compare(clusterName)) {
+					TraceEvent(SevError, "CCFMismatch")
 					    .detail("RequestType", "ElectionResultRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
 					    .detail("IncomingClusterKey", req.key)
+					    .detail("ClusterKey", ccf->getConnectionString().clusterKey())
 					    .detail("IncomingCoordinators", describeList(req.coordinators, req.coordinators.size()));
+					req.reply.sendError(wrong_connection_file());
+				} else {
+					regs.getInterface(req.key, id).electionResult.send(req);
 				}
-				regs.getInterface(req.key, id).electionResult.send(req);
 			}
 		}
 		when(GetLeaderRequest req = waitNext(interf.getLeader.getFuture())) {
@@ -604,16 +607,18 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 			if (forward.present())
 				req.reply.send(forward.get());
 			else {
-				StringRef reqClusterName = getClusterName(req.key);
-				StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey());
-				if (reqClusterName.compare(clusterName)) {
-					TraceEvent(SevWarnAlways, "CCFMismatch")
+				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
+				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
+					getClusterName(req.key).compare(clusterName)) {
+					TraceEvent(SevError, "CCFMismatch")
 					    .detail("RequestType", "GetLeaderRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
 					    .detail("IncomingClusterKey", req.key)
-						.detail("Key", reqClusterName).detail("Key2",clusterName);
+					    .detail("ClusterKey", ccf->getConnectionString().clusterKey());
+					req.reply.sendError(wrong_connection_file());
+				} else {
+					regs.getInterface(req.key, id).getLeader.send(req);
 				}
-				regs.getInterface(req.key, id).getLeader.send(req);
 			}
 		}
 		when(CandidacyRequest req = waitNext(interf.candidacy.getFuture())) {
@@ -621,15 +626,17 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 			if (forward.present())
 				req.reply.send(forward.get());
 			else {
-				StringRef reqClusterName = getClusterName(req.key);
-				StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey());
-				if (reqClusterName.compare(clusterName)) {
-					TraceEvent(SevWarnAlways, "CCFMismatch")
+				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
+				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
+					getClusterName(req.key).compare(clusterName)) {
+					TraceEvent(SevError, "CCFMismatch")
 					    .detail("RequestType", "CandidacyRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
 					    .detail("IncomingClusterKey", req.key);
+					req.reply.sendError(wrong_connection_file());
+				} else {
+					regs.getInterface(req.key, id).candidacy.send(req);
 				}
-				regs.getInterface(req.key, id).candidacy.send(req);
 			}
 		}
 		when(LeaderHeartbeatRequest req = waitNext(interf.leaderHeartbeat.getFuture())) {
@@ -637,15 +644,17 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 			if (forward.present())
 				req.reply.send(LeaderHeartbeatReply{ false });
 			else {
-				StringRef reqClusterName = getClusterName(req.key);
-				StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey());
-				if (reqClusterName.compare(clusterName)) {
-					TraceEvent(SevWarnAlways, "CCFMismatch")
+				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
+				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
+					getClusterName(req.key).compare(clusterName)) {
+					TraceEvent(SevError, "CCFMismatch")
 					    .detail("RequestType", "LeaderHeartbeatRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
 					    .detail("IncomingClusterKey", req.key);
+					req.reply.sendError(wrong_connection_file());
+				} else {
+					regs.getInterface(req.key, id).leaderHeartbeat.send(req);
 				}
-				regs.getInterface(req.key, id).leaderHeartbeat.send(req);
 			}
 		}
 		when(ForwardRequest req = waitNext(interf.forward.getFuture())) {
@@ -653,17 +662,19 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 			if (forward.present())
 				req.reply.send(Void());
 			else {
-				StringRef reqClusterName = getClusterName(req.key);
-				StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey());
-				if (reqClusterName.compare(clusterName)) {
-					TraceEvent(SevWarnAlways, "CCFMismatch")
+				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
+				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
+					getClusterName(req.key).compare(clusterName)) {
+					TraceEvent(SevError, "CCFMismatch")
 					    .detail("RequestType", "ForwardRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
 					    .detail("IncomingClusterKey", req.key);
+					req.reply.sendError(wrong_connection_file());
+				} else {
+					forwarders.add(
+						LeaderRegisterCollection::setForward(&regs, req.key, ClusterConnectionString(req.conn.toString())));
+					regs.getInterface(req.key, id).forward.send(req);
 				}
-				forwarders.add(LeaderRegisterCollection::setForward(&regs, req.key,
-																	ClusterConnectionString(req.conn.toString())));
-				regs.getInterface(req.key, id).forward.send(req);
 			}
 		}
 		when(wait(forwarders.getResult())) {
diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index d3e32203d9..b002204b0b 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -631,7 +631,6 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 
 	// Coordination
 	init( COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL,            1.0 ); if( randomize && BUGGIFY ) COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL = 10.0;
-	init( FORWARD_REQUEST_TOO_OLD,                             600.0 ); if( randomize && BUGGIFY ) FORWARD_REQUEST_TOO_OLD = 60.0;
 	init( ENABLE_CROSS_CLUSTER_SUPPORT,                         true ); if( randomize && BUGGIFY ) ENABLE_CROSS_CLUSTER_SUPPORT = false;
 
 	// Buggification
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 6bc56d4457..3426f6bb18 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -559,9 +559,7 @@ public:
 
 	// Coordination
 	double COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL;
-	double FORWARD_REQUEST_TOO_OLD;
-	bool ENABLE_CROSS_CLUSTER_SUPPORT; // Allow a coordinator to serve requests whose connection string does not match
-	                                   // the local copy
+	bool ENABLE_CROSS_CLUSTER_SUPPORT; // Allow a coordinator to serve requests whose connection string does not match the local descriptor
 
 	// Buggification
 	double BUGGIFIED_EVENTUAL_CONSISTENCY;
diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index 72d810961d..bfa1f9d007 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -1691,7 +1691,7 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
 	} else if (testConfig.extraDB == 2) {
 		// The DR database is a new database
 		g_simulator.extraDB =
-		    new ClusterConnectionString(extraCoordinatorAddresses, LiteralStringRef("ExtraCluster:0"));		
+		    new ClusterConnectionString(extraCoordinatorAddresses, LiteralStringRef("ExtraCluster:0"));
 	} else if (testConfig.extraDB == 3) {
 		// The DR database is the same database
 		g_simulator.extraDB = new ClusterConnectionString(coordinatorAddresses, LiteralStringRef("TestCluster:0"));
diff --git a/fdbserver/workloads/ConfigureDatabase.actor.cpp b/fdbserver/workloads/ConfigureDatabase.actor.cpp
index d9193fc9d9..0ab7d1b88b 100644
--- a/fdbserver/workloads/ConfigureDatabase.actor.cpp
+++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp
@@ -270,6 +270,7 @@ struct ConfigureDatabaseWorkload : TestWorkload {
 				return Void();
 			}
 			state int randomChoice = deterministicRandom()->randomInt(0, 8);
+
 			if (randomChoice == 0) {
 				wait(success(
 				    runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Optional<Value>> {
@@ -316,8 +317,13 @@ struct ConfigureDatabaseWorkload : TestWorkload {
 			} else if (randomChoice == 4) {
 				//TraceEvent("ConfigureTestQuorumBegin").detail("NewQuorum", s);
 				auto ch = autoQuorumChange();
+				std::string desiredClusterName = "NewName%d";
+				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT) {
+					// if configuration does not allow changing the descriptor, pass empty string (keep old descriptor)
+					desiredClusterName = "";
+				}
 				if (deterministicRandom()->randomInt(0, 2))
-					ch = nameQuorumChange(format("NewName%d", deterministicRandom()->randomInt(0, 100)), ch);
+					ch = nameQuorumChange(format(desiredClusterName.c_str(), deterministicRandom()->randomInt(0, 100)), ch);
 				wait(success(changeQuorum(cx, ch)));
 				//TraceEvent("ConfigureTestConfigureEnd").detail("NewQuorum", s);
 			} else if (randomChoice == 5) {
diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
index e6a6650de3..5a38e20d7e 100644
--- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
+++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
@@ -936,7 +936,8 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 		// test change coordinators and cluster description
 		// we randomly pick one process(not coordinator) and add it, in this case, it should always succeed
 		{
-			state std::string new_cluster_description = deterministicRandom()->randomAlphaNumeric(8);
+			// choose a new description if configuration allows transactions across differently named clusters
+			state std::string new_cluster_description = SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? deterministicRandom()->randomAlphaNumeric(8) : cs.clusterKeyName().toString();
 			state std::string new_coordinator_process;
 			state std::vector<std::string> old_coordinators_processes;
 			state bool possible_to_add_coordinator;

From 742c22cef2eebd74134e58e05a68bf9b4e736678 Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Thu, 22 Apr 2021 13:01:21 -0400
Subject: [PATCH 449/461] Don't allow changing desriptor if knob is set

---
 fdbclient/CoordinationInterface.h | 28 +++++++++++++++++++++++++++-
 fdbclient/ManagementAPI.actor.cpp | 13 ++++++++++---
 fdbclient/MonitorLeader.actor.cpp |  4 +++-
 fdbrpc/FlowTransport.actor.cpp    |  4 +++-
 fdbserver/Coordination.actor.cpp  |  4 ++++
 fdbserver/CoordinationInterface.h | 12 ++++++------
 6 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h
index a852df7a94..2ebd4e1259 100644
--- a/fdbclient/CoordinationInterface.h
+++ b/fdbclient/CoordinationInterface.h
@@ -32,13 +32,15 @@ const int MAX_CLUSTER_FILE_BYTES = 60000;
 
 constexpr UID WLTOKEN_CLIENTLEADERREG_GETLEADER(-1, 2);
 constexpr UID WLTOKEN_CLIENTLEADERREG_OPENDATABASE(-1, 3);
+constexpr UID WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE(-1, 4);
 
-constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 10);
+constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 11);
 
 // The coordinator interface as exposed to clients
 struct ClientLeaderRegInterface {
 	RequestStream<struct GetLeaderRequest> getLeader;
 	RequestStream<struct OpenDatabaseCoordRequest> openDatabase;
+	RequestStream<struct CheckClusterNameMutability> checkClusterNameMutability;
 
 	ClientLeaderRegInterface() {}
 	ClientLeaderRegInterface(NetworkAddress remote);
@@ -236,4 +238,28 @@ struct ProtocolInfoRequest {
 	}
 };
 
+struct CheckClusterNameMutabilityReply {
+	constexpr static FileIdentifier file_identifier = 7784299;
+	CheckClusterNameMutabilityReply() = default;
+	explicit CheckClusterNameMutabilityReply(bool value) : value(value) {}
+	bool value;
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, value);
+	}
+};
+
+struct CheckClusterNameMutability {
+		constexpr static FileIdentifier file_identifier = 214729;
+		Key key;
+		ReplyPromise<CheckClusterNameMutabilityReply> reply;
+		explicit CheckClusterNameMutability(Key key) : key(key) {}
+		CheckClusterNameMutability(){}
+
+        template <class Ar>
+        void serialize(Ar& ar) {
+			serializer(ar, key, reply);
+        }
+};
+
 #endif
diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp
index 90d670e801..56d9f0e6ec 100644
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@@ -1105,6 +1105,7 @@ ACTOR Future<Optional<CoordinatorsResult>> changeQuorumChecker(Transaction* tr,
 
 	vector<Future<Optional<LeaderInfo>>> leaderServers;
 	ClientCoordinators coord(Reference<ClusterConnectionFile>(new ClusterConnectionFile(conn)));
+
 	leaderServers.reserve(coord.clientLeaderServers.size());
 	for (int i = 0; i < coord.clientLeaderServers.size(); i++)
 		leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader,
@@ -1188,14 +1189,20 @@ ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChan
 			TEST(old.clusterKeyName() != conn.clusterKeyName()); // Quorum change with new name
 			TEST(old.clusterKeyName() == conn.clusterKeyName()); // Quorum change with unchanged name
 
-			vector<Future<Optional<LeaderInfo>>> leaderServers;
-			ClientCoordinators coord(Reference<ClusterConnectionFile>(new ClusterConnectionFile(conn)));
+			state vector<Future<Optional<LeaderInfo>>> leaderServers;
+			state ClientCoordinators coord(Reference<ClusterConnectionFile>(new ClusterConnectionFile(conn)));
+			if (! change->getDesiredClusterKeyName().empty()) {
+				CheckClusterNameMutabilityReply mutabilityReply = wait(coord.clientLeaderServers[0].checkClusterNameMutability.getReply(
+					CheckClusterNameMutability()));
+				if (! mutabilityReply.value) {
+					return CoordinatorsResult::BAD_DATABASE_STATE;
+				}
+			}
 			leaderServers.reserve(coord.clientLeaderServers.size());
 			for (int i = 0; i < coord.clientLeaderServers.size(); i++)
 				leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader,
 				                                           GetLeaderRequest(coord.clusterKey, UID()),
 				                                           TaskPriority::CoordinationReply));
-
 			choose {
 				when(wait(waitForAll(leaderServers))) {}
 				when(wait(delay(5.0))) { return CoordinatorsResult::COORDINATOR_UNREACHABLE; }
diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp
index 057e546501..a4dfe5a4a1 100644
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@@ -380,11 +380,13 @@ ClientCoordinators::ClientCoordinators(Key clusterKey, std::vector<NetworkAddres
 
 ClientLeaderRegInterface::ClientLeaderRegInterface(NetworkAddress remote)
   : getLeader(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_GETLEADER)),
-    openDatabase(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_OPENDATABASE)) {}
+    openDatabase(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_OPENDATABASE)),
+    checkClusterNameMutability(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE)) {}
 
 ClientLeaderRegInterface::ClientLeaderRegInterface(INetwork* local) {
 	getLeader.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskPriority::Coordination);
 	openDatabase.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_OPENDATABASE, TaskPriority::Coordination);
+	checkClusterNameMutability.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE, TaskPriority::Coordination);
 }
 
 // Nominee is the worker among all workers that are considered as leader by a coordinator
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 8cc9d0d8e6..861a05a968 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -51,6 +51,8 @@ constexpr UID WLTOKEN_PING_PACKET(-1, 1);
 constexpr int PACKET_LEN_WIDTH = sizeof(uint32_t);
 const uint64_t TOKEN_STREAM_FLAG = 1;
 
+const int WLTOKEN_COUNTS = 12;
+
 class EndpointMap : NonCopyable {
 public:
 	// Reserve space for this many wellKnownEndpoints
@@ -334,7 +336,7 @@ ACTOR Future<Void> pingLatencyLogger(TransportData* self) {
 }
 
 TransportData::TransportData(uint64_t transportId)
-  : endpoints(/*wellKnownTokenCount*/ 11), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
+  : endpoints(/*wellKnownTokenCount*/ WLTOKEN_COUNTS), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
     warnAlwaysForLargePacket(true), lastIncompatibleMessage(0), transportId(transportId),
     numIncompatibleConnections(0) {
 	degraded = makeReference<AsyncVar<bool>>(false);
diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp
index b4d5f9f38a..974062a056 100644
--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@@ -560,6 +560,10 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 	wait(LeaderRegisterCollection::init(&regs));
 
 	loop choose {
+		when(CheckClusterNameMutability req = waitNext(interf.checkClusterNameMutability.getFuture())) {
+			CheckClusterNameMutabilityReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? true : false);
+			req.reply.send(rep);
+		}
 		when(OpenDatabaseCoordRequest req = waitNext(interf.openDatabase.getFuture())) {
 			Optional<LeaderInfo> forward = regs.getForward(req.clusterKey);
 			if (forward.present()) {
diff --git a/fdbserver/CoordinationInterface.h b/fdbserver/CoordinationInterface.h
index ea379d1358..9cf4cb3ea0 100644
--- a/fdbserver/CoordinationInterface.h
+++ b/fdbserver/CoordinationInterface.h
@@ -24,12 +24,12 @@
 
 #include "fdbclient/CoordinationInterface.h"
 
-constexpr UID WLTOKEN_LEADERELECTIONREG_CANDIDACY(-1, 4);
-constexpr UID WLTOKEN_LEADERELECTIONREG_ELECTIONRESULT(-1, 5);
-constexpr UID WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT(-1, 6);
-constexpr UID WLTOKEN_LEADERELECTIONREG_FORWARD(-1, 7);
-constexpr UID WLTOKEN_GENERATIONREG_READ(-1, 8);
-constexpr UID WLTOKEN_GENERATIONREG_WRITE(-1, 9);
+constexpr UID WLTOKEN_LEADERELECTIONREG_CANDIDACY(-1, 5);
+constexpr UID WLTOKEN_LEADERELECTIONREG_ELECTIONRESULT(-1, 6);
+constexpr UID WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT(-1, 7);
+constexpr UID WLTOKEN_LEADERELECTIONREG_FORWARD(-1, 8);
+constexpr UID WLTOKEN_GENERATIONREG_READ(-1, 9);
+constexpr UID WLTOKEN_GENERATIONREG_WRITE(-1, 10);
 
 struct GenerationRegInterface {
 	constexpr static FileIdentifier file_identifier = 16726744;

From fcfb78162c74cededb6d96f28ee2844dd579af8f Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Fri, 23 Apr 2021 09:19:48 -0400
Subject: [PATCH 450/461] misc cleanup for publishing

---
 fdbclient/CoordinationInterface.h | 20 +++++++++++---------
 fdbclient/ManagementAPI.actor.cpp |  8 ++++----
 fdbclient/MonitorLeader.actor.cpp |  4 ++--
 fdbrpc/FlowTransport.actor.cpp    |  5 +++--
 fdbserver/Coordination.actor.cpp  | 21 +++++++++++----------
 5 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h
index 2ebd4e1259..919ae9c315 100644
--- a/fdbclient/CoordinationInterface.h
+++ b/fdbclient/CoordinationInterface.h
@@ -36,11 +36,11 @@ constexpr UID WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE(-1, 4);
 
 constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 11);
 
-// The coordinator interface as exposed to clients
+// well known endpoints published to the client.
 struct ClientLeaderRegInterface {
 	RequestStream<struct GetLeaderRequest> getLeader;
 	RequestStream<struct OpenDatabaseCoordRequest> openDatabase;
-	RequestStream<struct CheckClusterNameMutability> checkClusterNameMutability;
+	RequestStream<struct CheckDescriptorMutable> checkDescriptorMutable;
 
 	ClientLeaderRegInterface() {}
 	ClientLeaderRegInterface(NetworkAddress remote);
@@ -238,10 +238,11 @@ struct ProtocolInfoRequest {
 	}
 };
 
-struct CheckClusterNameMutabilityReply {
+// Returns true if the cluster descriptor may be modified.
+struct CheckDescriptorMutableReply {
 	constexpr static FileIdentifier file_identifier = 7784299;
-	CheckClusterNameMutabilityReply() = default;
-	explicit CheckClusterNameMutabilityReply(bool value) : value(value) {}
+	CheckDescriptorMutableReply() = default;
+	explicit CheckDescriptorMutableReply(bool value) : value(value) {}
 	bool value;
 	template <class Ar>
 	void serialize(Ar& ar) {
@@ -249,12 +250,13 @@ struct CheckClusterNameMutabilityReply {
 	}
 };
 
-struct CheckClusterNameMutability {
+// Allows client to check if allowed to change the cluster descriptor.
+struct CheckDescriptorMutable {
 		constexpr static FileIdentifier file_identifier = 214729;
 		Key key;
-		ReplyPromise<CheckClusterNameMutabilityReply> reply;
-		explicit CheckClusterNameMutability(Key key) : key(key) {}
-		CheckClusterNameMutability(){}
+		ReplyPromise<CheckDescriptorMutableReply> reply;
+		explicit CheckDescriptorMutable(Key key) : key(key) {}
+		CheckDescriptorMutable(){}
 
         template <class Ar>
         void serialize(Ar& ar) {
diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp
index 56d9f0e6ec..2d6fb4c36b 100644
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@@ -1191,12 +1191,12 @@ ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChan
 
 			state vector<Future<Optional<LeaderInfo>>> leaderServers;
 			state ClientCoordinators coord(Reference<ClusterConnectionFile>(new ClusterConnectionFile(conn)));
+			// check if allowed to modify the cluster descriptor
 			if (! change->getDesiredClusterKeyName().empty()) {
-				CheckClusterNameMutabilityReply mutabilityReply = wait(coord.clientLeaderServers[0].checkClusterNameMutability.getReply(
-					CheckClusterNameMutability()));
-				if (! mutabilityReply.value) {
+				CheckDescriptorMutableReply mutabilityReply = wait(coord.clientLeaderServers[0].checkDescriptorMutable.getReply(
+					CheckDescriptorMutable()));
+				if (! mutabilityReply.value)
 					return CoordinatorsResult::BAD_DATABASE_STATE;
-				}
 			}
 			leaderServers.reserve(coord.clientLeaderServers.size());
 			for (int i = 0; i < coord.clientLeaderServers.size(); i++)
diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp
index a4dfe5a4a1..0a22c0c508 100644
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@@ -381,12 +381,12 @@ ClientCoordinators::ClientCoordinators(Key clusterKey, std::vector<NetworkAddres
 ClientLeaderRegInterface::ClientLeaderRegInterface(NetworkAddress remote)
   : getLeader(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_GETLEADER)),
     openDatabase(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_OPENDATABASE)),
-    checkClusterNameMutability(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE)) {}
+    checkDescriptorMutable(Endpoint({ remote }, WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE)) {}
 
 ClientLeaderRegInterface::ClientLeaderRegInterface(INetwork* local) {
 	getLeader.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskPriority::Coordination);
 	openDatabase.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_OPENDATABASE, TaskPriority::Coordination);
-	checkClusterNameMutability.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE, TaskPriority::Coordination);
+	checkDescriptorMutable.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE, TaskPriority::Coordination);
 }
 
 // Nominee is the worker among all workers that are considered as leader by a coordinator
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 861a05a968..23cd61be53 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -51,7 +51,7 @@ constexpr UID WLTOKEN_PING_PACKET(-1, 1);
 constexpr int PACKET_LEN_WIDTH = sizeof(uint32_t);
 const uint64_t TOKEN_STREAM_FLAG = 1;
 
-const int WLTOKEN_COUNTS = 12;
+const int WLTOKEN_COUNTS = 12;  // number of wellKnownEndpoints
 
 class EndpointMap : NonCopyable {
 public:
@@ -98,6 +98,7 @@ void EndpointMap::realloc() {
 
 void EndpointMap::insertWellKnown(NetworkMessageReceiver* r, const Endpoint::Token& token, TaskPriority priority) {
 	int index = token.second();
+	ASSERT(index < WLTOKEN_COUNTS);
 	ASSERT(data[index].receiver == nullptr);
 	data[index].receiver = r;
 	data[index].token() =
@@ -336,7 +337,7 @@ ACTOR Future<Void> pingLatencyLogger(TransportData* self) {
 }
 
 TransportData::TransportData(uint64_t transportId)
-  : endpoints(/*wellKnownTokenCount*/ WLTOKEN_COUNTS), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
+  : endpoints(WLTOKEN_COUNTS), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
     warnAlwaysForLargePacket(true), lastIncompatibleMessage(0), transportId(transportId),
     numIncompatibleConnections(0) {
 	degraded = makeReference<AsyncVar<bool>>(false);
diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp
index 974062a056..eb7dcf8c6c 100644
--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@@ -545,7 +545,8 @@ struct LeaderRegisterCollection {
 	}
 };
 
-StringRef getClusterName(Key key) {
+// extract the prefix descriptor from cluster id
+StringRef getClusterDescriptor(Key key) {
 	StringRef str = key.contents();
 	return str.eat(":");
 }
@@ -558,10 +559,10 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 	state ActorCollection forwarders(false);
 
 	wait(LeaderRegisterCollection::init(&regs));
-
+ 
 	loop choose {
-		when(CheckClusterNameMutability req = waitNext(interf.checkClusterNameMutability.getFuture())) {
-			CheckClusterNameMutabilityReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? true : false);
+		when(CheckDescriptorMutable req = waitNext(interf.checkDescriptorMutable.getFuture())) {
+			CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? true : false);
 			req.reply.send(rep);
 		}
 		when(OpenDatabaseCoordRequest req = waitNext(interf.openDatabase.getFuture())) {
@@ -574,7 +575,7 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 			} else {
 				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
 				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
-					getClusterName(req.clusterKey).compare(clusterName)) {
+					getClusterDescriptor(req.clusterKey).compare(clusterName)) {
 					TraceEvent(SevError, "CCFMismatch")
 					    .detail("RequestType", "OpenDatabaseCoordRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
@@ -593,7 +594,7 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 			} else {
 				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
 				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
-					getClusterName(req.key).compare(clusterName)) {
+					getClusterDescriptor(req.key).compare(clusterName)) {
 					TraceEvent(SevError, "CCFMismatch")
 					    .detail("RequestType", "ElectionResultRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
@@ -613,7 +614,7 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 			else {
 				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
 				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
-					getClusterName(req.key).compare(clusterName)) {
+					getClusterDescriptor(req.key).compare(clusterName)) {
 					TraceEvent(SevError, "CCFMismatch")
 					    .detail("RequestType", "GetLeaderRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
@@ -632,7 +633,7 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 			else {
 				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
 				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
-					getClusterName(req.key).compare(clusterName)) {
+					getClusterDescriptor(req.key).compare(clusterName)) {
 					TraceEvent(SevError, "CCFMismatch")
 					    .detail("RequestType", "CandidacyRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
@@ -650,7 +651,7 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 			else {
 				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
 				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
-					getClusterName(req.key).compare(clusterName)) {
+					getClusterDescriptor(req.key).compare(clusterName)) {
 					TraceEvent(SevError, "CCFMismatch")
 					    .detail("RequestType", "LeaderHeartbeatRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
@@ -668,7 +669,7 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 			else {
 				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
 				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
-					getClusterName(req.key).compare(clusterName)) {
+					getClusterDescriptor(req.key).compare(clusterName)) {
 					TraceEvent(SevError, "CCFMismatch")
 					    .detail("RequestType", "ForwardRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())

From fc65154b5dad31705bf628abb52d4c4361ef4720 Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Wed, 28 Apr 2021 08:48:15 -0400
Subject: [PATCH 451/461] forward back new coordinator

---
 fdbclient/CoordinationInterface.h | 18 +++++++++---------
 fdbserver/Coordination.actor.cpp  | 11 ++++++++---
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h
index 919ae9c315..71448bf8b2 100644
--- a/fdbclient/CoordinationInterface.h
+++ b/fdbclient/CoordinationInterface.h
@@ -252,16 +252,16 @@ struct CheckDescriptorMutableReply {
 
 // Allows client to check if allowed to change the cluster descriptor.
 struct CheckDescriptorMutable {
-		constexpr static FileIdentifier file_identifier = 214729;
-		Key key;
-		ReplyPromise<CheckDescriptorMutableReply> reply;
-		explicit CheckDescriptorMutable(Key key) : key(key) {}
-		CheckDescriptorMutable(){}
+	constexpr static FileIdentifier file_identifier = 214729;
+	Key key;
+	ReplyPromise<CheckDescriptorMutableReply> reply;
+	explicit CheckDescriptorMutable(Key key) : key(key) {}
+	CheckDescriptorMutable(){}
 
-        template <class Ar>
-        void serialize(Ar& ar) {
-			serializer(ar, key, reply);
-        }
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, key, reply);
+	}
 };
 
 #endif
diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp
index eb7dcf8c6c..733637e7a0 100644
--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@@ -559,11 +559,16 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 	state ActorCollection forwarders(false);
 
 	wait(LeaderRegisterCollection::init(&regs));
- 
+
 	loop choose {
 		when(CheckDescriptorMutable req = waitNext(interf.checkDescriptorMutable.getFuture())) {
-			CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? true : false);
-			req.reply.send(rep);
+			Optional<LeaderInfo> forward = regs.getForward(req.key);
+			if (forward.present()) {
+				req.reply.send(CheckDescriptorMutableReply{false});
+			} else {
+				CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? true : false);
+				req.reply.send(rep);
+			}
 		}
 		when(OpenDatabaseCoordRequest req = waitNext(interf.openDatabase.getFuture())) {
 			Optional<LeaderInfo> forward = regs.getForward(req.clusterKey);

From 53d0ecc2fa0e2c04767f02d2116f588753ce7cd9 Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Wed, 5 May 2021 15:01:56 -0400
Subject: [PATCH 452/461] respond to comments made on 5/4

---
 fdbclient/CoordinationInterface.h |  3 +-
 fdbserver/Coordination.actor.cpp  | 47 +++++++++++++++----------------
 2 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h
index 71448bf8b2..a36182b7f3 100644
--- a/fdbclient/CoordinationInterface.h
+++ b/fdbclient/CoordinationInterface.h
@@ -34,6 +34,7 @@ constexpr UID WLTOKEN_CLIENTLEADERREG_GETLEADER(-1, 2);
 constexpr UID WLTOKEN_CLIENTLEADERREG_OPENDATABASE(-1, 3);
 constexpr UID WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE(-1, 4);
 
+// the value of this endpoint should be stable and not change.
 constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 11);
 
 // well known endpoints published to the client.
@@ -256,7 +257,7 @@ struct CheckDescriptorMutable {
 	Key key;
 	ReplyPromise<CheckDescriptorMutableReply> reply;
 	explicit CheckDescriptorMutable(Key key) : key(key) {}
-	CheckDescriptorMutable(){}
+	CheckDescriptorMutable() {}
 
 	template <class Ar>
 	void serialize(Ar& ar) {
diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp
index 733637e7a0..22c1fb2ce8 100644
--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@@ -406,8 +406,8 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
 
 				// If the current leader's priority became worse, we still need to notified all clients because now one
 				// of them might be better than the leader. In addition, even though FitnessRemote is better than
-				// FitnessUnknown, we still need to notified clients so that monitorLeaderRemotely has a chance to switch
-				// from passively monitoring the leader to actively attempting to become the leader.
+				// FitnessUnknown, we still need to notified clients so that monitorLeaderRemotely has a chance to
+				// switch from passively monitoring the leader to actively attempting to become the leader.
 				if (!currentNominee.present() || !nextNominee.present() ||
 				    !currentNominee.get().equalInternalId(nextNominee.get()) ||
 				    nextNominee.get() > currentNominee.get() ||
@@ -553,8 +553,10 @@ StringRef getClusterDescriptor(Key key) {
 
 // leaderServer multiplexes multiple leaderRegisters onto a single LeaderElectionRegInterface,
 // creating and destroying them on demand.
-ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore* pStore, UID id,
-								Reference<ClusterConnectionFile> ccf) {
+ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf,
+                                OnDemandStore* pStore,
+                                UID id,
+                                Reference<ClusterConnectionFile> ccf) {
 	state LeaderRegisterCollection regs(pStore);
 	state ActorCollection forwarders(false);
 
@@ -564,7 +566,7 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 		when(CheckDescriptorMutable req = waitNext(interf.checkDescriptorMutable.getFuture())) {
 			Optional<LeaderInfo> forward = regs.getForward(req.key);
 			if (forward.present()) {
-				req.reply.send(CheckDescriptorMutableReply{false});
+				req.reply.send(CheckDescriptorMutableReply{ false });
 			} else {
 				CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? true : false);
 				req.reply.send(rep);
@@ -579,9 +581,9 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 				req.reply.send(CachedSerialization<ClientDBInfo>(info));
 			} else {
 				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
-				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
-					getClusterDescriptor(req.clusterKey).compare(clusterName)) {
-					TraceEvent(SevError, "CCFMismatch")
+				if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
+				    getClusterDescriptor(req.clusterKey).compare(clusterName)) {
+					TraceEvent(SevWarn, "CCFMismatch")
 					    .detail("RequestType", "OpenDatabaseCoordRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
 					    .detail("IncomingClusterKey", req.clusterKey)
@@ -598,9 +600,8 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 				req.reply.send(forward.get());
 			} else {
 				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
-				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
-					getClusterDescriptor(req.key).compare(clusterName)) {
-					TraceEvent(SevError, "CCFMismatch")
+				if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
+					TraceEvent(SevWarn, "CCFMismatch")
 					    .detail("RequestType", "ElectionResultRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
 					    .detail("IncomingClusterKey", req.key)
@@ -618,9 +619,8 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 				req.reply.send(forward.get());
 			else {
 				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
-				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
-					getClusterDescriptor(req.key).compare(clusterName)) {
-					TraceEvent(SevError, "CCFMismatch")
+				if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
+					TraceEvent(SevWarn, "CCFMismatch")
 					    .detail("RequestType", "GetLeaderRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
 					    .detail("IncomingClusterKey", req.key)
@@ -637,9 +637,8 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 				req.reply.send(forward.get());
 			else {
 				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
-				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
-					getClusterDescriptor(req.key).compare(clusterName)) {
-					TraceEvent(SevError, "CCFMismatch")
+				if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
+					TraceEvent(SevWarn, "CCFMismatch")
 					    .detail("RequestType", "CandidacyRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
 					    .detail("IncomingClusterKey", req.key);
@@ -655,9 +654,8 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 				req.reply.send(LeaderHeartbeatReply{ false });
 			else {
 				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
-				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
-					getClusterDescriptor(req.key).compare(clusterName)) {
-					TraceEvent(SevError, "CCFMismatch")
+				if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
+					TraceEvent(SevWarn, "CCFMismatch")
 					    .detail("RequestType", "LeaderHeartbeatRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
 					    .detail("IncomingClusterKey", req.key);
@@ -673,16 +671,15 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf, OnDemandStore
 				req.reply.send(Void());
 			else {
 				StringRef clusterName = ccf->getConnectionString().clusterKeyName();
-				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT &&
-					getClusterDescriptor(req.key).compare(clusterName)) {
-					TraceEvent(SevError, "CCFMismatch")
+				if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) {
+					TraceEvent(SevWarn, "CCFMismatch")
 					    .detail("RequestType", "ForwardRequest")
 					    .detail("LocalCS", ccf->getConnectionString().toString())
 					    .detail("IncomingClusterKey", req.key);
 					req.reply.sendError(wrong_connection_file());
 				} else {
-					forwarders.add(
-						LeaderRegisterCollection::setForward(&regs, req.key, ClusterConnectionString(req.conn.toString())));
+					forwarders.add(LeaderRegisterCollection::setForward(
+					    &regs, req.key, ClusterConnectionString(req.conn.toString())));
 					regs.getInterface(req.key, id).forward.send(req);
 				}
 			}

From 64c10d36250c0acedb78ebd531598ef0e54237e6 Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Thu, 6 May 2021 11:51:33 -0400
Subject: [PATCH 453/461] fix joshua failures, formatting

---
 fdbclient/ManagementAPI.actor.cpp                        | 8 ++++----
 fdbclient/MonitorLeader.actor.cpp                        | 6 ++++--
 fdbrpc/FlowTransport.actor.cpp                           | 4 ++--
 fdbserver/Knobs.h                                        | 3 ++-
 fdbserver/workloads/ConfigureDatabase.actor.cpp          | 5 +++--
 fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp | 4 +++-
 6 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp
index 2d6fb4c36b..bfa998d25f 100644
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@@ -1192,10 +1192,10 @@ ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChan
 			state vector<Future<Optional<LeaderInfo>>> leaderServers;
 			state ClientCoordinators coord(Reference<ClusterConnectionFile>(new ClusterConnectionFile(conn)));
 			// check if allowed to modify the cluster descriptor
-			if (! change->getDesiredClusterKeyName().empty()) {
-				CheckDescriptorMutableReply mutabilityReply = wait(coord.clientLeaderServers[0].checkDescriptorMutable.getReply(
-					CheckDescriptorMutable()));
-				if (! mutabilityReply.value)
+			if (!change->getDesiredClusterKeyName().empty()) {
+				CheckDescriptorMutableReply mutabilityReply =
+				    wait(coord.clientLeaderServers[0].checkDescriptorMutable.getReply(CheckDescriptorMutable()));
+				if (!mutabilityReply.value)
 					return CoordinatorsResult::BAD_DATABASE_STATE;
 			}
 			leaderServers.reserve(coord.clientLeaderServers.size());
diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp
index 0a22c0c508..86a09ff424 100644
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@@ -386,7 +386,8 @@ ClientLeaderRegInterface::ClientLeaderRegInterface(NetworkAddress remote)
 ClientLeaderRegInterface::ClientLeaderRegInterface(INetwork* local) {
 	getLeader.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskPriority::Coordination);
 	openDatabase.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_OPENDATABASE, TaskPriority::Coordination);
-	checkDescriptorMutable.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE, TaskPriority::Coordination);
+	checkDescriptorMutable.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE,
+	                                             TaskPriority::Coordination);
 }
 
 // Nominee is the worker among all workers that are considered as leader by a coordinator
@@ -498,7 +499,8 @@ ACTOR Future<MonitorLeaderInfo> monitorLeaderOneGeneration(Reference<ClusterConn
 			if (leader.get().first.forward) {
 				TraceEvent("MonitorLeaderForwarding")
 				    .detail("NewConnStr", leader.get().first.serializedInfo.toString())
-				    .detail("OldConnStr", info.intermediateConnFile->getConnectionString().toString()).trackLatest("MonitorLeaderForwarding");
+				    .detail("OldConnStr", info.intermediateConnFile->getConnectionString().toString())
+				    .trackLatest("MonitorLeaderForwarding");
 				info.intermediateConnFile = makeReference<ClusterConnectionFile>(
 				    connFile->getFilename(), ClusterConnectionString(leader.get().first.serializedInfo.toString()));
 				return info;
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 23cd61be53..248011ffcb 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -51,7 +51,7 @@ constexpr UID WLTOKEN_PING_PACKET(-1, 1);
 constexpr int PACKET_LEN_WIDTH = sizeof(uint32_t);
 const uint64_t TOKEN_STREAM_FLAG = 1;
 
-const int WLTOKEN_COUNTS = 12;  // number of wellKnownEndpoints
+const int WLTOKEN_COUNTS = 13; // number of wellKnownEndpoints
 
 class EndpointMap : NonCopyable {
 public:
@@ -1218,7 +1218,7 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
 							}
 							compatible = false;
 							if (!protocolVersion.hasInexpensiveMultiVersionClient()) {
-								if(peer) {
+								if (peer) {
 									peer->protocolVersion->set(protocolVersion);
 								}
 
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 3426f6bb18..a89ac9c375 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -559,7 +559,8 @@ public:
 
 	// Coordination
 	double COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL;
-	bool ENABLE_CROSS_CLUSTER_SUPPORT; // Allow a coordinator to serve requests whose connection string does not match the local descriptor
+	bool ENABLE_CROSS_CLUSTER_SUPPORT; // Allow a coordinator to serve requests whose connection string does not match
+	                                   // the local descriptor
 
 	// Buggification
 	double BUGGIFIED_EVENTUAL_CONSISTENCY;
diff --git a/fdbserver/workloads/ConfigureDatabase.actor.cpp b/fdbserver/workloads/ConfigureDatabase.actor.cpp
index 0ab7d1b88b..ae03375ccb 100644
--- a/fdbserver/workloads/ConfigureDatabase.actor.cpp
+++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp
@@ -318,12 +318,13 @@ struct ConfigureDatabaseWorkload : TestWorkload {
 				//TraceEvent("ConfigureTestQuorumBegin").detail("NewQuorum", s);
 				auto ch = autoQuorumChange();
 				std::string desiredClusterName = "NewName%d";
-				if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT) {
+				if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT) {
 					// if configuration does not allow changing the descriptor, pass empty string (keep old descriptor)
 					desiredClusterName = "";
 				}
 				if (deterministicRandom()->randomInt(0, 2))
-					ch = nameQuorumChange(format(desiredClusterName.c_str(), deterministicRandom()->randomInt(0, 100)), ch);
+					ch = nameQuorumChange(format(desiredClusterName.c_str(), deterministicRandom()->randomInt(0, 100)),
+					                      ch);
 				wait(success(changeQuorum(cx, ch)));
 				//TraceEvent("ConfigureTestConfigureEnd").detail("NewQuorum", s);
 			} else if (randomChoice == 5) {
diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
index 5a38e20d7e..6d6f711a9f 100644
--- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
+++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
@@ -937,7 +937,9 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 		// we randomly pick one process(not coordinator) and add it, in this case, it should always succeed
 		{
 			// choose a new description if configuration allows transactions across differently named clusters
-			state std::string new_cluster_description = SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? deterministicRandom()->randomAlphaNumeric(8) : cs.clusterKeyName().toString();
+			state std::string new_cluster_description = SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT
+			                                                ? deterministicRandom()->randomAlphaNumeric(8)
+			                                                : cs.clusterKeyName().toString();
 			state std::string new_coordinator_process;
 			state std::vector<std::string> old_coordinators_processes;
 			state bool possible_to_add_coordinator;

From 10289ef8f1d5aeba5842e506b958f7b7f8bfe799 Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Mon, 17 May 2021 17:22:19 -0400
Subject: [PATCH 454/461] Respond to AJs comments

---
 fdbclient/CoordinationInterface.h | 17 ++++++++---------
 fdbclient/ManagementAPI.actor.cpp |  4 ++--
 fdbrpc/FlowTransport.actor.cpp    |  4 ++--
 fdbserver/Coordination.actor.cpp  |  8 +++++---
 fdbserver/CoordinationInterface.h | 12 ++++++------
 5 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h
index a36182b7f3..dda9cb47ed 100644
--- a/fdbclient/CoordinationInterface.h
+++ b/fdbclient/CoordinationInterface.h
@@ -32,16 +32,16 @@ const int MAX_CLUSTER_FILE_BYTES = 60000;
 
 constexpr UID WLTOKEN_CLIENTLEADERREG_GETLEADER(-1, 2);
 constexpr UID WLTOKEN_CLIENTLEADERREG_OPENDATABASE(-1, 3);
-constexpr UID WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE(-1, 4);
 
 // the value of this endpoint should be stable and not change.
-constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 11);
+constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 10);
+constexpr UID WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE(-1, 11);
 
 // well known endpoints published to the client.
 struct ClientLeaderRegInterface {
 	RequestStream<struct GetLeaderRequest> getLeader;
 	RequestStream<struct OpenDatabaseCoordRequest> openDatabase;
-	RequestStream<struct CheckDescriptorMutable> checkDescriptorMutable;
+	RequestStream<struct CheckDescriptorMutableRequest> checkDescriptorMutable;
 
 	ClientLeaderRegInterface() {}
 	ClientLeaderRegInterface(NetworkAddress remote);
@@ -243,21 +243,20 @@ struct ProtocolInfoRequest {
 struct CheckDescriptorMutableReply {
 	constexpr static FileIdentifier file_identifier = 7784299;
 	CheckDescriptorMutableReply() = default;
-	explicit CheckDescriptorMutableReply(bool value) : value(value) {}
-	bool value;
+	explicit CheckDescriptorMutableReply(bool isMutable) : isMutable(isMutable) {}
+	bool isMutable;
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, value);
+		serializer(ar, isMutable);
 	}
 };
 
 // Allows client to check if allowed to change the cluster descriptor.
-struct CheckDescriptorMutable {
+struct CheckDescriptorMutableRequest {
 	constexpr static FileIdentifier file_identifier = 214729;
 	Key key;
 	ReplyPromise<CheckDescriptorMutableReply> reply;
-	explicit CheckDescriptorMutable(Key key) : key(key) {}
-	CheckDescriptorMutable() {}
+	CheckDescriptorMutableRequest() {}
 
 	template <class Ar>
 	void serialize(Ar& ar) {
diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp
index bfa998d25f..217340a93c 100644
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@@ -1194,8 +1194,8 @@ ACTOR Future<CoordinatorsResult> changeQuorum(Database cx, Reference<IQuorumChan
 			// check if allowed to modify the cluster descriptor
 			if (!change->getDesiredClusterKeyName().empty()) {
 				CheckDescriptorMutableReply mutabilityReply =
-				    wait(coord.clientLeaderServers[0].checkDescriptorMutable.getReply(CheckDescriptorMutable()));
-				if (!mutabilityReply.value)
+				    wait(coord.clientLeaderServers[0].checkDescriptorMutable.getReply(CheckDescriptorMutableRequest()));
+				if (!mutabilityReply.isMutable)
 					return CoordinatorsResult::BAD_DATABASE_STATE;
 			}
 			leaderServers.reserve(coord.clientLeaderServers.size());
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 248011ffcb..9e978dda66 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -51,7 +51,7 @@ constexpr UID WLTOKEN_PING_PACKET(-1, 1);
 constexpr int PACKET_LEN_WIDTH = sizeof(uint32_t);
 const uint64_t TOKEN_STREAM_FLAG = 1;
 
-const int WLTOKEN_COUNTS = 13; // number of wellKnownEndpoints
+const int WLTOKEN_COUNTS = 12; // number of wellKnownEndpoints
 
 class EndpointMap : NonCopyable {
 public:
@@ -98,7 +98,7 @@ void EndpointMap::realloc() {
 
 void EndpointMap::insertWellKnown(NetworkMessageReceiver* r, const Endpoint::Token& token, TaskPriority priority) {
 	int index = token.second();
-	ASSERT(index < WLTOKEN_COUNTS);
+	ASSERT(index <= WLTOKEN_COUNTS);
 	ASSERT(data[index].receiver == nullptr);
 	data[index].receiver = r;
 	data[index].token() =
diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp
index 22c1fb2ce8..16124db34a 100644
--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@@ -563,12 +563,14 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf,
 	wait(LeaderRegisterCollection::init(&regs));
 
 	loop choose {
-		when(CheckDescriptorMutable req = waitNext(interf.checkDescriptorMutable.getFuture())) {
+		when(CheckDescriptorMutableRequest req = waitNext(interf.checkDescriptorMutable.getFuture())) {
 			Optional<LeaderInfo> forward = regs.getForward(req.key);
+			// Note the response returns the value of a knob enforced by checking only one coordinator. It is not
+			// quorum based.
 			if (forward.present()) {
-				req.reply.send(CheckDescriptorMutableReply{ false });
+				req.reply.sendError(coordinators_changed());
 			} else {
-				CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? true : false);
+				CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT);
 				req.reply.send(rep);
 			}
 		}
diff --git a/fdbserver/CoordinationInterface.h b/fdbserver/CoordinationInterface.h
index 9cf4cb3ea0..ea379d1358 100644
--- a/fdbserver/CoordinationInterface.h
+++ b/fdbserver/CoordinationInterface.h
@@ -24,12 +24,12 @@
 
 #include "fdbclient/CoordinationInterface.h"
 
-constexpr UID WLTOKEN_LEADERELECTIONREG_CANDIDACY(-1, 5);
-constexpr UID WLTOKEN_LEADERELECTIONREG_ELECTIONRESULT(-1, 6);
-constexpr UID WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT(-1, 7);
-constexpr UID WLTOKEN_LEADERELECTIONREG_FORWARD(-1, 8);
-constexpr UID WLTOKEN_GENERATIONREG_READ(-1, 9);
-constexpr UID WLTOKEN_GENERATIONREG_WRITE(-1, 10);
+constexpr UID WLTOKEN_LEADERELECTIONREG_CANDIDACY(-1, 4);
+constexpr UID WLTOKEN_LEADERELECTIONREG_ELECTIONRESULT(-1, 5);
+constexpr UID WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT(-1, 6);
+constexpr UID WLTOKEN_LEADERELECTIONREG_FORWARD(-1, 7);
+constexpr UID WLTOKEN_GENERATIONREG_READ(-1, 8);
+constexpr UID WLTOKEN_GENERATIONREG_WRITE(-1, 9);
 
 struct GenerationRegInterface {
 	constexpr static FileIdentifier file_identifier = 16726744;

From d233e1736f1fdb3282c7b70baa4fede5ed4e01e5 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Thu, 27 May 2021 09:58:02 -0700
Subject: [PATCH 455/461] Add release notes for PR 4863

---
 documentation/sphinx/source/release-notes/release-notes-630.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst
index bebd55e859..ca6a8fd029 100644
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@@ -5,6 +5,7 @@ Release Notes
 
 6.3.14
 ======
+* Fixed fdbbackup start command that automatically configures database with backup workers to only do so when using partitioned logs. `(PR #4863) <https://github.com/apple/foundationdb/pull/4863>`_
 * Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 * Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_
 * Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) <https://github.com/apple/foundationdb/pull/4770>`_

From d82eac406245dbcc16736dd7f81bee57d7db0fea Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Thu, 27 May 2021 20:41:49 -0700
Subject: [PATCH 456/461] Fix a test issue where closing an AsyncFileNonDurable
 could permanently prevent you from reopening the file if the machine was in a
 failed state during cleanup

---
 fdbrpc/AsyncFileNonDurable.actor.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index bde8e0fe9e..f813c1a354 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -238,6 +238,7 @@ public:
 				//TraceEvent("AsyncFileNonDurableOpenWaitOnDelete2").detail("Filename", filename);
 				if (shutdown.isReady())
 					throw io_error().asInjectedFault();
+				wait(g_simulator.onProcess(currentProcess, currentTaskID));
 			}
 
 			state Reference<AsyncFileNonDurable> nonDurableFile(
@@ -859,11 +860,9 @@ private:
 			//TraceEvent("AsyncFileNonDurable_FinishDelete", self->id).detail("Filename", self->filename);
 
 			delete self;
-			wait(g_simulator.onProcess(currentProcess, currentTaskID));
 			return Void();
 		} catch (Error& e) {
 			state Error err = e;
-			wait(g_simulator.onProcess(currentProcess, currentTaskID));
 			throw err;
 		}
 	}

From 750901dd1d9c124801701156be7bf677b0adfc6f Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Thu, 27 May 2021 21:54:59 -0700
Subject: [PATCH 457/461] Reduce the frequency that buggified reads are failed
 so that transactions with a lot of reads aren't doomed to almost always fail.

---
 fdbclient/NativeAPI.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 75c11db594..214b8196ac 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -2235,7 +2235,7 @@ ACTOR Future<Optional<Value>> getValue(Future<Version> version,
 
 			state GetValueReply reply;
 			try {
-				if (CLIENT_BUGGIFY) {
+				if (CLIENT_BUGGIFY_WITH_PROB(.01)) {
 					throw deterministicRandom()->randomChoice(
 					    std::vector<Error>{ transaction_too_old(), future_version() });
 				}
@@ -3078,7 +3078,7 @@ ACTOR Future<RangeResult> getRange(Database cx,
 				++cx->transactionPhysicalReads;
 				state GetKeyValuesReply rep;
 				try {
-					if (CLIENT_BUGGIFY) {
+					if (CLIENT_BUGGIFY_WITH_PROB(.01)) {
 						throw deterministicRandom()->randomChoice(
 						    std::vector<Error>{ transaction_too_old(), future_version() });
 					}

From cc3175fc505bf130e98cb2cb69a7200e966c2ebf Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Fri, 28 May 2021 11:09:41 -0400
Subject: [PATCH 458/461] remove forwarding

---
 fdbclient/CoordinationInterface.h | 3 +--
 fdbserver/Coordination.actor.cpp  | 9 ++-------
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h
index dda9cb47ed..bb76688b15 100644
--- a/fdbclient/CoordinationInterface.h
+++ b/fdbclient/CoordinationInterface.h
@@ -254,13 +254,12 @@ struct CheckDescriptorMutableReply {
 // Allows client to check if allowed to change the cluster descriptor.
 struct CheckDescriptorMutableRequest {
 	constexpr static FileIdentifier file_identifier = 214729;
-	Key key;
 	ReplyPromise<CheckDescriptorMutableReply> reply;
 	CheckDescriptorMutableRequest() {}
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, key, reply);
+		serializer(ar, reply);
 	}
 };
 
diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp
index 16124db34a..02c90aad19 100644
--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@@ -564,15 +564,10 @@ ACTOR Future<Void> leaderServer(LeaderElectionRegInterface interf,
 
 	loop choose {
 		when(CheckDescriptorMutableRequest req = waitNext(interf.checkDescriptorMutable.getFuture())) {
-			Optional<LeaderInfo> forward = regs.getForward(req.key);
 			// Note the response returns the value of a knob enforced by checking only one coordinator. It is not
 			// quorum based.
-			if (forward.present()) {
-				req.reply.sendError(coordinators_changed());
-			} else {
-				CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT);
-				req.reply.send(rep);
-			}
+			CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT);
+			req.reply.send(rep);
 		}
 		when(OpenDatabaseCoordRequest req = waitNext(interf.openDatabase.getFuture())) {
 			Optional<LeaderInfo> forward = regs.getForward(req.clusterKey);

From f6253db7dc08ce470e2d8c2f5b182a06e9411da8 Mon Sep 17 00:00:00 2001
From: Josh Slocum <josh.slocum@snowflake.com>
Date: Fri, 28 May 2021 18:19:42 +0000
Subject: [PATCH 459/461] Addressing final PR comments

---
 fdbrpc/QueueModel.cpp             | 2 +-
 fdbrpc/QueueModel.h               | 2 +-
 fdbserver/storageserver.actor.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdbrpc/QueueModel.cpp b/fdbrpc/QueueModel.cpp
index 6aaaf3df34..124c839647 100644
--- a/fdbrpc/QueueModel.cpp
+++ b/fdbrpc/QueueModel.cpp
@@ -60,7 +60,7 @@ double QueueModel::addRequest(uint64_t id) {
 	return d.penalty;
 }
 
-void QueueModel::updateTssEndpoint(uint64_t endpointId, TSSEndpointData tssData) {
+void QueueModel::updateTssEndpoint(uint64_t endpointId, const TSSEndpointData& tssData) {
 	auto& d = data[endpointId];
 	if (!d.tssData.present()) {
 		tssCount++;
diff --git a/fdbrpc/QueueModel.h b/fdbrpc/QueueModel.h
index 1e8cd009a0..89db9afee8 100644
--- a/fdbrpc/QueueModel.h
+++ b/fdbrpc/QueueModel.h
@@ -110,7 +110,7 @@ public:
 	int laggingRequestCount;
 	int laggingTSSCompareCount;
 
-	void updateTssEndpoint(uint64_t endpointId, TSSEndpointData endpointData);
+	void updateTssEndpoint(uint64_t endpointId, const TSSEndpointData& endpointData);
 	void removeOldTssData(UID currentGeneration);
 	Optional<TSSEndpointData> getTssData(uint64_t endpointId);
 
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 507de28f32..1f55bf4070 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -4071,7 +4071,7 @@ ACTOR Future<bool> restoreDurableState(StorageServer* data, IKeyValueStore* stor
 		wait(yield());
 	}
 
-	// TODO why is this seemingly random delay here?
+	// TODO: why is this seemingly random delay here?
 	wait(delay(0.0001));
 
 	{

From f28dae7c70c102fada52ad27bbcdeb7fcbe4e853 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 28 May 2021 12:43:30 -0700
Subject: [PATCH 460/461] Require a minimum of 6.2.33 for 6.2 snapshot
 restarting tests to avoid a bug in prior versions

---
 tests/CMakeLists.txt                             | 16 ++++++++--------
 .../SnapCycleRestart-1.txt                       |  0
 .../SnapCycleRestart-2.txt                       |  0
 .../SnapTestAttrition-1.txt                      |  0
 .../SnapTestAttrition-2.txt                      |  0
 .../SnapTestRestart-1.txt                        |  0
 .../SnapTestRestart-2.txt                        |  0
 .../SnapTestSimpleRestart-1.txt                  |  0
 .../SnapTestSimpleRestart-2.txt                  |  0
 9 files changed, 8 insertions(+), 8 deletions(-)
 rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapCycleRestart-1.txt (100%)
 rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapCycleRestart-2.txt (100%)
 rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapTestAttrition-1.txt (100%)
 rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapTestAttrition-2.txt (100%)
 rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapTestRestart-1.txt (100%)
 rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapTestRestart-2.txt (100%)
 rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapTestSimpleRestart-1.txt (100%)
 rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapTestSimpleRestart-2.txt (100%)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e12b1e3ce9..5b254573fc 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -186,17 +186,17 @@ if(WITH_PYTHON)
     TEST_FILES restarting/from_5.0.0/StorefrontTestRestart-1.txt
                restarting/from_5.0.0/StorefrontTestRestart-2.txt)
   add_fdb_test(
-    TEST_FILES restarting/from_6.2.29/SnapTestAttrition-1.txt
-               restarting/from_6.2.29/SnapTestAttrition-2.txt)
+    TEST_FILES restarting/from_6.2.33/SnapTestAttrition-1.txt
+               restarting/from_6.2.33/SnapTestAttrition-2.txt)
   add_fdb_test(
-    TEST_FILES restarting/from_6.2.29/SnapTestSimpleRestart-1.txt
-               restarting/from_6.2.29/SnapTestSimpleRestart-2.txt)
+    TEST_FILES restarting/from_6.2.33/SnapTestSimpleRestart-1.txt
+               restarting/from_6.2.33/SnapTestSimpleRestart-2.txt)
   add_fdb_test(
-    TEST_FILES restarting/from_6.2.29/SnapTestRestart-1.txt
-               restarting/from_6.2.29/SnapTestRestart-2.txt)
+    TEST_FILES restarting/from_6.2.33/SnapTestRestart-1.txt
+               restarting/from_6.2.33/SnapTestRestart-2.txt)
   add_fdb_test(
-    TEST_FILES restarting/from_6.2.29/SnapCycleRestart-1.txt
-               restarting/from_6.2.29/SnapCycleRestart-2.txt)
+    TEST_FILES restarting/from_6.2.33/SnapCycleRestart-1.txt
+               restarting/from_6.2.33/SnapCycleRestart-2.txt)
   add_fdb_test(
     TEST_FILES restarting/from_5.1.7/DrUpgradeRestart-1.txt
                restarting/from_5.1.7/DrUpgradeRestart-2.txt)
diff --git a/tests/restarting/from_6.2.29/SnapCycleRestart-1.txt b/tests/restarting/from_6.2.33/SnapCycleRestart-1.txt
similarity index 100%
rename from tests/restarting/from_6.2.29/SnapCycleRestart-1.txt
rename to tests/restarting/from_6.2.33/SnapCycleRestart-1.txt
diff --git a/tests/restarting/from_6.2.29/SnapCycleRestart-2.txt b/tests/restarting/from_6.2.33/SnapCycleRestart-2.txt
similarity index 100%
rename from tests/restarting/from_6.2.29/SnapCycleRestart-2.txt
rename to tests/restarting/from_6.2.33/SnapCycleRestart-2.txt
diff --git a/tests/restarting/from_6.2.29/SnapTestAttrition-1.txt b/tests/restarting/from_6.2.33/SnapTestAttrition-1.txt
similarity index 100%
rename from tests/restarting/from_6.2.29/SnapTestAttrition-1.txt
rename to tests/restarting/from_6.2.33/SnapTestAttrition-1.txt
diff --git a/tests/restarting/from_6.2.29/SnapTestAttrition-2.txt b/tests/restarting/from_6.2.33/SnapTestAttrition-2.txt
similarity index 100%
rename from tests/restarting/from_6.2.29/SnapTestAttrition-2.txt
rename to tests/restarting/from_6.2.33/SnapTestAttrition-2.txt
diff --git a/tests/restarting/from_6.2.29/SnapTestRestart-1.txt b/tests/restarting/from_6.2.33/SnapTestRestart-1.txt
similarity index 100%
rename from tests/restarting/from_6.2.29/SnapTestRestart-1.txt
rename to tests/restarting/from_6.2.33/SnapTestRestart-1.txt
diff --git a/tests/restarting/from_6.2.29/SnapTestRestart-2.txt b/tests/restarting/from_6.2.33/SnapTestRestart-2.txt
similarity index 100%
rename from tests/restarting/from_6.2.29/SnapTestRestart-2.txt
rename to tests/restarting/from_6.2.33/SnapTestRestart-2.txt
diff --git a/tests/restarting/from_6.2.29/SnapTestSimpleRestart-1.txt b/tests/restarting/from_6.2.33/SnapTestSimpleRestart-1.txt
similarity index 100%
rename from tests/restarting/from_6.2.29/SnapTestSimpleRestart-1.txt
rename to tests/restarting/from_6.2.33/SnapTestSimpleRestart-1.txt
diff --git a/tests/restarting/from_6.2.29/SnapTestSimpleRestart-2.txt b/tests/restarting/from_6.2.33/SnapTestSimpleRestart-2.txt
similarity index 100%
rename from tests/restarting/from_6.2.29/SnapTestSimpleRestart-2.txt
rename to tests/restarting/from_6.2.33/SnapTestSimpleRestart-2.txt

From 69dbe04d42a4dbe42f2b4e453c4d1856ff08e23a Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 28 May 2021 14:34:20 -0700
Subject: [PATCH 461/461] Rename WeakFutureReference to
 UnsafeWeakFutureReference and add warning comment

---
 fdbrpc/AsyncFileCached.actor.cpp |  2 +-
 fdbrpc/AsyncFileCached.actor.h   |  4 ++--
 fdbrpc/sim2.actor.cpp            |  2 +-
 fdbrpc/simulator.h               |  2 +-
 flow/genericactors.actor.h       | 21 +++++++++++++--------
 5 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/fdbrpc/AsyncFileCached.actor.cpp b/fdbrpc/AsyncFileCached.actor.cpp
index 984795c105..6354e55cd0 100644
--- a/fdbrpc/AsyncFileCached.actor.cpp
+++ b/fdbrpc/AsyncFileCached.actor.cpp
@@ -47,7 +47,7 @@ EvictablePage::~EvictablePage() {
 }
 
 // A map of filename to the file handle for all opened cached files
-std::map<std::string, WeakFutureReference<IAsyncFile>> AsyncFileCached::openFiles;
+std::map<std::string, UnsafeWeakFutureReference<IAsyncFile>> AsyncFileCached::openFiles;
 
 void AsyncFileCached::remove_page(AFCPage* page) {
 	pages.erase(page->pageOffset);
diff --git a/fdbrpc/AsyncFileCached.actor.h b/fdbrpc/AsyncFileCached.actor.h
index 2915b0557c..84c42f9716 100644
--- a/fdbrpc/AsyncFileCached.actor.h
+++ b/fdbrpc/AsyncFileCached.actor.h
@@ -146,7 +146,7 @@ public:
 			if (f.isReady() && f.isError())
 				return f;
 			if (!f.isReady())
-				openFiles[filename] = WeakFutureReference<IAsyncFile>(f);
+				openFiles[filename] = UnsafeWeakFutureReference<IAsyncFile>(f);
 			else
 				return f.get();
 		}
@@ -250,7 +250,7 @@ public:
 
 private:
 	// A map of filename to the file handle for all opened cached files
-	static std::map<std::string, WeakFutureReference<IAsyncFile>> openFiles;
+	static std::map<std::string, UnsafeWeakFutureReference<IAsyncFile>> openFiles;
 
 	std::string filename;
 	Reference<IAsyncFile> uncached;
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index f11caa5461..ee735b963a 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -2465,7 +2465,7 @@ Future<Reference<class IAsyncFile>> Sim2FileSystem::open(const std::string& file
 			                              diskParameters,
 			                              (flags & IAsyncFile::OPEN_NO_AIO) == 0);
 
-			machineCache[actualFilename] = WeakFutureReference<IAsyncFile>(f);
+			machineCache[actualFilename] = UnsafeWeakFutureReference<IAsyncFile>(f);
 		} else {
 			f = itr->second.get();
 		}
diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h
index 19bed013f2..f83686f464 100644
--- a/fdbrpc/simulator.h
+++ b/fdbrpc/simulator.h
@@ -194,7 +194,7 @@ public:
 		std::vector<ProcessInfo*> processes;
 
 		// A map from filename to file handle for all open files on a machine
-		std::map<std::string, WeakFutureReference<IAsyncFile>> openFiles;
+		std::map<std::string, UnsafeWeakFutureReference<IAsyncFile>> openFiles;
 
 		std::set<std::string> deletingFiles;
 		std::set<std::string> closingFiles;
diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h
index 88360685cc..400b9cdf41 100644
--- a/flow/genericactors.actor.h
+++ b/flow/genericactors.actor.h
@@ -1902,11 +1902,14 @@ Future<U> operator>>(Future<T> const& lhs, Future<U> const& rhs) {
 // A weak reference type to wrap a future Reference<T> object.
 // Once the future is complete, this object holds a pointer to the referenced object but does
 // not contribute to its reference count.
+//
+// WARNING: this class will not be aware when the underlying object is destroyed. It is up to the
+// user to make sure that an UnsafeWeakFutureReference is discarded at the same time the object is.
 template <class T>
-class WeakFutureReference {
+class UnsafeWeakFutureReference {
 public:
-	WeakFutureReference() {}
-	WeakFutureReference(Future<Reference<T>> future) : data(new WeakFutureReferenceData(future)) {}
+	UnsafeWeakFutureReference() {}
+	UnsafeWeakFutureReference(Future<Reference<T>> future) : data(new UnsafeWeakFutureReferenceData(future)) {}
 
 	// Returns a future to obtain a normal reference handle
 	// If the future is ready, this creates a Reference<T> to wrap the object
@@ -1926,17 +1929,19 @@ public:
 	Optional<T*> getPtrIfReady() { return data->ptr; }
 
 private:
-	// A class to hold the state for a WeakFutureReference
-	struct WeakFutureReferenceData : public ReferenceCounted<WeakFutureReferenceData>, NonCopyable {
+	// A class to hold the state for an UnsafeWeakFutureReference
+	struct UnsafeWeakFutureReferenceData : public ReferenceCounted<UnsafeWeakFutureReferenceData>, NonCopyable {
 		Optional<T*> ptr;
 		Future<Reference<T>> future;
 		Future<Void> moveResultFuture;
 
-		WeakFutureReferenceData(Future<Reference<T>> future) : future(future) { moveResultFuture = moveResult(this); }
+		UnsafeWeakFutureReferenceData(Future<Reference<T>> future) : future(future) {
+			moveResultFuture = moveResult(this);
+		}
 
 		// Waits for the future to complete and then stores the pointer in local storage
 		// When this completes, we will no longer be counted toward the reference count of the object
-		ACTOR Future<Void> moveResult(WeakFutureReferenceData* self) {
+		ACTOR Future<Void> moveResult(UnsafeWeakFutureReferenceData* self) {
 			Reference<T> result = wait(self->future);
 			self->ptr = result.getPtr();
 			self->future = Future<Reference<T>>();
@@ -1944,7 +1949,7 @@ private:
 		}
 	};
 
-	Reference<WeakFutureReferenceData> data;
+	Reference<UnsafeWeakFutureReferenceData> data;
 };
 
 #include "flow/unactorcompiler.h"