From 76838a20b7bd936472d3431bbc7534afac883dad Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <ajbeamon@apple.com>
Date: Fri, 30 Oct 2020 09:11:08 -0700
Subject: [PATCH 001/317] A model used to quickly simulate various GRV
 scenarios and algorithms

---
 contrib/grv_proxy_model/grv_test.py         | 134 ++++++++
 contrib/grv_proxy_model/plot.py             | 107 +++++++
 contrib/grv_proxy_model/priority.py         |  40 +++
 contrib/grv_proxy_model/proxy_model.py      | 338 ++++++++++++++++++++
 contrib/grv_proxy_model/rate_model.py       |  83 +++++
 contrib/grv_proxy_model/ratekeeper_model.py |  67 ++++
 contrib/grv_proxy_model/smoother.py         |  53 +++
 contrib/grv_proxy_model/workload_model.py   | 201 ++++++++++++
 8 files changed, 1023 insertions(+)
 create mode 100755 contrib/grv_proxy_model/grv_test.py
 create mode 100755 contrib/grv_proxy_model/plot.py
 create mode 100755 contrib/grv_proxy_model/priority.py
 create mode 100755 contrib/grv_proxy_model/proxy_model.py
 create mode 100755 contrib/grv_proxy_model/rate_model.py
 create mode 100755 contrib/grv_proxy_model/ratekeeper_model.py
 create mode 100644 contrib/grv_proxy_model/smoother.py
 create mode 100755 contrib/grv_proxy_model/workload_model.py

diff --git a/contrib/grv_proxy_model/grv_test.py b/contrib/grv_proxy_model/grv_test.py
new file mode 100755
index 0000000000..1cd0224538
--- /dev/null
+++ b/contrib/grv_proxy_model/grv_test.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+
+#
+# grv_test.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import inspect
+import sys
+
+import rate_model
+import workload_model
+import proxy_model
+import ratekeeper_model
+from priority import Priority
+from plot import Plotter
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-w', '--workload', type=str, help='Name of workload to run')
+parser.add_argument('-r', '--ratekeeper', type=str, help='Name of ratekeeper model')
+parser.add_argument('-d', '--duration', type=int, default=240, help='Duration of simulated test, in seconds. Defaults to 240.')
+parser.add_argument('-L', '--limiter', type=str, default='Original', help='Name of limiter implementation. Defaults to \'Original\'.')
+parser.add_argument('-p', '--proxy', type=str, default='ProxyModel', help='Name of proxy implementation. Defaults to \'ProxyModel\'.')
+parser.add_argument('--list', action='store_true', default=False, help='List options for all models.')
+parser.add_argument('--no-graph', action='store_true', default=False, help='Disable graphical output.')
+
+args = parser.parse_args()
+
+def print_choices_list(context=None):
+    if context == 'workload' or context is None:
+        print('Workloads:')
+        for w in workload_model.predefined_workloads.keys():
+            print('  %s' % w)
+
+    if context == 'ratekeeper' or context is None:
+        print('\nRatekeeper models:')
+        for r in ratekeeper_model.predefined_ratekeeper.keys():
+            print('  %s' % r)
+
+    proxy_model_classes = [c for c in [getattr(proxy_model, a) for a in dir(proxy_model)] if inspect.isclass(c)]
+
+    if context == 'proxy' or context is None:
+        print('\nProxy models:')
+        for p in proxy_model_classes:
+            if issubclass(p, proxy_model.ProxyModel):
+                print('  %s' % p.__name__)
+
+    if context == 'limiter' or context is None:
+        print('\nProxy limiters:')
+        for p in proxy_model_classes:
+            if issubclass(p, proxy_model.Limiter) and p != proxy_model.Limiter:
+                name = p.__name__
+                if name.endswith('Limiter'):
+                    name = name[0:-len('Limiter')]
+                print('  %s' % name)
+
+if args.workload is None or args.ratekeeper is None:
+    print('ERROR: A workload (-w/--workload) and ratekeeper model (-r/--ratekeeper) must be specified.\n')
+    print_choices_list()
+    sys.exit(1)
+
+if args.list:
+    print_choices_list()
+    sys.exit(0)
+
+def validate_class_type(var, name, superclass):
+    cls = getattr(var, name, None)
+    return cls is not None and inspect.isclass(cls) and issubclass(cls, superclass)
+
+if not args.ratekeeper in ratekeeper_model.predefined_ratekeeper:
+    print('Invalid ratekeeper model `%s\'' % args.ratekeeper)
+    print_choices_list('ratekeeper')
+    sys.exit(1)
+
+if not args.workload in workload_model.predefined_workloads:
+    print('Invalid workload model `%s\'' % args.workload)
+    print_choices_list('workload')
+    sys.exit(1)
+
+if not validate_class_type(proxy_model, args.proxy, proxy_model.ProxyModel):
+    print('Invalid proxy model `%s\'' % args.proxy)
+    print_choices_list('proxy')
+    sys.exit(1)
+
+limiter_name = args.limiter
+if not validate_class_type(proxy_model, limiter_name, proxy_model.Limiter):
+    limiter_name += 'Limiter'
+    if not validate_class_type(proxy_model, limiter_name, proxy_model.Limiter):
+        print('Invalid proxy limiter `%s\'' % args.limiter)
+        print_choices_list('limiter')
+        sys.exit(1)
+
+ratekeeper = ratekeeper_model.predefined_ratekeeper[args.ratekeeper]
+workload = workload_model.predefined_workloads[args.workload]
+
+limiter = getattr(proxy_model, limiter_name)
+proxy = getattr(proxy_model, args.proxy)(args.duration, ratekeeper, workload, limiter)
+
+proxy.run()
+
+for priority in workload.priorities():
+    latencies = sorted([p for t in proxy.results.latencies[priority].values() for p in t])
+    total_started = sum(proxy.results.started[priority].values())
+    still_queued = sum([r.count for r in proxy.request_queue if r.priority == priority])
+
+    if len(latencies) > 0:
+        print('\n%s: %d requests in %d seconds (rate=%f). %d still queued.' % (priority, total_started, proxy.time, float(total_started)/proxy.time, still_queued))
+        print('  Median latency: %f' % latencies[len(latencies)//2])
+        print('  90%% latency: %f' % latencies[int(0.9*len(latencies))])
+        print('  99%% latency: %f' % latencies[int(0.99*len(latencies))])
+        print('  99.9%% latency: %f' % latencies[int(0.999*len(latencies))])
+        print('  Max latency: %f' % latencies[-1])
+
+print('')
+
+if not args.no_graph:
+    plotter = Plotter(proxy.results)
+    plotter.display()
diff --git a/contrib/grv_proxy_model/plot.py b/contrib/grv_proxy_model/plot.py
new file mode 100755
index 0000000000..9334e2c844
--- /dev/null
+++ b/contrib/grv_proxy_model/plot.py
@@ -0,0 +1,107 @@
+#
+# plot.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import matplotlib.pyplot as plt
+
+class Plotter:
+    def __init__(self, results):
+        self.results = results
+
+    def add_plot(data, time_resolution, label, use_avg=False):
+        out_data = {}
+        counts = {}
+        for t in data.keys():
+            out_data.setdefault(t//time_resolution*time_resolution, 0)
+            counts.setdefault(t//time_resolution*time_resolution, 0)
+            out_data[t//time_resolution*time_resolution] += data[t]
+            counts[t//time_resolution*time_resolution] += 1
+
+        if use_avg:
+            out_data = { t: v/counts[t] for t,v in out_data.items() }
+
+        plt.plot(list(out_data.keys()), list(out_data.values()), label=label)
+
+    def add_plot_with_times(data, label):
+        plt.plot(list(data.keys()), list(data.values()), label=label)
+
+    def display(self, time_resolution=0.1):
+        plt.figure(figsize=(40,9))
+        plt.subplot(3, 3, 1)
+        for priority in self.results.started.keys():
+            Plotter.add_plot(self.results.started[priority], time_resolution, priority)
+
+        plt.xlabel('Time (s)')
+        plt.ylabel('Released/s')
+        plt.legend()
+
+        plt.subplot(3, 3, 2)
+        for priority in self.results.queued.keys():
+            Plotter.add_plot(self.results.queued[priority], time_resolution, priority)
+
+        plt.xlabel('Time (s)')
+        plt.ylabel('Requests/s')
+        plt.legend()
+
+        plt.subplot(3, 3, 3)
+        for priority in self.results.unprocessed_queue_sizes.keys():
+            data = {k: max(v) for (k,v) in self.results.unprocessed_queue_sizes[priority].items()}
+            Plotter.add_plot(data, time_resolution, priority)
+
+        plt.xlabel('Time (s)')
+        plt.ylabel('Max queue size')
+        plt.legend()
+
+        num = 4
+        for priority in self.results.latencies.keys():
+            plt.subplot(3, 3, num)
+            median_latencies = {k: v[int(0.5*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()}
+            percentile90_latencies = {k: v[int(0.9*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()}
+            max_latencies = {k: max(v) if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()}
+
+            Plotter.add_plot(median_latencies, time_resolution, 'median')
+            Plotter.add_plot(percentile90_latencies, time_resolution, '90th percentile')
+            Plotter.add_plot(max_latencies, time_resolution, 'max')
+
+            plt.xlabel('Time (s)')
+            plt.ylabel(str(priority) + ' Latency (s)')
+            plt.yscale('log')
+            plt.legend()
+            num += 1
+
+        for priority in self.results.rate.keys():
+            plt.subplot(3, 3, num)
+            if len(self.results.rate[priority]) > 0:
+                Plotter.add_plot(self.results.rate[priority], time_resolution, 'Rate', use_avg=True)
+            if len(self.results.released[priority]) > 0:
+                Plotter.add_plot(self.results.released[priority], time_resolution, 'Released', use_avg=True)
+            if len(self.results.limit[priority]) > 0:
+                Plotter.add_plot(self.results.limit[priority], time_resolution, 'Limit', use_avg=True)
+            if len(self.results.limit_and_budget[priority]) > 0:
+                Plotter.add_plot(self.results.limit_and_budget[priority], time_resolution, 'Limit and budget', use_avg=True)
+            if len(self.results.budget[priority]) > 0:
+                Plotter.add_plot(self.results.budget[priority], time_resolution, 'Budget', use_avg=True)
+
+            plt.xlabel('Time (s)')
+            plt.ylabel('Value (' + str(priority) + ')')
+            plt.legend()
+            num += 1
+
+        plt.show()
+
diff --git a/contrib/grv_proxy_model/priority.py b/contrib/grv_proxy_model/priority.py
new file mode 100755
index 0000000000..3ba5c05f2e
--- /dev/null
+++ b/contrib/grv_proxy_model/priority.py
@@ -0,0 +1,40 @@
+#
+# priority.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import functools
+
+@functools.total_ordering
+class Priority:
+    def __init__(self, priority_value, label):
+        self.priority_value = priority_value
+        self.label = label
+
+    def __lt__(self, other):
+        return self.priority_value < other.priority_value
+
+    def __str__(self):
+        return self.label
+
+    def __repr__(self):
+        return repr(self.label)
+
+Priority.SYSTEM = Priority(0, "System")
+Priority.DEFAULT = Priority(1, "Default")
+Priority.BATCH = Priority(2, "Batch")
diff --git a/contrib/grv_proxy_model/proxy_model.py b/contrib/grv_proxy_model/proxy_model.py
new file mode 100755
index 0000000000..9ca2a39bfe
--- /dev/null
+++ b/contrib/grv_proxy_model/proxy_model.py
@@ -0,0 +1,338 @@
+#
+# proxy_model.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import copy
+import functools
+import heapq
+
+from priority import Priority
+from smoother import Smoother
+
+@functools.total_ordering
+class Task:
+    def __init__(self, time, fxn):
+        self.time = time
+        self.fxn = fxn
+
+    def __lt__(self, other):
+        return self.time < other.time
+
+class Limiter:
+    class UpdateRateParams:
+        def __init__(self, time):
+            self.time = time
+
+    class UpdateLimitParams:
+        def __init__(self, time, elapsed):
+            self.time = time
+            self.elapsed = elapsed
+
+    class CanStartParams:
+        def __init__(self, time, num_started, count):
+            self.time = time
+            self.num_started = num_started
+            self.count = count
+
+    class UpdateBudgetParams:
+        def __init__(self, time, num_started, num_started_at_priority, min_priority, last_batch, queue_empty, elapsed):
+            self.time = time
+            self.num_started = num_started
+            self.num_started_at_priority = num_started_at_priority
+            self.min_priority = min_priority
+            self.last_batch = last_batch
+            self.queue_empty = queue_empty
+            self.elapsed = elapsed
+
+    def __init__(self, priority, ratekeeper_model, proxy_model):
+        self.priority = priority
+        self.ratekeeper_model = ratekeeper_model
+        self.proxy_model = proxy_model
+        self.limit = 0
+        self.rate = self.ratekeeper_model.get_limit(0, self.priority)
+
+    def update_rate(self, params):
+        pass
+
+    def update_limit(self, params):
+        pass
+
+    def can_start(self, params):
+        pass
+
+    def update_budget(self, params):
+        pass
+
+class OriginalLimiter(Limiter):
+    def __init__(self, priority, limit_rate_model, proxy_model):
+        Limiter.__init__(self, priority, limit_rate_model, proxy_model)
+
+    def update_rate(self, params):
+        self.rate = self.ratekeeper_model.get_limit(params.time, self.priority)
+
+    def update_limit(self, params):
+        self.limit = min(0, self.limit) + params.elapsed * self.rate
+        self.limit = min(self.limit, self.rate * 0.01)
+        self.limit = min(self.limit, 100000)
+
+        self.proxy_model.results.rate[self.priority][params.time] = self.rate
+        self.proxy_model.results.limit[self.priority][params.time] = self.limit
+
+    def can_start(self, params):
+        return params.num_started < self.limit
+
+    def update_budget(self, params):
+        self.limit -= params.num_started
+
+class PositiveBudgetLimiter(OriginalLimiter):
+    def __init__(self, priority, limit_rate_model, proxy_model):
+        OriginalLimiter.__init__(self, priority, limit_rate_model, proxy_model)
+
+    def update_limit(self, params):
+        self.limit += params.elapsed * self.rate
+        self.limit = min(self.limit, 2.0 * self.rate)
+
+class ClampedBudgetLimiter(PositiveBudgetLimiter):
+    def __init__(self, priority, limit_rate_model, proxy_model):
+        PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model)
+
+    def update_budget(self, params):
+        min_budget = -self.rate * 5.0
+        if self.limit > min_budget:
+            self.limit = max(self.limit - params.num_started, min_budget)
+
+class TimeLimiter(PositiveBudgetLimiter):
+    def __init__(self, priority, limit_rate_model, proxy_model):
+        PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model)
+        self.locked_until = 0
+
+    def can_start(self, params):
+        return params.time >= self.locked_until and PositiveBudgetLimiter.can_start(self, params)
+
+    def update_budget(self, params):
+        #print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch))
+
+        if params.min_priority >= self.priority or params.num_started < self.limit:
+            self.limit -= params.num_started
+        else:
+            self.limit = min(self.limit, max(self.limit - params.num_started, -params.last_batch))
+            self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + (params.num_started - self.limit)/self.rate)
+
+        #print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority))
+
+class TimePositiveBudgetLimiter(PositiveBudgetLimiter):
+    def __init__(self, priority, limit_rate_model, proxy_model):
+        PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model)
+        self.locked_until = 0
+
+    def update_limit(self, params):
+        if params.time >= self.locked_until:
+            PositiveBudgetLimiter.update_limit(self, params)
+
+    def can_start(self, params):
+        return params.num_started + params.count <= self.limit
+
+    def update_budget(self, params):
+        #if params.num_started > 0:
+            #print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch))
+
+        if params.num_started > self.limit:
+            self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + penalty/self.rate)
+            self.limit = 0
+        else:
+            self.limit -= params.num_started
+
+        #if params.num_started > 0:
+            #print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority))
+
+class SmoothingLimiter(OriginalLimiter):
+    def __init__(self, priority, limit_rate_model, proxy_model):
+        OriginalLimiter.__init__(self, priority, limit_rate_model, proxy_model)
+        self.smooth_released = Smoother(2)
+        self.smooth_rate_limit = Smoother(2)
+        self.rate_set = False
+
+    def update_rate(self, params):
+        OriginalLimiter.update_rate(self, params)
+        if not self.rate_set:
+            self.rate_set = True
+            self.smooth_rate_limit.reset(self.rate)
+        else:
+            self.smooth_rate_limit.set_total(params.time, self.rate)
+
+    def update_limit(self, params):
+        self.limit = 2.0 * (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time))
+
+    def can_start(self, params):
+        return params.num_started + params.count <= self.limit
+
+    def update_budget(self, params):
+        self.smooth_released.add_delta(params.time, params.num_started)
+
+class SmoothingBudgetLimiter(SmoothingLimiter):
+    def __init__(self, priority, limit_rate_model, proxy_model):
+        SmoothingLimiter.__init__(self, priority, limit_rate_model, proxy_model)
+        #self.smooth_filled = Smoother(2)
+        self.budget = 0
+
+    def update_limit(self, params):
+        release_rate = (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time))
+        #self.smooth_filled.set_total(params.time, 1 if release_rate > 0 else 0)
+        self.limit = 2.0 * release_rate
+
+        self.proxy_model.results.rate[self.priority][params.time] = self.smooth_rate_limit.smooth_total(params.time)
+        self.proxy_model.results.released[self.priority][params.time] = self.smooth_released.smooth_rate(params.time)
+        self.proxy_model.results.limit[self.priority][params.time] = self.limit
+        self.proxy_model.results.limit_and_budget[self.priority][params.time] = self.limit + self.budget
+        self.proxy_model.results.budget[self.priority][params.time] = self.budget
+
+        #self.budget = max(0, self.budget + params.elapsed * self.smooth_rate_limit.smooth_total(params.time))
+
+        #if self.smooth_filled.smooth_total(params.time) >= 0.1:
+            #self.budget += params.elapsed * self.smooth_rate_limit.smooth_total(params.time)
+
+        #print('Update limit: time=%f, priority=%s, limit=%f, rate=%f, released=%f, budget=%f' % (params.time, self.priority, self.limit, self.smooth_rate_limit.smooth_total(params.time), self.smooth_released.smooth_rate(params.time), self.budget))
+
+    def can_start(self, params):
+        return params.num_started + params.count <= self.limit + self.budget #or params.num_started + params.count <= self.budget
+
+    def update_budget(self, params):
+        self.budget = max(0, self.budget + (self.limit - params.num_started_at_priority) / 2 * params.elapsed)
+
+        if params.queue_empty:
+            self.budget = min(10, self.budget)
+
+        self.smooth_released.add_delta(params.time, params.num_started_at_priority)
+
+class ProxyModel:
+    class Results:
+        def __init__(self, priorities, duration):
+            self.started = self.init_result(priorities, 0, duration)
+            self.queued = self.init_result(priorities, 0, duration)
+            self.latencies = self.init_result(priorities, [], duration)
+            self.unprocessed_queue_sizes = self.init_result(priorities, [], duration)
+
+            self.rate = {p:{} for p in priorities}
+            self.released = {p:{} for p in priorities}
+            self.limit = {p:{} for p in priorities}
+            self.limit_and_budget = {p:{} for p in priorities}
+            self.budget = {p:{} for p in priorities}
+
+        def init_result(self, priorities, starting_value, duration):
+            return {p: {s: copy.copy(starting_value) for s in range(0, duration)} for p in priorities}
+
+    def __init__(self, duration, ratekeeper_model, workload_model, Limiter):
+        self.time = 0
+        self.log_time = 0
+        self.duration = duration
+        self.priority_limiters = { priority: Limiter(priority, ratekeeper_model, self) for priority in workload_model.priorities() }
+        self.workload_model = workload_model
+        self.request_scheduled = { p: False for p in self.workload_model.priorities()}
+
+        self.tasks = []
+        self.request_queue = []
+        self.results = ProxyModel.Results(self.workload_model.priorities(), duration)
+
+    def run(self):
+        self.update_rate()
+        self.process_requests(self.time)
+
+        for priority in self.workload_model.priorities():
+            next_request = self.workload_model.next_request(self.time, priority)
+            assert next_request is not None
+            heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request)))
+            self.request_scheduled[priority] = True
+
+        while True:# or len(self.request_queue) > 0:
+            if int(self.time) > self.log_time:
+                self.log_time = int(self.time)
+                #print(self.log_time)
+
+            task = heapq.heappop(self.tasks)
+            self.time = task.time
+            if self.time >= self.duration:
+                break
+
+            task.fxn()
+
+    def update_rate(self):
+        for limiter in self.priority_limiters.values():
+            limiter.update_rate(Limiter.UpdateRateParams(self.time))
+
+        heapq.heappush(self.tasks, Task(self.time + 0.01, lambda: self.update_rate()))
+
+    def receive_request(self, request):
+        heapq.heappush(self.request_queue, request)
+
+        self.results.queued[request.priority][int(self.time)] += request.count
+
+        next_request = self.workload_model.next_request(self.time, request.priority)
+        if next_request is not None and next_request.time < self.duration:
+            heapq.heappush(self.tasks, Task(next_request.time, lambda: self.receive_request(next_request)))
+        else:
+            self.request_scheduled[request.priority] = False
+
+    def process_requests(self, last_time):
+        elapsed = self.time - last_time
+        for limiter in self.priority_limiters.values():
+            limiter.update_limit(Limiter.UpdateLimitParams(self.time, elapsed))
+
+        current_started = 0
+        started = {p:0 for p in self.workload_model.priorities()}
+
+        min_priority = Priority.SYSTEM
+        last_batch = 0
+        while len(self.request_queue) > 0:
+            request = self.request_queue[0]
+
+            if not self.priority_limiters[request.priority].can_start(Limiter.CanStartParams(self.time, current_started, request.count)):
+                break
+
+            min_priority = request.priority
+            last_batch = request.count
+
+            if self.workload_model.request_completed(request) and not self.request_scheduled[request.priority]:
+                next_request = self.workload_model.next_request(self.time, request.priority)
+                assert next_request is not None
+                heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request)))
+                self.request_scheduled[request.priority] = True
+
+            current_started += request.count
+            started[request.priority] += request.count
+
+            heapq.heappop(self.request_queue)
+            self.results.started[request.priority][int(self.time)] += request.count
+            self.results.latencies[request.priority][int(self.time)].append(self.time-request.time)
+
+        if len(self.request_queue) == 0:
+            min_priority = Priority.BATCH
+
+        for priority, limiter in self.priority_limiters.items():
+            started_at_priority = sum([v for p,v in started.items() if p <= priority])
+            limiter.update_budget(Limiter.UpdateBudgetParams(self.time, current_started, started_at_priority, min_priority, last_batch, len(self.request_queue) == 0 or self.request_queue[0].priority > priority, elapsed))
+        
+        for priority in self.workload_model.priorities():
+            self.results.unprocessed_queue_sizes[priority][int(self.time)].append(self.workload_model.workload_models[priority].outstanding)
+
+        current_time = self.time
+
+        delay = 0.001
+        heapq.heappush(self.tasks, Task(self.time + delay, lambda: self.process_requests(current_time)))
+
+
diff --git a/contrib/grv_proxy_model/rate_model.py b/contrib/grv_proxy_model/rate_model.py
new file mode 100755
index 0000000000..1fabce2c7e
--- /dev/null
+++ b/contrib/grv_proxy_model/rate_model.py
@@ -0,0 +1,83 @@
+#
+# rate_model.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy
+
+class RateModel:
+    def __init__(self):
+        pass
+
+    def get_rate(self, time):
+        pass
+
+class FixedRateModel(RateModel):
+    def __init__(self, rate):
+        RateModel.__init__(self)
+        self.rate = rate
+
+    def get_rate(self, time):
+        return self.rate
+
+class UnlimitedRateModel(FixedRateModel):
+    def __init__(self):
+        self.rate = 1e9
+
+class IntervalRateModel(RateModel):
+    def __init__(self, intervals):
+        self.intervals = sorted(intervals)
+
+    def get_rate(self, time):
+        if len(self.intervals) == 0 or time < self.intervals[0][0]:
+            return 0
+        
+        target_interval = len(self.intervals)-1
+        for i in range(1, len(self.intervals)):
+            if time < self.intervals[i][0]:
+                target_interval = i-1
+                break
+
+        self.intervals = self.intervals[target_interval:]
+        return self.intervals[0][1]
+
+class SawtoothRateModel(RateModel):
+    def __init__(self, low, high, frequency):
+        self.low = low
+        self.high = high
+        self.frequency = frequency
+
+    def get_rate(self, time):
+        if int(2*time/self.frequency) % 2 == 0:
+            return self.low
+        else:
+            return self.high
+
+class DistributionRateModel(RateModel):
+    def __init__(self, distribution, frequency):
+        self.distribution = distribution
+        self.frequency = frequency
+        self.last_change = 0
+        self.rate = None
+
+    def get_rate(self, time):
+        if self.frequency == 0 or int((time - self.last_change) / self.frequency) > int(self.last_change / self.frequency) or self.rate is None:
+            self.last_change = time
+            self.rate = self.distribution()
+
+        return self.rate
diff --git a/contrib/grv_proxy_model/ratekeeper_model.py b/contrib/grv_proxy_model/ratekeeper_model.py
new file mode 100755
index 0000000000..57125dc4c0
--- /dev/null
+++ b/contrib/grv_proxy_model/ratekeeper_model.py
@@ -0,0 +1,67 @@
+#
+# ratekeeper.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy
+import rate_model
+from priority import Priority
+
+class RatekeeperModel:
+    def __init__(self, limit_models):
+        self.limit_models = limit_models
+
+    def get_limit(self, time, priority):
+        return self.limit_models[priority].get_rate(time)
+
+predefined_ratekeeper = {}
+
+predefined_ratekeeper['default200_batch100'] = RatekeeperModel(
+{ 
+    Priority.SYSTEM: rate_model.UnlimitedRateModel(), 
+    Priority.DEFAULT: rate_model.FixedRateModel(200),
+    Priority.BATCH: rate_model.FixedRateModel(100) 
+})
+
+predefined_ratekeeper['default_sawtooth'] = RatekeeperModel(
+{ 
+    Priority.SYSTEM: rate_model.UnlimitedRateModel(), 
+    Priority.DEFAULT: rate_model.SawtoothRateModel(10, 200, 1),
+    Priority.BATCH: rate_model.FixedRateModel(0) 
+})
+
+predefined_ratekeeper['default_uniform_random'] = RatekeeperModel(
+{ 
+    Priority.SYSTEM: rate_model.UnlimitedRateModel(), 
+    Priority.DEFAULT: rate_model.DistributionRateModel(lambda: numpy.random.uniform(10, 200), 1),
+    Priority.BATCH: rate_model.FixedRateModel(0) 
+})
+
+predefined_ratekeeper['default_trickle'] = RatekeeperModel(
+{ 
+    Priority.SYSTEM: rate_model.UnlimitedRateModel(), 
+    Priority.DEFAULT: rate_model.FixedRateModel(3),
+    Priority.BATCH: rate_model.FixedRateModel(0) 
+})
+
+predefined_ratekeeper['default1000'] = RatekeeperModel(
+{
+    Priority.SYSTEM: rate_model.UnlimitedRateModel(),
+    Priority.DEFAULT: rate_model.FixedRateModel(1000),
+    Priority.BATCH: rate_model.FixedRateModel(500)
+})
diff --git a/contrib/grv_proxy_model/smoother.py b/contrib/grv_proxy_model/smoother.py
new file mode 100644
index 0000000000..bc1b32ea12
--- /dev/null
+++ b/contrib/grv_proxy_model/smoother.py
@@ -0,0 +1,53 @@
+#
+# smoother.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import math
+
+class Smoother:
+    def __init__(self, folding_time):
+        self.folding_time = folding_time
+        self.reset(0)
+
+    def reset(self, value):
+        self.time = 0
+        self.total = value
+        self.estimate = value 
+        
+    def set_total(self, time, total):
+        self.add_delta(time, total-self.total)
+
+    def add_delta(self, time, delta):
+        self.update(time)
+        self.total += delta
+
+    def smooth_total(self, time):
+        self.update(time)
+        return self.estimate
+
+    def smooth_rate(self, time):
+        self.update(time)
+        return (self.total-self.estimate) / self.folding_time
+
+    def update(self, time):
+        elapsed = time - self.time
+        if elapsed > 0:
+            self.time = time
+            self.estimate += (self.total-self.estimate) * (1-math.exp(-elapsed/self.folding_time))
+
diff --git a/contrib/grv_proxy_model/workload_model.py b/contrib/grv_proxy_model/workload_model.py
new file mode 100755
index 0000000000..63fb4c472e
--- /dev/null
+++ b/contrib/grv_proxy_model/workload_model.py
@@ -0,0 +1,201 @@
+#
+# workload_model.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import functools
+import numpy
+import math
+
+import rate_model
+from priority import Priority
+
+@functools.total_ordering
+class Request:
+    def __init__(self, time, count, priority):
+        self.time = time
+        self.count = count
+        self.priority = priority
+
+    def __lt__(self, other):
+        return self.priority < other.priority
+
+class PriorityWorkloadModel:
+    def __init__(self, priority, rate_model, batch_model, generator, max_outstanding=1e9):
+        self.priority = priority
+        self.rate_model = rate_model
+        self.batch_model = batch_model
+        self.generator = generator
+        self.max_outstanding = max_outstanding
+        self.outstanding = 0
+
+    def next_request(self, time):
+        if self.outstanding >= self.max_outstanding:
+            return None
+
+        batch_size = self.batch_model.next_batch()
+        self.outstanding += batch_size
+        interval = self.generator.next_request_interval(self.rate_model.get_rate(time))
+        return Request(time + interval, batch_size, self.priority)
+
+    def request_completed(self, request):
+        was_full = self.max_outstanding <= self.outstanding
+        self.outstanding -= request.count
+
+        return was_full and self.outstanding < self.max_outstanding
+
+class WorkloadModel:
+    def __init__(self, workload_models):
+        self.workload_models = workload_models
+
+    def priorities(self):
+        return list(self.workload_models.keys())
+
+    def next_request(self, time, priority):
+        return self.workload_models[priority].next_request(time)
+
+    def request_completed(self, request):
+        return self.workload_models[request.priority].request_completed(request)
+
+class Distribution:
+    EXPONENTIAL = lambda x: numpy.random.exponential(x)
+    UNIFORM = lambda x: numpy.random.uniform(0, 2.0*x)
+    FIXED = lambda x: x 
+
+class BatchGenerator:
+    def __init__(self):
+        pass
+
+    def next_batch(self):
+        pass
+
+class DistributionBatchGenerator(BatchGenerator):
+    def __init__(self, distribution, size):
+        BatchGenerator.__init__(self)
+        self.distribution = distribution
+        self.size = size
+
+    def next_batch(self):
+        return math.ceil(self.distribution(self.size))
+
+class RequestGenerator:
+    def __init__(self):
+        pass
+
+    def next_request_interval(self, rate):
+        pass
+
+class DistributionRequestGenerator(RequestGenerator):
+    def __init__(self, distribution):
+        RequestGenerator.__init__(self)
+        self.distribution = distribution
+
+    def next_request_interval(self, rate):
+        if rate == 0:
+            return 1e9
+
+        return self.distribution(1.0/rate)
+
+predefined_workloads = {}
+
+predefined_workloads['slow_exponential'] = WorkloadModel(
+{
+    Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, 
+                                            rate_model.FixedRateModel(100), 
+                                            DistributionBatchGenerator(Distribution.FIXED, 1),
+                                            DistributionRequestGenerator(Distribution.EXPONENTIAL),
+                                            max_outstanding=100
+    )
+})
+
+predefined_workloads['fixed_uniform'] = WorkloadModel(
+{
+    Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, 
+                                           rate_model.FixedRateModel(0), 
+                                           DistributionBatchGenerator(Distribution.FIXED, 1),
+                                           DistributionRequestGenerator(Distribution.UNIFORM),
+                                           max_outstanding=10
+    ),
+    Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, 
+                                            rate_model.FixedRateModel(95), 
+                                            DistributionBatchGenerator(Distribution.FIXED, 10),
+                                            DistributionRequestGenerator(Distribution.UNIFORM),
+                                            max_outstanding=200
+    ),
+    Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, 
+                                          rate_model.FixedRateModel(1), 
+                                          DistributionBatchGenerator(Distribution.UNIFORM, 500),
+                                          DistributionRequestGenerator(Distribution.UNIFORM),
+                                          max_outstanding=200
+    )
+})
+
+predefined_workloads['batch_starvation'] = WorkloadModel(
+{
+    Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, 
+                                           rate_model.FixedRateModel(1), 
+                                           DistributionBatchGenerator(Distribution.FIXED, 1),
+                                           DistributionRequestGenerator(Distribution.UNIFORM),
+                                           max_outstanding=10
+    ),
+    Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, 
+                                            rate_model.IntervalRateModel([(0,50), (60,150), (120,90)]), 
+                                            DistributionBatchGenerator(Distribution.FIXED, 1),
+                                            DistributionRequestGenerator(Distribution.UNIFORM),
+                                            max_outstanding=200
+    ),
+    Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, 
+                                          rate_model.FixedRateModel(100), 
+                                          DistributionBatchGenerator(Distribution.FIXED, 1),
+                                          DistributionRequestGenerator(Distribution.UNIFORM),
+                                          max_outstanding=200
+    )
+})
+
+predefined_workloads['default_low_high_low'] = WorkloadModel(
+{
+    Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, 
+                                           rate_model.FixedRateModel(0), 
+                                           DistributionBatchGenerator(Distribution.FIXED, 1),
+                                           DistributionRequestGenerator(Distribution.UNIFORM),
+                                           max_outstanding=10
+    ),
+    Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, 
+                                            rate_model.IntervalRateModel([(0,100), (60,300), (120,100)]), 
+                                            DistributionBatchGenerator(Distribution.FIXED, 1),
+                                            DistributionRequestGenerator(Distribution.UNIFORM),
+                                            max_outstanding=200
+    ),
+    Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, 
+                                          rate_model.FixedRateModel(0), 
+                                          DistributionBatchGenerator(Distribution.FIXED, 1),
+                                          DistributionRequestGenerator(Distribution.UNIFORM),
+                                          max_outstanding=200
+    )
+})
+
+for rate in [83, 100, 180, 190, 200]:
+    predefined_workloads['default%d' % rate] = WorkloadModel(
+    {
+        Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT,
+                                                rate_model.FixedRateModel(rate),
+                                                DistributionBatchGenerator(Distribution.FIXED, 1),
+                                                DistributionRequestGenerator(Distribution.EXPONENTIAL),
+                                                max_outstanding=1000
+        )
+    })

From 82f7f541c39377ae2386cc52b777b354b3f545c4 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 25 Nov 2020 11:38:08 -0700
Subject: [PATCH 002/317] started lineage implementation

---
 flow/flow.cpp |  2 ++
 flow/flow.h   | 33 +++++++++++++++++++++++++++++++--
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/flow/flow.cpp b/flow/flow.cpp
index 89f04bd5df..a2bfcc1510 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -26,6 +26,8 @@
 #include <stdarg.h>
 #include <cinttypes>
 
+thread_local ActorLineagePropertyMap* currentLineage = nullptr;
+
 #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER)
 // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test.
 void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) {
diff --git a/flow/flow.h b/flow/flow.h
index a72465143d..155c5db2a2 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -36,6 +36,7 @@
 #include <string>
 #include <utility>
 #include <algorithm>
+#include <memory>
 
 #include "flow/Platform.h"
 #include "flow/FastAlloc.h"
@@ -407,6 +408,30 @@ struct SingleCallback {
 	}
 };
 
+// in the future we might want to read these from a different thread. std::shared_ptr
+// seems to be better suited for this...
+struct ActorLineagePropertyMap : std::enable_shared_from_this<ActorLineagePropertyMap> {
+	std::shared_ptr<ActorLineagePropertyMap> parent = nullptr;
+};
+
+extern thread_local ActorLineagePropertyMap* currentLineage;
+
+struct ActorLineage {
+	std::shared_ptr<ActorLineagePropertyMap> properties = std::make_shared<ActorLineagePropertyMap>();
+	ActorLineage() {
+		if (currentLineage) {
+			properties->parent = currentLineage->shared_from_this();
+		}
+	}
+};
+
+struct save_lineage {
+	ActorLineagePropertyMap* current = currentLineage;
+	~save_lineage() {
+		currentLineage = current;
+	}
+};
+
 // SAV is short for Single Assignment Variable: It can be assigned for only once!
 template <class T>
 struct SAV : private Callback<T>, FastAllocated<SAV<T>> {
@@ -445,6 +470,7 @@ public:
 		ASSERT(canBeSet());
 		new (&value_storage) T(std::forward<U>(value));
 		this->error_state = Error::fromCode(SET_ERROR_CODE);
+		save_lineage _{};
 		while (Callback<T>::next != this)
 			Callback<T>::next->fire(this->value());
 	}
@@ -457,6 +483,7 @@ public:
 	void sendError(Error err) {
 		ASSERT(canBeSet() && int16_t(err.code()) > 0);
 		this->error_state = err;
+		save_lineage _{};
 		while (Callback<T>::next != this)
 			Callback<T>::next->error(err);
 	}
@@ -477,6 +504,7 @@ public:
 	void finishSendAndDelPromiseRef() {
 		// Call only after value_storage has already been initialized!
 		this->error_state = Error::fromCode(SET_ERROR_CODE);
+		save_lineage _{};
 		while (Callback<T>::next != this)
 			Callback<T>::next->fire(this->value());
 
@@ -500,6 +528,7 @@ public:
 		}
 
 		this->error_state = err;
+		save_lineage _{};
 		while (Callback<T>::next != this)
 			Callback<T>::next->error(err);
 
@@ -987,7 +1016,7 @@ static inline void destruct(T& t) {
 }
 
 template <class ReturnValue>
-struct Actor : SAV<ReturnValue> {
+struct Actor : SAV<ReturnValue>, ActorLineage {
 	int8_t actor_wait_state;  // -1 means actor is cancelled; 0 means actor is not waiting; 1-N mean waiting in callback group #
 
 	Actor() : SAV<ReturnValue>(1, 1), actor_wait_state(0) { /*++actorCount;*/ }
@@ -995,7 +1024,7 @@ struct Actor : SAV<ReturnValue> {
 };
 
 template <>
-struct Actor<void> {
+struct Actor<void> : ActorLineage {
 	// This specialization is for a void actor (one not returning a future, hence also uncancellable)
 
 	int8_t actor_wait_state;  // 0 means actor is not waiting; 1-N mean waiting in callback group #

From 05f77f905fb3a32c026729479de3de5456a5789e Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 7 Dec 2020 15:15:25 -0700
Subject: [PATCH 003/317] Added actor lineage

---
 flow/actorcompiler/ActorCompiler.cs     |   1 +
 flow/actorcompiler/actorcompiler.csproj | 108 +-----------------------
 flow/actorcompiler/actorcompiler.sln    |  34 ++++++++
 flow/flow.cpp                           |   5 +-
 flow/flow.h                             |  96 +++++++++++++--------
 flow/genericactors.actor.h              |   4 +
 6 files changed, 110 insertions(+), 138 deletions(-)
 create mode 100644 flow/actorcompiler/actorcompiler.sln

diff --git a/flow/actorcompiler/ActorCompiler.cs b/flow/actorcompiler/ActorCompiler.cs
index 7aef82a42e..dc9de91868 100644
--- a/flow/actorcompiler/ActorCompiler.cs
+++ b/flow/actorcompiler/ActorCompiler.cs
@@ -452,6 +452,7 @@ namespace actorcompiler
                     fullClassName,
                     string.Join(", ", actor.parameters.Select(p => p.name).ToArray()));
 
+            writer.WriteLine("restore_lineage _;");
             if (actor.returnType != null)
                 writer.WriteLine("\treturn Future<{1}>({0});", newActor, actor.returnType);
             else
diff --git a/flow/actorcompiler/actorcompiler.csproj b/flow/actorcompiler/actorcompiler.csproj
index e737adabd2..b590913634 100644
--- a/flow/actorcompiler/actorcompiler.csproj
+++ b/flow/actorcompiler/actorcompiler.csproj
@@ -1,108 +1,8 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project Sdk="Microsoft.NET.Sdk">
+
   <PropertyGroup>
-    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
-    <ProductVersion>10.0.20506</ProductVersion>
-    <SchemaVersion>2.0</SchemaVersion>
-    <ProjectGuid>{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}</ProjectGuid>
     <OutputType>Exe</OutputType>
-    <AppDesignerFolder>Properties</AppDesignerFolder>
-    <RootNamespace>actorcompiler</RootNamespace>
-    <AssemblyName>actorcompiler</AssemblyName>
-    <TargetFrameworkVersion>v4.0</TargetFrameworkVersion>
-    <FileAlignment>512</FileAlignment>
-    <OutputPath>$(SolutionDir)bin\$(Configuration)\</OutputPath>
-    <PublishUrl>publish\</PublishUrl>
-    <Install>true</Install>
-    <InstallFrom>Disk</InstallFrom>
-    <UpdateEnabled>false</UpdateEnabled>
-    <UpdateMode>Foreground</UpdateMode>
-    <UpdateInterval>7</UpdateInterval>
-    <UpdateIntervalUnits>Days</UpdateIntervalUnits>
-    <UpdatePeriodically>false</UpdatePeriodically>
-    <UpdateRequired>false</UpdateRequired>
-    <MapFileExtensions>true</MapFileExtensions>
-    <ApplicationRevision>0</ApplicationRevision>
-    <ApplicationVersion>1.0.0.%2a</ApplicationVersion>
-    <IsWebBootstrapper>false</IsWebBootstrapper>
-    <UseApplicationTrust>false</UseApplicationTrust>
-    <BootstrapperEnabled>true</BootstrapperEnabled>
+    <TargetFramework>net5.0</TargetFramework>
   </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|AnyCPU'">
-    <DebugSymbols>true</DebugSymbols>
-    <DefineConstants>DEBUG;TRACE</DefineConstants>
-    <DebugType>full</DebugType>
-    <PlatformTarget>AnyCPU</PlatformTarget>
-    <LangVersion>default</LangVersion>
-    <ErrorReport>prompt</ErrorReport>
-    <CodeAnalysisIgnoreBuiltInRuleSets>false</CodeAnalysisIgnoreBuiltInRuleSets>
-    <CodeAnalysisFailOnMissingRules>false</CodeAnalysisFailOnMissingRules>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|AnyCPU'">
-    <DefineConstants>TRACE</DefineConstants>
-    <Optimize>true</Optimize>
-    <DebugType>pdbonly</DebugType>
-    <PlatformTarget>AnyCPU</PlatformTarget>
-    <LangVersion>default</LangVersion>
-    <ErrorReport>prompt</ErrorReport>
-    <CodeAnalysisIgnoreBuiltInRuleSets>false</CodeAnalysisIgnoreBuiltInRuleSets>
-    <CodeAnalysisIgnoreBuiltInRules>false</CodeAnalysisIgnoreBuiltInRules>
-  </PropertyGroup>
-  <ItemGroup>
-    <Reference Include="System" />
-    <Reference Include="System.Core">
-      <RequiredTargetFramework>3.5</RequiredTargetFramework>
-    </Reference>
-    <Reference Include="System.Xml.Linq">
-      <RequiredTargetFramework>3.5</RequiredTargetFramework>
-    </Reference>
-    <Reference Include="System.Data.DataSetExtensions">
-      <RequiredTargetFramework>3.5</RequiredTargetFramework>
-    </Reference>
-    <Reference Include="Microsoft.CSharp">
-      <RequiredTargetFramework>4.0</RequiredTargetFramework>
-    </Reference>
-    <Reference Include="System.Data" />
-    <Reference Include="System.Xml" />
-  </ItemGroup>
-  <ItemGroup>
-    <Compile Include="ActorCompiler.cs" />
-    <Compile Include="ActorParser.cs" />
-    <Compile Include="ParseTree.cs" />
-    <Compile Include="Program.cs" />
-    <Compile Include="Properties\AssemblyInfo.cs" />
-  </ItemGroup>
-  <ItemGroup>
-    <BootstrapperPackage Include=".NETFramework,Version=v4.0">
-      <Visible>False</Visible>
-      <ProductName>Microsoft .NET Framework 4 %28x86 and x64%29</ProductName>
-      <Install>true</Install>
-    </BootstrapperPackage>
-    <BootstrapperPackage Include="Microsoft.Net.Client.3.5">
-      <Visible>False</Visible>
-      <ProductName>.NET Framework 3.5 SP1 Client Profile</ProductName>
-      <Install>false</Install>
-    </BootstrapperPackage>
-    <BootstrapperPackage Include="Microsoft.Net.Framework.3.5.SP1">
-      <Visible>False</Visible>
-      <ProductName>.NET Framework 3.5 SP1</ProductName>
-      <Install>false</Install>
-    </BootstrapperPackage>
-    <BootstrapperPackage Include="Microsoft.Windows.Installer.3.1">
-      <Visible>False</Visible>
-      <ProductName>Windows Installer 3.1</ProductName>
-      <Install>true</Install>
-    </BootstrapperPackage>
-  </ItemGroup>
-  <ItemGroup>
-    <Content Include="Actor checklist.txt" />
-  </ItemGroup>
-  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
-  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
-       Other similar extension points exist, see Microsoft.Common.targets.
-  <Target Name="BeforeBuild">
-  </Target>
-  <Target Name="AfterBuild">
-  </Target>
-  -->
+
 </Project>
\ No newline at end of file
diff --git a/flow/actorcompiler/actorcompiler.sln b/flow/actorcompiler/actorcompiler.sln
new file mode 100644
index 0000000000..a4292bfaaa
--- /dev/null
+++ b/flow/actorcompiler/actorcompiler.sln
@@ -0,0 +1,34 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.26124.0
+MinimumVisualStudioVersion = 15.0.26124.0
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "actorcompiler", "actorcompiler.csproj", "{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Debug|x64 = Debug|x64
+		Debug|x86 = Debug|x86
+		Release|Any CPU = Release|Any CPU
+		Release|x64 = Release|x64
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x64.Build.0 = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x86.Build.0 = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|Any CPU.ActiveCfg = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|Any CPU.Build.0 = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x64.ActiveCfg = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x64.Build.0 = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x86.ActiveCfg = Debug|Any CPU
+		{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x86.Build.0 = Debug|Any CPU
+	EndGlobalSection
+EndGlobal
diff --git a/flow/flow.cpp b/flow/flow.cpp
index a2bfcc1510..c4a6097300 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -26,7 +26,10 @@
 #include <stdarg.h>
 #include <cinttypes>
 
-thread_local ActorLineagePropertyMap* currentLineage = nullptr;
+extern thread_local Reference<ActorLineage> currentLineage;
+
+ActorLineage::ActorLineage() : parent(currentLineage) {
+}
 
 #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER)
 // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test.
diff --git a/flow/flow.h b/flow/flow.h
index 155c5db2a2..a0c9793a7a 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -20,6 +20,7 @@
 
 #ifndef FLOW_FLOW_H
 #define FLOW_FLOW_H
+#include "flow/FastRef.h"
 #pragma once
 
 #pragma warning( disable: 4244 4267 ) // SOMEDAY: Carefully check for integer overflow issues (e.g. size_t to int conversions like this suppresses)
@@ -408,28 +409,21 @@ struct SingleCallback {
 	}
 };
 
-// in the future we might want to read these from a different thread. std::shared_ptr
-// seems to be better suited for this...
-struct ActorLineagePropertyMap : std::enable_shared_from_this<ActorLineagePropertyMap> {
-	std::shared_ptr<ActorLineagePropertyMap> parent = nullptr;
+struct ActorLineagePropertyMap : ReferenceCounted<ActorLineagePropertyMap> {
 };
 
-extern thread_local ActorLineagePropertyMap* currentLineage;
-
-struct ActorLineage {
-	std::shared_ptr<ActorLineagePropertyMap> properties = std::make_shared<ActorLineagePropertyMap>();
-	ActorLineage() {
-		if (currentLineage) {
-			properties->parent = currentLineage->shared_from_this();
-		}
-	}
+struct ActorLineage : ReferenceCounted<ActorLineage> {
+	Reference<ActorLineagePropertyMap> map;
+	Reference<ActorLineage> parent;
+	ActorLineage();
 };
 
-struct save_lineage {
-	ActorLineagePropertyMap* current = currentLineage;
-	~save_lineage() {
-		currentLineage = current;
-	}
+extern thread_local Reference<ActorLineage> currentLineage;
+
+struct restore_lineage {
+	Reference<ActorLineage> lineage;
+	restore_lineage() : lineage(currentLineage) {}
+	~restore_lineage() { currentLineage = lineage; }
 };
 
 // SAV is short for Single Assignment Variable: It can be assigned for only once!
@@ -447,7 +441,8 @@ public:
 
 	T& value() { return *(T*)&value_storage; }
 
-	SAV(int futures, int promises) : futures(futures), promises(promises), error_state(Error::fromCode(UNSET_ERROR_CODE)) {
+	SAV(int futures, int promises)
+	  : futures(futures), promises(promises), error_state(Error::fromCode(UNSET_ERROR_CODE)) {
 		Callback<T>::prev = Callback<T>::next = this;
 	}
 	~SAV() {
@@ -466,13 +461,14 @@ public:
 	}
 
 	template <class U>
-	void send(U && value) {
+	void send(U&& value) {
 		ASSERT(canBeSet());
 		new (&value_storage) T(std::forward<U>(value));
 		this->error_state = Error::fromCode(SET_ERROR_CODE);
-		save_lineage _{};
-		while (Callback<T>::next != this)
+		restore_lineage _;
+		while (Callback<T>::next != this) {
 			Callback<T>::next->fire(this->value());
+		}
 	}
 
 	void send(Never) {
@@ -483,13 +479,15 @@ public:
 	void sendError(Error err) {
 		ASSERT(canBeSet() && int16_t(err.code()) > 0);
 		this->error_state = err;
-		save_lineage _{};
-		while (Callback<T>::next != this)
+		restore_lineage _;
+		while (Callback<T>::next != this) {
 			Callback<T>::next->error(err);
+		}
 	}
 
 	template <class U>
 	void sendAndDelPromiseRef(U && value) {
+		restore_lineage _;
 		ASSERT(canBeSet());
 		if (promises == 1 && !futures) {
 			// No one is left to receive the value, so we can just die
@@ -503,8 +501,8 @@ public:
 
 	void finishSendAndDelPromiseRef() {
 		// Call only after value_storage has already been initialized!
+		restore_lineage _;
 		this->error_state = Error::fromCode(SET_ERROR_CODE);
-		save_lineage _{};
 		while (Callback<T>::next != this)
 			Callback<T>::next->fire(this->value());
 
@@ -520,6 +518,7 @@ public:
 	}
 
 	void sendErrorAndDelPromiseRef(Error err) {
+		restore_lineage _;
 		ASSERT(canBeSet() && int16_t(err.code()) > 0);
 		if (promises == 1 && !futures) {
 			// No one is left to receive the value, so we can just die
@@ -528,7 +527,6 @@ public:
 		}
 
 		this->error_state = err;
-		save_lineage _{};
 		while (Callback<T>::next != this)
 			Callback<T>::next->error(err);
 
@@ -624,6 +622,7 @@ struct NotifiedQueue : private SingleCallback<T>, FastAllocated<NotifiedQueue<T>
 		if (error.isValid()) return;
 
 		if (SingleCallback<T>::next != this) {
+			restore_lineage _;
 			SingleCallback<T>::next->fire(std::forward<U>(value));
 		}
 		else {
@@ -635,8 +634,10 @@ struct NotifiedQueue : private SingleCallback<T>, FastAllocated<NotifiedQueue<T>
 		if (error.isValid()) return;
 
 		this->error = err;
-		if (SingleCallback<T>::next != this)
+		if (SingleCallback<T>::next != this) {
+			restore_lineage _;
 			SingleCallback<T>::next->error(err);
+		}
 	}
 
 	void addPromiseRef() { promises++; }
@@ -1016,38 +1017,67 @@ static inline void destruct(T& t) {
 }
 
 template <class ReturnValue>
-struct Actor : SAV<ReturnValue>, ActorLineage {
+struct Actor : SAV<ReturnValue> {
+	Reference<ActorLineage> lineage = Reference<ActorLineage>{new ActorLineage() };
 	int8_t actor_wait_state;  // -1 means actor is cancelled; 0 means actor is not waiting; 1-N mean waiting in callback group #
 
-	Actor() : SAV<ReturnValue>(1, 1), actor_wait_state(0) { /*++actorCount;*/ }
+	Actor() : SAV<ReturnValue>(1, 1), actor_wait_state(0) {
+		/*++actorCount;*/
+		currentLineage = lineage;
+	}
+
+	Reference<ActorLineage> setLineage() {
+		auto res = currentLineage;
+		currentLineage = lineage;
+		return res;
+	}
 	//~Actor() { --actorCount; }
 };
 
 template <>
-struct Actor<void> : ActorLineage {
+struct Actor<void> {
 	// This specialization is for a void actor (one not returning a future, hence also uncancellable)
 
+	Reference<ActorLineage> lineage = Reference<ActorLineage>{new ActorLineage() };
 	int8_t actor_wait_state;  // 0 means actor is not waiting; 1-N mean waiting in callback group #
 
-	Actor() : actor_wait_state(0) { /*++actorCount;*/ }
+	Actor() : actor_wait_state(0) {
+		/*++actorCount;*/
+		currentLineage = lineage;
+	}
+
+	Reference<ActorLineage> setLineage() {
+		auto res = currentLineage;
+		currentLineage = lineage;
+		return res;
+	}
 	//~Actor() { --actorCount; }
 };
 
 template <class ActorType, int CallbackNumber, class ValueType>
 struct ActorCallback : Callback<ValueType> {
-	virtual void fire(ValueType const& value) override { static_cast<ActorType*>(this)->a_callback_fire(this, value); }
-	virtual void error(Error e) override { static_cast<ActorType*>(this)->a_callback_error(this, e); }
+	virtual void fire(ValueType const& value) override {
+		auto _ = static_cast<ActorType*>(this)->setLineage();
+		static_cast<ActorType*>(this)->a_callback_fire(this, value);
+	}
+	virtual void error(Error e) override {
+		auto _ = static_cast<ActorType*>(this)->setLineage();
+		static_cast<ActorType*>(this)->a_callback_error(this, e);
+	}
 };
 
 template <class ActorType, int CallbackNumber, class ValueType>
 struct ActorSingleCallback : SingleCallback<ValueType> {
 	virtual void fire(ValueType const& value) override {
+		auto _ = static_cast<ActorType*>(this)->setLineage();
 		static_cast<ActorType*>(this)->a_callback_fire(this, value);
 	}
 	virtual void fire(ValueType && value) override {
+		auto _ = static_cast<ActorType*>(this)->setLineage();
 		static_cast<ActorType*>(this)->a_callback_fire(this, std::move(value));
 	}
 	virtual void error(Error e) override {
+		auto _ = static_cast<ActorType*>(this)->setLineage();
 		static_cast<ActorType*>(this)->a_callback_error(this, e);
 	}
 };
diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h
index 3fcab1f7dd..ab9d9c07d5 100644
--- a/flow/genericactors.actor.h
+++ b/flow/genericactors.actor.h
@@ -1493,6 +1493,10 @@ struct YieldedFutureActor : SAV<Void>, ActorCallback<YieldedFutureActor, 1, Void
 		delete this;
 	}
 
+	Reference<ActorLineage> setLineage() {
+		return currentLineage;
+	}
+
 	void a_callback_fire(ActorCallback<YieldedFutureActor, 1, Void>*, Void) {
 		if (int16_t(in_error_state.code()) == UNSET_ERROR_CODE) {
 			in_error_state = Error::fromCode(SET_ERROR_CODE);

From d837e923ad9f8cbf3a5bcd5668a74d4ee0222c32 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 7 Dec 2020 15:23:18 -0700
Subject: [PATCH 004/317] minor bugfix

---
 flow/flow.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/flow.cpp b/flow/flow.cpp
index c4a6097300..ed977141bd 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -26,7 +26,7 @@
 #include <stdarg.h>
 #include <cinttypes>
 
-extern thread_local Reference<ActorLineage> currentLineage;
+thread_local Reference<ActorLineage> currentLineage;
 
 ActorLineage::ActorLineage() : parent(currentLineage) {
 }

From 2c4e38329e536172d2413da61d884ef944277598 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 9 Dec 2020 10:19:32 -0700
Subject: [PATCH 005/317] fix some compiler warnings

---
 fdbclient/SystemData.cpp              | 6 +++---
 fdbserver/BackupProgress.actor.cpp    | 2 +-
 fdbserver/BackupWorker.actor.cpp      | 6 +++---
 fdbserver/CommitProxyServer.actor.cpp | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index b402ad99a7..16733b1ad6 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -57,7 +57,7 @@ const Value keyServersValue( Standalone<RangeResultRef> result, const std::vecto
 	std::vector<Tag> destTag;
 
 	bool foundOldLocality = false;
-	for (const KeyValueRef kv : result) {
+	for (const KeyValueRef& kv : result) {
 		UID uid = decodeServerTagKey(kv.key);
 		if (std::find(src.begin(), src.end(), uid) != src.end()) {
 			srcTag.push_back( decodeServerTagValue(kv.value) );
@@ -109,7 +109,7 @@ void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& v
 	src.clear();
 	dest.clear();
 
-	for (const KeyValueRef kv : result) {
+	for (const KeyValueRef& kv : result) {
 		Tag tag = decodeServerTagValue(kv.value);
 		if (std::find(srcTag.begin(), srcTag.end(), tag) != srcTag.end()) {
 			src.push_back( decodeServerTagKey(kv.key) );
@@ -122,7 +122,7 @@ void decodeKeyServersValue( Standalone<RangeResultRef> result, const ValueRef& v
 	std::sort(dest.begin(), dest.end());
 	if(missingIsError && (src.size() != srcTag.size() || dest.size() != destTag.size())) {
 		TraceEvent(SevError, "AttemptedToDecodeMissingTag");
-		for (const KeyValueRef kv : result) {
+		for (const KeyValueRef& kv : result) {
 			Tag tag = decodeServerTagValue(kv.value);
 			UID serverID = decodeServerTagKey(kv.key);
 			TraceEvent("TagUIDMap").detail("Tag", tag.toString()).detail("UID", serverID.toString());
diff --git a/fdbserver/BackupProgress.actor.cpp b/fdbserver/BackupProgress.actor.cpp
index 3f1d564c16..f496ec0558 100644
--- a/fdbserver/BackupProgress.actor.cpp
+++ b/fdbserver/BackupProgress.actor.cpp
@@ -121,7 +121,7 @@ std::map<std::tuple<LogEpoch, Version, int>, std::map<Tag, Version>> BackupProgr
 			}
 		}
 
-		for (const Tag tag : tags) { // tags without progress data
+		for (const Tag& tag : tags) { // tags without progress data
 			tagVersions.insert({ tag, adjustedBeginVersion });
 			TraceEvent("BackupVersionRange", dbgid)
 			    .detail("OldEpoch", epoch)
diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp
index 3cea9f6611..b5f78593e2 100644
--- a/fdbserver/BackupWorker.actor.cpp
+++ b/fdbserver/BackupWorker.actor.cpp
@@ -508,7 +508,7 @@ ACTOR Future<Void> setBackupKeys(BackupData* self, std::map<UID, Version> savedL
 			state std::vector<Future<Optional<Version>>> prevVersions;
 			state std::vector<BackupConfig> versionConfigs;
 			state std::vector<Future<Optional<bool>>> allWorkersReady;
-			for (const auto [uid, version] : savedLogVersions) {
+			for (const auto& [uid, version] : savedLogVersions) {
 				versionConfigs.emplace_back(uid);
 				prevVersions.push_back(versionConfigs.back().latestBackupWorkerSavedVersion().get(tr));
 				allWorkersReady.push_back(versionConfigs.back().allWorkerStarted().get(tr));
@@ -573,7 +573,7 @@ ACTOR Future<Void> monitorBackupProgress(BackupData* self) {
 			if (self->recruitedEpoch == self->oldestBackupEpoch) {
 				// update update progress so far if previous epochs are done
 				Version v = std::numeric_limits<Version>::max();
-				for (const auto [tag, version] : tagVersions) {
+				for (const auto& [tag, version] : tagVersions) {
 					v = std::min(v, version);
 				}
 				savedLogVersions.emplace(uid, v);
@@ -783,7 +783,7 @@ ACTOR Future<Void> saveMutationsToFile(BackupData* self, Version popVersion, int
 		    .detail("TagId", self->tag.id)
 		    .detail("File", file->getFileName());
 	}
-	for (const UID uid : activeUids) {
+	for (const UID& uid : activeUids) {
 		self->backups[uid].lastSavedVersion = popVersion + 1;
 	}
 
diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp
index eac0f0d4c2..96ae4c000c 100644
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@@ -1778,7 +1778,7 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy, MasterInter
 					state KeyRange txnKeys = allKeys;
 					Standalone<RangeResultRef> UIDtoTagMap = commitData.txnStateStore->readRange( serverTagKeys ).get();
 					state std::map<Tag, UID> tag_uid;
-					for (const KeyValueRef kv : UIDtoTagMap) {
+					for (const KeyValueRef& kv : UIDtoTagMap) {
 						tag_uid[decodeServerTagValue(kv.value)] = decodeServerTagKey(kv.key);
 					}
 					loop {

From 0d324cee80b306797e6f92392414b786ad5ce914 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 9 Dec 2020 10:19:59 -0700
Subject: [PATCH 006/317] Annotation framework and role lineage

---
 fdbrpc/CMakeLists.txt      |  2 +
 fdbrpc/Locality.h          |  1 +
 fdbrpc/RoleLineage.cpp     | 23 ++++++++++
 fdbrpc/RoleLineage.h       | 31 +++++++++++++
 fdbserver/worker.actor.cpp |  3 ++
 flow/flow.cpp              |  6 +++
 flow/flow.h                | 90 ++++++++++++++++++++++++++++++++------
 7 files changed, 142 insertions(+), 14 deletions(-)
 create mode 100644 fdbrpc/RoleLineage.cpp
 create mode 100644 fdbrpc/RoleLineage.h

diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt
index b4fb20098d..41229dce47 100644
--- a/fdbrpc/CMakeLists.txt
+++ b/fdbrpc/CMakeLists.txt
@@ -22,6 +22,8 @@ set(FDBRPC_SRCS
   ReplicationPolicy.cpp
   ReplicationTypes.cpp
   ReplicationUtils.cpp
+  RoleLineage.h
+  RoleLineage.cpp
   Stats.actor.cpp
   Stats.h
   sim2.actor.cpp
diff --git a/fdbrpc/Locality.h b/fdbrpc/Locality.h
index 11c209071a..2129b7a3b7 100644
--- a/fdbrpc/Locality.h
+++ b/fdbrpc/Locality.h
@@ -63,6 +63,7 @@ struct ProcessClass {
 		Ratekeeper,
 		StorageCache,
 		Backup,
+		Worker, // used for actor lineage tracking
 		NoRole
 	};
 	enum ClassSource { CommandLineSource, AutoSource, DBSource, InvalidSource = -1 };
diff --git a/fdbrpc/RoleLineage.cpp b/fdbrpc/RoleLineage.cpp
new file mode 100644
index 0000000000..89a64bbe40
--- /dev/null
+++ b/fdbrpc/RoleLineage.cpp
@@ -0,0 +1,23 @@
+/*
+ * RoleLineage.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbrpc/RoleLineage.h"
+
+StringRef RoleLineage::name = "RoleLineage"_sr;
diff --git a/fdbrpc/RoleLineage.h b/fdbrpc/RoleLineage.h
new file mode 100644
index 0000000000..30a2ea2650
--- /dev/null
+++ b/fdbrpc/RoleLineage.h
@@ -0,0 +1,31 @@
+/*
+ * RoleLineage.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "fdbrpc/Locality.h"
+
+struct RoleLineage : LineageProperties<RoleLineage> {
+    static StringRef name;
+    ProcessClass::ClusterRole role = ProcessClass::NoRole;
+
+    bool isSet(ProcessClass::ClusterRole RoleLineage::*member) {
+        return this->*member != ProcessClass::NoRole;
+    }
+};
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index ca34f903a2..98363ea247 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -22,6 +22,7 @@
 #include <boost/lexical_cast.hpp>
 
 #include "fdbrpc/Locality.h"
+#include "fdbrpc/RoleLineage.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbserver/Knobs.h"
 #include "flow/ActorCollection.h"
@@ -46,6 +47,7 @@
 #include "flow/Profiler.h"
 #include "flow/ThreadHelper.actor.h"
 #include "flow/Trace.h"
+#include "flow/flow.h"
 
 #ifdef __linux__
 #include <fcntl.h>
@@ -1810,6 +1812,7 @@ ACTOR Future<Void> fdbd(
 {
 	state vector<Future<Void>> actors;
 	state Promise<Void> recoveredDiskFiles;
+	currentLineage->modify(&RoleLineage::role) = ProcessClass::Worker;
 
 	try {
 		ServerCoordinators coordinators( connFile );
diff --git a/flow/flow.cpp b/flow/flow.cpp
index ed977141bd..5b354fe054 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -31,6 +31,12 @@ thread_local Reference<ActorLineage> currentLineage;
 ActorLineage::ActorLineage() : parent(currentLineage) {
 }
 
+ActorLineage::~ActorLineage() {
+	for (auto ptr : properties) {
+		delete ptr.second;
+	}
+}
+
 #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER)
 // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test.
 void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) {
diff --git a/flow/flow.h b/flow/flow.h
index a0c9793a7a..0ffc895a86 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -20,6 +20,7 @@
 
 #ifndef FLOW_FLOW_H
 #define FLOW_FLOW_H
+#include "flow/Arena.h"
 #include "flow/FastRef.h"
 #pragma once
 
@@ -29,6 +30,7 @@
 
 #include <vector>
 #include <queue>
+#include <stack>
 #include <map>
 #include <unordered_map>
 #include <set>
@@ -409,21 +411,88 @@ struct SingleCallback {
 	}
 };
 
-struct ActorLineagePropertyMap : ReferenceCounted<ActorLineagePropertyMap> {
+struct LineagePropertiesBase {
+};
+
+// helper class to make implementation of LineageProperties easier
+template<class Derived>
+struct LineageProperties : LineagePropertiesBase {
+	// Contract:
+	//
+	// StringRef name = "SomeUniqueName"_str;
+
+
+	// this has to be implemented by subclasses
+	// but can't be made virtual.
+	// A user should implement this for any type
+	// within the properies class.
+	template<class Value>
+	bool isSet(Value Derived::*member) {
+		return true;
+	}
 };
 
 struct ActorLineage : ReferenceCounted<ActorLineage> {
-	Reference<ActorLineagePropertyMap> map;
+private:
+	std::unordered_map<StringRef, LineagePropertiesBase*> properties;
 	Reference<ActorLineage> parent;
+public:
 	ActorLineage();
+	~ActorLineage();
+	bool isRoot() const {
+		return parent.getPtr() == nullptr;
+	}
+	void makeRoot() {
+		parent.clear();
+	}
+	template <class T, class V>
+	V& modify(V T::*member) {
+		auto& res = properties[T::name];
+		if (!res) {
+			res = new T{};
+		}
+		T* map = static_cast<T*>(res);
+		return map->*member;
+	}
+	template <class T, class V>
+	std::optional<V> get(V T::*member) const {
+		auto current = this;
+		while (current != nullptr) {
+			auto iter = current->properties.find(T::name);
+			if (iter != current->properties.end()) {
+				T const& map = static_cast<T const&>(*iter->second);
+				if (map.isSet(member)) {
+					return map.*member;
+				}
+			}
+			current = current->parent.getPtr();
+		}
+		return std::optional<V>{};
+	}
+	template <class T, class V>
+	std::stack<V> stack(V T::*member) const {
+		auto current = this;
+		std::stack<V> res;
+		while (current != nullptr) {
+			auto iter = current->properties.find(T::name);
+			if (iter != current->properties.end()) {
+				T const& map = static_cast<T const&>(*iter->second);
+				if (map.isSet(member)) {
+					res.push(map.*member);
+				}
+			}
+			current = current->parent.getPtr();
+		}
+		return res;
+	}
 };
 
 extern thread_local Reference<ActorLineage> currentLineage;
 
 struct restore_lineage {
-	Reference<ActorLineage> lineage;
-	restore_lineage() : lineage(currentLineage) {}
-	~restore_lineage() { currentLineage = lineage; }
+	Reference<ActorLineage> prev;
+	restore_lineage() : prev(currentLineage) {}
+	~restore_lineage() { currentLineage = prev; }
 };
 
 // SAV is short for Single Assignment Variable: It can be assigned for only once!
@@ -465,7 +534,6 @@ public:
 		ASSERT(canBeSet());
 		new (&value_storage) T(std::forward<U>(value));
 		this->error_state = Error::fromCode(SET_ERROR_CODE);
-		restore_lineage _;
 		while (Callback<T>::next != this) {
 			Callback<T>::next->fire(this->value());
 		}
@@ -479,7 +547,6 @@ public:
 	void sendError(Error err) {
 		ASSERT(canBeSet() && int16_t(err.code()) > 0);
 		this->error_state = err;
-		restore_lineage _;
 		while (Callback<T>::next != this) {
 			Callback<T>::next->error(err);
 		}
@@ -487,7 +554,6 @@ public:
 
 	template <class U>
 	void sendAndDelPromiseRef(U && value) {
-		restore_lineage _;
 		ASSERT(canBeSet());
 		if (promises == 1 && !futures) {
 			// No one is left to receive the value, so we can just die
@@ -501,7 +567,6 @@ public:
 
 	void finishSendAndDelPromiseRef() {
 		// Call only after value_storage has already been initialized!
-		restore_lineage _;
 		this->error_state = Error::fromCode(SET_ERROR_CODE);
 		while (Callback<T>::next != this)
 			Callback<T>::next->fire(this->value());
@@ -518,7 +583,6 @@ public:
 	}
 
 	void sendErrorAndDelPromiseRef(Error err) {
-		restore_lineage _;
 		ASSERT(canBeSet() && int16_t(err.code()) > 0);
 		if (promises == 1 && !futures) {
 			// No one is left to receive the value, so we can just die
@@ -622,7 +686,6 @@ struct NotifiedQueue : private SingleCallback<T>, FastAllocated<NotifiedQueue<T>
 		if (error.isValid()) return;
 
 		if (SingleCallback<T>::next != this) {
-			restore_lineage _;
 			SingleCallback<T>::next->fire(std::forward<U>(value));
 		}
 		else {
@@ -635,7 +698,6 @@ struct NotifiedQueue : private SingleCallback<T>, FastAllocated<NotifiedQueue<T>
 
 		this->error = err;
 		if (SingleCallback<T>::next != this) {
-			restore_lineage _;
 			SingleCallback<T>::next->error(err);
 		}
 	}
@@ -1025,13 +1087,13 @@ struct Actor : SAV<ReturnValue> {
 		/*++actorCount;*/
 		currentLineage = lineage;
 	}
+	//~Actor() { --actorCount; }
 
 	Reference<ActorLineage> setLineage() {
 		auto res = currentLineage;
 		currentLineage = lineage;
 		return res;
 	}
-	//~Actor() { --actorCount; }
 };
 
 template <>
@@ -1045,13 +1107,13 @@ struct Actor<void> {
 		/*++actorCount;*/
 		currentLineage = lineage;
 	}
+	//~Actor() { --actorCount; }
 
 	Reference<ActorLineage> setLineage() {
 		auto res = currentLineage;
 		currentLineage = lineage;
 		return res;
 	}
-	//~Actor() { --actorCount; }
 };
 
 template <class ActorType, int CallbackNumber, class ValueType>

From 945d0246cddc0dcfff982f22af54c43617bc79a8 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 9 Dec 2020 13:28:15 -0700
Subject: [PATCH 007/317] add actor stacktrace feature

---
 flow/actorcompiler/ActorCompiler.cs |  3 ++-
 flow/flow.cpp                       |  6 ++++++
 flow/flow.h                         | 12 ++++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/flow/actorcompiler/ActorCompiler.cs b/flow/actorcompiler/ActorCompiler.cs
index dc9de91868..28771f4503 100644
--- a/flow/actorcompiler/ActorCompiler.cs
+++ b/flow/actorcompiler/ActorCompiler.cs
@@ -452,7 +452,7 @@ namespace actorcompiler
                     fullClassName,
                     string.Join(", ", actor.parameters.Select(p => p.name).ToArray()));
 
-            writer.WriteLine("restore_lineage _;");
+            writer.WriteLine("\trestore_lineage _;");
             if (actor.returnType != null)
                 writer.WriteLine("\treturn Future<{1}>({0});", newActor, actor.returnType);
             else
@@ -1287,6 +1287,7 @@ namespace actorcompiler
             constructor.WriteLine("{");
             constructor.Indent(+1);
             ProbeEnter(constructor, actor.name);
+            constructor.WriteLine("currentLineage->modify(&StackLineage::actorName) = LiteralStringRef(\"{0}\");", actor.name);
             constructor.WriteLine("this->{0};", body.call());
             ProbeExit(constructor, actor.name);
             WriteFunction(writer, constructor, constructor.BodyText);
diff --git a/flow/flow.cpp b/flow/flow.cpp
index 5b354fe054..2e47847fcd 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -37,6 +37,12 @@ ActorLineage::~ActorLineage() {
 	}
 }
 
+StringRef StackLineage::name = "StackLineage"_sr;
+
+std::stack<StringRef> getActorStackTrace() {
+	return currentLineage->stack(&StackLineage::actorName);
+}
+
 #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER)
 // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test.
 void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) {
diff --git a/flow/flow.h b/flow/flow.h
index 0ffc895a86..518dbd036c 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -495,6 +495,18 @@ struct restore_lineage {
 	~restore_lineage() { currentLineage = prev; }
 };
 
+struct StackLineage : LineageProperties<StackLineage> {
+	static StringRef name;
+	StringRef actorName;
+
+	template<class Value>
+	bool isSet(Value StackLineage::*member) {
+		return true;
+	}
+};
+
+extern std::stack<StringRef> getActorStackTrace();
+
 // SAV is short for Single Assignment Variable: It can be assigned for only once!
 template <class T>
 struct SAV : private Callback<T>, FastAllocated<SAV<T>> {

From f8e1df6c4f8c5a687afffe2b9a28aa13e32ae9d5 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 10 Dec 2020 10:42:04 -0700
Subject: [PATCH 008/317] Support for actor stack traces

---
 fdbrpc/RoleLineage.h              |  2 +-
 fdbserver/CMakeLists.txt          |  1 +
 fdbserver/SigStack.cpp            | 23 +++++++++++++++++++++++
 fdbserver/worker.actor.cpp        |  3 +++
 flow/flow.h                       |  7 +------
 tests/TestRunner/local_cluster.py |  2 +-
 6 files changed, 30 insertions(+), 8 deletions(-)
 create mode 100644 fdbserver/SigStack.cpp

diff --git a/fdbrpc/RoleLineage.h b/fdbrpc/RoleLineage.h
index 30a2ea2650..8e9d3f4e9e 100644
--- a/fdbrpc/RoleLineage.h
+++ b/fdbrpc/RoleLineage.h
@@ -25,7 +25,7 @@ struct RoleLineage : LineageProperties<RoleLineage> {
     static StringRef name;
     ProcessClass::ClusterRole role = ProcessClass::NoRole;
 
-    bool isSet(ProcessClass::ClusterRole RoleLineage::*member) {
+    bool isSet(ProcessClass::ClusterRole RoleLineage::*member) const {
         return this->*member != ProcessClass::NoRole;
     }
 };
diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt
index bf266069cb..f52e5b8279 100644
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@@ -88,6 +88,7 @@ set(FDBSERVER_SRCS
   ResolverInterface.h
   ServerDBInfo.actor.h
   ServerDBInfo.h
+  SigStack.cpp
   SimulatedCluster.actor.cpp
   SimulatedCluster.h
   SkipList.cpp
diff --git a/fdbserver/SigStack.cpp b/fdbserver/SigStack.cpp
new file mode 100644
index 0000000000..efec5aff7d
--- /dev/null
+++ b/fdbserver/SigStack.cpp
@@ -0,0 +1,23 @@
+#include "flow/flow.h"
+#include <csignal>
+#include <iostream>
+#include <string_view>
+
+// This is not yet correct, as this is not async safe
+// However, this should be good enough for an initial
+// proof of concept.
+extern "C" void stackSignalHandler(int sig) {
+    auto stack = getActorStackTrace();
+    int i = 0;
+    while (!stack.empty()) {
+        auto s = stack.top();
+        stack.pop();
+        std::string_view n(reinterpret_cast<const char*>(s.begin()), s.size());
+        std::cout << i << ": " << n << std::endl;
+        ++i;
+    }
+}
+
+void setupStackSignal() {
+    std::signal(SIGUSR1, &stackSignalHandler);
+}
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 98363ea247..5d371c0c80 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -1798,6 +1798,8 @@ ACTOR Future<Void> monitorLeaderRemotelyWithDelayedCandidacy( Reference<ClusterC
 	}
 }
 
+extern void setupStackSignal();
+
 ACTOR Future<Void> fdbd(
 	Reference<ClusterConnectionFile> connFile,
 	LocalityData localities,
@@ -1812,6 +1814,7 @@ ACTOR Future<Void> fdbd(
 {
 	state vector<Future<Void>> actors;
 	state Promise<Void> recoveredDiskFiles;
+	setupStackSignal();
 	currentLineage->modify(&RoleLineage::role) = ProcessClass::Worker;
 
 	try {
diff --git a/flow/flow.h b/flow/flow.h
index 518dbd036c..b1e4c1e1fb 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -427,7 +427,7 @@ struct LineageProperties : LineagePropertiesBase {
 	// A user should implement this for any type
 	// within the properies class.
 	template<class Value>
-	bool isSet(Value Derived::*member) {
+	bool isSet(Value Derived::*member) const {
 		return true;
 	}
 };
@@ -498,11 +498,6 @@ struct restore_lineage {
 struct StackLineage : LineageProperties<StackLineage> {
 	static StringRef name;
 	StringRef actorName;
-
-	template<class Value>
-	bool isSet(Value StackLineage::*member) {
-		return true;
-	}
 };
 
 extern std::stack<StringRef> getActorStackTrace();
diff --git a/tests/TestRunner/local_cluster.py b/tests/TestRunner/local_cluster.py
index 68318d51dd..85f2094774 100644
--- a/tests/TestRunner/local_cluster.py
+++ b/tests/TestRunner/local_cluster.py
@@ -38,7 +38,7 @@ cluster_file = {etcdir}/fdb.cluster
 command = {fdbserver_bin}
 public_address = auto:$ID
 listen_address = public
-datadir = {datadir}
+datadir = {datadir}/$ID
 logdir = {logdir}
 # logsize = 10MiB
 # maxlogssize = 100MiB

From fb64902d5c5b6e88501ebe906d4d939f61257b9b Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Tue, 19 Jan 2021 16:04:09 -0700
Subject: [PATCH 009/317] Assign roles

---
 fdbrpc/CMakeLists.txt                         |  2 --
 fdbserver/CMakeLists.txt                      |  2 ++
 .../RoleLineage.actor.cpp                     |  2 +-
 .../RoleLineage.actor.h                       | 21 ++++++++++++++-
 fdbserver/worker.actor.cpp                    | 26 ++++++++++++++++++-
 flow/flow.cpp                                 |  5 ++--
 flow/flow.h                                   | 16 ++++++++++++
 7 files changed, 67 insertions(+), 7 deletions(-)
 rename fdbrpc/RoleLineage.cpp => fdbserver/RoleLineage.actor.cpp (95%)
 rename fdbrpc/RoleLineage.h => fdbserver/RoleLineage.actor.h (59%)

diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt
index 7a9ce26a10..af84676be7 100644
--- a/fdbrpc/CMakeLists.txt
+++ b/fdbrpc/CMakeLists.txt
@@ -22,8 +22,6 @@ set(FDBRPC_SRCS
   ReplicationPolicy.cpp
   ReplicationTypes.cpp
   ReplicationUtils.cpp
-  RoleLineage.h
-  RoleLineage.cpp
   Stats.actor.cpp
   Stats.h
   sim2.actor.cpp
diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt
index afc45b2cc4..9e406a0d26 100644
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@@ -86,6 +86,8 @@ set(FDBSERVER_SRCS
   RestoreWorker.actor.cpp
   Resolver.actor.cpp
   ResolverInterface.h
+  RoleLineage.actor.h
+  RoleLineage.actor.cpp
   ServerDBInfo.actor.h
   ServerDBInfo.h
   SigStack.cpp
diff --git a/fdbrpc/RoleLineage.cpp b/fdbserver/RoleLineage.actor.cpp
similarity index 95%
rename from fdbrpc/RoleLineage.cpp
rename to fdbserver/RoleLineage.actor.cpp
index 89a64bbe40..6d1b49527a 100644
--- a/fdbrpc/RoleLineage.cpp
+++ b/fdbserver/RoleLineage.actor.cpp
@@ -18,6 +18,6 @@
  * limitations under the License.
  */
 
-#include "fdbrpc/RoleLineage.h"
+#include "fdbserver/RoleLineage.actor.h"
 
 StringRef RoleLineage::name = "RoleLineage"_sr;
diff --git a/fdbrpc/RoleLineage.h b/fdbserver/RoleLineage.actor.h
similarity index 59%
rename from fdbrpc/RoleLineage.h
rename to fdbserver/RoleLineage.actor.h
index 8e9d3f4e9e..d35c749771 100644
--- a/fdbrpc/RoleLineage.h
+++ b/fdbserver/RoleLineage.actor.h
@@ -1,5 +1,5 @@
 /*
- * RoleLineage.h
+ * RoleLineage.actor.h
  *
  * This source file is part of the FoundationDB open source project
  *
@@ -19,7 +19,15 @@
  */
 
 #pragma once
+#include "flow/flow.h"
+#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_G_H)
+#  define FDBSERVER_ROLE_LINEAGE_ACTOR_G_H
+#  include "fdbserver/RoleLineage.actor.g.h"
+#elif !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_H)
+#  define FDBSERVER_ROLE_LINEAGE_ACTOR_H
+
 #include "fdbrpc/Locality.h"
+#include "flow/actorcompiler.h" // This must be the last include
 
 struct RoleLineage : LineageProperties<RoleLineage> {
     static StringRef name;
@@ -29,3 +37,14 @@ struct RoleLineage : LineageProperties<RoleLineage> {
         return this->*member != ProcessClass::NoRole;
     }
 };
+
+// creates a new root and sets the role lineage
+ACTOR template<class Fun>
+Future<decltype(std::declval<Fun>()())> runInRole(Fun fun, ProcessClass::ClusterRole role) {
+    currentLineage->makeRoot();
+    currentLineage->modify(&RoleLineage::role) = role;
+    decltype(std::declval<Fun>()()) res = wait(fun());
+    return res;
+}
+
+#endif
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 36f5c14860..19aea8622c 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -22,7 +22,6 @@
 #include <boost/lexical_cast.hpp>
 
 #include "fdbrpc/Locality.h"
-#include "fdbrpc/RoleLineage.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbserver/Knobs.h"
 #include "flow/ActorCollection.h"
@@ -33,6 +32,7 @@
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbserver/MetricLogger.h"
 #include "fdbserver/BackupInterface.h"
+#include "fdbserver/RoleLineage.actor.h"
 #include "fdbserver/WorkerInterface.actor.h"
 #include "fdbserver/IKeyValueStore.h"
 #include "fdbserver/WaitFailure.h"
@@ -1024,6 +1024,8 @@ ACTOR Future<Void> workerServer(
 			DiskStore s = stores[f];
 			// FIXME: Error handling
 			if( s.storedComponent == DiskStore::Storage ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage;
 				IKeyValueStore* kv = openKVStore(s.storeType, s.filename, s.storeID, memoryLimit, false, validateDataFiles);
 				Future<Void> kvClosed = kv->onClosed();
 				filesClosed.add( kvClosed );
@@ -1058,6 +1060,8 @@ ACTOR Future<Void> workerServer(
 				f = storageServerRollbackRebooter( f, s.storeType, s.filename, recruited.id(), recruited.locality, dbInfo, folder, &filesClosed, memoryLimit, kv);
 				errorForwarders.add( forwardError( errors, Role::STORAGE_SERVER, recruited.id(), f ) );
 			} else if( s.storedComponent == DiskStore::TLogData ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::TLog;
 				std::string logQueueBasename;
 				const std::string filename = basename(s.filename);
 				if (StringRef(filename).startsWith(fileLogDataPrefix)) {
@@ -1218,6 +1222,8 @@ ACTOR Future<Void> workerServer(
 				}
 			}
 			when( RecruitMasterRequest req = waitNext(interf.master.getFuture()) ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Master;
 				MasterInterface recruited;
 				recruited.locality = locality;
 				recruited.initEndpoints();
@@ -1238,6 +1244,8 @@ ACTOR Future<Void> workerServer(
 				req.reply.send(recruited);
 			}
 			when ( InitializeDataDistributorRequest req = waitNext(interf.dataDistributor.getFuture()) ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::DataDistributor;
 				DataDistributorInterface recruited(locality);
 				recruited.initEndpoints();
 
@@ -1256,6 +1264,8 @@ ACTOR Future<Void> workerServer(
 				req.reply.send(recruited);
 			}
 			when ( InitializeRatekeeperRequest req = waitNext(interf.ratekeeper.getFuture()) ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Ratekeeper;
 				RatekeeperInterface recruited(locality, req.reqId);
 				recruited.initEndpoints();
 
@@ -1280,6 +1290,8 @@ ACTOR Future<Void> workerServer(
 			}
 			when (InitializeBackupRequest req = waitNext(interf.backup.getFuture())) {
 				if (!backupWorkerCache.exists(req.reqId)) {
+					LocalLineage _;
+					currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Backup;
 					BackupInterface recruited(locality);
 					recruited.initEndpoints();
 
@@ -1309,6 +1321,8 @@ ACTOR Future<Void> workerServer(
 						.detail("MinRecruitable", TLogVersion::MIN_RECRUITABLE);
 					req.reply.sendError(internal_error());
 				}
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::TLog;
 				TLogOptions tLogOptions(req.logVersion, req.spillType);
 				TLogFn tLogFn = tLogFnForOptions(tLogOptions);
 				auto& logData = sharedLogs[SharedLogsKey(tLogOptions, req.storeType)];
@@ -1341,6 +1355,8 @@ ACTOR Future<Void> workerServer(
 			}
 			when( InitializeStorageRequest req = waitNext(interf.storage.getFuture()) ) {
 				if( !storageCache.exists( req.reqId ) ) {
+					LocalLineage _;
+					currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage;
 					StorageServerInterface recruited(req.interfaceId);
 					recruited.locality = locality;
 					recruited.initEndpoints();
@@ -1379,6 +1395,8 @@ ACTOR Future<Void> workerServer(
 					forwardPromise( req.reply, storageCache.get( req.reqId ) );
 			}
 			when(InitializeCommitProxyRequest req = waitNext(interf.commitProxy.getFuture())) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::CommitProxy;
 				CommitProxyInterface recruited;
 				recruited.processId = locality.processId();
 				recruited.provisional = false;
@@ -1402,6 +1420,8 @@ ACTOR Future<Void> workerServer(
 				req.reply.send(recruited);
 			}
 			when( InitializeGrvProxyRequest req = waitNext(interf.grvProxy.getFuture()) ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::GrvProxy;
 				GrvProxyInterface recruited;
 				recruited.processId = locality.processId();
 				recruited.provisional = false;
@@ -1421,6 +1441,8 @@ ACTOR Future<Void> workerServer(
 				req.reply.send(recruited);
 			}
 			when( InitializeResolverRequest req = waitNext(interf.resolver.getFuture()) ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Resolver;
 				ResolverInterface recruited;
 				recruited.locality = locality;
 				recruited.initEndpoints();
@@ -1438,6 +1460,8 @@ ACTOR Future<Void> workerServer(
 				req.reply.send(recruited);
 			}
 			when( InitializeLogRouterRequest req = waitNext(interf.logRouter.getFuture()) ) {
+				LocalLineage _;
+				currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::LogRouter;
 				TLogInterface recruited(locality);
 				recruited.initEndpoints();
 
diff --git a/flow/flow.cpp b/flow/flow.cpp
index 2e47847fcd..c90bbbe9ae 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -28,8 +28,9 @@
 
 thread_local Reference<ActorLineage> currentLineage;
 
-ActorLineage::ActorLineage() : parent(currentLineage) {
-}
+LineagePropertiesBase::~LineagePropertiesBase() {}
+
+ActorLineage::ActorLineage() : parent(currentLineage) {}
 
 ActorLineage::~ActorLineage() {
 	for (auto ptr : properties) {
diff --git a/flow/flow.h b/flow/flow.h
index e043ab49d4..9b3ba698b6 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -412,6 +412,7 @@ struct SingleCallback {
 };
 
 struct LineagePropertiesBase {
+	virtual ~LineagePropertiesBase();
 };
 
 // helper class to make implementation of LineageProperties easier
@@ -433,6 +434,7 @@ struct LineageProperties : LineagePropertiesBase {
 };
 
 struct ActorLineage : ReferenceCounted<ActorLineage> {
+	friend class LocalLineage;
 private:
 	std::unordered_map<StringRef, LineagePropertiesBase*> properties;
 	Reference<ActorLineage> parent;
@@ -489,6 +491,20 @@ public:
 
 extern thread_local Reference<ActorLineage> currentLineage;
 
+// This class can be used in order to modify all lineage properties
+// of actors created within a (non-actor) scope
+struct LocalLineage {
+	Reference<ActorLineage> lineage = Reference<ActorLineage>{new ActorLineage() };
+	Reference<ActorLineage> oldLineage;
+	LocalLineage() {
+		oldLineage = currentLineage;
+		currentLineage = lineage;
+	}
+	~LocalLineage() {
+		currentLineage = oldLineage;
+	}
+};
+
 struct restore_lineage {
 	Reference<ActorLineage> prev;
 	restore_lineage() : prev(currentLineage) {}

From f40d8c2f490a08351ce3d7e91bfd6752e268548a Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Tue, 19 Jan 2021 16:04:21 -0700
Subject: [PATCH 010/317] make profiler signal handler reentrant safe

---
 flow/Profiler.actor.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp
index ece9bcfafd..33d1542db7 100644
--- a/flow/Profiler.actor.cpp
+++ b/flow/Profiler.actor.cpp
@@ -148,6 +148,8 @@ struct Profiler {
 	}
 
 	void signal_handler() {  // async signal safe!
+		static std::atomic<bool> inSigHandler = false;
+		if (!inSigHandler.exchange(true)) { return; }
 		if(profilingEnabled) {
 			double t = timer();
 			output_buffer->push(*(void**)&t);
@@ -156,6 +158,7 @@ struct Profiler {
 				output_buffer->push(addresses[i]);
 			output_buffer->push((void*)-1LL);
 		}
+		inSigHandler.store(false);
 	}
 
 	static void signal_handler_for_closure(int, siginfo_t* si, void*, void* self) {  // async signal safe!

From c3efbe3040770dae65319446b9b3877f29b0ee44 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Tue, 19 Jan 2021 16:52:30 -0700
Subject: [PATCH 011/317] fixed minor bug

---
 flow/Profiler.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp
index 33d1542db7..d691f46205 100644
--- a/flow/Profiler.actor.cpp
+++ b/flow/Profiler.actor.cpp
@@ -149,7 +149,7 @@ struct Profiler {
 
 	void signal_handler() {  // async signal safe!
 		static std::atomic<bool> inSigHandler = false;
-		if (!inSigHandler.exchange(true)) { return; }
+		if (inSigHandler.exchange(true)) { return; }
 		if(profilingEnabled) {
 			double t = timer();
 			output_buffer->push(*(void**)&t);

From 5259721a5858a4bdd4eba0877cf931667cc5ac12 Mon Sep 17 00:00:00 2001
From: "Johannes M. Scheuermann" <joh.scheuer@gmail.com>
Date: Sun, 14 Mar 2021 19:46:12 +0000
Subject: [PATCH 012/317] Use only one IP address that matches the hostname

---
 packaging/docker/create_server_environment.bash | 9 +++------
 packaging/docker/fdb.bash                       | 2 +-
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/packaging/docker/create_server_environment.bash b/packaging/docker/create_server_environment.bash
index 04a23792e2..51a782f991 100644
--- a/packaging/docker/create_server_environment.bash
+++ b/packaging/docker/create_server_environment.bash
@@ -23,21 +23,18 @@
 source /var/fdb/scripts/create_cluster_file.bash
 
 function create_server_environment() {
-	fdb_dir=/var/fdb
-	env_file=$fdb_dir/.fdbenv
-
-	: > $env_file
+	env_file=/var/fdb/.fdbenv
 
 	if [[ "$FDB_NETWORKING_MODE" == "host" ]]; then
 		public_ip=127.0.0.1
 	elif [[ "$FDB_NETWORKING_MODE" == "container" ]]; then
-		public_ip=$(grep `hostname` /etc/hosts | sed -e "s/\s *`hostname`.*//")
+		public_ip=$(hostname -i | awk '{print $1}')
 	else
 		echo "Unknown FDB Networking mode \"$FDB_NETWORKING_MODE\"" 1>&2
 		exit 1
 	fi
 
-	echo "export PUBLIC_IP=$public_ip" >> $env_file
+	echo "export PUBLIC_IP=$public_ip" > $env_file
 	if [[ -z $FDB_COORDINATOR && -z "$FDB_CLUSTER_FILE_CONTENTS" ]]; then
 		FDB_CLUSTER_FILE_CONTENTS="docker:docker@$public_ip:$FDB_PORT"
 	fi
diff --git a/packaging/docker/fdb.bash b/packaging/docker/fdb.bash
index 3bf1c6a680..943c8ed58b 100644
--- a/packaging/docker/fdb.bash
+++ b/packaging/docker/fdb.bash
@@ -26,4 +26,4 @@ source /var/fdb/.fdbenv
 echo "Starting FDB server on $PUBLIC_IP:$FDB_PORT"
 fdbserver --listen_address 0.0.0.0:$FDB_PORT --public_address $PUBLIC_IP:$FDB_PORT \
 	--datadir /var/fdb/data --logdir /var/fdb/logs \
-	--locality_zoneid=`hostname` --locality_machineid=`hostname` --class $FDB_PROCESS_CLASS
+	--locality_zoneid="$(hostname)" --locality_machineid="$(hostname)" --class $FDB_PROCESS_CLASS

From 29c626ca6a0d02f1d412327e177cc5db36b02042 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Mon, 15 Mar 2021 17:36:13 -0400
Subject: [PATCH 013/317] Changed code flow to fix loophole that avoided the
 knob guarding higher protocol versions and also added new restarting tests

---
 fdbserver/MoveKeys.actor.cpp                  | 24 ++++++++-------
 tests/CMakeLists.txt                          |  3 ++
 .../to_6.2.33/CycleTestRestart-1.txt          | 30 +++++++++++++++++++
 .../to_6.2.33/CycleTestRestart-2.txt          | 26 ++++++++++++++++
 4 files changed, 73 insertions(+), 10 deletions(-)
 create mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-1.txt
 create mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-2.txt

diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp
index c08f3f3476..83f7170e95 100644
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@@ -1232,23 +1232,27 @@ void seedShardServers(Arena& arena, CommitTransactionRef& tr, vector<StorageServ
 	tr.read_snapshot = 0;
 	tr.read_conflict_ranges.push_back_deep(arena, allKeys);
 
-	for (int s = 0; s < servers.size(); s++) {
-		tr.set(arena, serverTagKeyFor(servers[s].id()), serverTagValue(server_tag[servers[s].id()]));
-		tr.set(arena, serverListKeyFor(servers[s].id()), serverListValue(servers[s]));
+	for (auto& s : servers) {
+		tr.set(arena, serverTagKeyFor(s.id()), serverTagValue(server_tag[s.id()]));
+		tr.set(arena, serverListKeyFor(s.id()), serverListValue(s));
 	}
 
 	std::vector<Tag> serverTags;
+	std::vector<UID> serverSrcUID;
 	serverTags.reserve(servers.size());
-	for (int i = 0; i < servers.size(); i++)
-		serverTags.push_back(server_tag[servers[i].id()]);
+	for (auto& s : servers) {
+		serverTags.push_back(server_tag[s.id()]);
+		serverSrcUID.push_back(s.id());
+	}
 
+	auto ksValue = CLIENT_KNOBS->TAG_ENCODE_KEY_SERVERS ? keyServersValue(serverTags)
+	                                                    : keyServersValue(Standalone<RangeResultRef>(), serverSrcUID);
 	// We have to set this range in two blocks, because the master tracking of "keyServersLocations" depends on a change
 	// to a specific
 	//   key (keyServersKeyServersKey)
-	krmSetPreviouslyEmptyRange(
-	    tr, arena, keyServersPrefix, KeyRangeRef(KeyRef(), allKeys.end), keyServersValue(serverTags), Value());
+	krmSetPreviouslyEmptyRange(tr, arena, keyServersPrefix, KeyRangeRef(KeyRef(), allKeys.end), ksValue, Value());
 
-	for (int s = 0; s < servers.size(); s++)
-		krmSetPreviouslyEmptyRange(
-		    tr, arena, serverKeysPrefixFor(servers[s].id()), allKeys, serverKeysTrue, serverKeysFalse);
+	for (auto& s : servers) {
+		krmSetPreviouslyEmptyRange(tr, arena, serverKeysPrefixFor(s.id()), allKeys, serverKeysTrue, serverKeysFalse);
+	}
 }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 132616b1bb..16f0eb2170 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -204,6 +204,9 @@ if(WITH_PYTHON)
   add_fdb_test(
     TEST_FILES restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml
                restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml)
+  add_fdb_test(
+    TEST_FILES restarting/to_6.2.33/CycleTestRestart-1.txt
+               restarting/to_6.2.33/CycleTestRestart-2.txt IGNORE)
   add_fdb_test(
     TEST_FILES restarting/to_6.3.10/CycleTestRestart-1.txt
                restarting/to_6.3.10/CycleTestRestart-2.txt)
diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-1.txt b/tests/restarting/to_6.2.33/CycleTestRestart-1.txt
new file mode 100644
index 0000000000..647c2f3fe3
--- /dev/null
+++ b/tests/restarting/to_6.2.33/CycleTestRestart-1.txt
@@ -0,0 +1,30 @@
+testTitle=Clogged
+    clearAfterTest=false
+    testName=Cycle
+    transactionsPerSecond=500.0
+    nodeCount=2500
+    testDuration=10.0
+    expectedRate=0
+
+    testName=RandomClogging
+    testDuration=10.0
+
+    testName=Rollback
+    meanDelay=10.0
+    testDuration=10.0
+
+    testName=Attrition
+    machinesToKill=10
+    machinesToLeave=3
+    reboot=true
+    testDuration=10.0
+
+    testName=Attrition
+    machinesToKill=10
+    machinesToLeave=3
+    reboot=true
+    testDuration=10.0
+
+    testName=SaveAndKill
+    restartInfoLocation=simfdb/restartInfo.ini
+    testDuration=10.0
diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-2.txt b/tests/restarting/to_6.2.33/CycleTestRestart-2.txt
new file mode 100644
index 0000000000..7d498f2be1
--- /dev/null
+++ b/tests/restarting/to_6.2.33/CycleTestRestart-2.txt
@@ -0,0 +1,26 @@
+testTitle=Clogged
+    runSetup=false
+    testName=Cycle
+    transactionsPerSecond=2500.0
+    nodeCount=2500
+    testDuration=10.0
+    expectedRate=0
+
+    testName=RandomClogging
+    testDuration=10.0
+
+    testName=Rollback
+    meanDelay=10.0
+    testDuration=10.0
+
+    testName=Attrition
+    machinesToKill=10
+    machinesToLeave=3
+    reboot=true
+    testDuration=10.0
+
+    testName=Attrition
+    machinesToKill=10
+    machinesToLeave=3
+    reboot=true
+    testDuration=10.0

From a8c7a798f2483c22ffd6c8dacbb0946c81237c12 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 17 Mar 2021 15:34:20 -0600
Subject: [PATCH 014/317] First prototype of actorlineageset

---
 flow/ActorLineageSet.cpp | 118 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 flow/ActorLineageSet.cpp

diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp
new file mode 100644
index 0000000000..9fb93e9df7
--- /dev/null
+++ b/flow/ActorLineageSet.cpp
@@ -0,0 +1,118 @@
+/*
+ * ActorLineageSet.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flow/flow.h"
+#include <boost/lockfree/queue.hpp>
+
+class ActorLineageSet {
+public:
+	// The type we use for lookup into the set. Gets assigned during insert
+	using Index = unsigned;
+	// For now we use a fixed size capacity
+	constexpr static Index CAPACITY = 1024;
+	constexpr static Index npos = std::numeric_limits<Index>::max();
+
+	explicit ActorLineageSet();
+	ActorLineageSet(const ActorLineageSet&) = delete;
+	ActorLineageSet& operator=(const ActorLineageSet&) = delete;
+
+	// Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so
+	// the actual size might change anytime after or even during the call. This function only guarantees that the size
+	// was whatever the method returns at one point between the start and the end of the function call. The safest way
+	// to handle this is by assuming that this returns an estimate.
+	unsigned size();
+
+	Index insert(const Reference<ActorLineage>& lineage);
+	void erase(Index idx);
+	std::vector<Reference<ActorLineage>> copy();
+
+private:
+	static constexpr uintptr_t FREE = 0b1;
+	static constexpr uintptr_t LOCK = 0b10;
+	std::atomic<unsigned> _size = 0;
+	std::vector<std::atomic<std::uintptr_t>> _set;
+	boost::lockfree::queue<Index, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeQueue;
+	boost::lockfree::queue<ActorLineage*, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>>
+	    freeList;
+};
+
+ActorLineageSet::ActorLineageSet() {
+	// insert the free indexes in reverse order
+	for (unsigned i = CAPACITY; i > 0; --i) {
+		freeQueue.push(i - 1);
+		_set[i] = uintptr_t(1);
+	}
+}
+
+std::vector<Reference<ActorLineage>> ActorLineageSet::copy() {
+	std::vector<Reference<ActorLineage>> result;
+	for (int i = 0; i < CAPACITY; ++i) {
+		auto ptr = _set[i].load();
+		if ((ptr & FREE) != 0) {
+			ASSERT((ptr & LOCK) == 0);
+			if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) {
+				ActorLineage* entry = reinterpret_cast<ActorLineage*>(ptr);
+				ptr |= LOCK;
+				entry->addref();
+				// we try to unlock now. If this element was removed while we incremented the refcount, the element will
+				// end up in the freeList, so we will decrement later.
+				_set[i].compare_exchange_strong(ptr, ptr ^ LOCK);
+				result.emplace_back(entry);
+			}
+		}
+	}
+	// after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread
+	// might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next
+	// iteration
+	ActorLineage* toClean;
+	while (freeList.pop(toClean)) {
+		toClean->delref();
+	}
+	return result;
+}
+
+ActorLineageSet::Index ActorLineageSet::insert(const Reference<ActorLineage>& lineage) {
+	Index res;
+	if (!freeQueue.pop(res)) {
+		TraceEvent(SevWarnAlways, "NoCapacityInActorLineageSet");
+		return npos;
+	}
+	ASSERT(_set[res].load() & FREE);
+	auto ptr = reinterpret_cast<uintptr_t>(lineage.getPtr());
+	lineage->addref();
+	_set[res].store(ptr);
+	return res;
+}
+
+void ActorLineageSet::erase(Index idx) {
+	while (true) {
+		auto ptr = _set[idx].load();
+		if (ptr & LOCK) {
+			_set[idx].store(FREE);
+			freeList.push(reinterpret_cast<ActorLineage*>(ptr ^ LOCK));
+			return;
+		} else {
+			if (_set[idx].compare_exchange_strong(ptr, FREE)) {
+				reinterpret_cast<ActorLineage*>(ptr)->delref();
+				return;
+			}
+		}
+	}
+}
\ No newline at end of file

From 9812a49058adf16c2cdd1445f876f372be074109 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 17 Mar 2021 15:40:19 -0600
Subject: [PATCH 015/317] use consume_all to clean up after copy

---
 flow/ActorLineageSet.cpp | 5 +----
 flow/CMakeLists.txt      | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp
index 9fb93e9df7..0957339501 100644
--- a/flow/ActorLineageSet.cpp
+++ b/flow/ActorLineageSet.cpp
@@ -81,10 +81,7 @@ std::vector<Reference<ActorLineage>> ActorLineageSet::copy() {
 	// after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread
 	// might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next
 	// iteration
-	ActorLineage* toClean;
-	while (freeList.pop(toClean)) {
-		toClean->delref();
-	}
+	freeList.consume_all([](auto toClean) { toClean->delRef(); });
 	return result;
 }
 
diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt
index c838e8eff8..5e89fe4d28 100644
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@@ -3,6 +3,7 @@ find_package(Threads REQUIRED)
 set(FLOW_SRCS
   ActorCollection.actor.cpp
   ActorCollection.h
+  ActorLineageSet.cpp
   Arena.cpp
   Arena.h
   AsioReactor.h

From f6c7aa6ac77e55266e030109eb77d24b8894952e Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 17 Mar 2021 15:50:29 -0600
Subject: [PATCH 016/317] fixed typo

---
 flow/ActorLineageSet.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp
index 0957339501..9a0d34c9bf 100644
--- a/flow/ActorLineageSet.cpp
+++ b/flow/ActorLineageSet.cpp
@@ -81,7 +81,7 @@ std::vector<Reference<ActorLineage>> ActorLineageSet::copy() {
 	// after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread
 	// might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next
 	// iteration
-	freeList.consume_all([](auto toClean) { toClean->delRef(); });
+	freeList.consume_all([](auto toClean) { toClean->delref(); });
 	return result;
 }
 

From 4f1b807e1f480f24a0e3cb9622149953c295a4ab Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 17 Mar 2021 16:01:23 -0600
Subject: [PATCH 017/317] assert object alignment

---
 flow/ActorLineageSet.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp
index 9a0d34c9bf..570976379c 100644
--- a/flow/ActorLineageSet.cpp
+++ b/flow/ActorLineageSet.cpp
@@ -93,6 +93,7 @@ ActorLineageSet::Index ActorLineageSet::insert(const Reference<ActorLineage>& li
 	}
 	ASSERT(_set[res].load() & FREE);
 	auto ptr = reinterpret_cast<uintptr_t>(lineage.getPtr());
+	ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned
 	lineage->addref();
 	_set[res].store(ptr);
 	return res;

From 650e0de62570338ebff06cedc819a9bb00a0b925 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Thu, 18 Mar 2021 15:32:17 -0400
Subject: [PATCH 018/317] Remove extra downgrade workloads to restrict
 downgrade testing to 1 version apart

---
 tests/CMakeLists.txt                          |  3 --
 .../to_6.2.33/CycleTestRestart-1.txt          | 30 -------------------
 .../to_6.2.33/CycleTestRestart-2.txt          | 26 ----------------
 3 files changed, 59 deletions(-)
 delete mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-1.txt
 delete mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-2.txt

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 16f0eb2170..132616b1bb 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -204,9 +204,6 @@ if(WITH_PYTHON)
   add_fdb_test(
     TEST_FILES restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml
                restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml)
-  add_fdb_test(
-    TEST_FILES restarting/to_6.2.33/CycleTestRestart-1.txt
-               restarting/to_6.2.33/CycleTestRestart-2.txt IGNORE)
   add_fdb_test(
     TEST_FILES restarting/to_6.3.10/CycleTestRestart-1.txt
                restarting/to_6.3.10/CycleTestRestart-2.txt)
diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-1.txt b/tests/restarting/to_6.2.33/CycleTestRestart-1.txt
deleted file mode 100644
index 647c2f3fe3..0000000000
--- a/tests/restarting/to_6.2.33/CycleTestRestart-1.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-testTitle=Clogged
-    clearAfterTest=false
-    testName=Cycle
-    transactionsPerSecond=500.0
-    nodeCount=2500
-    testDuration=10.0
-    expectedRate=0
-
-    testName=RandomClogging
-    testDuration=10.0
-
-    testName=Rollback
-    meanDelay=10.0
-    testDuration=10.0
-
-    testName=Attrition
-    machinesToKill=10
-    machinesToLeave=3
-    reboot=true
-    testDuration=10.0
-
-    testName=Attrition
-    machinesToKill=10
-    machinesToLeave=3
-    reboot=true
-    testDuration=10.0
-
-    testName=SaveAndKill
-    restartInfoLocation=simfdb/restartInfo.ini
-    testDuration=10.0
diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-2.txt b/tests/restarting/to_6.2.33/CycleTestRestart-2.txt
deleted file mode 100644
index 7d498f2be1..0000000000
--- a/tests/restarting/to_6.2.33/CycleTestRestart-2.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-testTitle=Clogged
-    runSetup=false
-    testName=Cycle
-    transactionsPerSecond=2500.0
-    nodeCount=2500
-    testDuration=10.0
-    expectedRate=0
-
-    testName=RandomClogging
-    testDuration=10.0
-
-    testName=Rollback
-    meanDelay=10.0
-    testDuration=10.0
-
-    testName=Attrition
-    machinesToKill=10
-    machinesToLeave=3
-    reboot=true
-    testDuration=10.0
-
-    testName=Attrition
-    machinesToKill=10
-    machinesToLeave=3
-    reboot=true
-    testDuration=10.0

From 7080ea1f1f1b281070ecf8f5ab9caa5c7365355b Mon Sep 17 00:00:00 2001
From: Xiaoge Su <magichp@gmail.com>
Date: Tue, 16 Mar 2021 05:05:03 -0700
Subject: [PATCH 019/317] Add document describes how a get/commit is done in
 FDB

---
 design/Commit/Commit.svg                     |   1 +
 design/Commit/CommitOverall.svg              |   1 +
 design/Commit/GRV.svg                        |   1 +
 design/Commit/Get.svg                        |   1 +
 design/Commit/GetRange.svg                   |   1 +
 design/Commit/GetRangeFallback.svg           |   1 +
 design/Commit/How a commit is done in FDB.md | 204 +++++++++++++++++++
 design/Commit/commit.sequence                | 148 ++++++++++++++
 design/Commit/commitoverall.sequence         |  54 +++++
 design/Commit/get.sequence                   |  68 +++++++
 design/Commit/getrange.sequence              |  60 ++++++
 design/Commit/getrangefallback.sequence      |  80 ++++++++
 design/Commit/grv.sequence                   |  66 ++++++
 13 files changed, 686 insertions(+)
 create mode 100644 design/Commit/Commit.svg
 create mode 100644 design/Commit/CommitOverall.svg
 create mode 100644 design/Commit/GRV.svg
 create mode 100644 design/Commit/Get.svg
 create mode 100644 design/Commit/GetRange.svg
 create mode 100644 design/Commit/GetRangeFallback.svg
 create mode 100644 design/Commit/How a commit is done in FDB.md
 create mode 100644 design/Commit/commit.sequence
 create mode 100644 design/Commit/commitoverall.sequence
 create mode 100644 design/Commit/get.sequence
 create mode 100644 design/Commit/getrange.sequence
 create mode 100644 design/Commit/getrangefallback.sequence
 create mode 100644 design/Commit/grv.sequence

diff --git a/design/Commit/Commit.svg b/design/Commit/Commit.svg
new file mode 100644
index 0000000000..6a59a6c0bd
--- /dev/null
+++ b/design/Commit/Commit.svg
@@ -0,0 +1 @@
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="3121" height="4300"><defs/><g><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g><rect fill="white" stroke="none" x="0" y="0" width="3121" height="4300"/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="16.5pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1522.9007274749995" y="28.27246725" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Commit</text></g><g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 9.424155749999994 58.3355240925 L 834.9183057818749 58.3355240925 L 834.9183057818749 4291.1950792049975 L 9.424155749999994 4291.1950792049975 L 9.424155749999994 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="328.3882717815624" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Client</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="370.0557522503124" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (NativeAPI.actor.cpp)</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 975.18904427 58.3355240925 L 1694.4325820031247 58.3355240925 L 1694.4325820031247 4291.1950792049975 L 975.18904427 4291.1950792049975 L 975.18904427 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="1179.7644264178123" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">CommitProxy</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1277.2863990740623" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (CommitProxyServer.actor.cpp)</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1745.118777775625 58.3355240925 L 2122.261115654375 58.3355240925 L 2122.261115654375 4291.1950792049975 L 1745.118777775625 4291.1950792049975 L 1745.118777775625 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="1907.428960386875" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Master</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1955.783452574375" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> </text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 2192.9444275268747 58.3355240925 L 2339.4050007631245 58.3355240925 L 2339.4050007631245 4291.1950792049975 L 2192.9444275268747 4291.1950792049975 L 2192.9444275268747 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="2165.7047434418746" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Resolver</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2229.0738840668746" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (Resolver.actor.cpp)</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 2486.2089423762495 58.3355240925 L 2626.8174648312493 58.3355240925 L 2626.8174648312493 4291.1950792049975 L 2486.2089423762495 4291.1950792049975 L 2486.2089423762495 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="2459.8115434474994" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">TLog</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2496.4619340724994" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (TLogServer.actor.cpp)</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 2774.3208693349993 58.3355240925 L 3013.337594914999 58.3355240925 L 3013.337594914999 4291.1950792049975 L 2774.3208693349993 4291.1950792049975 L 2774.3208693349993 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="2755.423470406249" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Storage Server</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2862.144661812499" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (storageserver.actor.cpp)</text></g></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 106.54238182124999 153.990704955 L 106.54238182124999 4291.1950792049975" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 292.0263925575 153.990704955 L 292.0263925575 4291.1950792049975" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 527.64256046375 153.990704955 L 527.64256046375 4291.1950792049975" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 766.8222965074999 153.990704955 L 766.8222965074999 4291.1950792049975" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1055.077045731875 153.990704955 L 1055.077045731875 4291.1950792049975" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1357.402198013125 153.990704955 L 1357.402198013125 4291.1950792049975" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1579.0916996818748 153.990704955 L 1579.0916996818748 4291.1950792049975" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1810.4389081437498 153.990704955 L 1810.4389081437498 4291.1950792049975" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1999.0100770831248 153.990704955 L 1999.0100770831248 4291.1950792049975" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 2266.1747141449996 153.990704955 L 2266.1747141449996 4291.1950792049975" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 2556.5132036037494 153.990704955 L 2556.5132036037494 4291.1950792049975" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 2893.829232124999 153.990704955 L 2893.829232124999 4291.1950792049975" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/></g><g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 18.848311499999994 103.28874702000002 L 194.23645214249999 103.28874702000002 L 194.23645214249999 153.990704955 L 18.848311499999994 153.990704955 L 18.848311499999994 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="39.29872947749999" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Transaction::commit</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 213.0847636425 103.28874702000002 L 370.9680214725 103.28874702000002 L 370.9680214725 153.990704955 L 213.0847636425 153.990704955 L 213.0847636425 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="233.53518162" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">commitAndWatch</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 473.02832412687496 103.28874702000002 L 582.256796800625 103.28874702000002 L 582.256796800625 153.990704955 L 473.02832412687496 153.990704955 L 473.02832412687496 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="493.478742104375" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">tryCommit</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 708.1504429831249 103.28874702000002 L 825.4941500318749 103.28874702000002 L 825.4941500318749 153.990704955 L 708.1504429831249 153.990704955 L 708.1504429831249 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="728.6008609606249" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">watchValue</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 984.6132000199999 103.28874702000002 L 1125.5408914437498 103.28874702000002 L 1125.5408914437498 153.990704955 L 984.6132000199999 153.990704955 L 984.6132000199999 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1005.0636179974999" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">commitBatcher</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1293.607053473125 103.28874702000002 L 1421.197342553125 103.28874702000002 L 1421.197342553125 153.990704955 L 1293.607053473125 153.990704955 L 1293.607053473125 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1314.057471450625" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">commitBatch</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1473.174973110625 103.28874702000002 L 1685.008426253125 103.28874702000002 L 1685.008426253125 153.990704955 L 1473.174973110625 153.990704955 L 1473.174973110625 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1493.6253910881248" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">TagPartitionedLogSystem</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1754.542933525625 103.28874702000002 L 1866.334882761875 103.28874702000002 L 1866.334882761875 153.990704955 L 1754.542933525625 153.990704955 L 1754.542933525625 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1774.9933515031248" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">getVersion</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1885.1831942618749 103.28874702000002 L 2112.836959904375 103.28874702000002 L 2112.836959904375 153.990704955 L 1885.1831942618749 153.990704955 L 1885.1831942618749 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1905.6336122393748" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">serveLiveCommittedVersion</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2202.3685832768747 103.28874702000002 L 2329.9808450131245 103.28874702000002 L 2329.9808450131245 153.990704955 L 2202.3685832768747 153.990704955 L 2202.3685832768747 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2222.8190012543746" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">resolveBatch</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2495.6330981262495 103.28874702000002 L 2617.3933090812493 103.28874702000002 L 2617.3933090812493 153.990704955 L 2495.6330981262495 153.990704955 L 2495.6330981262495 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2516.0835161037494" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">tLogCommit</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2783.7450250849993 103.28874702000002 L 3003.913439164999 103.28874702000002 L 3003.913439164999 153.990704955 L 2783.7450250849993 153.990704955 L 2783.7450250849993 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2804.195443062499" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">serveWatchValueRequests</text></g></g><g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="199.284387189375" y="208.65080830500003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"></text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 106.54238182124999 216.19013290500004 L 276.5079494225 216.19013290500004" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(292.0263925575,216.19013290500004) translate(-292.0263925575,-216.19013290500004)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 276.3194663075 208.33666978000005 L 292.0263925575 216.19013290500004 L 276.3194663075 224.04359603000003 Z"/></g></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="317.1574745575" y="261.4260805050001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">CommitTransactionRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 292.0263925575 268.96540510500006 L 512.12411732875 268.96540510500006" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(527.64256046375,268.96540510500006) translate(-527.64256046375,-268.96540510500006)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 511.93563421375 261.11194198000004 L 527.64256046375 268.96540510500006 L 511.93563421375 276.8188682300001 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 527.64256046375 318.91343058000007 L 537.0667162137499 318.91343058000007 M 537.0667162137499 297.2378723550001 L 682.0679611887499 297.2378723550001 L 699.03144153875 314.2013527050001 L 699.03144153875 340.5889888050001 L 537.0667162137499 340.5889888050001 L 537.0667162137499 297.2378723550001 M 682.0679611887499 297.2378723550001 L 682.0679611887499 314.2013527050001 L 699.03144153875 314.2013527050001" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="green" stroke="none" font-family="sans-serif" font-size="11pt" font-style="italic" font-weight="normal" text-decoration="normal" x="563.45435231375" y="323.62550845500004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">CommitAttachID</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 527.64256046375 390.5370142800001 L 537.0667162137499 390.5370142800001 M 537.0667162137499 368.86145605500013 L 740.4346604074999 368.86145605500013 L 757.3981407575 385.82493640500013 L 757.3981407575 412.21257250500014 L 537.0667162137499 412.21257250500014 L 537.0667162137499 368.86145605500013 M 740.4346604074999 368.86145605500013 L 740.4346604074999 385.82493640500013 L 757.3981407575 385.82493640500013" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="563.45435231375" y="395.2490921550001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">NativeAPI.commit.Before</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="698.6828011446875" y="457.44852010500017" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">CommitTransactionRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 527.64256046375 464.98784470500016 L 1039.5586025968748 464.98784470500016" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(1055.077045731875,464.98784470500016) translate(-1055.077045731875,-464.98784470500016)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1039.370119481875 457.13438158000014 L 1055.077045731875 464.98784470500016 L 1039.370119481875 472.8413078300002 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 952.4113334599999 544.1507530050003 L 1157.74275800375 544.1507530050003 L 1157.74275800375 587.5018694550002 L 952.4113334599999 587.5018694550002 L 952.4113334599999 544.1507530050003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="978.7989695599999" y="570.5383891050003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Batch commit requests</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1080.2081277318748" y="661.0102843050003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Batched CommitTransactionRequests</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1055.077045731875 668.5496089050002 L 1341.8837548781248 668.5496089050002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(1357.402198013125,668.5496089050002) translate(-1357.402198013125,-668.5496089050002)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1341.695271763125 660.6961457800003 L 1357.402198013125 668.5496089050002 L 1341.695271763125 676.4030720300002 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1357.402198013125 716.0473538850002 L 1366.8263537631249 716.0473538850002 M 1366.8263537631249 696.8220761550002 L 1552.6908799881248 696.8220761550002 L 1569.654360338125 713.7855565050003 L 1569.654360338125 735.2726316150003 L 1366.8263537631249 735.2726316150003 L 1366.8263537631249 696.8220761550002 M 1552.6908799881248 696.8220761550002 L 1552.6908799881248 713.7855565050003 L 1569.654360338125 713.7855565050003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1393.2139898631249" y="719.8170161850003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">CommitProxyServer.batcher</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1286.823888085 763.5450988650002 L 1427.98050794125 763.5450988650002 L 1427.98050794125 806.8962153150003 L 1286.823888085 806.8962153150003 L 1286.823888085 763.5450988650002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1313.211524185" y="789.9327349650003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Preresolution</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1357.402198013125 856.8442407900003 L 1366.8263537631249 856.8442407900003 M 1366.8263537631249 835.1686825650004 L 1552.7040635818748 835.1686825650004 L 1569.667543931875 852.1321629150004 L 1569.667543931875 878.5197990150003 L 1366.8263537631249 878.5197990150003 L 1366.8263537631249 835.1686825650004 M 1552.7040635818748 835.1686825650004 L 1552.7040635818748 852.1321629150004 L 1569.667543931875 852.1321629150004" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1393.2139898631249" y="861.5563186650004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GettingCommitVersion</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1493.0453089378125" y="923.7557466150004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetCommitVersionRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1357.402198013125 931.2950712150003 L 1794.9204650087497 931.2950712150003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(1810.4389081437498,931.2950712150003) translate(-1810.4389081437498,-931.2950712150003)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1794.73198189375 923.4416080900004 L 1810.4389081437498 931.2950712150003 L 1794.73198189375 939.1485343400003 Z"/></g></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1501.8050745628125" y="976.5310188150004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetCommitVersionReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1810.4389081437498 984.0703434150004 L 1372.920641148125 984.0703434150004" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(1357.402198013125,984.0703434150004) translate(-1357.402198013125,-984.0703434150004)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1373.1091242631248 976.2168802900004 L 1357.402198013125 984.0703434150004 L 1373.1091242631248 991.9238065400003 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1357.402198013125 1034.0183688900004 L 1366.8263537631249 1034.0183688900004 M 1366.8263537631249 1012.3428106650003 L 1528.5194932693748 1012.3428106650003 L 1545.482973619375 1029.3062910150004 L 1545.482973619375 1055.6939271150004 L 1366.8263537631249 1055.6939271150004 L 1366.8263537631249 1012.3428106650003 M 1528.5194932693748 1012.3428106650003 L 1528.5194932693748 1029.3062910150004 L 1545.482973619375 1029.3062910150004" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1393.2139898631249" y="1038.7304467650004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GotCommitVersion</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1303.9186146475 1083.9663943650003 L 1410.88578137875 1083.9663943650003 L 1410.88578137875 1127.3175108150003 L 1303.9186146475 1127.3175108150003 L 1303.9186146475 1083.9663943650003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1330.3062507475" y="1110.354030465" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Resolve</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1698.6695595946871" y="1172.5534584150003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">ResolveTransactionBatchRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1357.402198013125 1180.0927830150004 L 2250.6562710099997 1180.0927830150004" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(2266.1747141449996,1180.0927830150004) translate(-2266.1747141449996,-1180.0927830150004)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 2250.4677878949997 1172.2393198900004 L 2266.1747141449996 1180.0927830150004 L 2250.4677878949997 1187.9462461400003 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2266.1747141449996 1230.0408084900002 L 2275.5988698949996 1230.0408084900002 M 2275.5988698949996 1208.3652502650002 L 2355.6049976824997 1208.3652502650002 L 2372.5684780324996 1225.3287306150003 L 2372.5684780324996 1251.7163667150003 L 2275.5988698949996 1251.7163667150003 L 2275.5988698949996 1208.3652502650002 M 2355.6049976824997 1208.3652502650002 L 2355.6049976824997 1225.3287306150003 L 2372.5684780324996 1225.3287306150003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2301.986505995" y="1234.752886365" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Before</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2131.6852714043744 1279.9888339650001 L 2400.6641568856244 1279.9888339650001 L 2400.6641568856244 1323.3399504150002 L 2131.6852714043744 1323.3399504150002 L 2131.6852714043744 1279.9888339650001" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2158.0729075043746" y="1306.376470065" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Wait for memory/needed version</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2266.1747141449996 1373.28797589 L 2275.5988698949996 1373.28797589 M 2275.5988698949996 1351.612417665 L 2459.8213062762497 1351.612417665 L 2476.7847866262496 1368.5758980150001 L 2476.7847866262496 1394.9635341150001 L 2275.5988698949996 1394.9635341150001 L 2275.5988698949996 1351.612417665 M 2459.8213062762497 1351.612417665 L 2459.8213062762497 1368.5758980150001 L 2476.7847866262496 1368.5758980150001" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2301.986505995" y="1378.0000537649998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">AfterQueueSizeCheck</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2159.2060233574994 1423.236001365 L 2373.1434049324994 1423.236001365 L 2373.1434049324994 1466.587117815 L 2159.2060233574994 1466.587117815 L 2159.2060233574994 1423.236001365" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2185.5936594574996" y="1449.6236374649998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Wait for resolver version</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2266.1747141449996 1516.53514329 L 2275.5988698949996 1516.53514329 M 2275.5988698949996 1494.859585065 L 2394.7675953387497 1494.859585065 L 2411.7310756887496 1511.823065415 L 2411.7310756887496 1538.210701515 L 2275.5988698949996 1538.210701515 L 2275.5988698949996 1494.859585065 M 2394.7675953387497 1494.859585065 L 2394.7675953387497 1511.823065415 L 2411.7310756887496 1511.823065415" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2301.986505995" y="1521.2472211649997" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">AfterOrderer</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2171.0053397637494 1566.4831687649998 L 2361.3440885262494 1566.4831687649998 L 2361.3440885262494 1609.834285215 L 2171.0053397637494 1609.834285215 L 2171.0053397637494 1566.4831687649998" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2197.3929758637496" y="1592.8708048649996" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Resolve the conflicts</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2266.1747141449996 1659.7823106899998 L 2275.5988698949996 1659.7823106899998 M 2275.5988698949996 1638.1067524649998 L 2343.0879078387497 1638.1067524649998 L 2360.0513881887496 1655.0702328149998 L 2360.0513881887496 1681.4578689149998 L 2275.5988698949996 1681.4578689149998 L 2275.5988698949996 1638.1067524649998 M 2343.0879078387497 1638.1067524649998 L 2343.0879078387497 1655.0702328149998 L 2360.0513881887496 1655.0702328149998" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2301.986505995" y="1664.4943885649996" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">After</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1707.4293252196871" y="1726.6938165149998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">ResolveTransactionBatchReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 2266.1747141449996 1734.2331411149999 L 1372.920641148125 1734.2331411149999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(1357.402198013125,1734.2331411149999) translate(-1357.402198013125,-1734.2331411149999)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1373.1091242631248 1726.37967799 L 1357.402198013125 1734.2331411149999 L 1373.1091242631248 1742.0866042399998 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1357.402198013125 1784.1811665899997 L 1366.8263537631249 1784.1811665899997 M 1366.8263537631249 1762.5056083649997 L 1541.8715440506248 1762.5056083649997 L 1558.835024400625 1779.4690887149998 L 1558.835024400625 1805.8567248149998 L 1366.8263537631249 1805.8567248149998 L 1366.8263537631249 1762.5056083649997 M 1541.8715440506248 1762.5056083649997 L 1541.8715440506248 1779.4690887149998 L 1558.835024400625 1779.4690887149998" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1393.2139898631249" y="1788.8932444649995" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">ProcessingMutations</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1253.883214256875 1834.1291920649996 L 1460.921181769375 1834.1291920649996 L 1460.921181769375 1877.4803085149997 L 1253.883214256875 1877.4803085149997 L 1253.883214256875 1834.1291920649996" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1280.270850356875" y="1860.5168281649994" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Calculate the metadata</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1161.35269667875 1905.7527757649996 L 1553.4516993475 1905.7527757649996 L 1553.4516993475 1949.1038922149996 L 1161.35269667875 1949.1038922149996 L 1161.35269667875 1905.7527757649996" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1187.74033277875" y="1932.1404118649994" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Determine which transactions should be committed</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1199.284825585 1977.3763594649995 L 1515.51957044125 1977.3763594649995 L 1515.51957044125 2020.7274759149996 L 1199.284825585 2020.7274759149996 L 1199.284825585 1977.3763594649995" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1225.672461685" y="2003.7639955649993" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Assign storage server tags to mutations</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1561.9012059074998" y="2116.853864564999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetRawCommittedVersionRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1357.402198013125 2124.3931891649995 L 1983.4916339481247 2124.3931891649995" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(1999.0100770831248,2124.3931891649995) translate(-1999.0100770831248,-2124.3931891649995)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1983.3031508331248 2116.5397260399996 L 1999.0100770831248 2124.3931891649995 L 1983.3031508331248 2132.2466522899995 Z"/></g></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1570.6609715324998" y="2169.629136764999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetRawCommittedVersionReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1999.0100770831248 2177.1684613649995 L 1372.920641148125 2177.1684613649995" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(1357.402198013125,2177.1684613649995) translate(-1357.402198013125,-2177.1684613649995)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1373.1091242631248 2169.3149982399996 L 1357.402198013125 2177.1684613649995 L 1373.1091242631248 2185.0219244899995 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1357.402198013125 2255.3889540899995 L 1366.8263537631249 2255.3889540899995 M 1366.8263537631249 2233.7133958649993 L 1529.3324815506248 2233.7133958649993 L 1546.295961900625 2250.676876214999 L 1546.295961900625 2277.0645123149993 L 1366.8263537631249 2277.0645123149993 L 1366.8263537631249 2233.7133958649993 M 1529.3324815506248 2233.7133958649993 L 1529.3324815506248 2250.676876214999 L 1546.295961900625 2250.676876214999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1393.2139898631249" y="2260.1010319649995" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">AfterStoreCommits</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1393.6094976756249" y="2322.300459914999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Version, LogPushData</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1357.402198013125 2329.8397845149993 L 1563.5732565468747 2329.8397845149993" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(1579.0916996818748,2329.8397845149993) translate(-1579.0916996818748,-2329.8397845149993)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1563.384773431875 2321.9863213899994 L 1579.0916996818748 2329.8397845149993 L 1563.384773431875 2337.6932476399993 Z"/></g></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1996.9406352365622" y="2375.075732114999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">TLogCommitRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1579.0916996818748 2382.6150567149994 L 2540.9947604687495 2382.6150567149994" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(2556.5132036037494,2382.6150567149994) translate(-2556.5132036037494,-2382.6150567149994)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 2540.8062773537495 2374.7615935899994 L 2556.5132036037494 2382.6150567149994 L 2540.8062773537495 2390.4685198399993 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2556.5132036037494 2432.5630821899995 L 2565.9373593537493 2432.5630821899995 M 2565.9373593537493 2410.887523964999 L 2710.9386043287495 2410.887523964999 L 2727.9020846787494 2427.851004314999 L 2727.9020846787494 2454.2386404149993 L 2565.9373593537493 2454.2386404149993 L 2565.9373593537493 2410.887523964999 M 2710.9386043287495 2410.887523964999 L 2710.9386043287495 2427.851004314999 L 2727.9020846787494 2427.851004314999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="green" stroke="none" font-family="sans-serif" font-size="11pt" font-style="italic" font-weight="normal" text-decoration="normal" x="2592.3249954537496" y="2437.2751600649995" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">CommitAttachID</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2556.5132036037494 2504.1866658899994 L 2565.9373593537493 2504.1866658899994 M 2565.9373593537493 2482.511107664999 L 2747.9332332349995 2482.511107664999 L 2764.8967135849994 2499.474588014999 L 2764.8967135849994 2525.862224114999 L 2565.9373593537493 2525.862224114999 L 2565.9373593537493 2482.511107664999 M 2747.9332332349995 2482.511107664999 L 2747.9332332349995 2499.474588014999 L 2764.8967135849994 2499.474588014999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2592.3249954537496" y="2508.8987437649994" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">BeforeWaitForVersion</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2465.793292113124 2554.134691364999 L 2647.233115094374 2554.134691364999 L 2647.233115094374 2597.485807814999 L 2465.793292113124 2597.485807814999 L 2465.793292113124 2554.134691364999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2492.1809282131244" y="2580.5223274649993" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Wait for the version</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2556.5132036037494 2647.4338332899993 L 2565.9373593537493 2647.4338332899993 M 2565.9373593537493 2625.758275064999 L 2645.9434871412495 2625.758275064999 L 2662.9069674912494 2642.721755414999 L 2662.9069674912494 2669.109391514999 L 2565.9373593537493 2669.109391514999 L 2565.9373593537493 2625.758275064999 M 2645.9434871412495 2625.758275064999 L 2645.9434871412495 2642.721755414999 L 2662.9069674912494 2642.721755414999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2592.3249954537496" y="2652.1459111649992" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Before</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2473.439776488124 2697.381858764999 L 2639.586630719374 2697.381858764999 L 2639.586630719374 2740.732975214999 L 2473.439776488124 2740.732975214999 L 2473.439776488124 2697.381858764999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2499.8274125881244" y="2723.769494864999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Store the commit</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2420.903155394374 2769.005442464999 L 2692.123251813124 2769.005442464999 L 2692.123251813124 2812.356558914999 L 2420.903155394374 2812.356558914999 L 2420.903155394374 2769.005442464999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2447.2907914943744" y="2795.393078564999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Put commit into persistent queue</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2556.5132036037494 2862.304584389999 L 2565.9373593537493 2862.304584389999 M 2565.9373593537493 2840.629026164999 L 2719.2808894849995 2840.629026164999 L 2736.2443698349994 2857.5925065149986 L 2736.2443698349994 2883.980142614999 L 2565.9373593537493 2883.980142614999 L 2565.9373593537493 2840.629026164999 M 2719.2808894849995 2840.629026164999 L 2719.2808894849995 2857.5925065149986 L 2736.2443698349994 2857.5925065149986" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2592.3249954537496" y="2867.016662264999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">AfterTLogCommit</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2399.098956175624 2912.2526098649987 L 2713.927451031874 2912.2526098649987 L 2713.927451031874 2955.6037263149988 L 2399.098956175624 2955.6037263149988 L 2399.098956175624 2912.2526098649987" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2425.4865922756244" y="2938.640245964999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Wait all prior message being committed</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2556.5132036037494 3005.551751789999 L 2565.9373593537493 3005.551751789999 M 2565.9373593537493 2983.8761935649986 L 2633.4263972974995 2983.8761935649986 L 2650.3898776474994 3000.8396739149985 L 2650.3898776474994 3027.2273100149987 L 2565.9373593537493 3027.2273100149987 L 2565.9373593537493 2983.8761935649986 M 2633.4263972974995 2983.8761935649986 L 2633.4263972974995 3000.8396739149985 L 2650.3898776474994 3000.8396739149985" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2592.3249954537496" y="3010.263829664999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">After</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2005.7004008615622" y="3072.4632576149984" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">TLogCommitReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 2556.5132036037494 3080.0025822149987 L 1594.610142816875 3080.0025822149987" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(1579.0916996818748,3080.0025822149987) translate(-1579.0916996818748,-3080.0025822149987)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1594.7986259318748 3072.1491190899987 L 1579.0916996818748 3080.0025822149987 L 1594.7986259318748 3087.8560453399987 Z"/></g></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1424.0635992381249" y="3125.2385298149984" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Version (min)</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1579.0916996818748 3132.7778544149987 L 1372.920641148125 3132.7778544149987" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(1357.402198013125,3132.7778544149987) translate(-1357.402198013125,-3132.7778544149987)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1373.1091242631248 3124.9243912899988 L 1357.402198013125 3132.7778544149987 L 1373.1091242631248 3140.6313175399987 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1357.402198013125 3182.725879889999 L 1366.8263537631249 3182.725879889999 M 1366.8263537631249 3161.0503216649986 L 1493.5317003006248 3161.0503216649986 L 1510.495180650625 3178.0138020149984 L 1510.495180650625 3204.4014381149987 L 1366.8263537631249 3204.4014381149987 L 1366.8263537631249 3161.0503216649986 M 1493.5317003006248 3161.0503216649986 L 1493.5317003006248 3178.0138020149984 L 1510.495180650625 3178.0138020149984" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1393.2139898631249" y="3187.437957764999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">AfterLogPush</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="909.1898597071875" y="3249.6373857149983" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">CommitID</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1357.402198013125 3257.1767103149987 L 543.16100359875 3257.1767103149987" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(527.64256046375,3257.1767103149987) translate(-527.64256046375,-3257.1767103149987)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 543.34948671375 3249.3232471899987 L 527.64256046375 3257.1767103149987 L 543.34948671375 3265.0301734399986 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 527.64256046375 3304.6744552949986 L 537.0667162137499 3304.6744552949986 M 537.0667162137499 3285.4491775649985 L 696.9097580637499 3285.4491775649985 L 713.87323841375 3302.4126579149984 L 713.87323841375 3323.8997330249986 L 537.0667162137499 3323.8997330249986 L 537.0667162137499 3285.4491775649985 M 696.9097580637499 3285.4491775649985 L 696.9097580637499 3302.4126579149984 L 713.87323841375 3302.4126579149984" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="563.45435231375" y="3308.4441175949987" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">NativeAPI.commit.After</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="409.834476510625" y="3369.1356806249983" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"></text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 527.64256046375 3376.6750052249986 L 307.5448356925 3376.6750052249986" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(292.0263925575,3376.6750052249986) translate(-292.0263925575,-3376.6750052249986)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 307.7333188075 3368.8215420999986 L 292.0263925575 3376.6750052249986 L 307.7333188075 3384.5284683499985 Z"/></g></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="504.40481328249996" y="3421.9109528249983" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Version</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 292.0263925575 3429.4502774249986 L 751.3038533724999 3429.4502774249986" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(766.8222965074999,3429.4502774249986) translate(-766.8222965074999,-3429.4502774249986)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 751.1153702574999 3421.5968142999986 L 766.8222965074999 3429.4502774249986 L 751.1153702574999 3437.3037405499986 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 766.8222965074999 3479.3983028999987 L 776.2464522574999 3479.3983028999987 M 776.2464522574999 3457.7227446749985 L 948.8014081699998 3457.7227446749985 L 965.7648885199999 3474.6862250249983 L 965.7648885199999 3501.0738611249985 L 776.2464522574999 3501.0738611249985 L 776.2464522574999 3457.7227446749985 M 948.8014081699998 3457.7227446749985 L 948.8014081699998 3474.6862250249983 L 965.7648885199999 3474.6862250249983" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="green" stroke="none" font-family="sans-serif" font-size="11pt" font-style="italic" font-weight="normal" text-decoration="normal" x="802.6340883574999" y="3484.1103807749987" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">WatchValueAttachID</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 766.8222965074999 3551.0218865999987 L 776.2464522574999 3551.0218865999987 M 776.2464522574999 3529.3463283749984 L 856.2525800449998 3529.3463283749984 L 873.2160603949999 3546.309808724998 L 873.2160603949999 3572.6974448249985 L 776.2464522574999 3572.6974448249985 L 776.2464522574999 3529.3463283749984 M 856.2525800449998 3529.3463283749984 L 856.2525800449998 3546.309808724998 L 873.2160603949999 3546.309808724998" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="802.6340883574999" y="3555.7339644749986" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Before</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1762.7818190037494" y="3617.933392424998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">WatchValueRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 766.8222965074999 3625.4727170249985 L 2878.3107889899993 3625.4727170249985" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(2893.829232124999,3625.4727170249985) translate(-2893.829232124999,-3625.4727170249985)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 2878.1223058749993 3617.6192538999985 L 2893.829232124999 3625.4727170249985 L 2878.1223058749993 3633.3261801499984 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2893.829232124999 3672.9704620049984 L 2903.253387874999 3672.9704620049984 M 2903.253387874999 3653.7451842749983 L 3048.2429140999993 3653.7451842749983 L 3065.206394449999 3670.708664624998 L 3065.206394449999 3692.1957397349984 L 2903.253387874999 3692.1957397349984 L 2903.253387874999 3653.7451842749983 M 3048.2429140999993 3653.7451842749983 L 3048.2429140999993 3670.708664624998 L 3065.206394449999 3670.708664624998" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2929.6410239749994" y="3676.7401243049985" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">watchValueQ.Before</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2772.805365556249 3720.4682069849982 L 3014.853098693749 3720.4682069849982 L 3014.853098693749 3763.8193234349983 L 2772.805365556249 3763.8193234349983 L 2772.805365556249 3720.4682069849982" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2799.193001656249" y="3746.8558430849985" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Ensure version is not too old</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2893.829232124999 3811.317068414998 L 2903.253387874999 3811.317068414998 M 2903.253387874999 3792.091790684998 L 3078.2604922249993 3792.091790684998 L 3095.223972574999 3809.055271034998 L 3095.223972574999 3830.542346144998 L 2903.253387874999 3830.542346144998 L 2903.253387874999 3792.091790684998 M 3078.2604922249993 3792.091790684998 L 3078.2604922249993 3809.055271034998 L 3095.223972574999 3809.055271034998" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2929.6410239749994" y="3815.0867307149983" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">watchValueQ.AfterVersion</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2759.189642899999 3909.705254444998 L 3028.468821349999 3909.705254444998 L 3028.468821349999 3953.056370894998 L 2759.189642899999 3953.056370894998 L 2759.189642899999 3909.705254444998" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2785.577278999999" y="3936.092890544998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Check storageserver::getValueQ</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2893.829232124999 4000.554115874998 L 2903.253387874999 4000.554115874998 M 2903.253387874999 3981.328838144998 L 3066.9167422249993 3981.328838144998 L 3083.880222574999 3998.2923184949977 L 3083.880222574999 4019.779393604998 L 2903.253387874999 4019.779393604998 L 2903.253387874999 3981.328838144998 M 3066.9167422249993 3981.328838144998 L 3066.9167422249993 3998.2923184949977 L 3083.880222574999 3998.2923184949977" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2929.6410239749994" y="4004.323778174998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">watchValueQ.AfterRead</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1805.3062330662494" y="4093.2878084549975" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Version</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 2893.829232124999 4100.827133054998 L 782.3407396424999 4100.827133054998" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(766.8222965074999,4100.827133054998) translate(-766.8222965074999,-4100.827133054998)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 782.5292227575 4092.973669929998 L 766.8222965074999 4100.827133054998 L 782.5292227575 4108.680596179998 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 766.8222965074999 4150.775158529998 L 776.2464522574999 4150.775158529998 M 776.2464522574999 4129.099600304999 L 843.7354902012498 4129.099600304999 L 860.6989705512499 4146.063080654999 L 860.6989705512499 4172.450716754998 L 776.2464522574999 4172.450716754998 L 776.2464522574999 4129.099600304999 M 843.7354902012498 4129.099600304999 L 843.7354902012498 4146.063080654999 L 860.6989705512499 4146.063080654999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="802.6340883574999" y="4155.487236404999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">After</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="199.284387189375" y="4217.686664354998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"></text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 292.0263925575 4225.225988954998 L 122.06082495624999 4225.225988954998" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(106.54238182124999,4225.225988954998) translate(-106.54238182124999,-4225.225988954998)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 122.24930807125 4217.372525829997 L 106.54238182124999 4225.225988954998 L 122.24930807125 4233.079452079998 Z"/></g></g><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 942.98717771 493.2603119550002 L 1167.16691375375 493.2603119550002 L 1167.16691375375 615.7743367050002 L 942.98717771 615.7743367050002 L 942.98717771 493.2603119550002 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 942.98717771 493.2603119550002 L 942.98717771 515.8782857550002 L 1006.00801946 515.8782857550002 L 1017.31700636 504.56929885500017 L 1017.31700636 493.2603119550002 L 942.98717771 493.2603119550002" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="961.83548921" y="508.3389611550002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">loop</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="1036.16531786" y="508.3389611550002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Batch requests]</text></g><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1291.433107763125 2048.9999431649994 L 2064.9791673331247 2048.9999431649994 L 2064.9791673331247 2205.4409286149994 L 1291.433107763125 2205.4409286149994 L 1291.433107763125 2048.9999431649994 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1291.433107763125 2048.9999431649994 L 1291.433107763125 2071.6179169649995 L 1354.453949513125 2071.6179169649995 L 1365.762936413125 2060.3089300649995 L 1365.762936413125 2048.9999431649994 L 1291.433107763125 2048.9999431649994" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="1310.281419263125" y="2064.0785923649996" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">loop</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="1384.611247913125" y="2064.0785923649996" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Wait txn commit version enter the MVCC window]</text></g><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 2749.765487149999 3858.814813394998 L 3093.304378324999 3858.814813394998 L 3093.304378324999 4048.051860854998 L 2749.765487149999 4048.051860854998 L 2749.765487149999 3858.814813394998 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2749.765487149999 3858.814813394998 L 2749.765487149999 3881.432787194998 L 2812.786328899999 3881.432787194998 L 2824.095315799999 3870.123800294998 L 2824.095315799999 3858.814813394998 L 2749.765487149999 3858.814813394998" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="2768.613798649999" y="3873.8934625949983" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">loop</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="2842.9436272999988" y="3873.8934625949983" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Value not change]</text></g></g></g></svg>
\ No newline at end of file
diff --git a/design/Commit/CommitOverall.svg b/design/Commit/CommitOverall.svg
new file mode 100644
index 0000000000..a96b08c205
--- /dev/null
+++ b/design/Commit/CommitOverall.svg
@@ -0,0 +1 @@
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="1055" height="2253"><defs/><g><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g><rect fill="white" stroke="none" x="0" y="0" width="1055" height="2253"/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="16.5pt" font-style="normal" font-weight="normal" text-decoration="normal" x="404.62081035250003" y="28.27246725" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Commit in FoundationDB</text></g><g/><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 84.81740175000002 110.639588505 L 84.81740175000002 2253.692606055" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 281.83493684375003 110.639588505 L 281.83493684375003 2253.692606055" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 460.79673078312504 110.639588505 L 460.79673078312504 2253.692606055" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 689.4402424393751 110.639588505 L 689.4402424393751 2253.692606055" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 832.9746447518751 110.639588505 L 832.9746447518751 2253.692606055" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 970.1947267675 110.639588505 L 970.1947267675 2253.692606055" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/></g><g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 45.192179085000014 59.93763057 L 124.44262441500001 59.93763057 L 124.44262441500001 110.639588505 L 45.192179085000014 110.639588505 L 45.192179085000014 59.93763057 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="65.64259706250002" y="91.79127700500001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Client</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 187.17553449125003 59.93763057 L 376.49433919625005 59.93763057 L 376.49433919625005 110.639588505 L 187.17553449125003 110.639588505 L 187.17553449125003 59.93763057 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="207.62595246875003" y="91.79127700500001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetReadVersionProxy</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 395.3426506962501 59.93763057 L 526.2508108700001 59.93763057 L 526.2508108700001 110.639588505 L 395.3426506962501 110.639588505 L 395.3426506962501 59.93763057 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="415.79306867375004" y="91.79127700500001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">CommitProxy</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 646.0686818837501 59.93763057 L 732.8118029950001 59.93763057 L 732.8118029950001 110.639588505 L 646.0686818837501 110.639588505 L 646.0686818837501 59.93763057 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="666.5190998612501" y="91.79127700500001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Master</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 782.930720915 59.93763057 L 883.0185685887501 59.93763057 L 883.0185685887501 110.639588505 L 782.930720915 110.639588505 L 782.930720915 59.93763057 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="803.3811388925001" y="91.79127700500001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Resolver</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 932.6495822275 59.93763057 L 1007.7398713075 59.93763057 L 1007.7398713075 110.639588505 L 932.6495822275 110.639588505 L 932.6495822275 59.93763057 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="953.100000205" y="91.79127700500001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">TLog</text></g></g><g><g><g><rect fill="white" stroke="none" x="109.20863936875004" y="148.336211505" width="148.23505985625" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="112.03588609375004" y="165.299691855" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Request read version</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 84.81740175000002 172.839016455 L 279.47889790625004 172.839016455" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(281.83493684375003,172.839016455) translate(-281.83493684375003,-172.839016455)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 265.6865779402262 165.63509627928067 L 280.06920622965447 172.839016455 L 265.6865779402262 180.04293663071934" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="391.93876088531255" y="201.11148370499998" width="187.3976575125" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="394.76600761031256" y="218.07496405499998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Request committed version</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 281.83493684375003 225.614288655 L 687.0842035018751 225.614288655" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(689.4402424393751,225.614288655) translate(-689.4402424393751,-225.614288655)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 673.2918835358512 218.41036847928066 L 687.6745118252795 225.614288655 L 673.2918835358512 232.81820883071933" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="389.85135854156255" y="253.886755905" width="191.5724622" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="392.67860526656256" y="270.850236255" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Respond committed version</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 689.4402424393751 278.389560855 L 284.19097578125 278.389560855" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(281.83493684375003,278.389560855) translate(-281.83493684375003,-278.389560855)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 297.98329574727387 271.18564067928065 L 283.6006674578456 278.389560855 L 297.98329574727387 285.5934810307194" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="107.12123702500004" y="306.66202810500005" width="152.40986454375" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="109.94848375000004" y="323.62550845500004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Respond read version</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 281.83493684375003 331.16483305500003 L 87.17344068750002 331.16483305500003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(84.81740175000002,331.16483305500003) translate(-84.81740175000002,-331.16483305500003)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 100.96576065352392 323.96091287928067 L 86.58313236409556 331.16483305500003 L 100.96576065352392 338.3687532307194" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="148.2659523540625" y="359.43730030500006" width="249.082227825" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="151.09319907906252" y="376.40078065500006" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Commit a mutation with read version</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 84.81740175000002 383.94010525500005 L 458.44069184562505 383.94010525500005" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(460.79673078312504,383.94010525500005) translate(-460.79673078312504,-383.94010525500005)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 444.6483718796012 376.7361850792807 L 459.0310001690295 383.94010525500005 L 444.6483718796012 391.1440254307194" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 460.79673078312504 433.88813073000006 L 470.22088653312505 433.88813073000006 M 470.22088653312505 412.2125725050001 L 616.3726235768751 412.2125725050001 L 616.3726235768751 455.5636889550001 L 470.22088653312505 455.5636889550001 L 470.22088653312505 412.2125725050001" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="496.60852263312506" y="438.60020860500003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Pre-resolution</text></g><g><g><rect fill="white" stroke="none" x="485.5908004331251" y="483.8361562050001" width="179.05537235625" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="488.4180471581251" y="500.7996365550001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Request a commit version</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 460.79673078312504 508.3389611550001 L 687.0842035018751 508.3389611550001" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(689.4402424393751,508.3389611550001) translate(-689.4402424393751,-508.3389611550001)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 673.2918835358512 501.13504097928075 L 687.6745118252795 508.3389611550001 L 673.2918835358512 515.5428813307194" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="520.1977340268751" y="587.5018694550001" width="109.84150516875" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="523.0249807518751" y="604.4653498050002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Commit version</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 689.4402424393751 612.0046744050002 L 463.15276972062503 612.0046744050002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(460.79673078312504,612.0046744050002) translate(-460.79673078312504,-612.0046744050002)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 476.9450896866489 604.8007542292809 L 462.5624613972206 612.0046744050002 L 476.9450896866489 619.2085945807195" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="520.1977340268751" y="691.1675827050002" width="109.84150516875" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="523.0249807518751" y="708.1310630550003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Commit version</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 689.4402424393751 715.6703876550002 L 463.15276972062503 715.6703876550002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(460.79673078312504,715.6703876550002) translate(-460.79673078312504,-715.6703876550002)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 476.9450896866489 708.4664674792809 L 462.5624613972206 715.6703876550002 L 476.9450896866489 722.8743078307195" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="556.9972142456251" y="794.8332959550003" width="45.666700481250004" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="559.8244609706251" y="811.7967763050003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Never</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 689.4402424393751 819.3361009050003 L 479.64504228312506 819.3361009050003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 470.22088653312505 809.9119451550004 L 489.0691980331251 828.7602566550003 M 470.22088653312505 828.7602566550003 L 489.0691980331251 809.9119451550004" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 460.79673078312504 897.5565936300003 L 470.22088653312505 897.5565936300003 M 470.22088653312505 875.8810354050004 L 593.8726235768751 875.8810354050004 L 593.8726235768751 919.2321518550003 L 470.22088653312505 919.2321518550003 L 470.22088653312505 875.8810354050004" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="496.60852263312506" y="902.2686715050004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Resolution</text></g><g><g><rect fill="white" stroke="none" x="525.65512073" y="947.5046191050003" width="242.461134075" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="528.482367455" y="964.4680994550004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Send the transaction to the resolver</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 460.79673078312504 972.0074240550003 L 830.6186058143751 972.0074240550003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(832.9746447518751,972.0074240550003) translate(-832.9746447518751,-972.0074240550003)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 816.8262858483512 964.803503879281 L 831.2089141377795 972.0074240550003 L 816.8262858483512 979.2113442307196" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="568.88998401125" y="1051.1703323550005" width="155.9914075125" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="571.71723073625" y="1068.1338127050003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">TransactionCommitted</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 832.9746447518751 1075.6731373050004 L 463.15276972062503 1075.6731373050004" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(460.79673078312504,1075.6731373050004) translate(-460.79673078312504,-1075.6731373050004)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 476.9450896866489 1068.469217129281 L 462.5624613972206 1075.6731373050004 L 476.9450896866489 1082.8770574807197" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="580.13998401125" y="1154.8360456050004" width="133.4914075125" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="582.96723073625" y="1171.7995259550003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">TransactionConflict</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 832.9746447518751 1179.3388505550004 L 463.15276972062503 1179.3388505550004" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(460.79673078312504,1179.3388505550004) translate(-460.79673078312504,-1179.3388505550004)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 476.9450896866489 1172.134930379281 L 462.5624613972206 1179.3388505550004 L 476.9450896866489 1186.5427707307197" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="581.38510119875" y="1258.5017588550004" width="131.0011731375" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="584.21234792375" y="1275.4652392050002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">TransactionTooOld</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 832.9746447518751 1283.0045638050003 L 463.15276972062503 1283.0045638050003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(460.79673078312504,1283.0045638050003) translate(-460.79673078312504,-1283.0045638050003)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 476.9450896866489 1275.800643629281 L 462.5624613972206 1283.0045638050003 L 476.9450896866489 1290.2084839807196" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 460.79673078312504 1361.2250565300003 L 470.22088653312505 1361.2250565300003 M 470.22088653312505 1339.5494983050003 L 623.0449868581251 1339.5494983050003 L 623.0449868581251 1382.9006147550003 L 470.22088653312505 1382.9006147550003 L 470.22088653312505 1339.5494983050003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="496.60852263312506" y="1365.937134405" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Post-resolution</text></g><g><g><rect fill="white" stroke="none" x="600.0952398628125" y="1411.1730820050004" width="230.800977825" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="602.9224865878125" y="1428.1365623550003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Push the transaction data to TLog</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 460.79673078312504 1435.6758869550004 L 967.83868783 1435.6758869550004" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(970.1947267675,1435.6758869550004) translate(-970.1947267675,-1435.6758869550004)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 954.0463678639761 1428.471966779281 L 968.4289961534045 1435.6758869550004 L 954.0463678639761 1442.8798071307197" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="528.8122808784375" y="1514.8387952550004" width="373.36689579375" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="531.6395276034375" y="1531.8022756050002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">The version of the transactions that are already durable</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 970.1947267675 1539.3416002050003 L 463.15276972062503 1539.3416002050003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(460.79673078312504,1539.3416002050003) translate(-460.79673078312504,-1539.3416002050003)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 476.9450896866489 1532.137680029281 L 462.5624613972206 1539.3416002050003 L 476.9450896866489 1546.5455203807196" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="669.7156011909375" y="1618.5045085050003" width="91.56025516875" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="672.5428479159375" y="1635.4679888550002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">tlog_stopped</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 970.1947267675 1643.0073134550003 L 463.15276972062503 1643.0073134550003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(460.79673078312504,1643.0073134550003) translate(-460.79673078312504,-1643.0073134550003)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 476.9450896866489 1635.803393279281 L 462.5624613972206 1643.0073134550003 L 476.9450896866489 1650.2112336307196" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 460.79673078312504 1721.2278061800002 L 470.22088653312505 1721.2278061800002 M 470.22088653312505 1699.5522479550002 L 561.3457681081251 1699.5522479550002 L 561.3457681081251 1742.9033644050003 L 470.22088653312505 1742.9033644050003 L 470.22088653312505 1699.5522479550002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="496.60852263312506" y="1725.939884055" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Reply</text></g><g><g><rect fill="white" stroke="none" x="483.1005660581251" y="1771.1758316550004" width="184.03584110625" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="485.9278127831251" y="1788.1393120050002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Report raw commit version</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 460.79673078312504 1795.6786366050003 L 687.0842035018751 1795.6786366050003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(689.4402424393751,1795.6786366050003) translate(-689.4402424393751,-1795.6786366050003)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 673.2918835358512 1788.474716429281 L 687.6745118252795 1795.6786366050003 L 673.2918835358512 1802.8825567807196" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="557.6904098081251" y="1823.9511038550004" width="34.856153606250004" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="560.5176565331251" y="1840.9145842050002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Void</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 689.4402424393751 1848.4539088050003 L 463.15276972062503 1848.4539088050003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(460.79673078312504,1848.4539088050003) translate(-460.79673078312504,-1848.4539088050003)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 476.9450896866489 1841.249988629281 L 462.5624613972206 1848.4539088050003 L 476.9450896866489 1855.6578289807196" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="217.8863136821875" y="1927.6168171050003" width="109.84150516875" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="220.71356040718752" y="1944.5802974550002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Commit version</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 460.79673078312504 1952.1196220550003 L 87.17344068750002 1952.1196220550003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(84.81740175000002,1952.1196220550003) translate(-84.81740175000002,-1952.1196220550003)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 100.96576065352392 1944.915701879281 L 86.58313236409556 1952.1196220550003 L 100.96576065352392 1959.3235422307196" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="194.1228859478125" y="2031.282530355" width="157.3683606375" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="196.95013267281252" y="2048.246010705" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Not committed: conflict</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 460.79673078312504 2055.785335305 L 87.17344068750002 2055.785335305" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(84.81740175000002,2055.785335305) translate(-84.81740175000002,-2055.785335305)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 100.96576065352392 2048.5814151292807 L 86.58313236409556 2055.785335305 L 100.96576065352392 2062.9892554807193" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g><rect fill="white" stroke="none" x="194.9468605571875" y="2134.948243605" width="155.72041141875" height="24.50280495"/></g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="197.77410728218752" y="2151.9117239549996" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Not committed: too old</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 460.79673078312504 2159.451048555 L 87.17344068750002 2159.451048555" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(84.81740175000002,2159.451048555) translate(-84.81740175000002,-2159.451048555)"><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 100.96576065352392 2152.2471283792806 L 86.58313236409556 2159.451048555 L 100.96576065352392 2166.6549687307192" stroke-miterlimit="10" stroke-width="1.5706926250000002"/></g></g><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 394.827640533125 536.6114284050002 L 755.4093326893751 536.6114284050002 L 755.4093326893751 847.6085681550003 L 394.827640533125 847.6085681550003 L 394.827640533125 536.6114284050002 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 394.827640533125 536.6114284050002 L 394.827640533125 559.2294022050002 L 446.528169783125 559.2294022050002 L 457.837156683125 547.9204153050002 L 457.837156683125 536.6114284050002 L 394.827640533125 536.6114284050002" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="413.67595203312504" y="551.6900776050002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">alt</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="476.685468183125" y="551.6900776050002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[New request]</text><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 394.827640533125 640.2771416550003 L 755.4093326893751 640.2771416550003" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray="4.712077875"/></g><g/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="476.685468183125" y="655.3557908550002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Replied before with a commit version]</text><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 394.827640533125 743.9428549050003 L 755.4093326893751 743.9428549050003" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray="4.712077875"/></g><g/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="476.685468183125" y="759.0215041050003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Replied before without commit version]</text></g><g/><g/><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 394.827640533125 1000.2798913050004 L 898.9437350018751 1000.2798913050004 L 898.9437350018751 1311.2770310550004 L 394.827640533125 1311.2770310550004 L 394.827640533125 1000.2798913050004 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 394.827640533125 1000.2798913050004 L 394.827640533125 1022.8978651050004 L 446.528169783125 1022.8978651050004 L 457.837156683125 1011.5888782050005 L 457.837156683125 1000.2798913050004 L 394.827640533125 1000.2798913050004" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="413.67595203312504" y="1015.3585405050004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">alt</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="476.685468183125" y="1015.3585405050004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[No conflict]</text><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 394.827640533125 1103.9456045550003 L 898.9437350018751 1103.9456045550003" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray="4.712077875"/></g><g/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="476.685468183125" y="1119.0242537550002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Conflict]</text><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 394.827640533125 1207.6113178050002 L 898.9437350018751 1207.6113178050002" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray="4.712077875"/></g><g/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="476.685468183125" y="1222.6899670050002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Read snapshot older than oldest version]</text></g><g/><g/><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 394.827640533125 1463.9483542050002 L 1036.1638170175 1463.9483542050002 L 1036.1638170175 1671.2797807050003 L 394.827640533125 1671.2797807050003 L 394.827640533125 1463.9483542050002 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 394.827640533125 1463.9483542050002 L 394.827640533125 1486.5663280050003 L 446.528169783125 1486.5663280050003 L 457.837156683125 1475.2573411050003 L 457.837156683125 1463.9483542050002 L 394.827640533125 1463.9483542050002" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="413.67595203312504" y="1479.0270034050002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">alt</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="476.685468183125" y="1479.0270034050002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[TLog not stopped]</text><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 394.827640533125 1567.6140674550002 L 1036.1638170175 1567.6140674550002" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray="4.712077875"/></g><g/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="476.685468183125" y="1582.6927166550001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[TLog stopped]</text></g><g/><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 18.84831150000001 1876.7263760550002 L 526.765821033125 1876.7263760550002 L 526.765821033125 2187.723515805 L 18.84831150000001 2187.723515805 L 18.84831150000001 1876.7263760550002 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 18.84831150000001 1876.7263760550002 L 18.84831150000001 1899.3443498550002 L 70.54884075000001 1899.3443498550002 L 81.85782765000002 1888.0353629550002 L 81.85782765000002 1876.7263760550002 L 18.84831150000001 1876.7263760550002" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="37.69662300000001" y="1891.8050252550001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">alt</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="100.70613915000001" y="1891.8050252550001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Commit successful]</text><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 18.84831150000001 1980.3920893050001 L 526.765821033125 1980.3920893050001" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray="4.712077875"/></g><g/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="100.70613915000001" y="1995.470738505" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Conflict]</text><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 18.84831150000001 2084.057802555 L 526.765821033125 2084.057802555" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray="4.712077875"/></g><g/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="100.70613915000001" y="2099.136451755" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Transaction too old]</text></g><g/><g/></g></g></svg>
\ No newline at end of file
diff --git a/design/Commit/GRV.svg b/design/Commit/GRV.svg
new file mode 100644
index 0000000000..ab2451fa03
--- /dev/null
+++ b/design/Commit/GRV.svg
@@ -0,0 +1 @@
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="2287" height="1707"><defs/><g><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g><rect fill="white" stroke="none" x="0" y="0" width="2287" height="1707"/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="16.5pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1056.8547004165625" y="28.27246725" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Get Read Version</text></g><g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 9.424155749999994 58.3355240925 L 690.9518715524999 58.3355240925 L 690.9518715524999 1698.421349265 L 9.424155749999994 1698.421349265 L 9.424155749999994 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="256.40505466687495" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Client</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="298.07253513562495" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (NativeAPI.actor.cpp)</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 788.4744715906251 58.3355240925 L 1738.80403839875 58.3355240925 L 1738.80403839875 1698.421349265 L 788.4744715906251 1698.421349265 L 788.4744715906251 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="1134.1397432759375" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GRVProxy</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1208.0704073384375" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (GrvProxyServer.actor.cpp)</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1906.3388454681253 58.3355240925 L 2152.840922610625 58.3355240925 L 2152.840922610625 1698.421349265 L 1906.3388454681253 1698.421349265 L 1906.3388454681253 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="1922.4621984925002" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Master</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1970.8166906800002" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (masterserver.actor.cpp)</text></g></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 135.75136619625 153.990704955 L 135.75136619625 1698.421349265" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 357.830835916875 153.990704955 L 357.830835916875 1698.421349265" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 572.267483371875 153.990704955 L 572.267483371875 1698.421349265" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 925.927170318125 153.990704955 L 925.927170318125 1698.421349265" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1295.9277071431252 153.990704955 L 1295.9277071431252 1698.421349265" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1623.4668181868751 153.990704955 L 1623.4668181868751 1698.421349265" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 2029.5898840393752 153.990704955 L 2029.5898840393752 1698.421349265" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/></g><g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 18.848311499999994 103.28874702000002 L 252.65442089249999 103.28874702000002 L 252.65442089249999 153.990704955 L 18.848311499999994 153.990704955 L 18.848311499999994 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="39.29872947749999" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Transaction::getReadVersion</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 271.5027323925 103.28874702000002 L 444.15893944124997 103.28874702000002 L 444.15893944124997 153.990704955 L 271.5027323925 153.990704955 L 271.5027323925 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="291.95315037" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">readVersionBatcher</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 463.00725094124994 103.28874702000002 L 681.5277158024999 103.28874702000002 L 681.5277158024999 153.990704955 L 463.00725094124994 153.990704955 L 463.00725094124994 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="483.45766891874996" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">getConsistentReadVersion</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 797.898627340625 103.28874702000002 L 1053.955713295625 103.28874702000002 L 1053.955713295625 153.990704955 L 797.898627340625 153.990704955 L 797.898627340625 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="818.349045318125" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">queueGetReadVersionRequests</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1216.2829532281253 103.28874702000002 L 1375.5724610581253 103.28874702000002 L 1375.5724610581253 153.990704955 L 1216.2829532281253 153.990704955 L 1216.2829532281253 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1236.7333712056252" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">transactionStarter</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1517.5537537250002 103.28874702000002 L 1729.3798826487503 103.28874702000002 L 1729.3798826487503 153.990704955 L 1517.5537537250002 153.990704955 L 1517.5537537250002 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1538.0041717025001" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">getLiveCommittedVersion</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1915.7630012181253 103.28874702000002 L 2143.416766860625 103.28874702000002 L 2143.416766860625 153.990704955 L 1915.7630012181253 153.990704955 L 1915.7630012181253 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1936.2134191956252" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">serveLiveCommittedVersion</text></g></g><g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="193.8369994940625" y="208.65080830500003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">VersionRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 135.75136619625 216.19013290500004 L 342.312392781875 216.19013290500004" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(357.830835916875,216.19013290500004) translate(-357.830835916875,-216.19013290500004)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 342.123909666875 208.33666978000005 L 357.830835916875 216.19013290500004 L 342.123909666875 224.04359603000003 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 238.05941075437502 295.353041205 L 477.602261079375 295.353041205 L 477.602261079375 338.704157655 L 238.05941075437502 338.704157655 L 238.05941075437502 295.353041205" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="264.447046854375" y="321.74067730499996" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Batch read version requests</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 357.830835916875 416.92465038 L 367.254991666875 416.92465038 M 367.254991666875 395.249092155 L 538.4110217981249 395.249092155 L 555.374502148125 412.212572505 L 555.374502148125 438.60020860500003 L 367.254991666875 438.60020860500003 L 367.254991666875 395.249092155 M 538.4110217981249 395.249092155 L 538.4110217981249 412.212572505 L 555.374502148125 412.212572505" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="green" stroke="none" font-family="sans-serif" font-size="11pt" font-style="italic" font-weight="normal" text-decoration="normal" x="393.64262776687497" y="421.636728255" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">TransactionAttachID</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="465.049159644375" y="483.83615620500007" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"></text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 357.830835916875 491.37548080500005 L 556.749040236875 491.37548080500005" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(572.267483371875,491.37548080500005) translate(-572.267483371875,-491.37548080500005)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 556.5605571218749 483.52201768000003 L 572.267483371875 491.37548080500005 L 556.5605571218749 499.22894393000007 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 572.267483371875 541.3235062800001 L 581.6916391218749 541.3235062800001 M 581.6916391218749 519.6479480550001 L 661.6977669093749 519.6479480550001 L 678.6612472593749 536.6114284050002 L 678.6612472593749 562.9990645050001 L 581.6916391218749 562.9990645050001 L 581.6916391218749 519.6479480550001 M 661.6977669093749 519.6479480550001 L 661.6977669093749 536.6114284050002 L 678.6612472593749 536.6114284050002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="608.0792752218749" y="546.0355841550002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Before</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="666.124914735625" y="608.2350121050001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetReadVersionRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 572.267483371875 615.7743367050001 L 910.408727183125 615.7743367050001" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(925.927170318125,615.7743367050001) translate(-925.927170318125,-615.7743367050001)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 910.220244068125 607.9208735800001 L 925.927170318125 615.7743367050001 L 910.220244068125 623.6277998300001 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 806.155745155625 694.9372450050002 L 1045.698595480625 694.9372450050002 L 1045.698595480625 738.2883614550002 L 806.155745155625 738.2883614550002 L 806.155745155625 694.9372450050002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="832.543381255625" y="721.3248811050003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Batch read version requests</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 925.927170318125 814.0585736850002 L 935.351326068125 814.0585736850002 M 935.351326068125 794.8332959550002 L 1269.540071043125 794.8332959550002 L 1286.503551393125 811.7967763050002 L 1286.503551393125 833.2838514150002 L 935.351326068125 833.2838514150002 L 935.351326068125 794.8332959550002 M 1269.540071043125 794.8332959550002 L 1269.540071043125 811.7967763050002 L 1286.503551393125 811.7967763050002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="961.738962168125" y="817.8282359850002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GrvProxyServer.queueTransactionStartRequests.Before</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1110.927438730625" y="878.5197990150002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"></text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 925.927170318125 886.0591236150002 L 1280.409264008125 886.0591236150002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(1295.9277071431252,886.0591236150002) translate(-1295.9277071431252,-886.0591236150002)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1280.2207808931253 878.2056604900002 L 1295.9277071431252 886.0591236150002 L 1280.2207808931253 893.9125867400002 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1295.9277071431252 936.0071490900002 L 1305.3518628931251 936.0071490900002 M 1305.3518628931251 914.3315908650002 L 1476.507893024375 914.3315908650002 L 1493.4713733743752 931.2950712150002 L 1493.4713733743752 957.6827073150002 L 1305.3518628931251 957.6827073150002 L 1305.3518628931251 914.3315908650002 M 1476.507893024375 914.3315908650002 L 1476.507893024375 931.2950712150002 L 1493.4713733743752 931.2950712150002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="green" stroke="none" font-family="sans-serif" font-size="11pt" font-style="italic" font-weight="normal" text-decoration="normal" x="1331.7394989931252" y="940.7192269650003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">TransactionAttachID</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1295.9277071431252 1007.6307327900001 L 1305.3518628931251 1007.6307327900001 M 1305.3518628931251 985.9551745650001 L 1597.079182086875 985.9551745650001 L 1614.0426624368752 1002.9186549150002 L 1614.0426624368752 1029.3062910150002 L 1305.3518628931251 1029.3062910150002 L 1305.3518628931251 985.9551745650001 M 1597.079182086875 985.9551745650001 L 1597.079182086875 1002.9186549150002 L 1614.0426624368752 1002.9186549150002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1331.7394989931252" y="1012.3428106650002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">AskLiveCommittedVersionFromMaster</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1459.6972626650002" y="1074.542238615" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"></text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1295.9277071431252 1082.0815632150002 L 1607.948375051875 1082.0815632150002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(1623.4668181868751,1082.0815632150002) translate(-1623.4668181868751,-1082.0815632150002)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1607.7598919368752 1074.2281000900002 L 1623.4668181868751 1082.0815632150002 L 1607.7598919368752 1089.9350263400001 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1623.4668181868751 1132.02958869 L 1632.890973936875 1132.02958869 M 1632.890973936875 1110.354030465 L 1787.926398599375 1110.354030465 L 1804.889878949375 1127.317510815 L 1804.889878949375 1153.7051469150001 L 1632.890973936875 1153.7051469150001 L 1632.890973936875 1110.354030465 M 1787.926398599375 1110.354030465 L 1787.926398599375 1127.317510815 L 1804.889878949375 1127.317510815" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1659.278610036875" y="1136.7416665649998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">confirmEpochLive</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1710.2234194725002" y="1198.941094515" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetRawCommittedVersionRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1623.4668181868751 1206.4804191150001 L 2014.071440904375 1206.4804191150001" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(2029.5898840393752,1206.4804191150001) translate(-2029.5898840393752,-1206.4804191150001)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 2013.8829577893753 1198.6269559900002 L 2029.5898840393752 1206.4804191150001 L 2013.8829577893753 1214.33388224 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2029.5898840393752 1256.42844459 L 2039.0140397893751 1256.42844459 M 2039.0140397893751 1234.752886365 L 2251.5665542956253 1234.752886365 L 2268.530034645625 1251.716366715 L 2268.530034645625 1278.104002815 L 2039.0140397893751 1278.104002815 L 2039.0140397893751 1234.752886365 M 2251.5665542956253 1234.752886365 L 2251.5665542956253 1251.716366715 L 2268.530034645625 1251.716366715" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2065.4016758893754" y="1261.1405224649998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetRawCommittedVersion</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1718.9831850975002" y="1323.339950415" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetRawCommittedVersionReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 2029.5898840393752 1330.879275015 L 1638.9852613218752 1330.879275015" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(1623.4668181868751,1330.879275015) translate(-1623.4668181868751,-1330.879275015)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1639.173744436875 1323.02581189 L 1623.4668181868751 1330.879275015 L 1639.173744436875 1338.73273814 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1623.4668181868751 1380.82730049 L 1632.890973936875 1380.82730049 M 1632.890973936875 1359.151742265 L 1700.380011880625 1359.151742265 L 1717.343492230625 1376.115222615 L 1717.343492230625 1402.502858715 L 1632.890973936875 1402.502858715 L 1632.890973936875 1359.151742265 M 1700.380011880625 1359.151742265 L 1700.380011880625 1376.115222615 L 1717.343492230625 1376.115222615" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1659.278610036875" y="1385.5393783649997" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">After</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1023.6545042950002" y="1447.738806315" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetReadVersionReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1623.4668181868751 1455.278130915 L 587.785926506875 1455.278130915" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(572.267483371875,1455.278130915) translate(-572.267483371875,-1455.278130915)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 587.974409621875 1447.42466779 L 572.267483371875 1455.278130915 L 587.974409621875 1463.13159404 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 572.267483371875 1505.22615639 L 581.6916391218749 1505.22615639 M 581.6916391218749 1483.5505981649999 L 649.1806770656249 1483.5505981649999 L 666.1441574156249 1500.514078515 L 666.1441574156249 1526.901714615 L 581.6916391218749 1526.901714615 L 581.6916391218749 1483.5505981649999 M 649.1806770656249 1483.5505981649999 L 649.1806770656249 1500.514078515 L 666.1441574156249 1500.514078515" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="608.0792752218749" y="1509.9382342649997" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">After</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="390.83651316" y="1572.1376622149999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetReadVersionReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 572.267483371875 1579.676986815 L 373.349279051875 1579.676986815" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(357.830835916875,1579.676986815) translate(-357.830835916875,-1579.676986815)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 373.537762166875 1571.82352369 L 357.830835916875 1579.676986815 L 373.537762166875 1587.53044994 Z"/></g></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="172.5784545721875" y="1624.9129344149999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetReadVersionReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 357.830835916875 1632.452259015 L 151.26980933125 1632.452259015" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(135.75136619625,1632.452259015) translate(-135.75136619625,-1632.452259015)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 151.45829244625 1624.59879589 L 135.75136619625 1632.452259015 L 151.45829244625 1640.30572214 Z"/></g></g><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 228.635255004375 244.46260015500002 L 487.02641682937497 244.46260015500002 L 487.02641682937497 366.976624905 L 228.635255004375 366.976624905 L 228.635255004375 244.46260015500002 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 228.635255004375 244.46260015500002 L 228.635255004375 267.08057395500003 L 291.656096754375 267.08057395500003 L 302.965083654375 255.77158705500003 L 302.965083654375 244.46260015500002 L 228.635255004375 244.46260015500002" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="247.483566504375" y="259.54124935500005" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">loop</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="321.81339515437503" y="259.54124935500005" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Batch requests]</text></g><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 796.7315894056251 644.0468039550002 L 1055.122751230625 644.0468039550002 L 1055.122751230625 766.5608287050002 L 796.7315894056251 766.5608287050002 L 796.7315894056251 644.0468039550002 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 796.7315894056251 644.0468039550002 L 796.7315894056251 666.6647777550002 L 859.7524311556251 666.6647777550002 L 871.0614180556252 655.3557908550002 L 871.0614180556252 644.0468039550002 L 796.7315894056251 644.0468039550002" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="815.5799009056251" y="659.1254531550002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">loop</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="889.9097295556252" y="659.1254531550002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Batch requests]</text></g></g></g></svg>
\ No newline at end of file
diff --git a/design/Commit/Get.svg b/design/Commit/Get.svg
new file mode 100644
index 0000000000..ab2451fa03
--- /dev/null
+++ b/design/Commit/Get.svg
@@ -0,0 +1 @@
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="2287" height="1707"><defs/><g><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g><rect fill="white" stroke="none" x="0" y="0" width="2287" height="1707"/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="16.5pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1056.8547004165625" y="28.27246725" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Get Read Version</text></g><g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 9.424155749999994 58.3355240925 L 690.9518715524999 58.3355240925 L 690.9518715524999 1698.421349265 L 9.424155749999994 1698.421349265 L 9.424155749999994 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="256.40505466687495" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Client</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="298.07253513562495" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (NativeAPI.actor.cpp)</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 788.4744715906251 58.3355240925 L 1738.80403839875 58.3355240925 L 1738.80403839875 1698.421349265 L 788.4744715906251 1698.421349265 L 788.4744715906251 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="1134.1397432759375" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GRVProxy</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1208.0704073384375" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (GrvProxyServer.actor.cpp)</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1906.3388454681253 58.3355240925 L 2152.840922610625 58.3355240925 L 2152.840922610625 1698.421349265 L 1906.3388454681253 1698.421349265 L 1906.3388454681253 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="1922.4621984925002" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Master</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1970.8166906800002" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (masterserver.actor.cpp)</text></g></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 135.75136619625 153.990704955 L 135.75136619625 1698.421349265" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 357.830835916875 153.990704955 L 357.830835916875 1698.421349265" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 572.267483371875 153.990704955 L 572.267483371875 1698.421349265" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 925.927170318125 153.990704955 L 925.927170318125 1698.421349265" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1295.9277071431252 153.990704955 L 1295.9277071431252 1698.421349265" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1623.4668181868751 153.990704955 L 1623.4668181868751 1698.421349265" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 2029.5898840393752 153.990704955 L 2029.5898840393752 1698.421349265" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/></g><g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 18.848311499999994 103.28874702000002 L 252.65442089249999 103.28874702000002 L 252.65442089249999 153.990704955 L 18.848311499999994 153.990704955 L 18.848311499999994 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="39.29872947749999" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Transaction::getReadVersion</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 271.5027323925 103.28874702000002 L 444.15893944124997 103.28874702000002 L 444.15893944124997 153.990704955 L 271.5027323925 153.990704955 L 271.5027323925 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="291.95315037" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">readVersionBatcher</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 463.00725094124994 103.28874702000002 L 681.5277158024999 103.28874702000002 L 681.5277158024999 153.990704955 L 463.00725094124994 153.990704955 L 463.00725094124994 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="483.45766891874996" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">getConsistentReadVersion</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 797.898627340625 103.28874702000002 L 1053.955713295625 103.28874702000002 L 1053.955713295625 153.990704955 L 797.898627340625 153.990704955 L 797.898627340625 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="818.349045318125" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">queueGetReadVersionRequests</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1216.2829532281253 103.28874702000002 L 1375.5724610581253 103.28874702000002 L 1375.5724610581253 153.990704955 L 1216.2829532281253 153.990704955 L 1216.2829532281253 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1236.7333712056252" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">transactionStarter</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1517.5537537250002 103.28874702000002 L 1729.3798826487503 103.28874702000002 L 1729.3798826487503 153.990704955 L 1517.5537537250002 153.990704955 L 1517.5537537250002 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1538.0041717025001" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">getLiveCommittedVersion</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1915.7630012181253 103.28874702000002 L 2143.416766860625 103.28874702000002 L 2143.416766860625 153.990704955 L 1915.7630012181253 153.990704955 L 1915.7630012181253 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1936.2134191956252" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">serveLiveCommittedVersion</text></g></g><g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="193.8369994940625" y="208.65080830500003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">VersionRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 135.75136619625 216.19013290500004 L 342.312392781875 216.19013290500004" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(357.830835916875,216.19013290500004) translate(-357.830835916875,-216.19013290500004)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 342.123909666875 208.33666978000005 L 357.830835916875 216.19013290500004 L 342.123909666875 224.04359603000003 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 238.05941075437502 295.353041205 L 477.602261079375 295.353041205 L 477.602261079375 338.704157655 L 238.05941075437502 338.704157655 L 238.05941075437502 295.353041205" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="264.447046854375" y="321.74067730499996" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Batch read version requests</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 357.830835916875 416.92465038 L 367.254991666875 416.92465038 M 367.254991666875 395.249092155 L 538.4110217981249 395.249092155 L 555.374502148125 412.212572505 L 555.374502148125 438.60020860500003 L 367.254991666875 438.60020860500003 L 367.254991666875 395.249092155 M 538.4110217981249 395.249092155 L 538.4110217981249 412.212572505 L 555.374502148125 412.212572505" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="green" stroke="none" font-family="sans-serif" font-size="11pt" font-style="italic" font-weight="normal" text-decoration="normal" x="393.64262776687497" y="421.636728255" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">TransactionAttachID</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="465.049159644375" y="483.83615620500007" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"></text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 357.830835916875 491.37548080500005 L 556.749040236875 491.37548080500005" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(572.267483371875,491.37548080500005) translate(-572.267483371875,-491.37548080500005)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 556.5605571218749 483.52201768000003 L 572.267483371875 491.37548080500005 L 556.5605571218749 499.22894393000007 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 572.267483371875 541.3235062800001 L 581.6916391218749 541.3235062800001 M 581.6916391218749 519.6479480550001 L 661.6977669093749 519.6479480550001 L 678.6612472593749 536.6114284050002 L 678.6612472593749 562.9990645050001 L 581.6916391218749 562.9990645050001 L 581.6916391218749 519.6479480550001 M 661.6977669093749 519.6479480550001 L 661.6977669093749 536.6114284050002 L 678.6612472593749 536.6114284050002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="608.0792752218749" y="546.0355841550002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Before</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="666.124914735625" y="608.2350121050001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetReadVersionRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 572.267483371875 615.7743367050001 L 910.408727183125 615.7743367050001" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(925.927170318125,615.7743367050001) translate(-925.927170318125,-615.7743367050001)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 910.220244068125 607.9208735800001 L 925.927170318125 615.7743367050001 L 910.220244068125 623.6277998300001 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 806.155745155625 694.9372450050002 L 1045.698595480625 694.9372450050002 L 1045.698595480625 738.2883614550002 L 806.155745155625 738.2883614550002 L 806.155745155625 694.9372450050002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="832.543381255625" y="721.3248811050003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Batch read version requests</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 925.927170318125 814.0585736850002 L 935.351326068125 814.0585736850002 M 935.351326068125 794.8332959550002 L 1269.540071043125 794.8332959550002 L 1286.503551393125 811.7967763050002 L 1286.503551393125 833.2838514150002 L 935.351326068125 833.2838514150002 L 935.351326068125 794.8332959550002 M 1269.540071043125 794.8332959550002 L 1269.540071043125 811.7967763050002 L 1286.503551393125 811.7967763050002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="961.738962168125" y="817.8282359850002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GrvProxyServer.queueTransactionStartRequests.Before</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1110.927438730625" y="878.5197990150002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"></text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 925.927170318125 886.0591236150002 L 1280.409264008125 886.0591236150002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(1295.9277071431252,886.0591236150002) translate(-1295.9277071431252,-886.0591236150002)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1280.2207808931253 878.2056604900002 L 1295.9277071431252 886.0591236150002 L 1280.2207808931253 893.9125867400002 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1295.9277071431252 936.0071490900002 L 1305.3518628931251 936.0071490900002 M 1305.3518628931251 914.3315908650002 L 1476.507893024375 914.3315908650002 L 1493.4713733743752 931.2950712150002 L 1493.4713733743752 957.6827073150002 L 1305.3518628931251 957.6827073150002 L 1305.3518628931251 914.3315908650002 M 1476.507893024375 914.3315908650002 L 1476.507893024375 931.2950712150002 L 1493.4713733743752 931.2950712150002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="green" stroke="none" font-family="sans-serif" font-size="11pt" font-style="italic" font-weight="normal" text-decoration="normal" x="1331.7394989931252" y="940.7192269650003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">TransactionAttachID</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1295.9277071431252 1007.6307327900001 L 1305.3518628931251 1007.6307327900001 M 1305.3518628931251 985.9551745650001 L 1597.079182086875 985.9551745650001 L 1614.0426624368752 1002.9186549150002 L 1614.0426624368752 1029.3062910150002 L 1305.3518628931251 1029.3062910150002 L 1305.3518628931251 985.9551745650001 M 1597.079182086875 985.9551745650001 L 1597.079182086875 1002.9186549150002 L 1614.0426624368752 1002.9186549150002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1331.7394989931252" y="1012.3428106650002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">AskLiveCommittedVersionFromMaster</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1459.6972626650002" y="1074.542238615" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"></text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1295.9277071431252 1082.0815632150002 L 1607.948375051875 1082.0815632150002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(1623.4668181868751,1082.0815632150002) translate(-1623.4668181868751,-1082.0815632150002)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1607.7598919368752 1074.2281000900002 L 1623.4668181868751 1082.0815632150002 L 1607.7598919368752 1089.9350263400001 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1623.4668181868751 1132.02958869 L 1632.890973936875 1132.02958869 M 1632.890973936875 1110.354030465 L 1787.926398599375 1110.354030465 L 1804.889878949375 1127.317510815 L 1804.889878949375 1153.7051469150001 L 1632.890973936875 1153.7051469150001 L 1632.890973936875 1110.354030465 M 1787.926398599375 1110.354030465 L 1787.926398599375 1127.317510815 L 1804.889878949375 1127.317510815" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1659.278610036875" y="1136.7416665649998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">confirmEpochLive</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1710.2234194725002" y="1198.941094515" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetRawCommittedVersionRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1623.4668181868751 1206.4804191150001 L 2014.071440904375 1206.4804191150001" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(2029.5898840393752,1206.4804191150001) translate(-2029.5898840393752,-1206.4804191150001)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 2013.8829577893753 1198.6269559900002 L 2029.5898840393752 1206.4804191150001 L 2013.8829577893753 1214.33388224 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 2029.5898840393752 1256.42844459 L 2039.0140397893751 1256.42844459 M 2039.0140397893751 1234.752886365 L 2251.5665542956253 1234.752886365 L 2268.530034645625 1251.716366715 L 2268.530034645625 1278.104002815 L 2039.0140397893751 1278.104002815 L 2039.0140397893751 1234.752886365 M 2251.5665542956253 1234.752886365 L 2251.5665542956253 1251.716366715 L 2268.530034645625 1251.716366715" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="2065.4016758893754" y="1261.1405224649998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetRawCommittedVersion</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1718.9831850975002" y="1323.339950415" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetRawCommittedVersionReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 2029.5898840393752 1330.879275015 L 1638.9852613218752 1330.879275015" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(1623.4668181868751,1330.879275015) translate(-1623.4668181868751,-1330.879275015)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1639.173744436875 1323.02581189 L 1623.4668181868751 1330.879275015 L 1639.173744436875 1338.73273814 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1623.4668181868751 1380.82730049 L 1632.890973936875 1380.82730049 M 1632.890973936875 1359.151742265 L 1700.380011880625 1359.151742265 L 1717.343492230625 1376.115222615 L 1717.343492230625 1402.502858715 L 1632.890973936875 1402.502858715 L 1632.890973936875 1359.151742265 M 1700.380011880625 1359.151742265 L 1700.380011880625 1376.115222615 L 1717.343492230625 1376.115222615" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1659.278610036875" y="1385.5393783649997" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">After</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1023.6545042950002" y="1447.738806315" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetReadVersionReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1623.4668181868751 1455.278130915 L 587.785926506875 1455.278130915" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(572.267483371875,1455.278130915) translate(-572.267483371875,-1455.278130915)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 587.974409621875 1447.42466779 L 572.267483371875 1455.278130915 L 587.974409621875 1463.13159404 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 572.267483371875 1505.22615639 L 581.6916391218749 1505.22615639 M 581.6916391218749 1483.5505981649999 L 649.1806770656249 1483.5505981649999 L 666.1441574156249 1500.514078515 L 666.1441574156249 1526.901714615 L 581.6916391218749 1526.901714615 L 581.6916391218749 1483.5505981649999 M 649.1806770656249 1483.5505981649999 L 649.1806770656249 1500.514078515 L 666.1441574156249 1500.514078515" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="608.0792752218749" y="1509.9382342649997" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">After</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="390.83651316" y="1572.1376622149999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetReadVersionReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 572.267483371875 1579.676986815 L 373.349279051875 1579.676986815" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(357.830835916875,1579.676986815) translate(-357.830835916875,-1579.676986815)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 373.537762166875 1571.82352369 L 357.830835916875 1579.676986815 L 373.537762166875 1587.53044994 Z"/></g></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="172.5784545721875" y="1624.9129344149999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetReadVersionReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 357.830835916875 1632.452259015 L 151.26980933125 1632.452259015" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(135.75136619625,1632.452259015) translate(-135.75136619625,-1632.452259015)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 151.45829244625 1624.59879589 L 135.75136619625 1632.452259015 L 151.45829244625 1640.30572214 Z"/></g></g><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 228.635255004375 244.46260015500002 L 487.02641682937497 244.46260015500002 L 487.02641682937497 366.976624905 L 228.635255004375 366.976624905 L 228.635255004375 244.46260015500002 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 228.635255004375 244.46260015500002 L 228.635255004375 267.08057395500003 L 291.656096754375 267.08057395500003 L 302.965083654375 255.77158705500003 L 302.965083654375 244.46260015500002 L 228.635255004375 244.46260015500002" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="247.483566504375" y="259.54124935500005" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">loop</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="321.81339515437503" y="259.54124935500005" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Batch requests]</text></g><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 796.7315894056251 644.0468039550002 L 1055.122751230625 644.0468039550002 L 1055.122751230625 766.5608287050002 L 796.7315894056251 766.5608287050002 L 796.7315894056251 644.0468039550002 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 796.7315894056251 644.0468039550002 L 796.7315894056251 666.6647777550002 L 859.7524311556251 666.6647777550002 L 871.0614180556252 655.3557908550002 L 871.0614180556252 644.0468039550002 L 796.7315894056251 644.0468039550002" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="815.5799009056251" y="659.1254531550002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">loop</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="889.9097295556252" y="659.1254531550002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Batch requests]</text></g></g></g></svg>
\ No newline at end of file
diff --git a/design/Commit/GetRange.svg b/design/Commit/GetRange.svg
new file mode 100644
index 0000000000..9aa3ac4d13
--- /dev/null
+++ b/design/Commit/GetRange.svg
@@ -0,0 +1 @@
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="1240" height="2036"><defs/><g><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g><rect fill="white" stroke="none" x="0" y="0" width="1240" height="2036"/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="16.5pt" font-style="normal" font-weight="normal" text-decoration="normal" x="570.2937248928124" y="28.27246725" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetRange</text></g><g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 9.424155749999994 58.3355240925 L 760.9988705699999 58.3355240925 L 760.9988705699999 2026.855771073043 L 9.424155749999994 2026.855771073043 L 9.424155749999994 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="291.4285541756249" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Client</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="333.0960346443749" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (NativeAPI.actor.cpp)</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 798.7614823581249 58.3355240925 L 961.6356298131249 58.3355240925 L 961.6356298131249 2026.855771073043 L 798.7614823581249 2026.855771073043 L 798.7614823581249 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="741.7927943668749" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Storage Server</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="848.5139857731249" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (storageserver.actor.cpp)</text></g></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 114.90297752437499 153.990704955 L 114.90297752437499 2026.855771073043" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 346.709009745 153.990704955 L 346.709009745 2026.855771073043" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 535.4376493875 153.990704955 L 535.4376493875 2026.855771073043" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 679.4189745768749 153.990704955 L 679.4189745768749 2026.855771073043" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 880.1985560856249 153.990704955 L 880.1985560856249 2026.855771073043" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/></g><g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 18.848311499999994 103.28874702000002 L 210.95764354874999 103.28874702000002 L 210.95764354874999 153.990704955 L 18.848311499999994 153.990704955 L 18.848311499999994 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="39.29872947749999" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Transaction::getRange</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 229.80595504875 103.28874702000002 L 463.61206444125 103.28874702000002 L 463.61206444125 153.990704955 L 229.80595504875 153.990704955 L 229.80595504875 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="250.25637302625" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Transaction::getReadVersion</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 482.46037594124994 103.28874702000002 L 588.41492283375 103.28874702000002 L 588.41492283375 153.990704955 L 482.46037594124994 153.990704955 L 482.46037594124994 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="502.91079391874996" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">getRange</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 607.2632343337499 103.28874702000002 L 751.5747148199998 103.28874702000002 L 751.5747148199998 153.990704955 L 607.2632343337499 153.990704955 L 607.2632343337499 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="627.7136523112499" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">getKeyLocation</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 808.1856381081249 103.28874702000002 L 952.2114740631248 103.28874702000002 L 952.2114740631248 153.990704955 L 808.1856381081249 153.990704955 L 808.1856381081249 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="828.6360560856249" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">getKeyValuesQ</text></g></g><g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="230.8059936346875" y="208.65080830500003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"></text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 114.90297752437499 216.19013290500004 L 331.19056661 216.19013290500004" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(346.709009745,216.19013290500004) translate(-346.709009745,-216.19013290500004)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 331.002083495 208.33666978000005 L 346.709009745 216.19013290500004 L 331.002083495 224.04359603000003 Z"/></g></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="290.1458993934375" y="261.4260805050001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">KeyRange</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 114.90297752437499 268.96540510500006 L 519.9192062525 268.96540510500006" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(535.4376493875,268.96540510500006) translate(-535.4376493875,-268.96540510500006)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 519.7307231374999 261.11194198000004 L 535.4376493875 268.96540510500006 L 519.7307231374999 276.8188682300001 Z"/></g></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="416.05379831625" y="314.20135270500003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Version</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 346.709009745 307.88976227304295 L 520.2198029662566 342.5467754899164" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(535.4376493875,345.58638527304294) rotate(11.295597454504959,0,0) translate(-535.4376493875,-345.58638527304294)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 519.7307231374999 337.7329221480429 L 535.4376493875 345.58638527304294 L 519.7307231374999 353.43984839804295 Z"/><g transform="rotate(-11.295597454504959,0,0)"/></g></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="594.5047279978124" y="441.71277392304296" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Key</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 535.4376493875 449.25209852304295 L 663.9005314418749 449.25209852304295" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(679.4189745768749,449.25209852304295) translate(-679.4189745768749,-449.25209852304295)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 663.7120483268749 441.3986353980429 L 679.4189745768749 449.25209852304295 L 663.7120483268749 457.10556164804296 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 587.5821197268749 477.524565773043 L 771.2558294268749 477.524565773043 L 771.2558294268749 520.875682223043 L 587.5821197268749 520.875682223043 L 587.5821197268749 477.524565773043" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="italic" font-weight="normal" text-decoration="normal" x="613.9697558268749" y="503.91220187304293" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Consult Get section</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="566.5628334665624" y="566.111629823043" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">LocationInfo</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 679.4189745768749 573.650954423043 L 550.9560925225 573.650954423043" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(535.4376493875,573.650954423043) translate(-535.4376493875,-573.650954423043)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 551.1445756375 565.797491298043 L 535.4376493875 573.650954423043 L 551.1445756375 581.504417548043 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 535.4376493875 623.598979898043 L 544.8618051374999 623.598979898043 M 544.8618051374999 601.9234216730431 L 624.8679329249999 601.9234216730431 L 641.8314132749999 618.8869020230431 L 641.8314132749999 645.274538123043 L 544.8618051374999 645.274538123043 L 544.8618051374999 601.9234216730431 M 624.8679329249999 601.9234216730431 L 624.8679329249999 618.8869020230431 L 641.8314132749999 618.8869020230431" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="571.2494412374999" y="628.3110577730431" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Before</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="632.4921750021874" y="690.5104857230431" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetKeyValuesRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 535.4376493875 698.0498103230431 L 864.6801129506249 698.0498103230431" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(880.1985560856249,698.0498103230431) translate(-880.1985560856249,-698.0498103230431)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 864.4916298356248 690.1963471980431 L 880.1985560856249 698.0498103230431 L 864.4916298356248 705.903273448043 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 880.1985560856249 745.547555303043 L 889.6227118356248 745.547555303043 M 889.6227118356248 726.322277573043 L 1113.3329411856248 726.322277573043 L 1130.2964215356249 743.2857579230431 L 1130.2964215356249 764.7728330330431 L 889.6227118356248 764.7728330330431 L 889.6227118356248 726.322277573043 M 1113.3329411856248 726.322277573043 L 1113.3329411856248 743.2857579230431 L 1130.2964215356249 743.2857579230431" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="916.0103479356248" y="749.3172176030431" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">storageserver.getKeyValues.Before</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 788.2262031887499 793.0453002830432 L 972.1709089824999 793.0453002830432 L 972.1709089824999 836.3964167330431 L 788.2262031887499 836.3964167330431 L 788.2262031887499 793.0453002830432" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="814.6138392887499" y="819.4329363830432" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Wait the SS version</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 880.1985560856249 883.8941617130431 L 889.6227118356248 883.8941617130431 M 889.6227118356248 864.6688839830431 L 1143.3505193106248 864.6688839830431 L 1160.3139996606249 881.6323643330431 L 1160.3139996606249 903.1194394430431 L 889.6227118356248 903.1194394430431 L 889.6227118356248 864.6688839830431 M 1143.3505193106248 864.6688839830431 L 1143.3505193106248 881.6323643330431 L 1160.3139996606249 881.6323643330431" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="916.0103479356248" y="887.6638240130432" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">storageserver.getKeyValues.AfterVersion</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 798.3629219387499 931.3919066930432 L 962.0341902324999 931.3919066930432 L 962.0341902324999 974.7430231430432 L 798.3629219387499 974.7430231430432 L 798.3629219387499 931.3919066930432" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="824.7505580387499" y="957.7795427930432" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Realign the keys</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 880.1985560856249 1022.2407681230432 L 889.6227118356248 1022.2407681230432 M 889.6227118356248 1003.0154903930431 L 1129.9970036856248 1003.0154903930431 L 1146.9604840356249 1019.9789707430432 L 1146.9604840356249 1041.4660458530432 L 889.6227118356248 1041.4660458530432 L 889.6227118356248 1003.0154903930431 M 1129.9970036856248 1003.0154903930431 L 1129.9970036856248 1019.9789707430432 L 1146.9604840356249 1019.9789707430432" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="916.0103479356248" y="1026.0104304230433" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">storageserver.getKeyValues.AfterKeys</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 880.1985560856249 1139.8542318830432 L 889.6227118356248 1139.8542318830432 M 889.6227118356248 1120.6289541530432 L 1106.0028630606248 1120.6289541530432 L 1122.9663434106249 1137.5924345030433 L 1122.9663434106249 1159.0795096130432 L 889.6227118356248 1159.0795096130432 L 889.6227118356248 1120.6289541530432 M 1106.0028630606248 1120.6289541530432 L 1106.0028630606248 1137.5924345030433 L 1122.9663434106249 1137.5924345030433" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="916.0103479356248" y="1143.6238941830431" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">storageserver.getKeyValues.Send</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="613.7494992209374" y="1204.3154572130431" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetKeyValuesReply (empty)</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 880.1985560856249 1211.8547818130432 L 550.9560925225 1211.8547818130432" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(535.4376493875,1211.8547818130432) translate(-535.4376493875,-1211.8547818130432)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 551.1445756375 1204.0013186880433 L 535.4376493875 1211.8547818130432 L 551.1445756375 1219.7082449380432 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 880.1985560856249 1310.242967843043 L 889.6227118356248 1310.242967843043 M 889.6227118356248 1291.017690113043 L 1167.3680974356248 1291.017690113043 L 1184.3315777856249 1307.981170463043 L 1184.3315777856249 1329.468245573043 L 889.6227118356248 1329.468245573043 L 889.6227118356248 1291.017690113043 M 1167.3680974356248 1291.017690113043 L 1167.3680974356248 1307.981170463043 L 1184.3315777856249 1307.981170463043" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="916.0103479356248" y="1314.012630143043" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">storageserver.getKeyValues.AfterReadRange</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="641.2519406271874" y="1374.704193173043" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetKeyValuesReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 880.1985560856249 1382.243517773043 L 550.9560925225 1382.243517773043" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(535.4376493875,1382.243517773043) translate(-535.4376493875,-1382.243517773043)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 551.1445756375 1374.3900546480431 L 535.4376493875 1382.243517773043 L 551.1445756375 1390.096980898043 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 535.4376493875 1460.464010498043 L 544.8618051374999 1460.464010498043 M 544.8618051374999 1438.788452273043 L 612.3508430812499 1438.788452273043 L 629.3143234312499 1455.751932623043 L 629.3143234312499 1482.139568723043 L 544.8618051374999 1482.139568723043 L 544.8618051374999 1438.788452273043 M 612.3508430812499 1438.788452273043 L 612.3508430812499 1455.751932623043 L 629.3143234312499 1455.751932623043" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="571.2494412374999" y="1465.1760883730428" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">After</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 438.602015240625 1510.412035973043 L 632.273283534375 1510.412035973043 L 632.273283534375 1553.763152423043 L 438.602015240625 1553.763152423043 L 438.602015240625 1510.412035973043" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="464.98965134062496" y="1536.7996720730428" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Combines the results</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 535.4376493875 1682.874086198043 L 544.8618051374999 1682.874086198043 M 544.8618051374999 1661.198527973043 L 614.0061165187499 1661.198527973043 L 630.9695968687499 1678.162008323043 L 630.9695968687499 1704.549644423043 L 544.8618051374999 1704.549644423043 L 544.8618051374999 1661.198527973043 M 614.0061165187499 1661.198527973043 L 614.0061165187499 1678.162008323043 L 630.9695968687499 1678.162008323043" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="571.2494412374999" y="1687.5861640730427" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Error</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 481.12276719375 1732.8221116730429 L 589.75253158125 1732.8221116730429 L 589.75253158125 1776.173228123043 L 481.12276719375 1776.173228123043 L 481.12276719375 1732.8221116730429" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="507.51040329374996" y="1759.2097477730426" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Fallback</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="242.6373544715625" y="1821.4091757230428" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">RangeResultRef or Error</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 535.4376493875 1828.948500323043 L 130.421420659375 1828.948500323043" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(114.90297752437499,1828.948500323043) translate(-114.90297752437499,-1828.948500323043)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 130.609903774375 1821.095037198043 L 114.90297752437499 1828.948500323043 L 130.609903774375 1836.801963448043 Z"/></g></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="270.1397958778125" y="1925.0748889730428" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">RangeResultRef</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 535.4376493875 1932.614213573043 L 130.421420659375 1932.614213573043" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(114.90297752437499,1932.614213573043) translate(-114.90297752437499,-1932.614213573043)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 130.609903774375 1924.760750448043 L 114.90297752437499 1932.614213573043 L 130.609903774375 1940.4676766980429 Z"/></g></g><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 429.177859490625 373.8588525230429 L 1212.604045035625 373.8588525230429 L 1212.604045035625 1582.0356196730431 L 429.177859490625 1582.0356196730431 L 429.177859490625 373.8588525230429 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 429.177859490625 373.8588525230429 L 429.177859490625 396.4768263230429 L 492.198701240625 396.4768263230429 L 503.50768814062496 385.1678394230429 L 503.50768814062496 373.8588525230429 L 429.177859490625 373.8588525230429" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="448.026170990625" y="388.93750172304294" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">loop</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="522.355999640625" y="388.93750172304294" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Keys in the range]</text></g><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 469.46855913749994 1069.738513103043 L 1193.755733535625 1069.738513103043 L 1193.755733535625 1410.5159850230432 L 469.46855913749994 1410.5159850230432 L 469.46855913749994 1069.738513103043 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 469.46855913749994 1069.738513103043 L 469.46855913749994 1092.356486903043 L 521.1690883874999 1092.356486903043 L 532.4780752874999 1081.047500003043 L 532.4780752874999 1069.738513103043 L 469.46855913749994 1069.738513103043" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="488.31687063749996" y="1084.817162303043" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">alt</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="551.3263867874999" y="1084.817162303043" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[No KV pair stored in this server]</text><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 469.46855913749994 1240.127249063043 L 1193.755733535625 1240.127249063043" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray="4.712077875"/></g><g/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="551.3263867874999" y="1255.205898263043" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[KV pair found]</text></g><g/><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 48.93388727437498 1610.308086923043 L 640.39375261875 1610.308086923043 L 640.39375261875 1960.886680823043 L 48.93388727437498 1960.886680823043 L 48.93388727437498 1610.308086923043 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 48.93388727437498 1610.308086923043 L 48.93388727437498 1632.926060723043 L 100.63441652437498 1632.926060723043 L 111.94340342437499 1621.617073823043 L 111.94340342437499 1610.308086923043 L 48.93388727437498 1610.308086923043" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="67.78219877437499" y="1625.386736123043" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">alt</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="130.791714924375" y="1625.386736123043" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Error]</text><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 48.93388727437498 1857.2209675730428 L 640.39375261875 1857.2209675730428" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray="4.712077875"/></g><g/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="130.791714924375" y="1872.2996167730428" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Successful]</text></g><g/></g></g></svg>
\ No newline at end of file
diff --git a/design/Commit/GetRangeFallback.svg b/design/Commit/GetRangeFallback.svg
new file mode 100644
index 0000000000..dcb2ea84f3
--- /dev/null
+++ b/design/Commit/GetRangeFallback.svg
@@ -0,0 +1 @@
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="1396" height="2724"><defs/><g><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g/><g><rect fill="white" stroke="none" x="0" y="0" width="1396" height="2724"/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="16.5pt" font-style="normal" font-weight="normal" text-decoration="normal" x="604.1754082046876" y="28.27246725" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetRange Fallback</text></g><g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 55.12300913812503 58.3355240925 L 766.09598623875 58.3355240925 L 766.09598623875 2715.476237804999 L 55.12300913812503 2715.476237804999 L 55.12300913812503 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="316.8265387040625" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Client</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="358.4940191728125" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (NativeAPI.actor.cpp)</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 784.94429773875 58.3355240925 L 1387.247949721875 58.3355240925 L 1387.247949721875 2715.476237804999 L 784.94429773875 2715.476237804999 L 784.94429773875 58.3355240925" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="bold" text-decoration="normal" x="947.6903620115627" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Storage Server</text><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1054.4115534178127" y="84.72316019250002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve"> (storageserver.actor.cpp)</text></g></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 145.45168442812502 153.990704955 L 145.45168442812502 2715.476237804999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 290.90336885625004 153.990704955 L 290.90336885625004 2715.476237804999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 470.07578068125 153.990704955 L 470.07578068125 2715.476237804999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 658.6652601675 153.990704955 L 658.6652601675 2715.476237804999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 912.234642950625 153.990704955 L 912.234642950625 2715.476237804999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1282.3367549006252 153.990704955 L 1282.3367549006252 2715.476237804999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="14.498701153846154,6.282770500000001"/></g><g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 64.54716488812502 103.28874702000002 L 226.35620396812502 103.28874702000002 L 226.35620396812502 153.990704955 L 64.54716488812502 153.990704955 L 64.54716488812502 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="84.99758286562502" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">getRangeFallback</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 247.10334150375004 103.28874702000002 L 334.70339620875006 103.28874702000002 L 334.70339620875006 153.990704955 L 247.10334150375004 153.990704955 L 247.10334150375004 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="267.55375948125004" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">getKey</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 398.34118301625006 103.28874702000002 L 541.81037834625 103.28874702000002 L 541.81037834625 153.990704955 L 398.34118301625006 153.990704955 L 398.34118301625006 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="418.79160099375" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">getExactRange</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 560.65868984625 103.28874702000002 L 756.67183048875 103.28874702000002 L 756.67183048875 153.990704955 L 560.65868984625 153.990704955 L 560.65868984625 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="581.10910782375" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">getKeyRangeLocations</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 794.36845348875 103.28874702000002 L 1030.1008324125 103.28874702000002 L 1030.1008324125 153.990704955 L 794.36845348875 153.990704955 L 794.36845348875 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="814.81887146625" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">serveGetKeyValuesRequests</text></g><path fill="none" stroke="none"/><g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 1186.8497158293753 103.28874702000002 L 1377.8237939718754 103.28874702000002 L 1377.8237939718754 153.990704955 L 1186.8497158293753 153.990704955 L 1186.8497158293753 103.28874702000002 Z" stroke-miterlimit="10" stroke-width="3.01572984" stroke-dasharray=""/></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="1207.3001338068752" y="135.14239345500002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">serveGetKeyRequests</text></g></g><g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="177.74051492343753" y="259.54124935500005" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">KeySelector</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 145.45168442812502 267.08057395500003 L 275.38492572125006 267.08057395500003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(290.90336885625004,267.08057395500003) translate(-290.90336885625004,-267.08057395500003)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 275.19644260625006 259.22711083 L 290.90336885625004 267.08057395500003 L 275.19644260625006 274.93403708000005 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 200.18345736562506 295.353041205 L 381.62328034687505 295.353041205 L 381.62328034687505 338.704157655 L 200.18345736562506 338.704157655 L 200.18345736562506 295.353041205" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="226.57109346562504" y="321.74067730499996" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Wait for the version</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 290.90336885625004 388.65218313 L 300.32752460625005 388.65218313 M 300.32752460625005 366.97662490500005 L 443.68814458125 366.97662490500005 L 460.65162493125 383.94010525500005 L 460.65162493125 410.32774135500006 L 300.32752460625005 410.32774135500006 L 300.32752460625005 366.97662490500005 M 443.68814458125 366.97662490500005 L 443.68814458125 383.94010525500005 L 460.65162493125 383.94010525500005" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="green" stroke="none" font-family="sans-serif" font-size="11pt" font-style="italic" font-weight="normal" text-decoration="normal" x="326.71516070625" y="393.364261005" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetKeyAttachID</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 290.90336885625004 460.27576683000007 L 300.32752460625005 460.27576683000007 M 300.32752460625005 438.6002086050001 L 417.85562505 438.6002086050001 L 434.8191054 455.5636889550001 L 434.8191054 481.9513250550001 L 300.32752460625005 481.9513250550001 L 300.32752460625005 438.6002086050001 M 417.85562505 438.6002086050001 L 417.85562505 455.5636889550001 L 434.8191054 455.5636889550001" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="326.71516070625" y="464.98784470500004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">AfterVersion</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 175.28843783437506 510.22379230500013 L 406.51829987812505 510.22379230500013 L 406.51829987812505 553.5749087550001 L 175.28843783437506 553.5749087550001 L 175.28843783437506 510.22379230500013" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="201.67607393437504" y="536.6114284050002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">See getKeyLocation in Get</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 290.90336885625004 603.5229342300001 L 300.32752460625005 603.5229342300001 M 300.32752460625005 581.8473760050001 L 380.33365239375007 581.8473760050001 L 397.29713274375007 598.8108563550002 L 397.29713274375007 625.1984924550002 L 300.32752460625005 625.1984924550002 L 300.32752460625005 581.8473760050001 M 380.33365239375007 581.8473760050001 L 380.33365239375007 598.8108563550002 L 397.29713274375007 598.8108563550002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="326.71516070625006" y="608.2350121050002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Before</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="733.6732845346876" y="670.4344400550002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetKeyRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 290.90336885625004 677.9737646550002 L 1266.8183117656251 677.9737646550002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(1282.3367549006252,677.9737646550002) translate(-1282.3367549006252,-677.9737646550002)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 1266.6298286506253 670.1203015300002 L 1282.3367549006252 677.9737646550002 L 1266.6298286506253 685.8272277800002 Z"/></g></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="742.4330501596876" y="723.2097122550002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetKeyReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 1282.3367549006252 730.7490368550002 L 306.42181199125 730.7490368550002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(290.90336885625004,730.7490368550002) translate(-290.90336885625004,-730.7490368550002)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 306.61029510625 722.8955737300003 L 290.90336885625004 730.7490368550002 L 306.61029510625 738.6024999800002 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 290.90336885625004 831.5875033800003 L 300.32752460625005 831.5875033800003 M 300.32752460625005 809.9119451550002 L 367.81656255000007 809.9119451550002 L 384.78004290000007 826.8754255050003 L 384.78004290000007 853.2630616050003 L 300.32752460625005 853.2630616050003 L 300.32752460625005 809.9119451550002 M 367.81656255000007 809.9119451550002 L 367.81656255000007 826.8754255050003 L 384.78004290000007 826.8754255050003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="326.71516070625006" y="836.2995812550004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">After</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="205.25394265781253" y="898.4990092050003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Key</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 290.90336885625004 906.0383338050003 L 160.97012756312503 906.0383338050003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(145.45168442812502,906.0383338050003) translate(-145.45168442812502,-906.0383338050003)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 161.15861067812503 898.1848706800004 L 145.45168442812502 906.0383338050003 L 161.15861067812503 913.8917969300003 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 290.90336885625004 1006.8768003300004 L 300.32752460625005 1006.8768003300004 M 300.32752460625005 985.2012421050003 L 369.47183598750007 985.2012421050003 L 386.43531633750007 1002.1647224550004 L 386.43531633750007 1028.5523585550004 L 300.32752460625005 1028.5523585550004 L 300.32752460625005 985.2012421050003 M 369.47183598750007 985.2012421050003 L 369.47183598750007 1002.1647224550004 L 386.43531633750007 1002.1647224550004" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="326.71516070625006" y="1011.5888782050005" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Error</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 9.42415575 1113.3697603050005 L 281.47921310625003 1113.3697603050005 L 281.47921310625003 1156.7208767550005 L 9.42415575 1156.7208767550005 L 9.42415575 1113.3697603050005" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="35.81179185000002" y="1139.7573964050002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Update read version if necessary</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="231.88116419531252" y="1201.9568243550004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Version, KeyRangeRef</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 145.45168442812502 1209.4961489550005 L 454.55733754625004 1209.4961489550005" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(470.07578068125,1209.4961489550005) translate(-470.07578068125,-1209.4961489550005)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 454.36885443125004 1201.6426858300006 L 470.07578068125 1209.4961489550005 L 454.36885443125004 1217.3496120800005 Z"/></g></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="529.3461063618751" y="1305.6225376050004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">KeyRange</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 470.07578068125 1313.1618622050005 L 643.1468170325 1313.1618622050005" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(658.6652601675,1313.1618622050005) translate(-658.6652601675,-1313.1618622050005)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 642.9583339175 1305.3083990800005 L 658.6652601675 1313.1618622050005 L 642.9583339175 1321.0153253300005 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 658.6652601675 1363.1098876800004 L 668.0894159175 1363.1098876800004 M 668.0894159175 1341.4343294550004 L 748.095543705 1341.4343294550004 L 765.059024055 1358.3978098050004 L 765.059024055 1384.7854459050004 L 668.0894159175 1384.7854459050004 L 668.0894159175 1341.4343294550004 M 748.095543705 1341.4343294550004 L 748.095543705 1358.3978098050004 L 765.059024055 1358.3978098050004" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="694.4770520175" y="1367.8219655550001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Before</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 575.994665083125 1413.0579131550003 L 741.335855251875 1413.0579131550003 L 741.335855251875 1456.4090296050003 L 575.994665083125 1456.4090296050003 L 575.994665083125 1413.0579131550003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="602.382301183125" y="1439.445549255" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Get the locations</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 658.6652601675 1506.3570550800002 L 668.0894159175 1506.3570550800002 M 668.0894159175 1484.6814968550002 L 735.57845386125 1484.6814968550002 L 752.54193421125 1501.6449772050003 L 752.54193421125 1528.0326133050003 L 668.0894159175 1528.0326133050003 L 668.0894159175 1484.6814968550002 M 735.57845386125 1484.6814968550002 L 735.57845386125 1501.6449772050003 L 752.54193421125 1501.6449772050003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="694.4770520175" y="1511.069132955" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">After</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="523.5050419087501" y="1573.2685609050002" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">LocationInfo</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 658.6652601675 1580.8078855050003 L 485.59422381625 1580.8078855050003" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(470.07578068125,1580.8078855050003) translate(-470.07578068125,-1580.8078855050003)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 485.78270693125 1572.9544223800003 L 470.07578068125 1580.8078855050003 L 485.78270693125 1588.6613486300002 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 470.07578068125 1681.6463520300001 L 479.49993643125003 1681.6463520300001 M 479.49993643125003 1659.970793805 L 559.50606421875 1659.970793805 L 576.46954456875 1676.9342741550001 L 576.46954456875 1703.3219102550001 L 479.49993643125003 1703.3219102550001 L 479.49993643125003 1659.970793805 M 559.50606421875 1659.970793805 L 559.50606421875 1676.9342741550001 L 576.46954456875 1676.9342741550001" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="505.88757253125004" y="1686.3584299049999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Before</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="615.8292840815625" y="1748.557857855" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetKeyValuesRequest</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 470.07578068125 1756.0971824550002 L 896.716199815625 1756.0971824550002" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g transform="translate(912.234642950625,1756.0971824550002) translate(-912.234642950625,-1756.0971824550002)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 896.527716700625 1748.2437193300002 L 912.234642950625 1756.0971824550002 L 896.527716700625 1763.9506455800001 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 912.234642950625 1803.594927435 L 921.658798700625 1803.594927435 M 921.658798700625 1784.369649705 L 1145.3690280506248 1784.369649705 L 1162.3325084006249 1801.333130055 L 1162.3325084006249 1822.820205165 L 921.658798700625 1822.820205165 L 921.658798700625 1784.369649705 M 1145.3690280506248 1784.369649705 L 1145.3690280506248 1801.333130055 L 1162.3325084006249 1801.333130055" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="948.0464348006251" y="1807.364589735" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">storageserver.getKeyValues.Before</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 820.26229005375 1851.092672415 L 1004.2069958475 1851.092672415 L 1004.2069958475 1894.443788865 L 820.26229005375 1894.443788865 L 820.26229005375 1851.092672415" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="846.64992615375" y="1877.4803085149997" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Wait the SS version</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 912.234642950625 1941.9415338449999 L 921.658798700625 1941.9415338449999 M 921.658798700625 1922.7162561149999 L 1175.386606175625 1922.7162561149999 L 1192.350086525625 1939.679736465 L 1192.350086525625 1961.1668115749999 L 921.658798700625 1961.1668115749999 L 921.658798700625 1922.7162561149999 M 1175.386606175625 1922.7162561149999 L 1175.386606175625 1939.679736465 L 1192.350086525625 1939.679736465" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="948.0464348006251" y="1945.7111961449998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">storageserver.getKeyValues.AfterVersion</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 830.39900880375 1989.4392788249997 L 994.0702770975 1989.4392788249997 L 994.0702770975 2032.7903952749998 L 830.39900880375 2032.7903952749998 L 830.39900880375 1989.4392788249997" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="856.78664490375" y="2015.8269149249995" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">Realign the keys</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 912.234642950625 2080.2881402549997 L 921.658798700625 2080.2881402549997 M 921.658798700625 2061.0628625249997 L 1162.033090550625 2061.0628625249997 L 1178.996570900625 2078.0263428749995 L 1178.996570900625 2099.5134179849997 L 921.658798700625 2099.5134179849997 L 921.658798700625 2061.0628625249997 M 1162.033090550625 2061.0628625249997 L 1162.033090550625 2078.0263428749995 L 1178.996570900625 2078.0263428749995" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="948.0464348006251" y="2084.057802555" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">storageserver.getKeyValues.AfterKeys</text></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 912.234642950625 2197.9016040149995 L 921.658798700625 2197.9016040149995 M 921.658798700625 2178.6763262849995 L 1138.0389499256248 2178.6763262849995 L 1155.0024302756249 2195.6398066349993 L 1155.0024302756249 2217.1268817449995 L 921.658798700625 2217.1268817449995 L 921.658798700625 2178.6763262849995 M 1138.0389499256248 2178.6763262849995 L 1138.0389499256248 2195.6398066349993 L 1155.0024302756249 2195.6398066349993" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="948.0464348006251" y="2201.6712663149997" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">storageserver.getKeyValues.Send</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="597.0866083003125" y="2262.362829344999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetKeyValuesReply (empty)</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 912.234642950625 2269.9021539449996 L 485.59422381625 2269.9021539449996" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(470.07578068125,2269.9021539449996) translate(-470.07578068125,-2269.9021539449996)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 485.78270693125 2262.0486908199996 L 470.07578068125 2269.9021539449996 L 485.78270693125 2277.7556170699995 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 912.234642950625 2368.2903399749994 L 921.658798700625 2368.2903399749994 M 921.658798700625 2349.0650622449994 L 1199.404184300625 2349.0650622449994 L 1216.367664650625 2366.028542594999 L 1216.367664650625 2387.5156177049994 L 921.658798700625 2387.5156177049994 L 921.658798700625 2349.0650622449994 M 1199.404184300625 2349.0650622449994 L 1199.404184300625 2366.028542594999 L 1216.367664650625 2366.028542594999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="lightblue" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="normal" text-decoration="normal" x="948.0464348006251" y="2372.0600022749995" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">storageserver.getKeyValues.AfterReadRange</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="624.5890497065625" y="2432.751565304999" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">GetKeyValuesReply</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 912.234642950625 2440.2908899049994 L 485.59422381625 2440.2908899049994" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(470.07578068125,2440.2908899049994) translate(-470.07578068125,-2440.2908899049994)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 485.78270693125 2432.4374267799994 L 470.07578068125 2440.2908899049994 L 485.78270693125 2448.1443530299994 Z"/></g></g><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 470.07578068125 2518.5113826299994 L 479.49993643125003 2518.5113826299994 M 479.49993643125003 2496.835824404999 L 546.988974375 2496.835824404999 L 563.952454725 2513.799304754999 L 563.952454725 2540.186940854999 L 479.49993643125003 2540.186940854999 L 479.49993643125003 2496.835824404999 M 546.988974375 2496.835824404999 L 546.988974375 2513.799304754999 L 563.952454725 2513.799304754999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray=""/><g><text fill="blue" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="505.88757253125004" y="2523.2234605049994" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">After</text></g><g><text fill="black" stroke="none" font-family="sans-serif" font-size="11pt" font-style="normal" font-weight="normal" text-decoration="normal" x="252.73321497656252" y="2641.9678229549986" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">RangeResultRef</text></g><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 470.07578068125 2649.507147554999 L 160.97012756312503 2649.507147554999" stroke-miterlimit="10" stroke-width="1.5706926250000002" stroke-dasharray="7.5393246000000005"/><g transform="translate(145.45168442812502,2649.507147554999) translate(-145.45168442812502,-2649.507147554999)"><path fill="black" stroke="none" paint-order="stroke fill markers" d=" M 161.15861067812503 2641.653684429999 L 145.45168442812502 2649.507147554999 L 161.15861067812503 2657.360610679999 Z"/></g></g><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 60.634282678125004 191.687327955 L 1348.3058451506251 191.687327955 L 1348.3058451506251 1085.0972930550006 L 60.634282678125004 1085.0972930550006 L 60.634282678125004 191.687327955 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 60.634282678125004 191.687327955 L 60.634282678125004 214.30530175500002 L 116.987155678125 214.30530175500002 L 128.296142578125 202.996314855 L 128.296142578125 191.687327955 L 60.634282678125004 191.687327955" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="79.482594178125" y="206.76597715500003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">opt</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="147.144454078125" y="206.76597715500003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Key need resolve]</text></g><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 79.48259417812501 759.0215041050003 L 395.8594720875001 759.0215041050003 L 395.8594720875001 1056.8248258050005 L 79.48259417812501 1056.8248258050005 L 79.48259417812501 759.0215041050003 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 79.48259417812501 759.0215041050003 L 79.48259417812501 781.6394779050003 L 131.18312342812501 781.6394779050003 L 142.49211032812502 770.3304910050003 L 142.49211032812502 759.0215041050003 L 79.48259417812501 759.0215041050003" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="98.33090567812502" y="774.1001533050003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">alt</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="161.34042182812502" y="774.1001533050003" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Success]</text><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 79.48259417812501 934.3108010550004 L 395.8594720875001 934.3108010550004" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray="4.712077875"/></g><g/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="161.34042182812502" y="949.3894502550004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Error]</text></g><g/><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 366.41006743125 1237.7686162050004 L 1263.488443400625 1237.7686162050004 L 1263.488443400625 2596.731875354999 L 366.41006743125 2596.731875354999 L 366.41006743125 1237.7686162050004 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 366.41006743125 1237.7686162050004 L 366.41006743125 1260.3865900050005 L 429.43090918125 1260.3865900050005 L 440.73989608125 1249.0776031050004 L 440.73989608125 1237.7686162050004 L 366.41006743125 1237.7686162050004" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="385.25837893125004" y="1252.8472654050004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">loop</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="459.58820758125" y="1252.8472654050004" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Loop over keys in the range]</text></g><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 385.25837893125004 1609.0803527550001 L 1244.640131900625 1609.0803527550001 L 1244.640131900625 2568.459408104999 L 385.25837893125004 2568.459408104999 L 385.25837893125004 1609.0803527550001 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 385.25837893125004 1609.0803527550001 L 385.25837893125004 1631.6983265550002 L 448.27922068125 1631.6983265550002 L 459.58820758125 1620.3893396550002 L 459.58820758125 1609.0803527550001 L 385.25837893125004 1609.0803527550001" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="404.10669043125006" y="1624.1590019550001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">loop</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="478.43651908125" y="1624.1590019550001" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[Loop over shards]</text></g><g><g/><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 404.10669043125 2127.7858852349996 L 1225.791820400625 2127.7858852349996 L 1225.791820400625 2468.5633571549993 L 404.10669043125 2468.5633571549993 L 404.10669043125 2127.7858852349996 Z" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><path fill="white" stroke="black" paint-order="fill stroke markers" d=" M 404.10669043125 2127.7858852349996 L 404.10669043125 2150.4038590349996 L 455.80721968125 2150.4038590349996 L 467.11620658124997 2139.0948721349996 L 467.11620658124997 2127.7858852349996 L 404.10669043125 2127.7858852349996" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray=""/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="422.95500193125" y="2142.8645344349998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">alt</text><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="485.96451808125" y="2142.8645344349998" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[No KV pair stored in this server]</text><g><path fill="none" stroke="black" paint-order="fill stroke markers" d=" M 404.10669043125 2298.1746211949994 L 1225.791820400625 2298.1746211949994" stroke-miterlimit="10" stroke-width="2.6926159285714286" stroke-dasharray="4.712077875"/></g><g/><text fill="black" stroke="none" font-family="sans-serif" font-size="8.8pt" font-style="normal" font-weight="bold" text-decoration="normal" x="485.96451808125" y="2313.2532703949996" text-anchor="start" dominant-baseline="alphabetic" xml:space="preserve">[KV pair found]</text></g><g/></g></g></svg>
\ No newline at end of file
diff --git a/design/Commit/How a commit is done in FDB.md b/design/Commit/How a commit is done in FDB.md
new file mode 100644
index 0000000000..78d74ed1e2
--- /dev/null
+++ b/design/Commit/How a commit is done in FDB.md	
@@ -0,0 +1,204 @@
+# How a commit is done in FDB
+
+## Overall description
+
+Legend:
+
+* `alt` means alternative paths
+  * The texts in `[]` are conditions
+  * The texts above the arrow are messages. 
+
+The diagrams are generated using https://sequencediagram.org. The source code of the diagrams are the `*.sequence` files.
+
+![CommitOverall](CommitOverall.svg)
+
+
+
+## Description of each sections
+
+Before all RPCs mentioned below, the client would first verify if the commit proxies and GRV proxies are changed, by comparing the client information ID it holds to the ID the cluster coordinator holds. If they are different, the proxies are changed and the client will refresh the proxies list.
+
+### GetReadVersion Section
+
+* The GRV Proxy sends a request to master to retrieve the current commit version. This version is the read version of the request.
+
+### Preresolution Section
+
+* The commit proxy sends a request for commit version, with a request number.
+
+* - The request number is a monotonically increasing number per commit proxy.
+  - This ensures for each proxy, the master will process the requests in order.
+
+* The master server waits until the request number is current.
+
+  When the current request number is larger than the incoming request number
+
+  * If a commit version is already assigned to the incoming request number, return the commit version and the version that is immediately before the commit version (prevVersion).
+
+  * Otherwise return `Never`
+
+  * Increase current commit version, return it back to the commit proxy.
+
+    * Only one process serves as master. Thus the commit version is unique for each cluster.
+
+    * The monotonically increasing commit version will ensure each transaction processed in strict ordering.
+
+### Resolution section
+
+* The commit proxy sends the transaction to the resolver.
+* Resolver waits until its version reaches `prevVersion`
+  * Ensures all transactions having version smaller than this transaction are resolved.
+  * Detects conflicts for the given transaction:
+    * If there is no conflict, return `TransactionCommitted` as the status
+    * Any conflict, return `TransactionConflict` status
+    * If the read snapshot is not in MVCC, return `TransactionTooOld` status
+
+### Post Resolution section
+
+* The proxy waits until the local batch number is current
+* The proxy will update the metadata keys and calculate which storage servers are affected
+* The proxy then waits until the commit version is current, i.e. only those commits in the MVCC window should be processed.
+* The proxy pushs the commit data to TLog
+* TLog waits the commit version to current, then persist the commit.
+
+### TLog section
+
+* Wait until *all* TLogs returns the transaction result.
+
+### Reply section
+
+* The proxy will update the master its commit version
+* Reply the result to the client, base on the result from the resolver.
+
+## Tracking the process using `g_traceBatch`
+
+`g_traceBatch` can be used for querying the transactions and commits. A typical query string:
+
+```
+index=iffdb LogGroup=loggroup Type=location Location=location
+```
+
+The format of `location` is, in general, `<source_file_name>.<function/actor name>.<log information>`, e.g.
+
+```
+NativeAPI.getConsistentReadVersion.Before
+```
+
+means the `location` is at `NativeAPI.actor.cpp`, `ACTOR` `getConsistentReadVersion`, `Before` requesting the read version from GRV Proxy.
+
+In the following sections, <span style="color:green">green</span> tag indicates an attach; <span style="color:blue">blue</span> tag indicates an event that the location follows the format mentioned above, where only the `<log information>` is included; <span style="color:lightblue">light-blue</span> tag indicates an event that the location is not following the format, where the full location is included. All the `g_traceBatch` events are tabularized after the diagram.
+
+`contrib/commit_debug.py` can be used to visualize the commit process.
+
+### Get Read Version
+
+![GetReadVersion](GRV.svg)
+
+| **Role**     | **File name**  | **Function/Actor**          | **Trace**                         | **Type**                                                     | **Location**                                                 |
+| ------------ | -------------- | --------------------------- | --------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| **Client**   | NativeAPI      | Transaction::getReadVersion |                                   |                                                              |                                                              |
+|              |                | readVersionBatcher          |                                   | [*TransactionAttachID*](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4639) |                                                              |
+|              |                | getConsistentReadVersion    | Before                            | TransactionDebug                                             | [NativeAPI.getConsistentReadVersion.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4564) |
+| **GRVProxy** | GrvProxyServer | queueGetReadVersionRequests | Before                            | TransactionDebug                                             | [GrvProxyServer.queueTransactionStartRequests.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L373-L375) |
+|              |                | transactionStarter          |                                   | [*TransactionAttachID*](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L734-L735) |                                                              |
+|              |                |                             | AskLiveCommittedVersionFromMaster | TransactionDebug                                             | [GrvProxyServer.transactionStarter.AskLiveCommittedVersionFromMaster](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L787-L789) |
+|              |                | getLiveCommittedVersion     | confirmEpochLive                  | TransactionDebug                                             | [GrvProxyServer.getLiveCommittedVersion.confirmEpochLive](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L479-L480) |
+| **Master**   | MasterServer   | serveLiveCommittedVersion   | GetRawCommittedVersion            | TransactionDebug                                             | [MasterServer.serveLiveCommittedVersion.GetRawCommittedVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/masterserver.actor.cpp#L1187-L1189) |
+| **GRVProxy** | GrvProxyServer | getLiveCommittedVersion     | After                             | TransactionDebug                                             | [GrvProxyServer.getLiveCommittedVersion.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L500-L501) |
+| **Client**   | NativeAPI      | getConsistentReadVersion    | After                             | TransactionDebug                                             | [NativeAPI.getConsistentReadVersion.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4594-L4595) |
+
+### Get
+
+![Get](Get.svg)
+
+| **Role**           | **File name**       | **Function/Actor**                  | **Trace**    | **Name**                                                     | **Location**                                                 | **Notes**                                                    |
+| ------------------ | ------------------- | ----------------------------------- | ------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| **Client**         | NativeAPI           | Transaction::get                    |              |                                                              |                                                              |                                                              |
+|                    |                     | Transaction::getReadVersion         |              |                                                              | *(Refer to GetReadVersion)*                                  |                                                              |
+|                    |                     | getKeyLocation                      | Before       | TransactionDebug                                             | [NativeAPI.getKeyLocation.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L1975-L1976) | getKeyLocation is called by getValue, getKeyLocation actually calls getKeyLocation_internal |
+|                    |                     |                                     | After        | TransactionDebug                                             | [NativeAPI.getKeyLocation.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L1988-L1989) |                                                              |
+|                    |                     | getValue                            |              | [*GetValueAttachID*](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2164) |                                                              |                                                              |
+|                    |                     |                                     | Before       | GetValueDebug                                                | [NativeAPI.getValue.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2165-L2167) |                                                              |
+| **Storage Server** | StorageServer       | serveGetValueRequests               | received     | GetValueDebug                                                | [StorageServer.received](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L4325-L4327) |                                                              |
+|                    |                     | getValueQ                           | DoRead       | GetValueDebug                                                | [getValueQ.DoRead](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1115-L1117) |                                                              |
+|                    |                     |                                     | AfterVersion | GetValueDebug                                                | [getValueQ.AfterVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1122-L1124) |                                                              |
+|                    | KeyValueStoreSQLite | KeyValueStoreSQLite::Reader::action | Before       | GetValueDebug                                                | [Reader.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/KeyValueStoreSQLite.actor.cpp#L1654-L1656) |                                                              |
+|                    |                     |                                     | After        | GetValueDebug                                                | [Reader.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/KeyValueStoreSQLite.actor.cpp#L1662-L1664) |                                                              |
+|                    | StorageServer       |                                     | AfterRead    | GetValueDebug                                                | [getValueQ.AfterRead](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1185-L1187) |                                                              |
+| **Client**         | NativeAPI           | getValue                            | After        | GetValueDebug                                                | [NativeAPI.getValue.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2216-L2218) | (When successful)                                            |
+|                    |                     |                                     | Error        | GetValueDebug                                                | [NativeAPI.getValue.Error](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2232-L2234) | (Wehn failure)                                               |
+
+
+
+### Get Range
+
+![GetRange](GetRange.svg)
+
+| **Role**           | **File name** | **Function/Actor**          | **Trace**      | **Name**         | **Location**                                                 | **Notes**                            |
+| ------------------ | ------------- | --------------------------- | -------------- | ---------------- | ------------------------------------------------------------ | ------------------------------------ |
+| **Client**         | NativeAPI     | Transaction::getRange       |                |                  |                                                              |                                      |
+|                    |               | Transaction::getReadVersion |                |                  | *(Refer to GetReadVersion)*                                  |                                      |
+|                    |               | getKeyLocation              | Before         | TransactionDebug | [NativeAPI.getKeyLocation.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L1975) | getKeyLocation is called by getRange |
+|                    |               |                             | After          | TransactionDebug | [NativeAPI.getKeyLocation.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L1988-L1989) |                                      |
+|                    |               | getRange                    | Before         | TransactionDebug | [NativeAPI.getRange.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L3004) |                                      |
+| **Storage Server** | storageserver | getKeyValuesQ               | Before         | TransactionDebug | [storageserver.getKeyValues.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1812) |                                      |
+|                    |               |                             | AfterVersion   | TransactionDebug | [storageserver.getKeyValues.AfterVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1821) |                                      |
+|                    |               |                             | AfterKeys      | TransactionDebug | [storageserver.getKeyValues.AfterKeys](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1846) |                                      |
+|                    |               |                             | Send           | TransactionDebug | [storageserver.getKeyValues.Send](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1866) | (When no keys found)                 |
+|                    |               |                             | AfterReadRange | TransactionDebug | [storageserver.getKeyValues.AfterReadRange](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1886) | (When found keys in this SS)         |
+| **Client**         | NativeAPI     | getRange                    | After          | TransactionDebug | [NativeAPI.getRange.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L3044-L3046) | (When successful)                    |
+|                    |               |                             | Error          | TransactionDebug | [NativeAPI.getRange.Error](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L3155-L3156) | (Wehn failure)                       |
+
+### GetRange Fallback
+
+![GetRangeFallback](GetRangeFallback.svg)
+
+| **Role**   | **File name** | **Function/Actor**   | **Trace**    | **Type**         | **Location**                                                 | **Notes**                                       |
+| ---------- | ------------- | -------------------- | ------------ | ---------------- | ------------------------------------------------------------ | ----------------------------------------------- |
+| **Client** | NativeAPI     | getRangeFallback     |              |                  |                                                              |                                                 |
+|            |               | getKey               |              |                  | *GetKeyAttachID*                                             |                                                 |
+|            |               |                      | AfterVersion | GetKeyDebug      | [NativeAPI.getKey.AfterVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2263-L2266) |                                                 |
+|            |               |                      | Before       | GetKeyDebug      | [NativeAPI.getKey.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2285-L2288) |                                                 |
+|            |               |                      | After        | GetKeyDebug      | [NativeAPI.getKey.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2316-L2318) | Success                                         |
+|            |               |                      | Error        | GetKeyDebug      | [NativeAPI.getKey.Error](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2326) | Error                                           |
+|            |               | getReadVersion       |              |                  |                                                              | *(Refer to GetReadVersion)*                     |
+|            |               | getKeyRangeLocations | Before       | TransactionDebug | [NativeAPI.getKeyLocations.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2029) |                                                 |
+|            |               |                      | After        | TransactionDebug | [NativeAPI.getKeyLocations.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2044) |                                                 |
+|            |               | getExactRange        | Before       | TransactionDebug | [NativeAPI.getExactRange.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2674) | getKeyRangeLocations is called by getExactRange |
+|            |               |                      | After        | TransactionDebug | [NativeAPI.getExactRange.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2707) |                                                 |
+
+### Commit
+
+![Commit](Commit.svg)
+
+| **Role**         | **File name**     | **Function/Actor**                          | **Trace**            | **Type**                                                     | **Location**                                                 | **Notes** |
+| ---------------- | ----------------- | ------------------------------------------- | -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | --------- |
+| **Client**       | NativeAPI         | Transaction::commit                         |                      |                                                              |                                                              |           |
+|                  |                   | commitAndWatch                              |                      |                                                              |                                                              |           |
+|                  |                   | tryCommit                                   |                      | *[commitAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4100)* |                                                              |           |
+|                  |                   |                                             | Before               | CommitDebug                                                  | [NativeAPI.commit.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4101-L4102) |           |
+| **Commit Proxy** | CommitProxyServer | commitBatcher                               | batcher              | CommitDebug                                                  | [CommitProxyServer.batcher](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L244-L245) |           |
+|                  |                   | commitBatch                                 |                      |                                                              |                                                              |           |
+|                  |                   | CommitBatchContext::setupTraceBatch         |                      | *[CommitAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L526)* |                                                              |           |
+|                  |                   |                                             | Before               | CommitDebug                                                  | [CommitProxyServer.commitBatch.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L532) |           |
+|                  |                   | CommitBatchContext::preresolutionProcessing | GettingCommitVersion | CommitDebug                                                  | [CommitProxyServer.commitBatch.GettingCommitVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L616-L617) |           |
+|                  |                   |                                             | GotCommitVersion     | CommitDebug                                                  | [CommitProxyServer.commitBatch.GotCommitVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L643) |           |
+| **Resolver**     | Resolver          | resolveBatch                                |                      | *[CommitAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L116)* |                                                              |           |
+|                  |                   |                                             | Before               | CommitDebug                                                  | [Resolver.resolveBatch.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L117) |           |
+|                  |                   |                                             | AfterQueueSizeCheck  | CommitDebug                                                  | [Resolver.resolveBatch.AfterQueueSizeCheck](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L137) |           |
+|                  |                   |                                             | AfterOrderer         | CommitDebug                                                  | [Resolver.resolveBatch.AfterOrderer](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L172) |           |
+|                  |                   |                                             | After                | CommitDebug                                                  | [Resolver.resolveBatch.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L296) |           |
+| **Commit Proxy** | CommitProxyServer | CommitBatchContext::postResolution          | ProcessingMutations  | CommitDebug                                                  | [CommitProxyServer.CommitBatch.ProcessingMutations](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L1074) |           |
+|                  |                   |                                             | AfterStoreCommits    | CommitDebug                                                  | [CommitProxyServer.CommitBatch.AfterStoreCommits](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L1154) |           |
+| **TLog**         | TLogServer        | tLogCommit                                  |                      | *[commitAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2047)* |                                                              |           |
+|                  |                   |                                             | BeforeWaitForVersion | CommitDebug                                                  | [TLogServer.tLogCommit.BeforeWaitForVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2048) |           |
+|                  |                   |                                             | Before               | CommitDebug                                                  | [TLog.tLogCommit.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2083) |           |
+|                  |                   |                                             | AfterTLogCommit      | CommitDebug                                                  | [TLog.tLogCommit.AfterTLogCommit](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2107) |           |
+|                  |                   |                                             | After                | CommitDebug                                                  | [TLog.tLogCommit.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2125) |           |
+| **Commit Proxy** | CommitProxyServer | CommitBatchContext::reply                   | AfterLogPush         | CommitDebug                                                  | [CommitProxyServer.CommitBatch.AfterLogPush](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L1263) |           |
+| **Client**       | NativeAPI         | tryCommit                                   | After                | CommitDebug                                                  | [NativeAPI.commit.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4152) |           |
+|                  |                   | commitAndWatch                              |                      |                                                              |                                                              |           |
+|                  |                   | watchValue                                  |                      | *[WatchValueAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2408)* |                                                              |           |
+|                  |                   |                                             | Before               | WatchValueDebug                                              | [NativeAPI.watchValue.Before]()                              |           |
+|                  |                   |                                             | After                | WatchValueDebug                                              | [NativeAPI.watchValue.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2431-L2433) |           |
+
diff --git a/design/Commit/commit.sequence b/design/Commit/commit.sequence
new file mode 100644
index 0000000000..502a572c22
--- /dev/null
+++ b/design/Commit/commit.sequence
@@ -0,0 +1,148 @@
+title Commit
+
+participantgroup **Client** (NativeAPI.actor.cpp)
+	participant "Transaction::commit" as tC
+    participant "commitAndWatch" as cAW
+    participant "tryCommit" as Commit
+    participant "watchValue" as wV
+end
+
+participantgroup **CommitProxy** (CommitProxyServer.actor.cpp)
+	participant "commitBatcher" as cB
+    participant "commitBatch" as Batch
+    participant "TagPartitionedLogSystem" as TPLS
+end
+
+participantgroup **Master** 
+	participant "getVersion" as gV
+    participant "serveLiveCommittedVersion" as sLCV
+end
+
+participantgroup **Resolver** (Resolver.actor.cpp)
+	participant "resolveBatch" as rB
+end
+
+participantgroup **TLog** (TLogServer.actor.cpp)
+	participant "tLogCommit" as tLC
+end
+
+participantgroup **Storage Server** (storageserver.actor.cpp)
+	participant "serveWatchValueRequests" as sWVR
+end
+
+autoactivation off
+
+tC -> cAW:
+cAW -> Commit: CommitTransactionRequest
+
+note right of Commit: <color:#green>//CommitAttachID//</color>
+note right of Commit: <color:#lightblue>NativeAPI.commit.Before</color>
+
+Commit -> cB: CommitTransactionRequest
+loop Batch requests
+	box over cB: Batch commit requests
+end
+
+cB -> Batch: Batched CommitTransactionRequests
+
+note right of Batch: <color:#lightblue>--CommitProxyServer.batcher--</color>
+
+box over Batch: Preresolution
+
+note right of Batch: <color:#blue>GettingCommitVersion</color>
+
+Batch -> gV: GetCommitVersionRequest
+gV -> Batch: GetCommitVersionReply
+
+note right of Batch: <color:#blue>GotCommitVersion</color>
+
+box over Batch: Resolve
+
+Batch -> rB: ResolveTransactionBatchRequest
+
+note right of rB: <color:#blue>Before</color>
+
+box over rB: Wait for memory/needed version
+
+note right of rB: <color:#blue>AfterQueueSizeCheck</color>
+
+box over rB: Wait for resolver version
+
+note right of rB: <color:#blue>AfterOrderer</color>
+
+box over rB: Resolve the conflicts
+
+note right of rB: <color:#blue>After</color>
+
+rB --> Batch: ResolveTransactionBatchReply
+
+note right of Batch: <color:#blue>ProcessingMutations</color>
+
+box over Batch: Calculate the metadata
+
+box over Batch: Determine which transactions should be committed
+
+box over Batch: Assign storage server tags to mutations
+
+loop Wait txn commit version enter the MVCC window
+	Batch -> sLCV: GetRawCommittedVersionRequest
+    sLCV --> Batch: GetRawCommittedVersionReply
+end
+
+note right of Batch: <color:#blue>AfterStoreCommits</color>
+
+Batch -> TPLS: Version, LogPushData
+TPLS -> tLC: TLogCommitRequest
+
+note right of tLC: <color:#green>//CommitAttachID//</color>
+
+note right of tLC: <color:#blue>BeforeWaitForVersion</color>
+
+box over tLC: Wait for the version
+
+note right of tLC: <color:#blue>Before</color>
+
+box over tLC: Store the commit
+
+box over tLC: Put commit into persistent queue
+
+note right of tLC: <color:#blue>AfterTLogCommit</color>
+
+box over tLC: Wait all prior message being committed
+
+note right of tLC: <color:#blue>After</color>
+
+tLC --> TPLS: TLogCommitReply
+TPLS -> Batch: Version (min)
+
+note right of Batch: AfterLogPush
+
+Batch --> Commit: CommitID
+
+note right of Commit: <color:#lightblue>--NativeAPI.commit.After--</color>
+
+Commit --> cAW:
+
+cAW -> wV: Version
+
+note right of wV: <color:#green>//WatchValueAttachID//</color>
+note right of wV: <color:#blue>Before</color>
+
+wV -> sWVR: WatchValueRequest
+
+note right of sWVR: <color:#lightblue>--watchValueQ.Before--</color>
+
+box over sWVR: Ensure version is not too old
+
+note right of sWVR: <color:#lightblue>--watchValueQ.AfterVersion--</color>
+
+loop Value not change
+	box over sWVR: Check storageserver::getValueQ
+	note right of sWVR: <color:#lightblue>--watchValueQ.AfterRead--</color>
+end 
+
+sWVR --> wV: Version
+
+note right of wV: <color:#blue>After</color>
+
+cAW --> tC:
diff --git a/design/Commit/commitoverall.sequence b/design/Commit/commitoverall.sequence
new file mode 100644
index 0000000000..4bb250b219
--- /dev/null
+++ b/design/Commit/commitoverall.sequence
@@ -0,0 +1,54 @@
+title Commit in FoundationDB
+
+participant "Client" as C
+participant "GetReadVersionProxy" as GRV
+participant "CommitProxy" as P
+participant "Master" as M
+participant "Resolver" as R
+participant "TLog" as T
+	
+C ->> GRV: Request read version
+GRV ->> M: Request committed version
+M ->> GRV: Respond committed version
+GRV ->> C: Respond read version
+	
+C ->> P: Commit a mutation with read version
+  
+box right of P: Pre-resolution
+P ->> M: Request a commit version
+alt New request
+	M ->> P: Commit version
+else Replied before with a commit version
+	M ->> P: Commit version
+else Replied before without commit version
+	M --x P: Never
+end
+  
+box right of P: Resolution
+P ->> R: Send the transaction to the resolver
+alt No conflict
+	R ->> P: TransactionCommitted
+else Conflict
+	R ->> P: TransactionConflict
+else Read snapshot older than oldest version
+	R ->> P: TransactionTooOld
+end
+  
+box right of P: Post-resolution
+P ->> T: Push the transaction data to TLog
+alt TLog not stopped
+	T ->> P: The version of the transactions that are already durable
+else TLog stopped
+	T ->> P: tlog_stopped
+end  
+  
+box right of P: Reply
+P ->> M: Report raw commit version
+M -->> P: Void
+alt Commit successful
+	P ->> C: Commit version
+else Conflict
+	P ->> C: Not committed: conflict
+else Transaction too old
+	P ->> C: Not committed: too old
+end
diff --git a/design/Commit/get.sequence b/design/Commit/get.sequence
new file mode 100644
index 0000000000..dcd2ee7073
--- /dev/null
+++ b/design/Commit/get.sequence
@@ -0,0 +1,68 @@
+title Get
+
+participantgroup **Client** (NativeAPI.actor.cpp)
+	participant "Transaction::get" as get
+    participant "Transaction::getReadVersion" as gRV
+    participant "getValue" as gV
+    participant "getKeyLocation" as gKL
+end
+
+participantgroup **CommitProxy** (CommitProxyServer.actor.cpp)
+	participant "doKeyServerLocationRequest" as dKSLR
+end
+
+participantgroup **Storage Server** (storageserver.actor.cpp)
+	participant "serveGetValueRequests" as sGVR
+    participant "getValueQ" as gVQ
+end 
+
+participantgroup **KeyValueStoreSQLite** (KeyValueStoreSQLite.actor.cpp)
+	participant "KeyValueStoreSQLite::Reader::action" as axn
+end
+
+autoactivation off
+
+get -> gRV:
+box over gRV: //Consult Get Read Version section//
+gRV --> get: Version
+
+get -> gV: Version, Key
+gV -> gKL: Key
+note right of gKL: <color:#blue>Before</color>
+
+gKL -> dKSLR: GetKeyServerLocationsRequest
+dKSLR --> gKL: GetKeyServerLocationsReply
+
+note right of gKL: <color:#blue>After</color>
+
+gKL --> gV: LocationInfo
+
+note right of gV: <color:#green>//GetValueAttachID//</color>
+
+note right of gV: <color:#blue>Before</color>
+
+gV -> sGVR: GetValueRequest
+note right of sGVR: <color:#lightblue>--storageServer.received--</color>
+
+sGVR -> gVQ: GetValueRequest
+
+note right of gVQ: <color:#lightblue>--getValueQ.DoRead--</color>
+
+note right of gVQ: <color:#lightblue>--getValueQ.AfterVersion--</color>
+
+gVQ -> axn: Key
+
+note right of axn: <color:#lightblue>--Reader.Before--</color>
+note right of axn: <color:#lightblue>--Reader.After--</color>
+
+axn --> gVQ: Value
+note right of gVQ: <color:#lightblue>--getValueQ.AfterRead--</color>
+
+gVQ --> gV: GetValueReply
+alt Error
+  	note right of gV: <color:#blue>Error</color>
+	gV --> get: Error
+else Success
+	note right of gV: <color:#blue>After</color>
+	gV --> get: Value
+end
diff --git a/design/Commit/getrange.sequence b/design/Commit/getrange.sequence
new file mode 100644
index 0000000000..5a07436b99
--- /dev/null
+++ b/design/Commit/getrange.sequence
@@ -0,0 +1,60 @@
+title GetRange
+
+participantgroup **Client** (NativeAPI.actor.cpp)
+	participant "Transaction::getRange" as tGR
+    participant "Transaction::getReadVersion" as gRV
+    participant "getRange" as gR
+    participant "getKeyLocation" as gKL
+end
+
+participantgroup **Storage Server** (storageserver.actor.cpp)
+    participant "getKeyValuesQ" as gKVQ
+end
+
+autoactivation off
+
+tGR -> gRV:
+tGR -> gR: KeyRange
+gRV -->(2) gR: Version
+
+loop Keys in the range
+	gR -> gKL: Key
+    
+    box over gKL: //Consult Get section//
+    
+    gKL --> gR: LocationInfo
+
+	  note right of gR: <color:#blue>Before</color>
+    
+    gR -> gKVQ: GetKeyValuesRequest
+    
+    note right of gKVQ: <color:#lightblue>--storageserver.getKeyValues.Before--</color>
+    
+    box over gKVQ: Wait the SS version
+    
+    note right of gKVQ: <color:#lightblue>--storageserver.getKeyValues.AfterVersion--</color>
+    
+    box over gKVQ: Realign the keys
+    
+    note right of gKVQ: <color:#lightblue>--storageserver.getKeyValues.AfterKeys--</color>
+    
+    alt No KV pair stored in this server
+    	note right of gKVQ: <color:#lightblue>--storageserver.getKeyValues.Send--</color>
+        gKVQ --> gR: GetKeyValuesReply (empty)
+    else KV pair found
+    	note right of gKVQ: <color:#lightblue>--storageserver.getKeyValues.AfterReadRange--</color>
+        gKVQ --> gR: GetKeyValuesReply
+    end
+    
+    note right of gR: <color:#blue>After</color>
+    
+    box over gR: Combines the results
+end
+
+alt Error
+	note right of gR: <color:#blue>Error</color>
+    box over gR: Fallback
+	gR -> tGR: RangeResultRef or Error
+else Successful
+	gR -> tGR: RangeResultRef
+end
diff --git a/design/Commit/getrangefallback.sequence b/design/Commit/getrangefallback.sequence
new file mode 100644
index 0000000000..7fdbf56a3c
--- /dev/null
+++ b/design/Commit/getrangefallback.sequence
@@ -0,0 +1,80 @@
+title GetRange Fallback
+
+participantgroup **Client** (NativeAPI.actor.cpp)
+	participant "getRangeFallback" as gRF
+    participant "getKey" as gK
+    participant "getExactRange" as gER
+    participant "getKeyRangeLocations" as gKRL
+end
+
+participantgroup **Storage Server** (storageserver.actor.cpp)
+    participant "serveGetKeyValuesRequests" as sGKVR
+    participant "serveGetKeyRequests" as sGKR
+end
+
+autoactivation off
+
+opt Key need resolve
+	gRF -> gK: KeySelector
+    
+    box over gK: Wait for the version
+    
+    note right of gK: <color:#green>//GetKeyAttachID//</color>
+    note right of gK: <color:#blue>AfterVersion</color>
+    
+    box over gK: See getKeyLocation in Get
+    
+    note right of gK: <color:#blue>Before</color>
+    
+    gK -> sGKR: GetKeyRequest
+    sGKR --> gK: GetKeyReply
+    
+    alt Success
+	    note right of gK: <color:#blue>After</color>
+        gK --> gRF: Key
+    else Error
+    	note right of gK: <color:#blue>Error</color>
+    end
+end
+
+box over gRF: Update read version if necessary
+
+gRF -> gER: Version, KeyRangeRef
+
+loop Loop over keys in the range
+	gER -> gKRL: KeyRange
+    
+    note right of gKRL: <color:#blue>Before</color>
+    box over gKRL: Get the locations
+    note right of gKRL: <color:#blue>After</color>
+    
+    gKRL --> gER: LocationInfo
+    
+    loop Loop over shards
+    	note right of gER: <color:#blue>Before</color>
+        
+        gER -> sGKVR: GetKeyValuesRequest
+    
+ 	    note right of sGKVR: <color:#lightblue>--storageserver.getKeyValues.Before--</color>
+    
+    	box over sGKVR: Wait the SS version
+    
+    	note right of sGKVR: <color:#lightblue>--storageserver.getKeyValues.AfterVersion--</color>
+    
+    	box over sGKVR: Realign the keys
+    
+    	note right of sGKVR: <color:#lightblue>--storageserver.getKeyValues.AfterKeys--</color>
+    
+    	alt No KV pair stored in this server
+    		note right of sGKVR: <color:#lightblue>--storageserver.getKeyValues.Send--</color>
+        	sGKVR --> gER: GetKeyValuesReply (empty)
+    	else KV pair found
+    		note right of sGKVR: <color:#lightblue>--storageserver.getKeyValues.AfterReadRange--</color>
+        	sGKVR --> gER: GetKeyValuesReply
+    	end
+        
+        note right of gER: <color:#blue>After</color>
+    end
+end
+
+gER --> gRF: RangeResultRef
diff --git a/design/Commit/grv.sequence b/design/Commit/grv.sequence
new file mode 100644
index 0000000000..c09ac97830
--- /dev/null
+++ b/design/Commit/grv.sequence
@@ -0,0 +1,66 @@
+title Get Read Version
+
+participantgroup **Client** (NativeAPI.actor.cpp)
+	participant "Transaction::getReadVersion" as gRV
+    participant "readVersionBatcher" as rVB
+    participant "getConsistentReadVersion" as gCRV
+end
+
+participantgroup **GRVProxy** (GrvProxyServer.actor.cpp)
+	participant "queueGetReadVersionRequests" as qGRVR
+    participant "transactionStarter" as tS
+    participant "getLiveCommittedVersion" as gLCV
+end
+
+participantgroup **Master** (masterserver.actor.cpp)
+	participant "serveLiveCommittedVersion" as sLCV
+end 
+
+autoactivation off
+
+gRV -> rVB: VersionRequest
+
+loop Batch requests
+	box over rVB:Batch read version requests
+end
+
+note right of rVB: <color:#green>//TransactionAttachID//</color>
+
+rVB -> gCRV:
+
+note right of gCRV: <color:#blue>Before</color>
+
+gCRV -> qGRVR: GetReadVersionRequest
+
+loop Batch requests
+	box over qGRVR: Batch read version requests
+end
+
+note right of qGRVR: <color:#lightblue>--GrvProxyServer.queueTransactionStartRequests.Before--</color>
+
+qGRVR -> tS:
+
+note right of tS: <color:#green>//TransactionAttachID//</color>
+
+note right of tS: <color:#blue>AskLiveCommittedVersionFromMaster</color>
+
+tS -> gLCV:
+
+note right of gLCV: <color:#blue>confirmEpochLive</color>
+
+gLCV -> sLCV: GetRawCommittedVersionRequest
+
+note right of sLCV: <color:#blue>GetRawCommittedVersion</color>
+
+sLCV --> gLCV: GetRawCommittedVersionReply
+
+note right of gLCV: <color:#blue>After</color>
+
+gLCV --> gCRV: GetReadVersionReply
+
+note right of gCRV: <color:#blue>After</color>
+
+gCRV --> rVB: GetReadVersionReply
+
+rVB --> gRV: GetReadVersionReply
+

From 927c7993ccd1bdb27ca821a773e451bad11292ba Mon Sep 17 00:00:00 2001
From: Xiaoge Su <magichp@gmail.com>
Date: Wed, 17 Mar 2021 01:15:21 -0700
Subject: [PATCH 020/317] Update the documentation per comments

---
 design/Commit/How a commit is done in FDB.md | 32 +++++++++++++-------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/design/Commit/How a commit is done in FDB.md b/design/Commit/How a commit is done in FDB.md
index 78d74ed1e2..1b6c9e62e3 100644
--- a/design/Commit/How a commit is done in FDB.md	
+++ b/design/Commit/How a commit is done in FDB.md	
@@ -26,14 +26,14 @@ Before all RPCs mentioned below, the client would first verify if the commit pro
 
 * The commit proxy sends a request for commit version, with a request number.
 
-* - The request number is a monotonically increasing number per commit proxy.
+  - The request number is a monotonically increasing number per commit proxy.
   - This ensures for each proxy, the master will process the requests in order.
 
 * The master server waits until the request number is current.
 
   When the current request number is larger than the incoming request number
 
-  * If a commit version is already assigned to the incoming request number, return the commit version and the version that is immediately before the commit version (prevVersion).
+  * If a commit version is already assigned to the incoming request number, return the commit version and the previous commit version. (i.e. `prevVersion`)
 
   * Otherwise return `Never`
 
@@ -41,7 +41,7 @@ Before all RPCs mentioned below, the client would first verify if the commit pro
 
     * Only one process serves as master. Thus the commit version is unique for each cluster.
 
-    * The monotonically increasing commit version will ensure each transaction processed in strict ordering.
+    * The monotonically increasing commit version will ensure that each transaction is processed in a strict serial order.
 
 ### Resolution section
 
@@ -56,26 +56,25 @@ Before all RPCs mentioned below, the client would first verify if the commit pro
 ### Post Resolution section
 
 * The proxy waits until the local batch number is current
-* The proxy will update the metadata keys and calculate which storage servers are affected
-* The proxy then waits until the commit version is current, i.e. only those commits in the MVCC window should be processed.
-* The proxy pushs the commit data to TLog
-* TLog waits the commit version to current, then persist the commit.
 
 ### TLog section
-
+* The proxy updates the metadata keys and attaches corresponding storage servers' tags to all mutations.
+* The proxy then waits until the commit version is current, i.e. the proxy's committed version is catching up with the commit version of the batch and these two versions are within the MVCC window.
+* The proxy pushs the commit data to TLog
+* TLog waits the commit version to current, then persist the commit.
 * Wait until *all* TLogs returns the transaction result.
 
 ### Reply section
 
-* The proxy will update the master its commit version
+* The proxy updates the master with the committed version for next GRV request at the master.
 * Reply the result to the client, base on the result from the resolver.
 
 ## Tracking the process using `g_traceBatch`
 
-`g_traceBatch` can be used for querying the transactions and commits. A typical query string:
+`g_traceBatch` can be used for querying the transactions and commits. A typical query string for Splunk is:
 
 ```
-index=iffdb LogGroup=loggroup Type=location Location=location
+LogGroup=loggroup Type=type Location=location
 ```
 
 The format of `location` is, in general, `<source_file_name>.<function/actor name>.<log information>`, e.g.
@@ -86,6 +85,17 @@ NativeAPI.getConsistentReadVersion.Before
 
 means the `location` is at `NativeAPI.actor.cpp`, `ACTOR` `getConsistentReadVersion`, `Before` requesting the read version from GRV Proxy.
 
+Some of example queries are:
+
+```
+LogGroup=loggroup Type=TransactionDebug Location=NativeAPI*
+```
+
+```
+LogGroup=loggroup Type=CommitDebug Location=storageserver*
+```
+
+
 In the following sections, <span style="color:green">green</span> tag indicates an attach; <span style="color:blue">blue</span> tag indicates an event that the location follows the format mentioned above, where only the `<log information>` is included; <span style="color:lightblue">light-blue</span> tag indicates an event that the location is not following the format, where the full location is included. All the `g_traceBatch` events are tabularized after the diagram.
 
 `contrib/commit_debug.py` can be used to visualize the commit process.

From 924253da86afa4dd87c14a911e8d68160d0733a3 Mon Sep 17 00:00:00 2001
From: Xiaoge Su <magichp@gmail.com>
Date: Thu, 18 Mar 2021 17:03:19 -0700
Subject: [PATCH 021/317] Update the documentation per comments II

---
 design/Commit/How a commit is done in FDB.md | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/design/Commit/How a commit is done in FDB.md b/design/Commit/How a commit is done in FDB.md
index 1b6c9e62e3..739549aedf 100644
--- a/design/Commit/How a commit is done in FDB.md	
+++ b/design/Commit/How a commit is done in FDB.md	
@@ -56,13 +56,11 @@ Before all RPCs mentioned below, the client would first verify if the commit pro
 ### Post Resolution section
 
 * The proxy waits until the local batch number is current
-
-### TLog section
 * The proxy updates the metadata keys and attaches corresponding storage servers' tags to all mutations.
 * The proxy then waits until the commit version is current, i.e. the proxy's committed version is catching up with the commit version of the batch and these two versions are within the MVCC window.
-* The proxy pushs the commit data to TLog
-* TLog waits the commit version to current, then persist the commit.
-* Wait until *all* TLogs returns the transaction result.
+* The proxy pushes the commit data to TLogs.
+* TLog waits the commit version to be current, then persists the commit.
+* Wait until *all* TLogs return the transaction result.
 
 ### Reply section
 
@@ -71,10 +69,10 @@ Before all RPCs mentioned below, the client would first verify if the commit pro
 
 ## Tracking the process using `g_traceBatch`
 
-`g_traceBatch` can be used for querying the transactions and commits. A typical query string for Splunk is:
+`g_traceBatch` can be used for querying the transactions and commits. A typical query in the trace logs is:
 
 ```
-LogGroup=loggroup Type=type Location=location
+Type=type Location=location
 ```
 
 The format of `location` is, in general, `<source_file_name>.<function/actor name>.<log information>`, e.g.
@@ -85,14 +83,14 @@ NativeAPI.getConsistentReadVersion.Before
 
 means the `location` is at `NativeAPI.actor.cpp`, `ACTOR` `getConsistentReadVersion`, `Before` requesting the read version from GRV Proxy.
 
-Some of example queries are:
+Some example queries are:
 
 ```
-LogGroup=loggroup Type=TransactionDebug Location=NativeAPI*
+Type=TransactionDebug Location=NativeAPI*
 ```
 
 ```
-LogGroup=loggroup Type=CommitDebug Location=storageserver*
+LogGroup=loggroup Type=CommitDebug Location=Resolver.resolveBatch.*
 ```
 
 

From 5c1b674815b1765dbc08eed4d98875163dee5708 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 19 Mar 2021 10:31:58 -0600
Subject: [PATCH 022/317] implemented test

---
 flow/CMakeLists.txt                          |   2 +-
 flow/WriteOnlySet.actor.cpp                  | 159 +++++++++++++++++++
 flow/{ActorLineageSet.cpp => WriteOnlySet.h} |  75 ++++-----
 3 files changed, 187 insertions(+), 49 deletions(-)
 create mode 100644 flow/WriteOnlySet.actor.cpp
 rename flow/{ActorLineageSet.cpp => WriteOnlySet.h} (60%)

diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt
index 5e89fe4d28..4c28aee437 100644
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@@ -3,7 +3,6 @@ find_package(Threads REQUIRED)
 set(FLOW_SRCS
   ActorCollection.actor.cpp
   ActorCollection.h
-  ActorLineageSet.cpp
   Arena.cpp
   Arena.h
   AsioReactor.h
@@ -70,6 +69,7 @@ set(FLOW_SRCS
   TreeBenchmark.h
   UnitTest.cpp
   UnitTest.h
+  WriteOnlySet.actor.cpp
   XmlTraceLogFormatter.cpp
   XmlTraceLogFormatter.h
   actorcompiler.h
diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
new file mode 100644
index 0000000000..d0f7c514ad
--- /dev/null
+++ b/flow/WriteOnlySet.actor.cpp
@@ -0,0 +1,159 @@
+/*
+ * WriteOnlySet.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flow/DeterministicRandom.h"
+#include "flow/WriteOnlySet.h"
+#include "flow/flow.h"
+#include "flow/UnitTest.h"
+
+#include <chrono>
+#include <random>
+#include "flow/actorcompiler.h" // has to be last include
+
+template <class T, class IndexType, IndexType CAPACITY>
+auto WriteOnlySet<T, IndexType, CAPACITY>::insert(const Reference<T>& lineage) -> Index {
+	Index res;
+	if (!freeQueue.pop(res)) {
+		TraceEvent(SevWarnAlways, "NoCapacityInWriteOnlySet");
+		return npos;
+	}
+	ASSERT(_set[res].load() & FREE);
+	auto ptr = reinterpret_cast<uintptr_t>(lineage.getPtr());
+	ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned
+	ASSERT((ptr & FREE) == 0 && (ptr & LOCK) == 0);
+	lineage->addref();
+	_set[res].store(ptr);
+	return res;
+}
+
+template <class T, class IndexType, IndexType CAPACITY>
+void WriteOnlySet<T, IndexType, CAPACITY>::erase(Index idx) {
+	while (true) {
+		auto ptr = _set[idx].load();
+		if (ptr & LOCK) {
+			_set[idx].store(FREE);
+			freeList.push(reinterpret_cast<T*>(ptr ^ LOCK));
+			return;
+		} else {
+			if (_set[idx].compare_exchange_strong(ptr, FREE)) {
+				reinterpret_cast<T*>(ptr)->delref();
+				return;
+			}
+		}
+	}
+}
+
+// Explicit instantiation
+template class WriteOnlySet<ActorLineage, unsigned, 1024>;
+
+// testing code
+namespace {
+
+std::atomic<unsigned long> instanceCounter = 0;
+constexpr double iteration_frequency = 10.0;
+
+struct TestObject {
+	mutable std::atomic<unsigned> _refCount = 1;
+	TestObject() { instanceCounter.fetch_add(1); }
+	void delref() const {
+		if (--_refCount == 0) {
+			delete this;
+			--instanceCounter;
+		}
+	}
+	void addref() const { ++_refCount; }
+};
+
+using TestSet = WriteOnlySet<TestObject, unsigned, 128>;
+using Clock = std::chrono::steady_clock;
+
+ACTOR Future<Void> threadjoiner(std::shared_ptr<std::vector<std::thread>> threads, std::shared_ptr<TestSet> set) {
+	loop {
+		wait(delay(0.1));
+		for (unsigned i = 0;;) {
+			if (threads->size() == i) {
+				break;
+			}
+			auto& t = (*threads)[i];
+			if (t.joinable()) {
+				t.join();
+				if (i + 1 < threads->size()) {
+					std::swap(*threads->rbegin(), (*threads)[i]);
+				}
+				threads->pop_back();
+			} else {
+				++i;
+			}
+		}
+		if (threads->empty()) {
+			set->copy();
+			ASSERT(instanceCounter.load() == 0);
+			return Void();
+		}
+	}
+}
+
+void testCopier(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
+	auto start = Clock::now();
+	while (true) {
+		if (Clock::now() - start > runFor) {
+			return;
+		}
+		auto copy = set->copy();
+		std::this_thread::sleep_for(std::chrono::milliseconds(10));
+	}
+}
+
+void writer(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
+	auto start = Clock::now();
+	std::random_device rDev;
+	DeterministicRandom rnd(rDev());
+	while (true) {
+		if (Clock::now() - start > runFor) {
+			return;
+		}
+		std::vector<TestSet::Index> positions;
+		for (int i = 0; i < rnd.randomInt(1, 101); ++i) {
+			positions.push_back(set->insert(Reference<TestObject>(new TestObject())));
+		}
+		rnd.randomShuffle(positions);
+		for (auto p : positions) {
+			set->erase(p);
+		}
+		std::this_thread::sleep_for(std::chrono::milliseconds(1));
+	}
+}
+
+TEST_CASE("/flow/WriteOnlySet") {
+	if (g_network->isSimulated()) {
+		// This test is not deterministic, so we shouldn't run it in simulation
+		return Void();
+	}
+	auto set = std::make_shared<TestSet>();
+	auto threads = std::make_shared<std::vector<std::thread>>();
+	std::chrono::seconds runFor(10);
+	for (int i = 0; i < 5; ++i) {
+		threads->emplace_back([set, runFor]() { writer(set, runFor); });
+	}
+	threads->emplace_back([set, runFor]() { testCopier(set, runFor); });
+	wait(threadjoiner(threads, set));
+	return Void();
+}
+} // namespace
\ No newline at end of file
diff --git a/flow/ActorLineageSet.cpp b/flow/WriteOnlySet.h
similarity index 60%
rename from flow/ActorLineageSet.cpp
rename to flow/WriteOnlySet.h
index 570976379c..a319ad22f0 100644
--- a/flow/ActorLineageSet.cpp
+++ b/flow/WriteOnlySet.h
@@ -1,9 +1,9 @@
 /*
- * ActorLineageSet.cpp
+ * WriteOnlySet.cpp
  *
  * This source file is part of the FoundationDB open source project
  *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,20 +18,23 @@
  * limitations under the License.
  */
 
-#include "flow/flow.h"
+#pragma once
+#include "flow/Error.h"
+#include "flow/FastRef.h"
+#include "flow/Trace.h"
 #include <boost/lockfree/queue.hpp>
 
-class ActorLineageSet {
+template <class T, class IndexType, IndexType CAPACITY>
+class WriteOnlySet {
 public:
 	// The type we use for lookup into the set. Gets assigned during insert
-	using Index = unsigned;
+	using Index = IndexType;
 	// For now we use a fixed size capacity
-	constexpr static Index CAPACITY = 1024;
 	constexpr static Index npos = std::numeric_limits<Index>::max();
 
-	explicit ActorLineageSet();
-	ActorLineageSet(const ActorLineageSet&) = delete;
-	ActorLineageSet& operator=(const ActorLineageSet&) = delete;
+	explicit WriteOnlySet();
+	WriteOnlySet(const WriteOnlySet&) = delete;
+	WriteOnlySet& operator=(const WriteOnlySet&) = delete;
 
 	// Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so
 	// the actual size might change anytime after or even during the call. This function only guarantees that the size
@@ -39,36 +42,39 @@ public:
 	// to handle this is by assuming that this returns an estimate.
 	unsigned size();
 
-	Index insert(const Reference<ActorLineage>& lineage);
+	Index insert(const Reference<T>& lineage);
 	void erase(Index idx);
-	std::vector<Reference<ActorLineage>> copy();
+	std::vector<Reference<T>> copy();
 
 private:
 	static constexpr uintptr_t FREE = 0b1;
 	static constexpr uintptr_t LOCK = 0b10;
-	std::atomic<unsigned> _size = 0;
+	std::atomic<Index> _size = 0;
 	std::vector<std::atomic<std::uintptr_t>> _set;
+	static_assert(std::atomic<Index>::is_always_lock_free, "Index type can't be used as a lock-free type");
+	static_assert(std::atomic<Index>::is_always_lock_free, "uintptr_t can't be used as a lock-free type");
 	boost::lockfree::queue<Index, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeQueue;
-	boost::lockfree::queue<ActorLineage*, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>>
-	    freeList;
+	boost::lockfree::queue<T*, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeList;
 };
 
-ActorLineageSet::ActorLineageSet() {
+template <class T, class IndexType, IndexType CAPACITY>
+WriteOnlySet<T, IndexType, CAPACITY>::WriteOnlySet() : _set(CAPACITY) {
 	// insert the free indexes in reverse order
 	for (unsigned i = CAPACITY; i > 0; --i) {
 		freeQueue.push(i - 1);
-		_set[i] = uintptr_t(1);
+		_set[i] = uintptr_t(FREE);
 	}
 }
 
-std::vector<Reference<ActorLineage>> ActorLineageSet::copy() {
-	std::vector<Reference<ActorLineage>> result;
+template <class T, class IndexType, IndexType CAPACITY>
+std::vector<Reference<T>> WriteOnlySet<T, IndexType, CAPACITY>::copy() {
+	std::vector<Reference<T>> result;
 	for (int i = 0; i < CAPACITY; ++i) {
 		auto ptr = _set[i].load();
 		if ((ptr & FREE) != 0) {
 			ASSERT((ptr & LOCK) == 0);
 			if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) {
-				ActorLineage* entry = reinterpret_cast<ActorLineage*>(ptr);
+				T* entry = reinterpret_cast<T*>(ptr);
 				ptr |= LOCK;
 				entry->addref();
 				// we try to unlock now. If this element was removed while we incremented the refcount, the element will
@@ -85,32 +91,5 @@ std::vector<Reference<ActorLineage>> ActorLineageSet::copy() {
 	return result;
 }
 
-ActorLineageSet::Index ActorLineageSet::insert(const Reference<ActorLineage>& lineage) {
-	Index res;
-	if (!freeQueue.pop(res)) {
-		TraceEvent(SevWarnAlways, "NoCapacityInActorLineageSet");
-		return npos;
-	}
-	ASSERT(_set[res].load() & FREE);
-	auto ptr = reinterpret_cast<uintptr_t>(lineage.getPtr());
-	ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned
-	lineage->addref();
-	_set[res].store(ptr);
-	return res;
-}
-
-void ActorLineageSet::erase(Index idx) {
-	while (true) {
-		auto ptr = _set[idx].load();
-		if (ptr & LOCK) {
-			_set[idx].store(FREE);
-			freeList.push(reinterpret_cast<ActorLineage*>(ptr ^ LOCK));
-			return;
-		} else {
-			if (_set[idx].compare_exchange_strong(ptr, FREE)) {
-				reinterpret_cast<ActorLineage*>(ptr)->delref();
-				return;
-			}
-		}
-	}
-}
\ No newline at end of file
+class ActorLineage;
+extern template class WriteOnlySet<ActorLineage, unsigned, 1024>;

From 459afeed4cd9d6df4892e085f94d369af59f1efc Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 19 Mar 2021 11:25:55 -0600
Subject: [PATCH 023/317] disable jemalloc on macOS

---
 cmake/Jemalloc.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Jemalloc.cmake b/cmake/Jemalloc.cmake
index 6dff173b93..e89ef3ce82 100644
--- a/cmake/Jemalloc.cmake
+++ b/cmake/Jemalloc.cmake
@@ -3,7 +3,7 @@ add_library(jemalloc INTERFACE)
 set(USE_JEMALLOC ON)
 # We don't want to use jemalloc on Windows
 # Nor on FreeBSD, where jemalloc is the default system allocator
-if(USE_SANITIZER OR WIN32 OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD"))
+if(USE_SANITIZER OR WIN32 OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") OR APPLE)
   set(USE_JEMALLOC OFF)
   return()
 endif()

From 995ae34b1e637f6f776fc889e00474eb1ca1a322 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 19 Mar 2021 17:10:42 -0600
Subject: [PATCH 024/317] Bugfxies & hack to allow new unit test to run

---
 fdbserver/fdbserver.actor.cpp |  4 ++
 flow/WriteOnlySet.actor.cpp   | 89 ++++++++++++++++++++++++++++++-----
 flow/WriteOnlySet.h           | 44 +++--------------
 3 files changed, 89 insertions(+), 48 deletions(-)

diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index ff28269e4f..a285c0b958 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -66,6 +66,7 @@
 #include "flow/SystemMonitor.h"
 #include "flow/TLSConfig.actor.h"
 #include "flow/Tracing.h"
+#include "flow/WriteOnlySet.h"
 
 #if defined(__linux__) || defined(__FreeBSD__)
 #include <execinfo.h>
@@ -1572,6 +1573,9 @@ private:
 } // namespace
 
 int main(int argc, char* argv[]) {
+	// TODO: Remove later, this is just to force the statics to be initialized
+	// otherwise the unit test won't run
+	ActorLineageSet _;
 	try {
 		platformInit();
 
diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index d0f7c514ad..32023f5e24 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -34,32 +34,75 @@ auto WriteOnlySet<T, IndexType, CAPACITY>::insert(const Reference<T>& lineage) -
 		TraceEvent(SevWarnAlways, "NoCapacityInWriteOnlySet");
 		return npos;
 	}
-	ASSERT(_set[res].load() & FREE);
+	ASSERT(_set[res].load() == 0);
 	auto ptr = reinterpret_cast<uintptr_t>(lineage.getPtr());
-	ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned
-	ASSERT((ptr & FREE) == 0 && (ptr & LOCK) == 0);
+	ASSERT((ptr % 2) == 0); // this needs to be at least 2-byte aligned
+	ASSERT(ptr != 0);
 	lineage->addref();
 	_set[res].store(ptr);
 	return res;
 }
 
 template <class T, class IndexType, IndexType CAPACITY>
-void WriteOnlySet<T, IndexType, CAPACITY>::erase(Index idx) {
+bool WriteOnlySet<T, IndexType, CAPACITY>::eraseImpl(Index idx) {
 	while (true) {
 		auto ptr = _set[idx].load();
 		if (ptr & LOCK) {
-			_set[idx].store(FREE);
+			_set[idx].store(0);
 			freeList.push(reinterpret_cast<T*>(ptr ^ LOCK));
-			return;
+			return false;
 		} else {
-			if (_set[idx].compare_exchange_strong(ptr, FREE)) {
+			if (_set[idx].compare_exchange_strong(ptr, 0)) {
 				reinterpret_cast<T*>(ptr)->delref();
-				return;
+				return true;
 			}
 		}
 	}
 }
 
+template <class T, class IndexType, IndexType CAPACITY>
+bool WriteOnlySet<T, IndexType, CAPACITY>::erase(Index idx) {
+	auto res = eraseImpl(idx);
+	ASSERT(freeQueue.push(idx));
+	return res;
+}
+
+template <class T, class IndexType, IndexType CAPACITY>
+WriteOnlySet<T, IndexType, CAPACITY>::WriteOnlySet() : _set(CAPACITY) {
+	// insert the free indexes in reverse order
+	for (unsigned i = CAPACITY; i > 0; --i) {
+		freeQueue.push(i - 1);
+		_set[i] = uintptr_t(0);
+	}
+}
+
+template <class T, class IndexType, IndexType CAPACITY>
+std::vector<Reference<T>> WriteOnlySet<T, IndexType, CAPACITY>::copy() {
+	std::vector<Reference<T>> result;
+	for (int i = 0; i < CAPACITY; ++i) {
+		auto ptr = _set[i].load();
+		if (ptr) {
+			ASSERT((ptr & LOCK) == 0); // if we lock something we need to immediately unlock after we're done copying
+			// We attempt lock so this won't get deleted. We will try this only once, if the other thread removed the
+			// object from the set between the previews lines and now, we just won't make it part of the result.
+			if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) {
+				T* entry = reinterpret_cast<T*>(ptr);
+				ptr |= LOCK;
+				entry->addref();
+				// we try to unlock now. If this element was removed while we incremented the refcount, the element will
+				// end up in the freeList, so we will decrement later.
+				_set[i].compare_exchange_strong(ptr, ptr ^ LOCK);
+				result.emplace_back(entry);
+			}
+		}
+	}
+	// after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread
+	// might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next
+	// iteration
+	freeList.consume_all([](auto toClean) { toClean->delref(); });
+	return result;
+}
+
 // Explicit instantiation
 template class WriteOnlySet<ActorLineage, unsigned, 1024>;
 
@@ -67,7 +110,10 @@ template class WriteOnlySet<ActorLineage, unsigned, 1024>;
 namespace {
 
 std::atomic<unsigned long> instanceCounter = 0;
-constexpr double iteration_frequency = 10.0;
+std::atomic<unsigned long> numInserts = 0;
+std::atomic<unsigned long> numErase = 0;
+std::atomic<unsigned long> numLockedErase = 0;
+std::atomic<unsigned long> numCopied = 0;
 
 struct TestObject {
 	mutable std::atomic<unsigned> _refCount = 1;
@@ -117,6 +163,7 @@ void testCopier(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
 			return;
 		}
 		auto copy = set->copy();
+		numCopied.fetch_add(copy.size());
 		std::this_thread::sleep_for(std::chrono::milliseconds(10));
 	}
 }
@@ -126,17 +173,32 @@ void writer(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
 	std::random_device rDev;
 	DeterministicRandom rnd(rDev());
 	while (true) {
+		unsigned inserts = 0, erases = 0;
 		if (Clock::now() - start > runFor) {
 			return;
 		}
 		std::vector<TestSet::Index> positions;
 		for (int i = 0; i < rnd.randomInt(1, 101); ++i) {
-			positions.push_back(set->insert(Reference<TestObject>(new TestObject())));
+			Reference<TestObject> o(new TestObject());
+			auto pos = set->insert(o);
+			if (pos == TestSet::npos) {
+				// could not insert -- ignore
+				break;
+			}
+			++inserts;
+			ASSERT(pos < TestSet::capacity);
+			positions.push_back(pos);
 		}
 		rnd.randomShuffle(positions);
 		for (auto p : positions) {
-			set->erase(p);
+			if (!set->erase(p)) {
+				++numLockedErase;
+			}
+			++erases;
 		}
+		numInserts.fetch_add(inserts);
+		numErase.fetch_add(erases);
+		ASSERT(inserts == erases);
 		std::this_thread::sleep_for(std::chrono::milliseconds(1));
 	}
 }
@@ -154,6 +216,11 @@ TEST_CASE("/flow/WriteOnlySet") {
 	}
 	threads->emplace_back([set, runFor]() { testCopier(set, runFor); });
 	wait(threadjoiner(threads, set));
+	TraceEvent("WriteOnlySetTestResult")
+	    .detail("Inserts", numInserts.load())
+	    .detail("Erases", numErase.load())
+	    .detail("Copies", numCopied.load())
+	    .detail("LockedErase", numLockedErase.load());
 	return Void();
 }
 } // namespace
\ No newline at end of file
diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h
index a319ad22f0..9d80795c68 100644
--- a/flow/WriteOnlySet.h
+++ b/flow/WriteOnlySet.h
@@ -31,6 +31,7 @@ public:
 	using Index = IndexType;
 	// For now we use a fixed size capacity
 	constexpr static Index npos = std::numeric_limits<Index>::max();
+	constexpr static IndexType capacity = CAPACITY;
 
 	explicit WriteOnlySet();
 	WriteOnlySet(const WriteOnlySet&) = delete;
@@ -43,12 +44,13 @@ public:
 	unsigned size();
 
 	Index insert(const Reference<T>& lineage);
-	void erase(Index idx);
+	bool erase(Index idx);
 	std::vector<Reference<T>> copy();
 
 private:
-	static constexpr uintptr_t FREE = 0b1;
-	static constexpr uintptr_t LOCK = 0b10;
+	bool eraseImpl(Index idx);
+
+	static constexpr uintptr_t LOCK = 0b1;
 	std::atomic<Index> _size = 0;
 	std::vector<std::atomic<std::uintptr_t>> _set;
 	static_assert(std::atomic<Index>::is_always_lock_free, "Index type can't be used as a lock-free type");
@@ -57,39 +59,7 @@ private:
 	boost::lockfree::queue<T*, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeList;
 };
 
-template <class T, class IndexType, IndexType CAPACITY>
-WriteOnlySet<T, IndexType, CAPACITY>::WriteOnlySet() : _set(CAPACITY) {
-	// insert the free indexes in reverse order
-	for (unsigned i = CAPACITY; i > 0; --i) {
-		freeQueue.push(i - 1);
-		_set[i] = uintptr_t(FREE);
-	}
-}
-
-template <class T, class IndexType, IndexType CAPACITY>
-std::vector<Reference<T>> WriteOnlySet<T, IndexType, CAPACITY>::copy() {
-	std::vector<Reference<T>> result;
-	for (int i = 0; i < CAPACITY; ++i) {
-		auto ptr = _set[i].load();
-		if ((ptr & FREE) != 0) {
-			ASSERT((ptr & LOCK) == 0);
-			if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) {
-				T* entry = reinterpret_cast<T*>(ptr);
-				ptr |= LOCK;
-				entry->addref();
-				// we try to unlock now. If this element was removed while we incremented the refcount, the element will
-				// end up in the freeList, so we will decrement later.
-				_set[i].compare_exchange_strong(ptr, ptr ^ LOCK);
-				result.emplace_back(entry);
-			}
-		}
-	}
-	// after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread
-	// might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next
-	// iteration
-	freeList.consume_all([](auto toClean) { toClean->delref(); });
-	return result;
-}
-
 class ActorLineage;
 extern template class WriteOnlySet<ActorLineage, unsigned, 1024>;
+
+using ActorLineageSet = WriteOnlySet<ActorLineage, unsigned, 1024>;

From 99ac47e96c10922ca40e1267467bcfcbb51a51a0 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 19 Mar 2021 18:08:09 -0600
Subject: [PATCH 025/317] documentation

---
 flow/WriteOnlySet.actor.cpp |  6 ++++
 flow/WriteOnlySet.h         | 65 +++++++++++++++++++++++++++++++++----
 2 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index 32023f5e24..93d9e99fc7 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -109,12 +109,14 @@ template class WriteOnlySet<ActorLineage, unsigned, 1024>;
 // testing code
 namespace {
 
+// Some statistics
 std::atomic<unsigned long> instanceCounter = 0;
 std::atomic<unsigned long> numInserts = 0;
 std::atomic<unsigned long> numErase = 0;
 std::atomic<unsigned long> numLockedErase = 0;
 std::atomic<unsigned long> numCopied = 0;
 
+// A simple object that counts the number of its instances. This is used to detect memory leaks.
 struct TestObject {
 	mutable std::atomic<unsigned> _refCount = 1;
 	TestObject() { instanceCounter.fetch_add(1); }
@@ -130,6 +132,7 @@ struct TestObject {
 using TestSet = WriteOnlySet<TestObject, unsigned, 128>;
 using Clock = std::chrono::steady_clock;
 
+// An actor that can join a set of threads in an async way.
 ACTOR Future<Void> threadjoiner(std::shared_ptr<std::vector<std::thread>> threads, std::shared_ptr<TestSet> set) {
 	loop {
 		wait(delay(0.1));
@@ -156,6 +159,7 @@ ACTOR Future<Void> threadjoiner(std::shared_ptr<std::vector<std::thread>> thread
 	}
 }
 
+// occasionally copy the contents of the past set.
 void testCopier(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
 	auto start = Clock::now();
 	while (true) {
@@ -168,6 +172,7 @@ void testCopier(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
 	}
 }
 
+// In a loop adds and removes a set of objects to the set
 void writer(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
 	auto start = Clock::now();
 	std::random_device rDev;
@@ -203,6 +208,7 @@ void writer(std::shared_ptr<TestSet> set, std::chrono::seconds runFor) {
 	}
 }
 
+// This unit test creates 5 writer threads and one copier thread.
 TEST_CASE("/flow/WriteOnlySet") {
 	if (g_network->isSimulated()) {
 		// This test is not deterministic, so we shouldn't run it in simulation
diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h
index 9d80795c68..a2589ec387 100644
--- a/flow/WriteOnlySet.h
+++ b/flow/WriteOnlySet.h
@@ -24,6 +24,21 @@
 #include "flow/Trace.h"
 #include <boost/lockfree/queue.hpp>
 
+/**
+ * This is a Write-Only set that supports copying the whole content. This data structure is lock-free and allows a user
+ * to insert and remove objects up to a given capacity (passed by a template).
+ *
+ * Template parameters:
+ * \param T The type to store.
+ * \param IndexType The type used as an index
+ * \param CAPACITY The maximum number of object this structure can store (if a user tries to store more, insert will
+ *                 fail gracefully)
+ * \pre T implements `void addref() const` and `void delref() const`
+ * \pre IndexType must have a copy constructor
+ * \pre IndexType must have a trivial assignment operator
+ * \pre IndexType must have a trivial destructor
+ * \pre IndexType can be used as an index into a std::vector
+ */
 template <class T, class IndexType, IndexType CAPACITY>
 class WriteOnlySet {
 public:
@@ -37,25 +52,61 @@ public:
 	WriteOnlySet(const WriteOnlySet&) = delete;
 	WriteOnlySet& operator=(const WriteOnlySet&) = delete;
 
-	// Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so
-	// the actual size might change anytime after or even during the call. This function only guarantees that the size
-	// was whatever the method returns at one point between the start and the end of the function call. The safest way
-	// to handle this is by assuming that this returns an estimate.
-	unsigned size();
+	/**
+	 * Attempts to insert \p lineage into the set. This method can fail if the set is full (its size is equal to its
+	 * capacity). Calling insert on a full set is safe but the method will return \ref npos if the operation fails.
+	 *
+	 * \param lineage A reference to the object the user wants to insert.
+	 * \ret An index that can later be used to erase the value again or \ref npos if the insert failed.
+	 * \pre lineage.getPtr() % 2 == 0 (the memory for lineage has to be at least 2 byte aligned)
+	 */
+	[[nodiscard]] Index insert(const Reference<T>& lineage);
 
-	Index insert(const Reference<T>& lineage);
+	/**
+	 * Erases the object associated with \p idx from the set.
+	 *
+	 * \ret Whether the reference count was decremented. Usually the return value is only interesting for testing and
+	 *      benchmarking purposes and will in most cases be ignored. If \ref delref wasn't called, it will be called
+	 *      later. Note that at the time the return value is checked, \ref delref might already have been called.
+	 */
 	bool erase(Index idx);
+	/**
+	 * Copies all elements that are stored in the set into a vector. This copy operation does NOT provide a snapshot of
+	 * the data structure. The contract is weak:
+	 * - All object that were in the set before copy is called and weren't removed until after copy returned are
+	 *   guaranteed to be in the result.
+	 * - Any object that was inserted while copy is running might be in the result.
+	 * - Any object that was erased while copy is running might be in the result.
+	 */
 	std::vector<Reference<T>> copy();
 
 private:
+	// the implementation of erase -- the wrapper just makes the function a bit more readable.
 	bool eraseImpl(Index idx);
 
+	// the last bit of a pointer within the set is used like a boolean and true means that the object is locked. Locking
+	// an object is only relevant for memory management. A locked pointer can still be erased from the set, but the
+	// erase won't call delref on the object. Instead it will push the pointer into the \ref freeList and copy will call
+	// delref later.
 	static constexpr uintptr_t LOCK = 0b1;
-	std::atomic<Index> _size = 0;
+
+	// The actual memory
 	std::vector<std::atomic<std::uintptr_t>> _set;
 	static_assert(std::atomic<Index>::is_always_lock_free, "Index type can't be used as a lock-free type");
 	static_assert(std::atomic<Index>::is_always_lock_free, "uintptr_t can't be used as a lock-free type");
+
+	// The freeQueue. On creation all indexes (0..capacity-1) are pushed into this queue. On insert one element from
+	// this queue is consumed and the resulting number is used as an index into the set. On erase the index is given
+	// back to the freeQueue.
 	boost::lockfree::queue<Index, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeQueue;
+
+	// The freeList is used for memory management. Generally copying a shared pointer can't be done in a lock-free way.
+	// Instead, when we copy the data structure we first copy the address, then attempt to set the last bit to 1 and
+	// only if that succeeds we will increment the reference count. Whenever we attempt to remove an object
+	// in \ref erase we remove the object from the set (using an atomic compare and swap) and only decrement the
+	// reference count if the last bit is 0. If it's not we'll push the pointer into this free list.
+	// \ref copy will consume all elements from this freeList each time it runs and decrements the refcount for each
+	// element.
 	boost::lockfree::queue<T*, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeList;
 };
 

From 6746bbaba7d919736ecd7b8d5d9134507f448bde Mon Sep 17 00:00:00 2001
From: Xiaoge Su <magichp@gmail.com>
Date: Mon, 22 Mar 2021 10:36:45 -0700
Subject: [PATCH 026/317] Update the document per comments III

---
 design/Commit/How a commit is done in FDB.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/design/Commit/How a commit is done in FDB.md b/design/Commit/How a commit is done in FDB.md
index 739549aedf..1e34ac481e 100644
--- a/design/Commit/How a commit is done in FDB.md	
+++ b/design/Commit/How a commit is done in FDB.md	
@@ -1,5 +1,8 @@
 # How a commit is done in FDB
 
+This doc describes how commit is done in FDB 6.3+.
+The commit path in FDB 6.3 and before is documented in [documentation/sphinx/source/read-write-path.rst](https://github.com/apple/foundationdb/pull/4099).
+
 ## Overall description
 
 Legend:

From 61352b912444c5d3601b8e33de234cc1f61fe32b Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 22 Mar 2021 11:41:45 -0600
Subject: [PATCH 027/317] use push_back where emplace_back is unnecessary

---
 flow/WriteOnlySet.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index 93d9e99fc7..9ab63aa56f 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -92,7 +92,7 @@ std::vector<Reference<T>> WriteOnlySet<T, IndexType, CAPACITY>::copy() {
 				// we try to unlock now. If this element was removed while we incremented the refcount, the element will
 				// end up in the freeList, so we will decrement later.
 				_set[i].compare_exchange_strong(ptr, ptr ^ LOCK);
-				result.emplace_back(entry);
+				result.push_back(entry);
 			}
 		}
 	}

From 301daf326939d6378d410420d007322f7c7a3dd3 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 22 Mar 2021 11:46:16 -0600
Subject: [PATCH 028/317] address review comments

---
 flow/WriteOnlySet.actor.cpp | 2 +-
 flow/WriteOnlySet.h         | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index 9ab63aa56f..364c53460d 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -1,5 +1,5 @@
 /*
- * WriteOnlySet.cpp
+ * WriteOnlySet.actor.cpp
  *
  * This source file is part of the FoundationDB open source project
  *
diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h
index a2589ec387..c71736f852 100644
--- a/flow/WriteOnlySet.h
+++ b/flow/WriteOnlySet.h
@@ -1,5 +1,5 @@
 /*
- * WriteOnlySet.cpp
+ * WriteOnlySet.h
  *
  * This source file is part of the FoundationDB open source project
  *
@@ -50,7 +50,9 @@ public:
 
 	explicit WriteOnlySet();
 	WriteOnlySet(const WriteOnlySet&) = delete;
+	WriteOnlySet(WriteOnlySet&&) = delete;
 	WriteOnlySet& operator=(const WriteOnlySet&) = delete;
+	WriteOnlySet& operator=(WriteOnlySet&&) = delete;
 
 	/**
 	 * Attempts to insert \p lineage into the set. This method can fail if the set is full (its size is equal to its
@@ -93,7 +95,7 @@ private:
 	// The actual memory
 	std::vector<std::atomic<std::uintptr_t>> _set;
 	static_assert(std::atomic<Index>::is_always_lock_free, "Index type can't be used as a lock-free type");
-	static_assert(std::atomic<Index>::is_always_lock_free, "uintptr_t can't be used as a lock-free type");
+	static_assert(std::atomic<uintptr_t>::is_always_lock_free, "uintptr_t can't be used as a lock-free type");
 
 	// The freeQueue. On creation all indexes (0..capacity-1) are pushed into this queue. On insert one element from
 	// this queue is consumed and the resulting number is used as an index into the set. On erase the index is given

From 5bd79de88179945a78e7862d90e7de183d3d690c Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 22 Mar 2021 10:01:28 -0700
Subject: [PATCH 029/317] Fix build

---
 flow/Profiler.actor.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp
index 46b0bcecb4..24bba87739 100644
--- a/flow/Profiler.actor.cpp
+++ b/flow/Profiler.actor.cpp
@@ -142,6 +142,8 @@ struct Profiler {
 	}
 
 	void signal_handler() { // async signal safe!
+		static std::atomic<bool> inSigHandler = false;
+		if (inSigHandler.exchange(true)) { return; }
 		if (profilingEnabled) {
 			double t = timer();
 			output_buffer->push(*(void**)&t);

From 0ec7340a6f72f8d29b43ade50667d2b0e88ebd75 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 22 Mar 2021 10:55:52 -0700
Subject: [PATCH 030/317] Create reference

---
 flow/WriteOnlySet.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index 364c53460d..92eceea7bc 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -92,7 +92,7 @@ std::vector<Reference<T>> WriteOnlySet<T, IndexType, CAPACITY>::copy() {
 				// we try to unlock now. If this element was removed while we incremented the refcount, the element will
 				// end up in the freeList, so we will decrement later.
 				_set[i].compare_exchange_strong(ptr, ptr ^ LOCK);
-				result.push_back(entry);
+				result.push_back(Reference(entry));
 			}
 		}
 	}
@@ -229,4 +229,4 @@ TEST_CASE("/flow/WriteOnlySet") {
 	    .detail("LockedErase", numLockedErase.load());
 	return Void();
 }
-} // namespace
\ No newline at end of file
+} // namespace

From b246e673bceab43b28cc4a855584333eb3404146 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Wed, 24 Mar 2021 15:34:19 -0400
Subject: [PATCH 031/317] Added comment to seedShardServers (taken from
 existing desc in .h file)

---
 fdbserver/MoveKeys.actor.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp
index 83f7170e95..0702b8d097 100644
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@@ -1212,6 +1212,8 @@ ACTOR Future<Void> moveKeys(Database cx,
 	return Void();
 }
 
+// Called by the master server to write the very first transaction to the database
+// establishing a set of shard servers and all invariants of the systemKeys.
 void seedShardServers(Arena& arena, CommitTransactionRef& tr, vector<StorageServerInterface> servers) {
 	std::map<Optional<Value>, Tag> dcId_locality;
 	std::map<UID, Tag> server_tag;

From 2dfd420882537d7fa7d477c08b699f1a5e961a1c Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 24 Mar 2021 14:52:42 -0700
Subject: [PATCH 032/317] Add sampling profiler thread

---
 fdbrpc/AsyncFileKAIO.actor.h  |  6 +++++-
 fdbrpc/IAsyncFile.h           |  4 ++++
 fdbrpc/Net2FileSystem.cpp     |  4 ++++
 fdbrpc/Net2FileSystem.h       |  3 +++
 fdbrpc/sim2.actor.cpp         |  4 ++++
 fdbrpc/simulator.h            |  4 ++++
 fdbserver/fdbserver.actor.cpp |  1 +
 flow/Platform.actor.cpp       | 27 +++++++++++++++++++++++++++
 flow/Platform.h               |  2 ++
 9 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/AsyncFileKAIO.actor.h
index 5e6592e6ba..dbdb040d00 100644
--- a/fdbrpc/AsyncFileKAIO.actor.h
+++ b/fdbrpc/AsyncFileKAIO.actor.h
@@ -242,7 +242,11 @@ public:
 		// result = map(result, [=](int r) mutable { KAIOLogBlockEvent(io, OpLogEntry::READY, r); return r; });
 #endif
 
-		return success(result);
+		auto& actorLineageSet = IAsyncFileSystem::filesystem()->getActorLineageSet();
+		auto index = actorLineageSet.insert(currentLineage);
+		Future<Void> res = success(result);
+		actorLineageSet.erase(index);
+		return res;
 	}
 // TODO(alexmiller): Remove when we upgrade the dev docker image to >14.10
 #ifndef FALLOC_FL_ZERO_RANGE
diff --git a/fdbrpc/IAsyncFile.h b/fdbrpc/IAsyncFile.h
index ed703514c6..ad48db5f07 100644
--- a/fdbrpc/IAsyncFile.h
+++ b/fdbrpc/IAsyncFile.h
@@ -25,6 +25,7 @@
 
 #include <ctime>
 #include "flow/flow.h"
+#include "flow/WriteOnlySet.h"
 #include "fdbrpc/IRateControl.h"
 
 // All outstanding operations must be cancelled before the destructor of IAsyncFile is called.
@@ -118,6 +119,9 @@ public:
 	// Returns the time of the last modification of the file.
 	virtual Future<std::time_t> lastWriteTime(const std::string& filename) = 0;
 
+	// Returns the shared memory data structure used to store actor lineages.
+	virtual ActorLineageSet& getActorLineageSet() = 0;
+
 	static IAsyncFileSystem* filesystem() { return filesystem(g_network); }
 	static runCycleFuncPtr runCycleFunc() {
 		return reinterpret_cast<runCycleFuncPtr>(
diff --git a/fdbrpc/Net2FileSystem.cpp b/fdbrpc/Net2FileSystem.cpp
index 71a7d784a1..8e895c08dc 100644
--- a/fdbrpc/Net2FileSystem.cpp
+++ b/fdbrpc/Net2FileSystem.cpp
@@ -89,6 +89,10 @@ Future<std::time_t> Net2FileSystem::lastWriteTime(const std::string& filename) {
 	return Net2AsyncFile::lastWriteTime(filename);
 }
 
+ActorLineageSet& Net2FileSystem::getActorLineageSet() {
+	return actorLineageSet;
+}
+
 void Net2FileSystem::newFileSystem(double ioTimeout, const std::string& fileSystemPath) {
 	g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Net2FileSystem(ioTimeout, fileSystemPath));
 }
diff --git a/fdbrpc/Net2FileSystem.h b/fdbrpc/Net2FileSystem.h
index 702b87828f..0c2229b5ca 100644
--- a/fdbrpc/Net2FileSystem.h
+++ b/fdbrpc/Net2FileSystem.h
@@ -39,6 +39,8 @@ public:
 
 	Future<Void> renameFile(std::string const& from, std::string const& to) override;
 
+	ActorLineageSet& getActorLineageSet() override;
+
 	// void init();
 	static void stop();
 
@@ -52,6 +54,7 @@ public:
 	dev_t fileSystemDeviceId;
 	bool checkFileSystem;
 #endif
+	ActorLineageSet actorLineageSet;
 };
 
 #endif
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index 6101ca8512..e9219f3ff3 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -2494,6 +2494,10 @@ Future<std::time_t> Sim2FileSystem::lastWriteTime(const std::string& filename) {
 	return fileWrites[filename];
 }
 
+ActorLineageSet& Sim2FileSystem::getActorLineageSet() {
+	return actorLineageSet;
+}
+
 void Sim2FileSystem::newFileSystem() {
 	g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Sim2FileSystem());
 }
diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h
index cde0eb0dda..08b4264e81 100644
--- a/fdbrpc/simulator.h
+++ b/fdbrpc/simulator.h
@@ -471,6 +471,8 @@ public:
 
 	Future<std::time_t> lastWriteTime(const std::string& filename) override;
 
+	ActorLineageSet& getActorLineageSet() override;
+
 	Future<Void> renameFile(std::string const& from, std::string const& to) override;
 
 	Sim2FileSystem() {}
@@ -478,6 +480,8 @@ public:
 	~Sim2FileSystem() override {}
 
 	static void newFileSystem();
+
+	ActorLineageSet actorLineageSet;
 };
 
 #endif
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index a285c0b958..fbcd7fd9ee 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -1948,6 +1948,7 @@ int main(int argc, char* argv[]) {
 				ASSERT(opts.connectionFile);
 
 				setupRunLoopProfiler();
+				setupSamplingProfiler();
 
 				auto dataFolder = opts.dataFolder;
 				if (!dataFolder.size())
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 42d8decccc..756fb6a7e3 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -48,6 +48,8 @@
 #include "flow/UnitTest.h"
 #include "flow/FaultInjection.h"
 
+#include "fdbrpc/IAsyncFile.h"
+
 #ifdef _WIN32
 #include <windows.h>
 #include <winioctl.h>
@@ -3673,6 +3675,31 @@ void setupRunLoopProfiler() {
 #endif
 }
 
+void* sampleThread(void* arg) {
+	while (true) {
+		threadSleep(1.0); // TODO: Read sample rate from global config
+
+		// TODO: Copy actor lineage of currently running actor
+
+		auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy();
+		printf("Disk ALPs: %d\n", diskAlps.size());
+
+		// TODO: Call collect on all actor lineages
+		for (auto actorLineage : diskAlps) {
+		}
+
+		// TODO: Serialize collected actor linage properties
+	}
+
+	return nullptr;
+}
+
+void setupSamplingProfiler() {
+	// TODO: Add knob
+	TraceEvent("StartingSamplingProfilerThread");
+	startThread(&sampleThread, nullptr);
+}
+
 // UnitTest for getMemoryInfo
 #ifdef __linux__
 TEST_CASE("/flow/Platform/getMemoryInfo") {
diff --git a/flow/Platform.h b/flow/Platform.h
index 74c9395c53..edf9ff3997 100644
--- a/flow/Platform.h
+++ b/flow/Platform.h
@@ -741,6 +741,8 @@ void registerCrashHandler();
 void setupRunLoopProfiler();
 EXTERNC void setProfilingEnabled(int enabled);
 
+void setupSamplingProfiler();
+
 // Use _exit() or criticalError(), not exit()
 #define exit static_assert(false, "Calls to exit() are forbidden by policy");
 

From 36f4c17ef143cd3c82b7038f001d256867e2a7fa Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 24 Mar 2021 15:04:45 -0700
Subject: [PATCH 033/317] Reduce the number of actor calls in load balancing to
 improve performance.

---
 fdbrpc/LoadBalance.actor.h | 321 +++++++++++++++++++++----------------
 1 file changed, 184 insertions(+), 137 deletions(-)

diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h
index 9b47912993..78f73352ba 100644
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@@ -75,109 +75,169 @@ struct LoadBalancedReply {
 Optional<LoadBalancedReply> getLoadBalancedReply(const LoadBalancedReply* reply);
 Optional<LoadBalancedReply> getLoadBalancedReply(const void*);
 
-// Returns true if we got a value for our request
-// Throws an error if the request returned an error that should bubble out
-// Returns false if we got an error that should result in reissuing the request
-template <class T>
-bool checkAndProcessResult(ErrorOr<T> result, Reference<ModelHolder> holder, bool atMostOnce, bool triedAllOptions) {
-	Optional<LoadBalancedReply> loadBalancedReply;
-	if (!result.isError()) {
-		loadBalancedReply = getLoadBalancedReply(&result.get());
+// Stores state for a request made by the load balancer
+template <class Request>
+struct RequestData : NonCopyable {
+	Future<ErrorOr<REPLY_TYPE(Request)>> response;
+	Reference<ModelHolder> modelHolder;
+	Future<Void> backoffDelay;
+	RequestStream<Request> const* stream = nullptr;
+	bool triedAllOptions = false;
+
+	bool requestStarted = false; // true once the request has been sent to an alternative
+	bool requestProcessed = false; // true once a response has been received and handled by checkAndProcessResult
+
+	// Whether or not the response future is valid
+	// This is true once setupRequest is called, even though at that point the response is Never().
+	bool isValid() { return response.isValid(); }
+
+	// Initializes the request state and starts the backoff delay
+	void setupRequest(double backoff, bool triedAllOptions, RequestStream<Request> const* stream) {
+		backoffDelay = (backoff > 0) ? delay(backoff) : Void();
+		response = Never();
+		modelHolder = Reference<ModelHolder>();
+		requestStarted = false;
+		requestProcessed = false;
+
+		this->stream = stream;
+		this->triedAllOptions = triedAllOptions;
 	}
 
-	int errCode;
-	if (loadBalancedReply.present()) {
-		errCode =
-		    loadBalancedReply.get().error.present() ? loadBalancedReply.get().error.get().code() : error_code_success;
-	} else {
-		errCode = result.isError() ? result.getError().code() : error_code_success;
+	// Sends the request to the configured stream
+	// This should not be called until after setupRequest has been called and the backoff delay has elapsed
+	void startRequest(Request request, QueueModel* model) {
+		ASSERT(stream);
+		ASSERT(backoffDelay.isReady());
+
+		backoffDelay = Never();
+		modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
+		response = stream->tryGetReply(request);
+		requestStarted = true;
 	}
 
-	bool maybeDelivered = errCode == error_code_broken_promise || errCode == error_code_request_maybe_delivered;
-	bool receivedResponse = loadBalancedReply.present() ? !loadBalancedReply.get().error.present() : result.present();
-	receivedResponse = receivedResponse || (!maybeDelivered && errCode != error_code_process_behind);
-	bool futureVersion = errCode == error_code_future_version || errCode == error_code_process_behind;
+	// Implementation of the logic to handle a response.
+	// Checks the state of the response, updates the queue model, and returns one of the following outcomes:
+	// A return value of true means that the request completed successfully
+	// A return value of false means that the request failed but should be retried
+	// A return value with an error means that the error should be thrown back to original caller
+	static ErrorOr<bool> checkAndProcessResultImpl(ErrorOr<REPLY_TYPE(Request)> result,
+	                                               Reference<ModelHolder> modelHolder,
+	                                               bool atMostOnce,
+	                                               bool triedAllOptions) {
+		ASSERT(modelHolder);
 
-	holder->release(
-	    receivedResponse, futureVersion, loadBalancedReply.present() ? loadBalancedReply.get().penalty : -1.0);
+		Optional<LoadBalancedReply> loadBalancedReply;
+		if (!result.isError()) {
+			loadBalancedReply = getLoadBalancedReply(&result.get());
+		}
+
+		int errCode;
+		if (loadBalancedReply.present()) {
+			errCode = loadBalancedReply.get().error.present() ? loadBalancedReply.get().error.get().code()
+			                                                  : error_code_success;
+		} else {
+			errCode = result.isError() ? result.getError().code() : error_code_success;
+		}
+
+		bool maybeDelivered = errCode == error_code_broken_promise || errCode == error_code_request_maybe_delivered;
+		bool receivedResponse =
+		    loadBalancedReply.present() ? !loadBalancedReply.get().error.present() : result.present();
+		receivedResponse = receivedResponse || (!maybeDelivered && errCode != error_code_process_behind);
+		bool futureVersion = errCode == error_code_future_version || errCode == error_code_process_behind;
+
+		modelHolder->release(
+		    receivedResponse, futureVersion, loadBalancedReply.present() ? loadBalancedReply.get().penalty : -1.0);
+
+		if (errCode == error_code_server_overloaded) {
+			return false;
+		}
+
+		if (loadBalancedReply.present() && !loadBalancedReply.get().error.present()) {
+			return true;
+		}
+
+		if (!loadBalancedReply.present() && result.present()) {
+			return true;
+		}
+
+		if (receivedResponse) {
+			return loadBalancedReply.present() ? loadBalancedReply.get().error.get() : result.getError();
+		}
+
+		if (atMostOnce && maybeDelivered) {
+			return request_maybe_delivered();
+		}
+
+		if (triedAllOptions && errCode == error_code_process_behind) {
+			return process_behind();
+		}
 
-	if (errCode == error_code_server_overloaded) {
 		return false;
 	}
 
-	if (loadBalancedReply.present() && !loadBalancedReply.get().error.present()) {
-		return true;
+	// Checks the state of the response, updates the queue model, and returns one of the following outcomes:
+	// A return value of true means that the request completed successfully
+	// A return value of false means that the request failed but should be retried
+	// In the event of a non-retryable failure, an error is thrown indicating the failure
+	bool checkAndProcessResult(bool atMostOnce) {
+		ASSERT(response.isReady());
+		requestProcessed = true;
+
+		ErrorOr<bool> outcome =
+		    checkAndProcessResultImpl(response.get(), std::move(modelHolder), atMostOnce, triedAllOptions);
+
+		if (outcome.isError()) {
+			throw outcome.getError();
+		} else if (!outcome.get()) {
+			response = Future<ErrorOr<REPLY_TYPE(Request)>>();
+		}
+
+		return outcome.get();
 	}
 
-	if (!loadBalancedReply.present() && result.present()) {
-		return true;
+	// Convert this request to a lagging request. Such a request is no longer being waited on, but it still needs to be
+	// processed so we can update the queue model.
+	void makeLaggingRequest() {
+		ASSERT(response.isValid());
+		ASSERT(!response.isReady());
+		ASSERT(modelHolder);
+		ASSERT(modelHolder->model);
+
+		QueueModel* model = modelHolder->model;
+		if (model->laggingRequestCount > FLOW_KNOBS->MAX_LAGGING_REQUESTS_OUTSTANDING ||
+		    model->laggingRequests.isReady()) {
+			model->laggingRequests.cancel();
+			model->laggingRequestCount = 0;
+			model->addActor = PromiseStream<Future<Void>>();
+			model->laggingRequests = actorCollection(model->addActor.getFuture(), &model->laggingRequestCount);
+		}
+
+		// We need to process the lagging request in order to update the queue model
+		Reference<ModelHolder> holderCapture = std::move(modelHolder);
+		bool triedAllOptionsCapture = triedAllOptions;
+		Future<Void> updateModel =
+		    map(response, [holderCapture, triedAllOptionsCapture](ErrorOr<REPLY_TYPE(Request)> result) {
+			    checkAndProcessResultImpl(result, holderCapture, false, triedAllOptionsCapture);
+			    return Void();
+		    });
+		model->addActor.send(updateModel);
 	}
 
-	if (receivedResponse) {
-		throw loadBalancedReply.present() ? loadBalancedReply.get().error.get() : result.getError();
-	}
-
-	if (atMostOnce && maybeDelivered) {
-		throw request_maybe_delivered();
-	}
-
-	if (triedAllOptions && errCode == error_code_process_behind) {
-		throw process_behind();
-	}
-
-	return false;
-}
-
-ACTOR template <class Request>
-Future<Optional<REPLY_TYPE(Request)>> makeRequest(RequestStream<Request> const* stream,
-                                                  Request request,
-                                                  double backoff,
-                                                  Future<Void> requestUnneeded,
-                                                  QueueModel* model,
-                                                  bool isFirstRequest,
-                                                  bool atMostOnce,
-                                                  bool triedAllOptions) {
-	if (backoff > 0.0) {
-		wait(delay(backoff) || requestUnneeded);
-	}
-
-	if (requestUnneeded.isReady()) {
-		return Optional<REPLY_TYPE(Request)>();
-	}
-
-	state Reference<ModelHolder> holder(new ModelHolder(model, stream->getEndpoint().token.first()));
-
-	ErrorOr<REPLY_TYPE(Request)> result = wait(stream->tryGetReply(request));
-	if (checkAndProcessResult(result, holder, atMostOnce, triedAllOptions)) {
-		return result.get();
-	} else {
-		return Optional<REPLY_TYPE(Request)>();
-	}
-}
-
-template <class Reply>
-void addLaggingRequest(Future<Optional<Reply>> reply, Promise<Void> requestFinished, QueueModel* model) {
-	requestFinished.send(Void());
-	if (!reply.isReady()) {
-		if (model) {
-			if (model->laggingRequestCount > FLOW_KNOBS->MAX_LAGGING_REQUESTS_OUTSTANDING ||
-			    model->laggingRequests.isReady()) {
-				model->laggingRequests.cancel();
-				model->laggingRequestCount = 0;
-				model->addActor = PromiseStream<Future<Void>>();
-				model->laggingRequests = actorCollection(model->addActor.getFuture(), &model->laggingRequestCount);
-			}
-
-			model->addActor.send(success(errorOr(reply)));
+	~RequestData() {
+		// If the request has been started but hasn't completed, mark it as a lagging request
+		if (requestStarted && !requestProcessed && modelHolder && modelHolder->model) {
+			makeLaggingRequest();
 		}
 	}
-}
+};
 
-// Keep trying to get a reply from any of servers until success or cancellation; tries to take into account
-//   failMon's information for load balancing and avoiding failed servers
+// Try to get a reply from one of the alternatives until success, cancellation, or certain errors.
+// Load balancing has a budget to race requests to a second alternative if the first request is slow.
+// Tries to take into account failMon's information for load balancing and avoiding failed servers.
 // If ALL the servers are failed and the list of servers is not fresh, throws an exception to let the caller refresh the
-// list of servers. When model is set, load balance among alternatives in the same DC, aiming to balance request queue
-// length on these interfaces. If too many interfaces in the same DC are bad, try remote interfaces.
+// list of servers.
+// When model is set, load balance among alternatives in the same DC aims to balance request queue length on these
+// interfaces. If too many interfaces in the same DC are bad, try remote interfaces.
 ACTOR template <class Interface, class Request, class Multi>
 Future<REPLY_TYPE(Request)> loadBalance(
     Reference<MultiInterface<Multi>> alternatives,
@@ -186,9 +246,11 @@ Future<REPLY_TYPE(Request)> loadBalance(
     TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint,
     bool atMostOnce = false, // if true, throws request_maybe_delivered() instead of retrying automatically
     QueueModel* model = nullptr) {
-	state Future<Optional<REPLY_TYPE(Request)>> firstRequest;
+
+	state RequestData<Request> firstRequestData;
+	state RequestData<Request> secondRequestData;
+
 	state Optional<uint64_t> firstRequestEndpoint;
-	state Future<Optional<REPLY_TYPE(Request)>> secondRequest;
 	state Future<Void> secondDelay = Never();
 
 	state Promise<Void> requestFinished;
@@ -320,7 +382,7 @@ Future<REPLY_TYPE(Request)> loadBalance(
 		}
 
 		// Find an alternative, if any, that is not failed, starting with
-		// nextAlt. This logic matters only if model == NULL. Otherwise, the
+		// nextAlt. This logic matters only if model == nullptr. Otherwise, the
 		// bestAlt and nextAlt have been decided.
 		state RequestStream<Request> const* stream = nullptr;
 		for (int alternativeNum = 0; alternativeNum < alternatives->size(); alternativeNum++) {
@@ -340,7 +402,7 @@ Future<REPLY_TYPE(Request)> loadBalance(
 			stream = nullptr;
 		}
 
-		if (!stream && !firstRequest.isValid()) {
+		if (!stream && !firstRequestData.isValid()) {
 			// Everything is down!  Wait for someone to be up.
 
 			vector<Future<Void>> ok(alternatives->size());
@@ -391,50 +453,40 @@ Future<REPLY_TYPE(Request)> loadBalance(
 			numAttempts = 0; // now that we've got a server back, reset the backoff
 		} else if (!stream) {
 			// Only the first location is available.
-			Optional<REPLY_TYPE(Request)> result = wait(firstRequest);
-			if (result.present()) {
-				return result.get();
-			}
+			loop choose {
+				when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); }
+				when(ErrorOr<REPLY_TYPE(Request)> result = wait(firstRequestData.response)) {
+					if (firstRequestData.checkAndProcessResult(atMostOnce)) {
+						return result.get();
+					}
 
-			firstRequest = Future<Optional<REPLY_TYPE(Request)>>();
-			firstRequestEndpoint = Optional<uint64_t>();
-		} else if (firstRequest.isValid()) {
+					firstRequestEndpoint = Optional<uint64_t>();
+					break;
+				}
+			}
+		} else if (firstRequestData.isValid()) {
 			// Issue a second request, the first one is taking a long time.
-			secondRequest = makeRequest(
-			    stream, request, backoff, requestFinished.getFuture(), model, false, atMostOnce, triedAllOptions);
+			secondRequestData.setupRequest(backoff, triedAllOptions, stream);
 			state bool firstFinished = false;
 
-			loop {
-				choose {
-					when(ErrorOr<Optional<REPLY_TYPE(Request)>> result =
-					         wait(firstRequest.isValid() ? errorOr(firstRequest) : Never())) {
-						if (result.isError() || result.get().present()) {
-							addLaggingRequest(secondRequest, requestFinished, model);
-							if (result.isError()) {
-								throw result.getError();
-							} else {
-								return result.get().get();
-							}
-						}
-
-						firstRequest = Future<Optional<REPLY_TYPE(Request)>>();
-						firstRequestEndpoint = Optional<uint64_t>();
-						firstFinished = true;
+			loop choose {
+				when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); }
+				when(wait(secondRequestData.backoffDelay)) { secondRequestData.startRequest(request, model); }
+				when(ErrorOr<REPLY_TYPE(Request)> result =
+				         wait(firstRequestData.response.isValid() ? firstRequestData.response : Never())) {
+					if (firstRequestData.checkAndProcessResult(atMostOnce)) {
+						return result.get();
 					}
-					when(ErrorOr<Optional<REPLY_TYPE(Request)>> result = wait(errorOr(secondRequest))) {
-						if (result.isError() || result.get().present()) {
-							if (!firstFinished) {
-								addLaggingRequest(firstRequest, requestFinished, model);
-							}
-							if (result.isError()) {
-								throw result.getError();
-							} else {
-								return result.get().get();
-							}
-						}
 
-						break;
+					firstRequestEndpoint = Optional<uint64_t>();
+					firstFinished = true;
+				}
+				when(ErrorOr<REPLY_TYPE(Request)> result = wait(secondRequestData.response)) {
+					if (secondRequestData.checkAndProcessResult(atMostOnce)) {
+						return result.get();
 					}
+
+					break;
 				}
 			}
 
@@ -445,13 +497,13 @@ Future<REPLY_TYPE(Request)> loadBalance(
 			}
 		} else {
 			// Issue a request, if it takes too long to get a reply, go around the loop
-			firstRequest = makeRequest(
-			    stream, request, backoff, requestFinished.getFuture(), model, true, atMostOnce, triedAllOptions);
+			firstRequestData.setupRequest(backoff, triedAllOptions, stream);
 			firstRequestEndpoint = stream->getEndpoint().token.first();
 
 			loop {
 				choose {
-					when(ErrorOr<Optional<REPLY_TYPE(Request)>> result = wait(errorOr(firstRequest))) {
+					when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); }
+					when(ErrorOr<REPLY_TYPE(Request)> result = wait(firstRequestData.response)) {
 						if (model) {
 							model->secondMultiplier =
 							    std::max(model->secondMultiplier - FLOW_KNOBS->SECOND_REQUEST_MULTIPLIER_DECAY, 1.0);
@@ -460,15 +512,10 @@ Future<REPLY_TYPE(Request)> loadBalance(
 							             FLOW_KNOBS->SECOND_REQUEST_MAX_BUDGET);
 						}
 
-						if (result.isError()) {
-							throw result.getError();
+						if (firstRequestData.checkAndProcessResult(atMostOnce)) {
+							return result.get();
 						}
 
-						if (result.get().present()) {
-							return result.get().get();
-						}
-
-						firstRequest = Future<Optional<REPLY_TYPE(Request)>>();
 						firstRequestEndpoint = Optional<uint64_t>();
 						break;
 					}

From f7d3b31ef8f93a9ec845bef3a8216e70c384d804 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 24 Mar 2021 16:27:35 -0600
Subject: [PATCH 034/317] Actually close files in simulation

---
 fdbrpc/AsyncFileNonDurable.actor.h |  4 ++++
 fdbrpc/sim2.actor.cpp              | 16 ++++++++--------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 49fe0e2c8f..13fdcc25a5 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -267,6 +267,10 @@ public:
 			Future<Void> deleteFuture = deleteFile(this);
 			if (!deleteFuture.isReady())
 				filesBeingDeleted[filename] = deleteFuture;
+		} else if (isSoleOwner()) {
+			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
+			// we remove the file from the map to make sure it gets closed.
+			g_simulator.getCurrentProcess()->machine->openFiles.erase(filename);
 		}
 	}
 
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index 1af14ec676..6cddbb7e88 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -536,7 +536,10 @@ public:
 
 	std::string getFilename() const override { return actualFilename; }
 
-	~SimpleFile() override { _close(h); }
+	~SimpleFile() override {
+		_close(h);
+		--openCount;
+	}
 
 private:
 	int h;
@@ -1933,10 +1936,7 @@ public:
 		TraceEvent("ClogInterface")
 		    .detail("IP", ip.toString())
 		    .detail("Delay", seconds)
-		    .detail("Queue",
-		            mode == ClogSend      ? "Send"
-		            : mode == ClogReceive ? "Receive"
-		                                  : "All");
+		    .detail("Queue", mode == ClogSend ? "Send" : mode == ClogReceive ? "Receive" : "All");
 
 		if (mode == ClogSend || mode == ClogAll)
 			g_clogging.clogSendFor(ip, seconds);
@@ -2408,9 +2408,9 @@ int sf_open(const char* filename, int flags, int convFlags, int mode) {
 	                       GENERIC_READ | ((flags & IAsyncFile::OPEN_READWRITE) ? GENERIC_WRITE : 0),
 	                       FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
 	                       nullptr,
-	                       (flags & IAsyncFile::OPEN_EXCLUSIVE) ? CREATE_NEW
-	                       : (flags & IAsyncFile::OPEN_CREATE)  ? OPEN_ALWAYS
-	                                                            : OPEN_EXISTING,
+	                       (flags & IAsyncFile::OPEN_EXCLUSIVE)
+	                           ? CREATE_NEW
+	                           : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS : OPEN_EXISTING,
 	                       FILE_ATTRIBUTE_NORMAL,
 	                       nullptr);
 	int h = -1;

From 6a344ddeab4eac19ee34f1af7649a6b5e8e39efc Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 24 Mar 2021 16:56:11 -0600
Subject: [PATCH 035/317] fix typo

---
 fdbrpc/AsyncFileNonDurable.actor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 13fdcc25a5..8cc65bf4a5 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -269,7 +269,7 @@ public:
 				filesBeingDeleted[filename] = deleteFuture;
 		} else if (isSoleOwner()) {
 			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
-			// we remove the file from the map to make sure it gets closed.
+			// remove the file from the map to make sure it gets closed.
 			g_simulator.getCurrentProcess()->machine->openFiles.erase(filename);
 		}
 	}

From b5412b355e3f900f7b40adadf5d7b51ee142141a Mon Sep 17 00:00:00 2001
From: Nim Wijetunga <nim.wijetunga@snowflake.com>
Date: Wed, 24 Mar 2021 23:34:34 +0000
Subject: [PATCH 036/317] Add Java API for network busyness

---
 bindings/java/fdbJNI.cpp                              | 11 +++++++++++
 .../src/main/com/apple/foundationdb/Database.java     |  9 +++++++++
 .../src/main/com/apple/foundationdb/FDBDatabase.java  | 11 +++++++++++
 3 files changed, 31 insertions(+)

diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp
index ebe83269e6..f5d66577fd 100644
--- a/bindings/java/fdbJNI.cpp
+++ b/bindings/java/fdbJNI.cpp
@@ -580,6 +580,17 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1setOpti
 	}
 }
 
+JNIEXPORT jdouble JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1getMainThreadBusyness(JNIEnv* jenv,
+																								  jobject,
+																								  jlong dbPtr) {
+	if (!dbPtr) {
+		throwParamNotNull(jenv);
+		return 0;
+	}
+	FDBDatabase* database = (FDBDatabase*)dbPtr;
+	return (jdouble) fdb_database_get_main_thread_busyness(database);
+}
+
 JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FDB_Error_1predicate(JNIEnv* jenv,
                                                                             jobject,
                                                                             jint predicate,
diff --git a/bindings/java/src/main/com/apple/foundationdb/Database.java b/bindings/java/src/main/com/apple/foundationdb/Database.java
index e5f2d36de6..741fa1c5eb 100644
--- a/bindings/java/src/main/com/apple/foundationdb/Database.java
+++ b/bindings/java/src/main/com/apple/foundationdb/Database.java
@@ -80,6 +80,15 @@ public interface Database extends AutoCloseable, TransactionContext {
 	 */
 	DatabaseOptions options();
 
+	/**
+	 * Returns a value which indicates the saturation of the client
+	 * <br>
+	 * <b>Note:</b> By default, this value is updated every second
+	 *
+	 * @return a value where 0 indicates that the client is idle and 1 (or larger) indicates that the client is saturated.
+	 */
+	double getMainThreadBusyness();
+
 	/**
 	 * Runs a read-only transactional function against this {@code Database} with retry logic.
 	 *  {@link Function#apply(Object) apply(ReadTransaction)} will be called on the
diff --git a/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java b/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java
index 620b5aaa4e..8df1fd75b6 100644
--- a/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java
+++ b/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java
@@ -150,6 +150,16 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume
 		}
 	}
 
+	@Override
+	public double getMainThreadBusyness() {
+		pointerReadLock.lock();
+		try {
+			return Database_getMainThreadBusyness(getPtr());
+		} finally {
+			pointerReadLock.unlock();
+		}
+	}
+
 	@Override
 	public Executor getExecutor() {
 		return executor;
@@ -163,4 +173,5 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume
 	private native long Database_createTransaction(long cPtr);
 	private native void Database_dispose(long cPtr);
 	private native void Database_setOption(long cPtr, int code, byte[] value) throws FDBException;
+	private native double Database_getMainThreadBusyness(long cPtr);
 }
\ No newline at end of file

From 21f1e1d5de98ab75264ccc30cecd35f682b9b647 Mon Sep 17 00:00:00 2001
From: Nim Wijetunga <nim.wijetunga@snowflake.com>
Date: Wed, 24 Mar 2021 23:38:42 +0000
Subject: [PATCH 037/317] add comment

---
 bindings/java/fdbJNI.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp
index f5d66577fd..482098e815 100644
--- a/bindings/java/fdbJNI.cpp
+++ b/bindings/java/fdbJNI.cpp
@@ -580,6 +580,9 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1setOpti
 	}
 }
 
+// Get network thread busyness (updated every 1s)
+// A value of 0 indicates that the client is more or less idle
+// A value of 1 (or more) indicates that the client is saturated
 JNIEXPORT jdouble JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1getMainThreadBusyness(JNIEnv* jenv,
 																								  jobject,
 																								  jlong dbPtr) {

From a84592df7e4151e54a3a7717e58d24e2f701f410 Mon Sep 17 00:00:00 2001
From: Nim Wijetunga <nim.wijetunga@snowflake.com>
Date: Wed, 24 Mar 2021 23:59:40 +0000
Subject: [PATCH 038/317] add test for network busyness

---
 .../test/com/apple/foundationdb/test/AsyncStackTester.java  | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java
index f584f452a9..87ea5adfe0 100644
--- a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java
+++ b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java
@@ -510,6 +510,12 @@ public class AsyncStackTester {
 				db.options().setTransactionCausalReadRisky();
 				db.options().setTransactionIncludePortInAddress();
 
+				// Test network busyness
+				double busyness = db.getMainThreadBusyness();
+				if (busyness < 0) {
+					throw new IllegalStateException("Network busyness cannot be less than 0");
+				}
+
 				tr.options().setPrioritySystemImmediate();
 				tr.options().setPriorityBatch();
 				tr.options().setCausalReadRisky();

From bdccf8bc801504e846892b242a57ab829818b643 Mon Sep 17 00:00:00 2001
From: Nim Wijetunga <nim.wijetunga@snowflake.com>
Date: Thu, 25 Mar 2021 00:11:11 +0000
Subject: [PATCH 039/317] fix formatting issues

---
 bindings/java/fdbJNI.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp
index 482098e815..06acae658e 100644
--- a/bindings/java/fdbJNI.cpp
+++ b/bindings/java/fdbJNI.cpp
@@ -584,14 +584,14 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1setOpti
 // A value of 0 indicates that the client is more or less idle
 // A value of 1 (or more) indicates that the client is saturated
 JNIEXPORT jdouble JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1getMainThreadBusyness(JNIEnv* jenv,
-																								  jobject,
-																								  jlong dbPtr) {
+                                                                                                  jobject,
+                                                                                                  jlong dbPtr) {
 	if (!dbPtr) {
 		throwParamNotNull(jenv);
 		return 0;
 	}
 	FDBDatabase* database = (FDBDatabase*)dbPtr;
-	return (jdouble) fdb_database_get_main_thread_busyness(database);
+	return (jdouble)fdb_database_get_main_thread_busyness(database);
 }
 
 JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FDB_Error_1predicate(JNIEnv* jenv,

From b51e4aa59048ed73afbb6a6d82b4d86f520f6129 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 24 Mar 2021 19:57:24 -0600
Subject: [PATCH 040/317] handle file renames properly

---
 fdbrpc/AsyncFileNonDurable.actor.h | 12 +++++++++++-
 flow/flow.h                        |  2 ++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 8cc65bf4a5..21cfda8907 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -270,7 +270,17 @@ public:
 		} else if (isSoleOwner()) {
 			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
 			// remove the file from the map to make sure it gets closed.
-			g_simulator.getCurrentProcess()->machine->openFiles.erase(filename);
+			auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
+			auto iter = openFiles.find(filename);
+			// the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the
+			// map anymore.
+			if (iter != openFiles.end()) {
+				// even if the filename exists, it doesn't mean that it references the same file. It could be that the
+				// file was renamed and later a file with the same name was opened.
+				if (iter->second.canGet() && iter->second.get().getPtr() == this) {
+					openFiles.erase(filename);
+				}
+			}
 		}
 	}
 
diff --git a/flow/flow.h b/flow/flow.h
index 987572d7c5..e03d598d9b 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -674,6 +674,8 @@ public:
 	bool isValid() const { return sav != 0; }
 	bool isReady() const { return sav->isSet(); }
 	bool isError() const { return sav->isError(); }
+	// returns true if get can be called on this future (counterpart of canBeSet on Promises)
+	bool canGet() const { return isValid() && isReady() && !isError(); }
 	Error& getError() const {
 		ASSERT(isError());
 		return sav->error_state;

From 1385a776daa0b90cb20478251d0faf8766cb1a10 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 25 Mar 2021 13:22:29 -0600
Subject: [PATCH 041/317] only remove files from the open map if they have no
 modifications in flight

---
 fdbrpc/AsyncFileNonDurable.actor.h | 49 ++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 21cfda8907..281b3f289d 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -259,6 +259,37 @@ public:
 		//TraceEvent("AsyncFileNonDurable_Destroy", id).detail("Filename", filename);
 	}
 
+	// The purpose of this actor is to simply keep a reference to a non-durable file until all pending modifications
+	// have completed. When they return, this actor will die and therefore decrement the reference count by 1.
+	ACTOR void waitOnOutstandingModifications(Reference<AsyncFileNonDurable> self) {
+		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
+		state TaskPriority currentTaskID = g_network->getCurrentTask();
+		state std::string filename = self->filename;
+
+		wait(g_simulator.onMachine(currentProcess));
+		try {
+			Promise<bool> startSyncPromise = self->startSyncPromise;
+			self->startSyncPromise = Promise<bool>();
+			startSyncPromise.send(true);
+
+			std::vector<Future<Void>> outstandingModifications;
+
+			for (auto itr = self->pendingModifications.ranges().begin();
+			     itr != self->pendingModifications.ranges().end();
+			     ++itr)
+				if (itr->value().isValid() && !itr->value().isReady())
+					outstandingModifications.push_back(itr->value());
+
+			// Ignore errors here so that all modifications can finish
+			wait(waitForAllReady(outstandingModifications));
+			wait(g_simulator.onProcess(currentProcess, currentTaskID));
+		} catch (Error& e) {
+			state Error err = e;
+			wait(g_simulator.onProcess(currentProcess, currentTaskID));
+			throw err;
+		}
+	}
+
 	void addref() override { ReferenceCounted<AsyncFileNonDurable>::addref(); }
 	void delref() override {
 		if (delref_no_destroy()) {
@@ -270,6 +301,24 @@ public:
 		} else if (isSoleOwner()) {
 			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
 			// remove the file from the map to make sure it gets closed.
+			bool hasPendingModifications = false;
+			for (auto iter = pendingModifications.ranges().begin(); iter != pendingModifications.ranges().end();
+			     ++iter) {
+				if (iter->value().isValid() && !iter->value().isReady()) {
+					hasPendingModifications = true;
+					break;
+				}
+			}
+			if (hasPendingModifications) {
+				// If we still have pending references we won't close the file and instead wait for them. But while we
+				// wait for those to complete, another actor might open the file. So we call into an actor that will
+				// hold a refernce until all pending operations are complete. If someone opens this file before this
+				// completes, nothing will happen. Otherwise we will enter delref again but this time
+				// hasPendingModifications will evalualte to false.
+				addref();
+				waitOnOutstandingModifications(Reference<AsyncFileNonDurable>(this));
+				return;
+			}
 			auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
 			auto iter = openFiles.find(filename);
 			// the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the

From 1033db9fba275a809b3159fc2d52a92293350a45 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 25 Mar 2021 14:00:07 -0600
Subject: [PATCH 042/317] Revert change

---
 fdbrpc/AsyncFileNonDurable.actor.h | 47 +++++++-----------------------
 1 file changed, 11 insertions(+), 36 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 281b3f289d..f65895067e 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -267,27 +267,20 @@ public:
 		state std::string filename = self->filename;
 
 		wait(g_simulator.onMachine(currentProcess));
-		try {
-			Promise<bool> startSyncPromise = self->startSyncPromise;
-			self->startSyncPromise = Promise<bool>();
-			startSyncPromise.send(true);
+		Promise<bool> startSyncPromise = self->startSyncPromise;
+		self->startSyncPromise = Promise<bool>();
+		startSyncPromise.send(true);
 
-			std::vector<Future<Void>> outstandingModifications;
+		std::vector<Future<Void>> outstandingModifications;
 
-			for (auto itr = self->pendingModifications.ranges().begin();
-			     itr != self->pendingModifications.ranges().end();
-			     ++itr)
-				if (itr->value().isValid() && !itr->value().isReady())
-					outstandingModifications.push_back(itr->value());
+		for (auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end();
+		     ++itr)
+			if (itr->value().isValid() && !itr->value().isReady())
+				outstandingModifications.push_back(itr->value());
 
-			// Ignore errors here so that all modifications can finish
-			wait(waitForAllReady(outstandingModifications));
-			wait(g_simulator.onProcess(currentProcess, currentTaskID));
-		} catch (Error& e) {
-			state Error err = e;
-			wait(g_simulator.onProcess(currentProcess, currentTaskID));
-			throw err;
-		}
+		// Ignore errors here so that all modifications can finish
+		wait(waitForAllReady(outstandingModifications));
+		wait(g_simulator.onProcess(currentProcess, currentTaskID));
 	}
 
 	void addref() override { ReferenceCounted<AsyncFileNonDurable>::addref(); }
@@ -301,24 +294,6 @@ public:
 		} else if (isSoleOwner()) {
 			// isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we
 			// remove the file from the map to make sure it gets closed.
-			bool hasPendingModifications = false;
-			for (auto iter = pendingModifications.ranges().begin(); iter != pendingModifications.ranges().end();
-			     ++iter) {
-				if (iter->value().isValid() && !iter->value().isReady()) {
-					hasPendingModifications = true;
-					break;
-				}
-			}
-			if (hasPendingModifications) {
-				// If we still have pending references we won't close the file and instead wait for them. But while we
-				// wait for those to complete, another actor might open the file. So we call into an actor that will
-				// hold a refernce until all pending operations are complete. If someone opens this file before this
-				// completes, nothing will happen. Otherwise we will enter delref again but this time
-				// hasPendingModifications will evalualte to false.
-				addref();
-				waitOnOutstandingModifications(Reference<AsyncFileNonDurable>(this));
-				return;
-			}
 			auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles;
 			auto iter = openFiles.find(filename);
 			// the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the

From c3ba4659ff461d3a5eb16eaa62d563627ea2032b Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Fri, 26 Mar 2021 18:06:21 +0000
Subject: [PATCH 043/317] Document that ryw disable can only be set at
 beginning of transaction

---
 fdbclient/vexillographer/fdb.options | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options
index 82ba1910c2..db68bb31a4 100644
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@@ -210,7 +210,7 @@ description is not currently required but encouraged.
     <Option name="check_writes_enable" code="50"
             hidden="true" />
     <Option name="read_your_writes_disable" code="51"
-            description="Reads performed by a transaction will not see any prior mutations that occured in that transaction, instead seeing the value which was in the database at the transaction's read version. This option may provide a small performance benefit for the client, but also disables a number of client-side optimizations which are beneficial for transactions which tend to read and write the same keys within a single transaction."/>
+            description="Reads performed by a transaction will not see any prior mutations that occured in that transaction, instead seeing the value which was in the database at the transaction's read version. This option may provide a small performance benefit for the client, but also disables a number of client-side optimizations which are beneficial for transactions which tend to read and write the same keys within a single transaction. This option can only be set at the beginning of a transaction."/>
     <Option name="read_ahead_disable" code="52"
             description="Deprecated" />
     <Option name="durability_datacenter" code="110" />

From 8d52abf048da81a95f6760e4431ad737a83811d3 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Fri, 26 Mar 2021 11:33:28 -0700
Subject: [PATCH 044/317] Update fdbclient/vexillographer/fdb.options

---
 fdbclient/vexillographer/fdb.options | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options
index db68bb31a4..c6a4a9749c 100644
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@@ -210,7 +210,7 @@ description is not currently required but encouraged.
     <Option name="check_writes_enable" code="50"
             hidden="true" />
     <Option name="read_your_writes_disable" code="51"
-            description="Reads performed by a transaction will not see any prior mutations that occured in that transaction, instead seeing the value which was in the database at the transaction's read version. This option may provide a small performance benefit for the client, but also disables a number of client-side optimizations which are beneficial for transactions which tend to read and write the same keys within a single transaction. This option can only be set at the beginning of a transaction."/>
+            description="Reads performed by a transaction will not see any prior mutations that occured in that transaction, instead seeing the value which was in the database at the transaction's read version. This option may provide a small performance benefit for the client, but also disables a number of client-side optimizations which are beneficial for transactions which tend to read and write the same keys within a single transaction. It is an error to set this option after performing any reads or writes on the transaction."/>
     <Option name="read_ahead_disable" code="52"
             description="Deprecated" />
     <Option name="durability_datacenter" code="110" />

From 0eff74f2053bd91a37ec947f31edd55e2c29f681 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Fri, 26 Mar 2021 12:19:33 -0700
Subject: [PATCH 045/317] Add special keys for maintenance and datadistribution

---
 fdbclient/NativeAPI.actor.cpp                 |  10 +
 fdbclient/SpecialKeySpace.actor.cpp           | 177 +++++++++++++++++-
 fdbclient/SpecialKeySpace.actor.h             |  13 ++
 .../SpecialKeySpaceCorrectness.actor.cpp      | 174 +++++++++++++++++
 4 files changed, 373 insertions(+), 1 deletion(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index f5058df92b..73ad8abc07 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -970,6 +970,16 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 		    std::make_unique<ClientProfilingImpl>(
 		        KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
 				.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
+		registerSpecialKeySpaceModule(
+		    SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE,
+		    std::make_unique<MaintenanceImpl>(
+		        KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0"))
+		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
+		registerSpecialKeySpaceModule(
+		    SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE,
+		    std::make_unique<DataDistributionImpl>(
+		        KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
+		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
 	}
 	if (apiVersionAtLeast(630)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION, SpecialKeySpace::IMPLTYPE::READONLY,
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 4a987238b3..1fc2dbf15f 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -79,7 +79,11 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandT
 	{ "advanceversion", singleKeyRange(LiteralStringRef("min_required_commit_version"))
 	                        .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
 	{ "profile", KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
-	                 .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
+	                 .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "maintenance", KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0"))
+	                     .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "datadistribution", KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
+	                          .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
 };
 
 std::set<std::string> SpecialKeySpace::options = { "excluded/force", "failed/force" };
@@ -1679,6 +1683,7 @@ ACTOR static Future<Standalone<RangeResultRef>> ClientProfilingGetRangeActor(Rea
 	return result;
 }
 
+// TODO : add limitation on set operation
 Future<Standalone<RangeResultRef>> ClientProfilingImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return ClientProfilingGetRangeActor(ryw, getKeyRange().begin, kr);
 }
@@ -1734,3 +1739,173 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke
 	    ryw, "profile",
 	    "Clear operation is forbidden for profile client. You can set it to default to disable profiling.");
 }
+
+MaintenanceImpl::MaintenanceImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
+
+ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYourWritesTransaction* ryw, KeyRef prefix,
+                                                                         KeyRangeRef kr) {
+	state Standalone<RangeResultRef> result;
+	// zoneId
+	ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
+	Optional<Value> val = wait(ryw->getTransaction().get(healthyZoneKey));
+	if (val.present()) {
+		TraceEvent(SevDebug, "MaintenanceDebug2").detail("KeyRange", kr.toString());
+		auto healthyZone = decodeHealthyZoneValue(val.get());
+		if ((healthyZone.first == ignoreSSFailuresZoneString) ||
+		    (healthyZone.second > ryw->getTransaction().getReadVersion().get())) {
+			Key zone_key = healthyZone.first.withPrefix(prefix);
+			int64_t seconds = healthyZone.first == ignoreSSFailuresZoneString
+			                      ? 0
+			                      : (healthyZone.second - ryw->getTransaction().getReadVersion().get()) /
+			                            CLIENT_KNOBS->CORE_VERSIONSPERSECOND;
+			if (kr.contains(zone_key)) {
+				result.push_back_deep(result.arena(),
+				                      KeyValueRef(zone_key, Value(boost::lexical_cast<std::string>(seconds))));
+			}
+		}
+	}
+	return rywGetRange(ryw, kr, result);
+}
+
+Future<Standalone<RangeResultRef>> MaintenanceImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+	return MaintenanceGetRangeActor(ryw, getKeyRange().begin, kr);
+}
+
+ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
+	// read
+	ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
+	ryw->getTransaction().setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+	Optional<Value> val = wait(ryw->getTransaction().get(healthyZoneKey));
+	Optional<std::pair<Key, Version>> healthyZone =
+	    val.present() ? decodeHealthyZoneValue(val.get()) : Optional<std::pair<Key, Version>>();
+
+	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
+	    ryw->getSpecialKeySpaceWriteMap().containedRanges(kr);
+	Key zoneId;
+	int64_t seconds;
+	bool isSet = false;
+	// Since maintenance only allows one zone at the same time,
+	// if a transaction has more than one set operation on different zone keys,
+	// the commit will throw an error
+	for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
+		if (!iter->value().first) continue;
+		if (iter->value().second.present()) {
+			if (isSet)
+				return Optional<std::string>(ManagementAPIError::toJsonString(
+				    false, "maintenance", "Multiple zones given for maintenance, only one allowed at the same time"));
+			isSet = true;
+			zoneId = iter->begin().removePrefix(kr.begin);
+			seconds = boost::lexical_cast<int64_t>(iter->value().second.get().toString());
+		} else {
+			// if we already have set operation, then all clear operations will be meaningless, thus skip
+			if (!isSet && healthyZone.present() && iter.range().contains(healthyZone.get().first.withPrefix(kr.begin)))
+				ryw->getTransaction().clear(healthyZoneKey);
+		}
+	}
+
+	if (isSet) {
+		if (healthyZone.present() && healthyZone.get().first == ignoreSSFailuresZoneString) {
+			std::string msg = "Maintenance mode cannot be used while data distribution is disabled for storage "
+			                  "server failures.";
+			return Optional<std::string>(ManagementAPIError::toJsonString(false, "maintenance", msg));
+		} else {
+			TraceEvent(SevDebug, "SKSMaintenanceSet").detail("ZoneId", zoneId.toString());
+			ryw->getTransaction().set(healthyZoneKey,
+			                          healthyZoneValue(zoneId, ryw->getTransaction().getReadVersion().get() +
+			                                                       (seconds * CLIENT_KNOBS->CORE_VERSIONSPERSECOND)));
+		}
+	}
+	return Optional<std::string>();
+}
+
+Future<Optional<std::string>> MaintenanceImpl::commit(ReadYourWritesTransaction* ryw) {
+	return maintenanceCommitActor(ryw, getKeyRange());
+}
+
+DataDistributionImpl::DataDistributionImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
+
+ACTOR static Future<Standalone<RangeResultRef>> DataDistributionGetRangeActor(ReadYourWritesTransaction* ryw,
+                                                                              KeyRef prefix, KeyRangeRef kr) {
+	state Standalone<RangeResultRef> result;
+	// dataDistributionModeKey
+	state Key modeKey = LiteralStringRef("mode").withPrefix(prefix);
+	if (kr.contains(modeKey)) {
+		auto entry = ryw->getSpecialKeySpaceWriteMap()[modeKey];
+		if (ryw->readYourWritesDisabled() || !entry.first) {
+			Optional<Value> f = wait(ryw->getTransaction().get(dataDistributionModeKey));
+			int mode = -1;
+			if (f.present()) {
+				mode = BinaryReader::fromStringRef<int>(f.get(), Unversioned());
+			}
+			result.push_back_deep(result.arena(), KeyValueRef(modeKey, Value(boost::lexical_cast<std::string>(mode))));
+		}
+	}
+	// rebalanceDDIgnoreKey
+	state Key rebalanceIgnoredKey = LiteralStringRef("rebalance_ignored").withPrefix(prefix);
+	if (kr.contains(rebalanceIgnoredKey)) {
+		auto entry = ryw->getSpecialKeySpaceWriteMap()[rebalanceIgnoredKey];
+		if (ryw->readYourWritesDisabled() || !entry.first) {
+			Optional<Value> f = wait(ryw->getTransaction().get(rebalanceDDIgnoreKey));
+			if (f.present()) {
+				result.push_back_deep(result.arena(), KeyValueRef(rebalanceIgnoredKey, Value()));
+			}
+		}
+	}
+	return rywGetRange(ryw, kr, result);
+}
+
+Future<Standalone<RangeResultRef>> DataDistributionImpl::getRange(ReadYourWritesTransaction* ryw,
+                                                                  KeyRangeRef kr) const {
+	return DataDistributionGetRangeActor(ryw, getKeyRange().begin, kr);
+}
+
+Future<Optional<std::string>> DataDistributionImpl::commit(ReadYourWritesTransaction* ryw) {
+	// there are two valid keys in the range
+	// <prefix>/mode -> dataDistributionModeKey, the value is only allowed to be set as "0"(disable) or "1"(enable)
+	// <prefix>/rebalance_ignored -> rebalanceDDIgnoreKey, value is unused thus empty
+	Optional<std::string> msg;
+	KeyRangeRef kr = getKeyRange();
+	Key modeKey = LiteralStringRef("mode").withPrefix(kr.begin);
+	Key rebalanceIgnoredKey = LiteralStringRef("rebalance_ignored").withPrefix(kr.begin);
+	auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(kr);
+	for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
+		if (!iter->value().first) continue;
+		if (iter->value().second.present()) {
+			if (iter->range() == singleKeyRange(modeKey)) {
+				try {
+					int mode = boost::lexical_cast<int>(iter->value().second.get().toString());
+					Value modeVal = BinaryWriter::toValue(mode, Unversioned());
+					if (mode == 0 || mode == 1)
+						ryw->getTransaction().set(dataDistributionModeKey, modeVal);
+					else
+						msg = ManagementAPIError::toJsonString(false, "datadistribution",
+						                                       "Please set the value of the data_distribution/mode to "
+						                                       "0(disable) or 1(enable), other values are not allowed");
+				} catch (boost::bad_lexical_cast& e) {
+					msg = ManagementAPIError::toJsonString(false, "datadistribution",
+					                                       "Invalid datadistribution mode(int): " +
+					                                           iter->value().second.get().toString());
+				}
+			} else if (iter->range() == singleKeyRange(rebalanceIgnoredKey)) {
+				if (iter->value().second.get().size())
+					msg =
+					    ManagementAPIError::toJsonString(false, "datadistribution",
+					                                     "Value is unused for the data_distribution/rebalance_ignored "
+					                                     "key, please set it to an empty value");
+				else
+					ryw->getTransaction().set(rebalanceDDIgnoreKey, LiteralStringRef("on"));
+			} else {
+				msg = ManagementAPIError::toJsonString(
+				    false, "datadistribution",
+				    "Changing invalid keys, please read the documentation to check valid keys in the range");
+			}
+		} else {
+			// clear
+			if (iter->range().contains(modeKey))
+				ryw->getTransaction().clear(dataDistributionModeKey);
+			else if (iter->range().contains(rebalanceIgnoredKey))
+				ryw->getTransaction().clear(rebalanceDDIgnoreKey);
+		}
+	}
+	return msg;
+}
diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h
index 0c70e43525..0d46f51961 100644
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@@ -363,5 +363,18 @@ public:
 	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
 };
 
+class MaintenanceImpl : public SpecialKeyRangeRWImpl {
+public:
+	explicit MaintenanceImpl(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
+};
+class DataDistributionImpl : public SpecialKeyRangeRWImpl {
+public:
+	explicit DataDistributionImpl(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
+};
+
 #include "flow/unactorcompiler.h"
 #endif
diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
index f101a02360..ce41d98812 100644
--- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
+++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
@@ -1229,6 +1229,180 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
 				}
 			}
 		}
+		// data_distribution & maintenance get
+		loop {
+			try {
+				// maintenance
+				Standalone<RangeResultRef> maintenanceKVs = wait(
+				    tx->getRange(SpecialKeySpace::getManamentApiCommandRange("maintenance"), CLIENT_KNOBS->TOO_MANY));
+				// By default, no maintenance is going on
+				ASSERT(!maintenanceKVs.more && !maintenanceKVs.size());
+				// datadistribution
+				Standalone<RangeResultRef> ddKVs = wait(tx->getRange(
+				    SpecialKeySpace::getManamentApiCommandRange("datadistribution"), CLIENT_KNOBS->TOO_MANY));
+				// By default, data_distribution/mode := "-1"
+				ASSERT(!ddKVs.more && ddKVs.size() == 1);
+				ASSERT(ddKVs[0].key == LiteralStringRef("mode").withPrefix(
+				                           SpecialKeySpace::getManagementApiCommandPrefix("datadistribution")));
+				ASSERT(ddKVs[0].value == Value(boost::lexical_cast<std::string>(-1)));
+				tx->reset();
+				break;
+			} catch (Error& e) {
+				TraceEvent(SevDebug, "MaintenanceGet").error(e);
+				wait(tx->onError(e));
+			}
+		}
+		// maintenance set
+		{
+			// Make sure setting more than one zone as maintenance will fail
+			loop {
+				try {
+					tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+					tx->set(Key(deterministicRandom()->randomAlphaNumeric(8))
+					            .withPrefix(SpecialKeySpace::getManagementApiCommandPrefix("maintenance")),
+					        Value(boost::lexical_cast<std::string>(deterministicRandom()->randomInt(1, 100))));
+					// make sure this is a different zone id
+					tx->set(Key(deterministicRandom()->randomAlphaNumeric(9))
+					            .withPrefix(SpecialKeySpace::getManagementApiCommandPrefix("maintenance")),
+					        Value(boost::lexical_cast<std::string>(deterministicRandom()->randomInt(1, 100))));
+					wait(tx->commit());
+					ASSERT(false);
+				} catch (Error& e) {
+					TraceEvent(SevDebug, "MaintenanceSetMoreThanOneZone").error(e);
+					if (e.code() == error_code_special_keys_api_failure) {
+						Optional<Value> errorMsg =
+						    wait(tx->get(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ERRORMSG).begin));
+						ASSERT(errorMsg.present());
+						std::string errorStr;
+						auto valueObj = readJSONStrictly(errorMsg.get().toString()).get_obj();
+						auto schema = readJSONStrictly(JSONSchemas::managementApiErrorSchema.toString()).get_obj();
+						// special_key_space_management_api_error_msg schema validation
+						ASSERT(schemaMatch(schema, valueObj, errorStr, SevError, true));
+						ASSERT(valueObj["command"].get_str() == "maintenance" && !valueObj["retriable"].get_bool());
+						TraceEvent(SevDebug, "MaintenanceSetMoreThanOneZone")
+						    .detail("ErrorMessage", valueObj["message"].get_str());
+						tx->reset();
+						break;
+					} else {
+						wait(tx->onError(e));
+					}
+					wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+				}
+			}
+			// Disable DD for SS failures
+			state int ignoreSSFailuresRetry = 0;
+			loop {
+				try {
+					tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+					tx->set(ignoreSSFailuresZoneString.withPrefix(
+					            SpecialKeySpace::getManagementApiCommandPrefix("maintenance")),
+					        Value(boost::lexical_cast<std::string>(0)));
+					wait(tx->commit());
+					tx->reset();
+					ignoreSSFailuresRetry++;
+					wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+				} catch (Error& e) {
+					TraceEvent(SevDebug, "MaintenanceDDIgnoreSSFailures").error(e);
+					// the second commit will fail since maintenance not allowed to use while DD disabled for SS
+					// failures
+					if (e.code() == error_code_special_keys_api_failure) {
+						Optional<Value> errorMsg =
+						    wait(tx->get(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ERRORMSG).begin));
+						ASSERT(errorMsg.present());
+						std::string errorStr;
+						auto valueObj = readJSONStrictly(errorMsg.get().toString()).get_obj();
+						auto schema = readJSONStrictly(JSONSchemas::managementApiErrorSchema.toString()).get_obj();
+						// special_key_space_management_api_error_msg schema validation
+						ASSERT(schemaMatch(schema, valueObj, errorStr, SevError, true));
+						ASSERT(valueObj["command"].get_str() == "maintenance" && !valueObj["retriable"].get_bool());
+						ASSERT(ignoreSSFailuresRetry > 0);
+						TraceEvent(SevDebug, "MaintenanceDDIgnoreSSFailures")
+						    .detail("Retry", ignoreSSFailuresRetry)
+						    .detail("ErrorMessage", valueObj["message"].get_str());
+						tx->reset();
+						break;
+					} else {
+						wait(tx->onError(e));
+					}
+					ignoreSSFailuresRetry++;
+					wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
+				}
+			}
+			// set dd mode to 0 and disable DD for rebalance
+			loop {
+				try {
+					tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+					KeyRef ddPrefix = SpecialKeySpace::getManagementApiCommandPrefix("datadistribution");
+					tx->set(LiteralStringRef("mode").withPrefix(ddPrefix), LiteralStringRef("0"));
+					tx->set(LiteralStringRef("rebalance_ignored").withPrefix(ddPrefix), Value());
+					wait(tx->commit());
+					tx->reset();
+					break;
+				} catch (Error& e) {
+					TraceEvent(SevDebug, "DataDistributionDisableModeAndRebalance").error(e);
+					wait(tx->onError(e));
+				}
+			}
+			// verify underlying system keys are consistent with the change
+			loop {
+				try {
+					tx->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+					// check DD disabled for SS failures
+					Optional<Value> val1 = wait(tx->get(healthyZoneKey));
+					ASSERT(val1.present());
+					auto healthyZone = decodeHealthyZoneValue(val1.get());
+					ASSERT(healthyZone.first == ignoreSSFailuresZoneString);
+					// check DD mode
+					Optional<Value> val2 = wait(tx->get(dataDistributionModeKey));
+					ASSERT(val2.present());
+					// mode should be set to 0
+					ASSERT(BinaryReader::fromStringRef<int>(val2.get(), Unversioned()) == 0);
+					// check DD disabled for rebalance
+					Optional<Value> val3 = wait(tx->get(rebalanceDDIgnoreKey));
+					// default value "on"
+					ASSERT(val3.present() && val3.get() == LiteralStringRef("on"));
+					tx->reset();
+					break;
+				} catch (Error& e) {
+					wait(tx->onError(e));
+				}
+			}
+			// then, clear all changes
+			loop {
+				try {
+					tx->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+					tx->clear(ignoreSSFailuresZoneString.withPrefix(
+					    SpecialKeySpace::getManagementApiCommandPrefix("maintenance")));
+					KeyRef ddPrefix = SpecialKeySpace::getManagementApiCommandPrefix("datadistribution");
+					tx->clear(LiteralStringRef("mode").withPrefix(ddPrefix));
+					tx->clear(LiteralStringRef("rebalance_ignored").withPrefix(ddPrefix));
+					wait(tx->commit());
+					tx->reset();
+					break;
+				} catch (Error& e) {
+					wait(tx->onError(e));
+				}
+			}
+			// verify all changes are cleared
+			loop {
+				try {
+					tx->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
+					// check DD SSFailures key
+					Optional<Value> val1 = wait(tx->get(healthyZoneKey));
+					ASSERT(!val1.present());
+					// check DD mode
+					Optional<Value> val2 = wait(tx->get(dataDistributionModeKey));
+					ASSERT(!val2.present());
+					// check DD rebalance key
+					Optional<Value> val3 = wait(tx->get(rebalanceDDIgnoreKey));
+					ASSERT(!val3.present());
+					tx->reset();
+					break;
+				} catch (Error& e) {
+					wait(tx->onError(e));
+				}
+			}
+		}
 		return Void();
 	}
 };

From ff6e922c9d9e0574b6348720b49ac5a16fe12d16 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Fri, 26 Mar 2021 12:24:45 -0700
Subject: [PATCH 046/317] Clang-format SpecialKeySpace.actor.cpp

---
 fdbclient/SpecialKeySpace.actor.cpp | 59 ++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 22 deletions(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 0327dbc154..c402d64c06 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -76,18 +76,24 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandT
 	  KeyRangeRef(LiteralStringRef("failed/"), LiteralStringRef("failed0"))
 	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
 	{ "lock", singleKeyRange(LiteralStringRef("db_locked")).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
-	{ "consistencycheck", singleKeyRange(LiteralStringRef("consistency_check_suspended"))
-	                          .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
-	{ "coordinators", KeyRangeRef(LiteralStringRef("coordinators/"), LiteralStringRef("coordinators0"))
-	                      .withPrefix(moduleToBoundary[MODULE::CONFIGURATION].begin) },
-	{ "advanceversion", singleKeyRange(LiteralStringRef("min_required_commit_version"))
-	                        .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
-	{ "profile", KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
-	                 .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
-	{ "maintenance", KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0"))
-	                     .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
-	{ "datadistribution", KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
-	                          .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
+	{ "consistencycheck",
+	  singleKeyRange(LiteralStringRef("consistency_check_suspended"))
+	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "coordinators",
+	  KeyRangeRef(LiteralStringRef("coordinators/"), LiteralStringRef("coordinators0"))
+	      .withPrefix(moduleToBoundary[MODULE::CONFIGURATION].begin) },
+	{ "advanceversion",
+	  singleKeyRange(LiteralStringRef("min_required_commit_version"))
+	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "profile",
+	  KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
+	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "maintenance",
+	  KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0"))
+	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
+	{ "datadistribution",
+	  KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
+	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
 };
 
 std::set<std::string> SpecialKeySpace::options = { "excluded/force", "failed/force" };
@@ -1798,7 +1804,8 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke
 
 MaintenanceImpl::MaintenanceImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYourWritesTransaction* ryw, KeyRef prefix,
+ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYourWritesTransaction* ryw,
+                                                                         KeyRef prefix,
                                                                          KeyRangeRef kr) {
 	state Standalone<RangeResultRef> result;
 	// zoneId
@@ -1844,7 +1851,8 @@ ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWrites
 	// if a transaction has more than one set operation on different zone keys,
 	// the commit will throw an error
 	for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
-		if (!iter->value().first) continue;
+		if (!iter->value().first)
+			continue;
 		if (iter->value().second.present()) {
 			if (isSet)
 				return Optional<std::string>(ManagementAPIError::toJsonString(
@@ -1867,8 +1875,9 @@ ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWrites
 		} else {
 			TraceEvent(SevDebug, "SKSMaintenanceSet").detail("ZoneId", zoneId.toString());
 			ryw->getTransaction().set(healthyZoneKey,
-			                          healthyZoneValue(zoneId, ryw->getTransaction().getReadVersion().get() +
-			                                                       (seconds * CLIENT_KNOBS->CORE_VERSIONSPERSECOND)));
+			                          healthyZoneValue(zoneId,
+			                                           ryw->getTransaction().getReadVersion().get() +
+			                                               (seconds * CLIENT_KNOBS->CORE_VERSIONSPERSECOND)));
 		}
 	}
 	return Optional<std::string>();
@@ -1881,7 +1890,8 @@ Future<Optional<std::string>> MaintenanceImpl::commit(ReadYourWritesTransaction*
 DataDistributionImpl::DataDistributionImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
 ACTOR static Future<Standalone<RangeResultRef>> DataDistributionGetRangeActor(ReadYourWritesTransaction* ryw,
-                                                                              KeyRef prefix, KeyRangeRef kr) {
+                                                                              KeyRef prefix,
+                                                                              KeyRangeRef kr) {
 	state Standalone<RangeResultRef> result;
 	// dataDistributionModeKey
 	state Key modeKey = LiteralStringRef("mode").withPrefix(prefix);
@@ -1925,7 +1935,8 @@ Future<Optional<std::string>> DataDistributionImpl::commit(ReadYourWritesTransac
 	Key rebalanceIgnoredKey = LiteralStringRef("rebalance_ignored").withPrefix(kr.begin);
 	auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(kr);
 	for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
-		if (!iter->value().first) continue;
+		if (!iter->value().first)
+			continue;
 		if (iter->value().second.present()) {
 			if (iter->range() == singleKeyRange(modeKey)) {
 				try {
@@ -1934,25 +1945,29 @@ Future<Optional<std::string>> DataDistributionImpl::commit(ReadYourWritesTransac
 					if (mode == 0 || mode == 1)
 						ryw->getTransaction().set(dataDistributionModeKey, modeVal);
 					else
-						msg = ManagementAPIError::toJsonString(false, "datadistribution",
+						msg = ManagementAPIError::toJsonString(false,
+						                                       "datadistribution",
 						                                       "Please set the value of the data_distribution/mode to "
 						                                       "0(disable) or 1(enable), other values are not allowed");
 				} catch (boost::bad_lexical_cast& e) {
-					msg = ManagementAPIError::toJsonString(false, "datadistribution",
+					msg = ManagementAPIError::toJsonString(false,
+					                                       "datadistribution",
 					                                       "Invalid datadistribution mode(int): " +
 					                                           iter->value().second.get().toString());
 				}
 			} else if (iter->range() == singleKeyRange(rebalanceIgnoredKey)) {
 				if (iter->value().second.get().size())
 					msg =
-					    ManagementAPIError::toJsonString(false, "datadistribution",
+					    ManagementAPIError::toJsonString(false,
+					                                     "datadistribution",
 					                                     "Value is unused for the data_distribution/rebalance_ignored "
 					                                     "key, please set it to an empty value");
 				else
 					ryw->getTransaction().set(rebalanceDDIgnoreKey, LiteralStringRef("on"));
 			} else {
 				msg = ManagementAPIError::toJsonString(
-				    false, "datadistribution",
+				    false,
+				    "datadistribution",
 				    "Changing invalid keys, please read the documentation to check valid keys in the range");
 			}
 		} else {

From 62516cb8449fb19522d72fc6c09eaec559ea98f3 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Mon, 29 Mar 2021 11:16:54 -0700
Subject: [PATCH 047/317] Add documentation for special keys
 \xff\xff/management/maintenance and \xff\xff/management/data_distribution

---
 documentation/sphinx/source/developer-guide.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index bb30e9e469..6f4f7bcad4 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -949,6 +949,8 @@ that process, and wait for necessary data to be moved away.
 #. ``\xff\xff/management/options/failed/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed/<exclusion>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
 #. ``\xff\xff/management/min_required_commit_version`` Read/write. Changing this key will change the corresponding system key ``\xff/minRequiredCommitVersion = [[Version]]``. The value of this special key is the literal text of the underlying ``Version``, which is ``int64_t``. If you set the key with a value failed to be parsed as ``int64_t``, ``special_keys_api_failure`` will be thrown. In addition, the given ``Version`` should be larger than the current read version and smaller than the upper bound(``2**63-1-version_per_second*3600*24*365*1000``). Otherwise, ``special_keys_api_failure`` is thrown. For more details, see help text of ``fdbcli`` command ``advanceversion``.
 #. ``\xff\xff/management/profiling/<client_txn_sample_rate|client_txn_size_limit>`` Read/write. Changing these two keys will change the corresponding system keys ``\xff\x02/fdbClientInfo/<client_txn_sample_rate|client_txn_size_limit>``, respectively. The value of ``\xff\xff/management/client_txn_sample_rate`` is a literal text of ``double``, and the value of ``\xff\xff/management/client_txn_size_limit`` is a literal text of ``int64_t``. A special value ``default`` can be set to or read from these two keys, representing the client profiling is disabled. In addition, ``clear`` in this range is not allowed. For more details, see help text of ``fdbcli`` command ``profile client``.
+#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of ``int`` which represents the remaining time for the zone to be in maintenance. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. The special key ``\xff\xff/management/maintenance/IgnoreSSFailures``, if set, will disable datadistribution for storage server failures and thus maintenance mode will be unable to use until the key is cleared. For more details, see help text of ``fdbcli`` command ``maintenance``.
+#. ``\xff\xff/management/data_distribution/<mode|rebalance_ignored>`` Read/write. Changing these two keys will change the two corresponding system keys ``\xff/dataDistributionMode`` and ``\xff\x02/rebalanceDDIgnored``. The value of ``\xff\xff/management/data_distribution/mode`` is a literal text of ``0`` (disable) or ``1`` (enable). Transactions committed with invalid values will throw ``special_keys_api_failure`` . The value of ``\xff\xff/management/data_distribution/rebalance_ignored`` is empty. If present, it means data distribution is disabled for rebalance. Any transaction committed with non-empty value for this key will throw ``special_keys_api_failure``. For more details, see help text of ``fdbcli`` command ``datadistribution``.
 
 An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or
 an ip address and port (e.g. ``127.0.0.1:4500``). If no port is specified,

From e2bd500fa731073c6b1110f253e97f2b7f831491 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Mon, 29 Mar 2021 11:57:14 -0700
Subject: [PATCH 048/317] Remove debug trace

---
 fdbclient/SpecialKeySpace.actor.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index c402d64c06..82ce62f589 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1812,7 +1812,6 @@ ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYou
 	ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
 	Optional<Value> val = wait(ryw->getTransaction().get(healthyZoneKey));
 	if (val.present()) {
-		TraceEvent(SevDebug, "MaintenanceDebug2").detail("KeyRange", kr.toString());
 		auto healthyZone = decodeHealthyZoneValue(val.get());
 		if ((healthyZone.first == ignoreSSFailuresZoneString) ||
 		    (healthyZone.second > ryw->getTransaction().getReadVersion().get())) {

From f2d368711058226f76b89ca57909a25a61127e85 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 29 Mar 2021 16:06:26 -0700
Subject: [PATCH 049/317] Print stack

---
 cmake/CompileBoost.cmake | 2 +-
 flow/Platform.actor.cpp  | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/cmake/CompileBoost.cmake b/cmake/CompileBoost.cmake
index 9e7fbd2971..0b1cc68502 100644
--- a/cmake/CompileBoost.cmake
+++ b/cmake/CompileBoost.cmake
@@ -10,7 +10,7 @@ function(compile_boost)
   set(BOOST_COMPILER_FLAGS -fvisibility=hidden -fPIC -std=c++14 -w)
   set(BOOST_CXX_COMPILER "${CMAKE_CXX_COMPILER}")
   if(APPLE)
-    set(BOOST_TOOLSET "darwin")
+    set(BOOST_TOOLSET "clang-darwin")
     # this is to fix a weird macOS issue -- by default
     # cmake would otherwise pass a compiler that can't
     # compile boost
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 756fb6a7e3..d81dee877a 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -3680,12 +3680,19 @@ void* sampleThread(void* arg) {
 		threadSleep(1.0); // TODO: Read sample rate from global config
 
 		// TODO: Copy actor lineage of currently running actor
+		// Read currentLineage
 
 		auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy();
 		printf("Disk ALPs: %d\n", diskAlps.size());
 
 		// TODO: Call collect on all actor lineages
 		for (auto actorLineage : diskAlps) {
+			auto stack = actorLineage->stack(&StackLineage::actorName);
+			while (!stack.empty()) {
+				printf("%s ", stack.top());
+				stack.pop();
+			}
+			printf("\n");
 		}
 
 		// TODO: Serialize collected actor linage properties

From ef1b924f07d2287309b948b7d611a21f8718d662 Mon Sep 17 00:00:00 2001
From: Scott Fines <scottfines@gmail.com>
Date: Wed, 31 Mar 2021 09:07:45 -0500
Subject: [PATCH 050/317] Removing String.format from ByteArrayUtil.printable.

String.format can be potentially expensive, and if using `printable()`
within a hot loop that can be a performance penalty. Admittedly, it
doesn't seem like a good idea to call printable() from within a hot
loop, but if you have to, it's good for it to perform well.
---
 .../foundationdb/tuple/ByteArrayUtilTest.java | 42 +++++++++++++++++++
 .../foundationdb/tuple/ByteArrayUtil.java     | 12 +++++-
 bindings/java/src/tests.cmake                 |  1 +
 3 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 bindings/java/src/junit/com/apple/foundationdb/tuple/ByteArrayUtilTest.java

diff --git a/bindings/java/src/junit/com/apple/foundationdb/tuple/ByteArrayUtilTest.java b/bindings/java/src/junit/com/apple/foundationdb/tuple/ByteArrayUtilTest.java
new file mode 100644
index 0000000000..b5222b7761
--- /dev/null
+++ b/bindings/java/src/junit/com/apple/foundationdb/tuple/ByteArrayUtilTest.java
@@ -0,0 +1,42 @@
+package com.apple.foundationdb.tuple;
+
+import java.nio.charset.Charset;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+public class ByteArrayUtilTest {
+	
+	@Test
+	void printableWorksForAllByteValues(){
+		//Quick test to make sure that no bytes are unprintable
+		byte[] bytes = new byte[2*((int)Byte.MAX_VALUE+1)];
+		for(int i=0; i< bytes.length;i++){
+			bytes[i] = (byte)(i & 0xff);
+		}
+
+		String value = ByteArrayUtil.printable(bytes);
+		String expected = "\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\x09\\x0a\\x0b\\x0c\\x0d\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd\\xfe\\xff";
+		Assertions.assertEquals(expected,value,"Incorrect printable string");
+	}
+
+	@Test
+	void printableWorksForAsciiStrings(){
+		char[] asciiChars = new char[]{
+			'!','"','#','$','%','&','\'','(',')','*','+',',','~','.','/',
+			'0','1','2','3','4','5','6','7','8','9',':',';','<','?','@',
+			'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
+			'[','\\',']','^','_','`',
+			'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','{','|','}','~',(char)127
+		};
+
+		for(int i=0;i<asciiChars.length;i++){
+			String substring = new String(asciiChars,0,i);
+			byte[] asciiBytes = substring.getBytes(Charset.forName("UTF-8"));
+
+			String printable = ByteArrayUtil.printable(asciiBytes);
+			String expected = substring.replace("\\", "\\\\");
+			Assertions.assertEquals(expected,printable,"Incorrect printable string");
+		}
+	}
+}
diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java b/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java
index 65b1d52009..387b69cbab 100644
--- a/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java
+++ b/bindings/java/src/main/com/apple/foundationdb/tuple/ByteArrayUtil.java
@@ -419,6 +419,9 @@ public class ByteArrayUtil extends FastByteComparisons {
 		return ByteBuffer.wrap(src).order(ByteOrder.LITTLE_ENDIAN).getLong();
 	}
 
+	private static final char[] hexChars =
+	    new char[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
+
 	/**
 	 * Gets a human readable version of a byte array. The bytes that correspond with
 	 *  ASCII printable characters [32-127) are passed through. Other bytes are
@@ -437,7 +440,14 @@ public class ByteArrayUtil extends FastByteComparisons {
 			byte b = val[i];
 			if (b >= 32 && b < 127 && b != '\\') s.append((char)b);
 			else if (b == '\\') s.append("\\\\");
-			else s.append(String.format("\\x%02x", b));
+			else {
+				//use a lookup table here to avoid doing an expensive String.format() call
+				s.append("\\x");
+				int nib = (b & 0xF0) >> 4;
+				s.append(hexChars[nib]);
+				nib = b & 0x0F;
+				s.append(hexChars[nib]);
+			}
 		}
 		return s.toString();
 	}
diff --git a/bindings/java/src/tests.cmake b/bindings/java/src/tests.cmake
index 5820537173..8bed62ecb8 100644
--- a/bindings/java/src/tests.cmake
+++ b/bindings/java/src/tests.cmake
@@ -28,6 +28,7 @@
 set(JAVA_JUNIT_TESTS
   src/junit/com/apple/foundationdb/tuple/ArrayUtilSortTest.java
   src/junit/com/apple/foundationdb/tuple/ArrayUtilTest.java
+  src/junit/com/apple/foundationdb/tuple/ByteArrayUtilTest.java
   src/junit/com/apple/foundationdb/tuple/TupleComparisonTest.java
   src/junit/com/apple/foundationdb/tuple/TuplePackingTest.java
   src/junit/com/apple/foundationdb/tuple/TupleSerializationTest.java

From c90be2003f8dffe6161d96ae90d53023cd6a4a3b Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 1 Apr 2021 10:34:59 -0700
Subject: [PATCH 051/317] Profile running actor

---
 flow/Platform.actor.cpp     |  8 +++++++
 flow/WriteOnlySet.actor.cpp | 41 +++++++++++++++++++++++++++++++++
 flow/WriteOnlySet.h         | 46 ++++++++++++++++++++++++++++++++++++-
 flow/flow.cpp               |  1 +
 flow/flow.h                 | 17 ++++++++++++--
 5 files changed, 110 insertions(+), 3 deletions(-)

diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index d81dee877a..50f252021b 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -3681,6 +3681,14 @@ void* sampleThread(void* arg) {
 
 		// TODO: Copy actor lineage of currently running actor
 		// Read currentLineage
+		auto actorLineage = currentLineageThreadSafe.get();
+		printf("Currently running actor lineage (%p):\n", actorLineage.getPtr());
+		auto stack = actorLineage->stack(&StackLineage::actorName);
+		while (!stack.empty()) {
+			printf("%s ", stack.top());
+			stack.pop();
+		}
+		printf("\n");
 
 		auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy();
 		printf("Disk ALPs: %d\n", diskAlps.size());
diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index 92eceea7bc..c79f8f4db7 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -67,6 +67,32 @@ bool WriteOnlySet<T, IndexType, CAPACITY>::erase(Index idx) {
 	return res;
 }
 
+template <class T, class IndexType, IndexType CAPACITY>
+bool WriteOnlySet<T, IndexType, CAPACITY>::replace(Index idx, const Reference<T>& lineage) {
+	auto lineagePtr = reinterpret_cast<uintptr_t>(lineage.getPtr());
+	ASSERT((lineagePtr % 2) == 0); // this needs to be at least 2-byte aligned
+
+	while (true) {
+		if (lineage.isValid()) {
+			lineage->addref();
+		}
+
+		auto ptr = _set[idx].load();
+		if (ptr & LOCK) {
+			_set[idx].store(lineagePtr);
+			return false;
+		} else {
+			if (_set[idx].compare_exchange_strong(ptr, lineagePtr)) {
+				if (ptr) {
+					reinterpret_cast<T*>(ptr)->delref();
+				}
+				_set[idx].store(lineagePtr);
+				return ptr != 0;
+			}
+		}
+	}
+}
+
 template <class T, class IndexType, IndexType CAPACITY>
 WriteOnlySet<T, IndexType, CAPACITY>::WriteOnlySet() : _set(CAPACITY) {
 	// insert the free indexes in reverse order
@@ -103,8 +129,23 @@ std::vector<Reference<T>> WriteOnlySet<T, IndexType, CAPACITY>::copy() {
 	return result;
 }
 
+template <class T, class IndexType>
+WriteOnlyVariable<T, IndexType>::WriteOnlyVariable() : WriteOnlySet<T, IndexType, 1>() {}
+
+template <class T, class IndexType>
+Reference<T> WriteOnlyVariable<T, IndexType>::get() {
+	auto result = WriteOnlySet<T, IndexType, 1>::copy();
+	return result.size() ? result.at(0) : Reference<T>();
+}
+
+template <class T, class IndexType>
+bool WriteOnlyVariable<T, IndexType>::replace(const Reference<T>& element) {
+	return WriteOnlySet<T, IndexType, 1>::replace(0, element);
+}
+
 // Explicit instantiation
 template class WriteOnlySet<ActorLineage, unsigned, 1024>;
+template class WriteOnlyVariable<ActorLineage, unsigned>;
 
 // testing code
 namespace {
diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h
index c71736f852..73da2bfac1 100644
--- a/flow/WriteOnlySet.h
+++ b/flow/WriteOnlySet.h
@@ -72,6 +72,17 @@ public:
 	 *      later. Note that at the time the return value is checked, \ref delref might already have been called.
 	 */
 	bool erase(Index idx);
+
+	/**
+	 * Replaces the object associated with \p idx with \p lineage.
+	 *
+	 * \ret Whether the reference count of the replaced object was decremented. Usually the return value is only
+	 *      interesting for testing and benchmarking purposes and will in most cases be ignored. If \ref delref
+	 *      wasn't called, it will be called later. Note that at the time the return value is checked, \ref delref
+	 *      might already have been called.
+	 */
+	bool replace(Index idx, const Reference<T>& lineage);
+
 	/**
 	 * Copies all elements that are stored in the set into a vector. This copy operation does NOT provide a snapshot of
 	 * the data structure. The contract is weak:
@@ -82,7 +93,7 @@ public:
 	 */
 	std::vector<Reference<T>> copy();
 
-private:
+protected:
 	// the implementation of erase -- the wrapper just makes the function a bit more readable.
 	bool eraseImpl(Index idx);
 
@@ -112,6 +123,39 @@ private:
 	boost::lockfree::queue<T*, boost::lockfree::fixed_sized<true>, boost::lockfree::capacity<CAPACITY>> freeList;
 };
 
+/**
+ * Provides a thread safe, lock-free write only variable.
+ *
+ * Template parameters:
+ * \param T The type to store.
+ * \param IndexType The type used as an index
+ * \pre T implements `void addref() const` and `void delref() const`
+ * \pre IndexType must have a copy constructor
+ * \pre IndexType must have a trivial assignment operator
+ * \pre IndexType must have a trivial destructor
+ * \pre IndexType can be used as an index into a std::vector
+ */
+template <class T, class IndexType>
+class WriteOnlyVariable : private WriteOnlySet<T, IndexType, 1> {
+public:
+	explicit WriteOnlyVariable();
+
+	/**
+	 * Returns a copied reference to the stored variable.
+	 */
+	Reference<T> get();
+
+	/**
+	 * Replaces the variable with \p lineage. \p lineage is permitted to be an invalid pointer.
+	 *
+	 * \ret Whether the reference count of the replaced object was decremented. Note that if the reference being replaced
+	 *      is invalid, this function will always return false. If \ref delref wasn't called and the reference was valid,
+	 *      it will be called later. Note that at the time the return value is checked, \ref delref might already have
+	 *      been called.
+	 */
+	bool replace(const Reference<T>& element);
+};
+
 class ActorLineage;
 extern template class WriteOnlySet<ActorLineage, unsigned, 1024>;
 
diff --git a/flow/flow.cpp b/flow/flow.cpp
index 02e5b93410..82bf2be43b 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -27,6 +27,7 @@
 #include <cinttypes>
 
 thread_local Reference<ActorLineage> currentLineage;
+WriteOnlyVariable<ActorLineage, unsigned> currentLineageThreadSafe;
 
 LineagePropertiesBase::~LineagePropertiesBase() {}
 
diff --git a/flow/flow.h b/flow/flow.h
index 430a12a460..b61453c8f2 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -50,6 +50,7 @@
 #include "flow/ThreadPrimitives.h"
 #include "flow/network.h"
 #include "flow/FileIdentifier.h"
+#include "flow/WriteOnlySet.h"
 
 #include <boost/version.hpp>
 
@@ -500,6 +501,7 @@ public:
 };
 
 extern thread_local Reference<ActorLineage> currentLineage;
+extern WriteOnlyVariable<ActorLineage, unsigned> currentLineageThreadSafe;
 
 // This class can be used in order to modify all lineage properties
 // of actors created within a (non-actor) scope
@@ -509,14 +511,21 @@ struct LocalLineage {
 	LocalLineage() {
 		oldLineage = currentLineage;
 		currentLineage = lineage;
+		currentLineageThreadSafe.replace(lineage);
+	}
+	~LocalLineage() {
+		currentLineage = oldLineage;
+		currentLineageThreadSafe.replace(oldLineage);
 	}
-	~LocalLineage() { currentLineage = oldLineage; }
 };
 
 struct restore_lineage {
 	Reference<ActorLineage> prev;
 	restore_lineage() : prev(currentLineage) {}
-	~restore_lineage() { currentLineage = prev; }
+	~restore_lineage() {
+		currentLineage = prev;
+		currentLineageThreadSafe.replace(prev);
+	}
 };
 
 struct StackLineage : LineageProperties<StackLineage> {
@@ -1108,12 +1117,14 @@ struct Actor : SAV<ReturnValue> {
 	Actor() : SAV<ReturnValue>(1, 1), actor_wait_state(0) {
 		/*++actorCount;*/
 		currentLineage = lineage;
+		currentLineageThreadSafe.replace(lineage);
 	}
 	//~Actor() { --actorCount; }
 
 	Reference<ActorLineage> setLineage() {
 		auto res = currentLineage;
 		currentLineage = lineage;
+		currentLineageThreadSafe.replace(lineage);
 		return res;
 	}
 };
@@ -1128,12 +1139,14 @@ struct Actor<void> {
 	Actor() : actor_wait_state(0) {
 		/*++actorCount;*/
 		currentLineage = lineage;
+		currentLineageThreadSafe.replace(lineage);
 	}
 	//~Actor() { --actorCount; }
 
 	Reference<ActorLineage> setLineage() {
 		auto res = currentLineage;
 		currentLineage = lineage;
+		currentLineageThreadSafe.replace(lineage);
 		return res;
 	}
 };

From 41d1aee609374905ad217b11524bb3c19adef0cb Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 1 Apr 2021 14:06:13 -0600
Subject: [PATCH 052/317] delete dead code

---
 fdbrpc/AsyncFileNonDurable.actor.h | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 6168c01abc..2234ee0b26 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -268,30 +268,6 @@ public:
 		//TraceEvent("AsyncFileNonDurable_Destroy", id).detail("Filename", filename);
 	}
 
-	// The purpose of this actor is to simply keep a reference to a non-durable file until all pending modifications
-	// have completed. When they return, this actor will die and therefore decrement the reference count by 1.
-	ACTOR void waitOnOutstandingModifications(Reference<AsyncFileNonDurable> self) {
-		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
-		state TaskPriority currentTaskID = g_network->getCurrentTask();
-		state std::string filename = self->filename;
-
-		wait(g_simulator.onMachine(currentProcess));
-		Promise<bool> startSyncPromise = self->startSyncPromise;
-		self->startSyncPromise = Promise<bool>();
-		startSyncPromise.send(true);
-
-		std::vector<Future<Void>> outstandingModifications;
-
-		for (auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end();
-		     ++itr)
-			if (itr->value().isValid() && !itr->value().isReady())
-				outstandingModifications.push_back(itr->value());
-
-		// Ignore errors here so that all modifications can finish
-		wait(waitForAllReady(outstandingModifications));
-		wait(g_simulator.onProcess(currentProcess, currentTaskID));
-	}
-
 	void addref() override { ReferenceCounted<AsyncFileNonDurable>::addref(); }
 	void delref() override {
 		if (delref_no_destroy()) {

From 90ebf90c8ba619afe85c1933851c7fb53fd56943 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sat, 3 Apr 2021 19:54:49 -0700
Subject: [PATCH 053/317] Refactored page rebuild logic to bulk build pages
 full and split pages more evenly.

---
 fdbserver/DeltaTree.h              |   2 -
 fdbserver/IPager.h                 |   2 +-
 fdbserver/Knobs.cpp                |   2 +-
 fdbserver/Knobs.h                  |   2 +-
 fdbserver/VersionedBTree.actor.cpp | 439 +++++++++++++++++------------
 5 files changed, 267 insertions(+), 180 deletions(-)

diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h
index bef753a440..2e0fee0b40 100644
--- a/fdbserver/DeltaTree.h
+++ b/fdbserver/DeltaTree.h
@@ -230,8 +230,6 @@ struct DeltaTree {
 	inline Node& newNode() { return *(Node*)((uint8_t*)this + size()); }
 
 public:
-	// Get count of total overhead bytes (everything but the user-formatted Delta) for a tree given size n
-	static int emptyTreeSize() { return sizeof(DeltaTree); }
 
 	struct DecodedNode {
 		DecodedNode() {}
diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h
index 0f74c744a8..45c9f02fcc 100644
--- a/fdbserver/IPager.h
+++ b/fdbserver/IPager.h
@@ -76,7 +76,7 @@ public:
 	virtual void delref() = 0;
 };
 
-// This API is probably customized to the behavior of DWALPager and probably needs some changes to be more generic.
+// This API is probably too customized to the behavior of DWALPager and probably needs some changes to be more generic.
 class IPager2 : public IClosable {
 public:
 	// Returns an IPage that can be passed to writePage. The data in the returned IPage might not be zeroed.
diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index 29fbc6fcc6..539637580a 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -703,7 +703,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( REDWOOD_DEFAULT_PAGE_SIZE,                            4096 );
 	init( REDWOOD_KVSTORE_CONCURRENT_READS,                       64 );
 	init( REDWOOD_COMMIT_CONCURRENT_READS,                        64 );
-	init( REDWOOD_PAGE_REBUILD_FILL_FACTOR,                     0.66 );
+	init( REDWOOD_PAGE_REBUILD_MAX_SLACK,                       0.33 );
 	init( REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES,                    10 );
 	init( REDWOOD_LAZY_CLEAR_MIN_PAGES,                            0 );
 	init( REDWOOD_LAZY_CLEAR_MAX_PAGES,                          1e6 );
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 16abf63692..1b8e6874cd 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -636,7 +636,7 @@ public:
 	int REDWOOD_DEFAULT_PAGE_SIZE; // Page size for new Redwood files
 	int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress.
 	int REDWOOD_COMMIT_CONCURRENT_READS; // Max number of concurrent reads done to support commit operations
-	double REDWOOD_PAGE_REBUILD_FILL_FACTOR; // When rebuilding pages, start a new page after this capacity
+	double REDWOOD_PAGE_REBUILD_MAX_SLACK; // When rebuilding pages, max slack to allow in page
 	int REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES; // Number of pages to try to pop from the lazy delete queue and process at
 	                                         // once
 	int REDWOOD_LAZY_CLEAR_MIN_PAGES; // Minimum number of pages to free before ending a lazy clear cycle, unless the
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 8051d956b0..4f3bb874c4 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -2864,7 +2864,8 @@ struct RedwoodRecordRef {
 
 	bool operator>=(const RedwoodRecordRef& rhs) const { return compare(rhs) >= 0; }
 
-	// Worst case overhead means to assu
+	// Worst case overhead means to assume that either the prefix length or the suffix length
+	// could contain the full key size
 	int deltaSize(const RedwoodRecordRef& base, int skipLen, bool worstCaseOverhead) const {
 		int prefixLen = getCommonPrefixLen(base, skipLen);
 		int keySuffixLen = key.size() - prefixLen;
@@ -3732,6 +3733,184 @@ private:
 	Future<int> m_lazyClearActor;
 	bool m_lazyClearStop;
 
+	struct PageToBuild {
+		PageToBuild(int index, int blockSize)
+		  : startIndex(index), count(0), pageSize(blockSize),
+		    bytesLeft(blockSize - sizeof(BTreePage) - sizeof(BTreePage::BinaryTree)),
+		    largeDeltaTree(pageSize > BTreePage::BinaryTree::SmallSizeLimit), blockSize(blockSize), blockCount(1),
+		    kvBytes(0) {}
+
+		int startIndex;
+		int count;
+		int pageSize;
+		int bytesLeft;
+		bool largeDeltaTree;
+		int blockSize;
+		int blockCount;
+		int kvBytes;
+
+		int size() const { return pageSize - bytesLeft; }
+
+		double usedFraction() const { return (double)size() / pageSize; }
+
+		double slackFraction() const { return (double)bytesLeft / pageSize; }
+
+		double kvFraction() const { return (double)kvBytes / pageSize; }
+
+		int endIndex() const { return startIndex + count; }
+
+		int lastIndex() const { return endIndex() - 1; }
+
+		std::string toString() const {
+			return format(
+			    "{start=%d count=%d used %d/%d bytes (%.2f%% slack) kvBytes=%d blocks=%d blockSize=%d large=%d}",
+			    startIndex,
+			    count,
+			    size(),
+			    pageSize,
+			    slackFraction() * 100,
+			    kvBytes,
+			    blockCount,
+			    blockSize,
+			    largeDeltaTree);
+		}
+
+		// Move an item from a to b if a has 2 or more items and the item fits in b
+		// a and b must be consecutive pages from the same array of records
+		static bool shiftItem(PageToBuild& a, PageToBuild& b, int deltaSize, int kvBytes) {
+			if (a.count < 2) {
+				return false;
+			}
+
+			// Size of the nodes in A and B, respectively
+			int aNodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(a.largeDeltaTree);
+			int bNodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(b.largeDeltaTree);
+
+			if (b.bytesLeft < bNodeSize) {
+				return false;
+			}
+
+			--a.count;
+			++b.count;
+			--b.startIndex;
+			a.bytesLeft += aNodeSize;
+			b.bytesLeft -= bNodeSize;
+			a.kvBytes -= kvBytes;
+			b.kvBytes += kvBytes;
+
+			return true;
+		}
+
+		// Try to add a record of the given delta size to the page.
+		// If force is true, the page will be expanded to make the record fit if needed.
+		// Return value is whether or not the record was added to the page.
+		bool addRecord(const RedwoodRecordRef& rec, int deltaSize, bool force) {
+			int nodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(largeDeltaTree);
+
+			// If the record doesn't fit and the page can't be expanded then return false
+			if (nodeSize > bytesLeft && !force) {
+				return false;
+			}
+
+			++count;
+			bytesLeft -= nodeSize;
+			kvBytes += rec.kvBytes();
+
+			// If needed, expand page so that record fits.
+			// This is a loop because the first expansion may increase per-node overhead which could
+			// then require a second expansion.
+			while (bytesLeft < 0) {
+				int newBlocks = (-bytesLeft + blockSize - 1) / blockSize;
+				int extraSpace = newBlocks * blockSize;
+				blockCount += newBlocks;
+				bytesLeft += extraSpace;
+				pageSize += extraSpace;
+
+				// If size has moved into the "large" range then every node has gotten bigger so adjust bytesLeft
+				if (!largeDeltaTree && pageSize > BTreePage::BinaryTree::SmallSizeLimit) {
+					largeDeltaTree = true;
+					bytesLeft -= (count * BTreePage::BinaryTree::LargeTreePerNodeExtraOverhead);
+				}
+			}
+			return true;
+		}
+	};
+
+	static std::vector<PageToBuild> splitPages(const RedwoodRecordRef* lowerBound,
+	                                           const RedwoodRecordRef* upperBound,
+	                                           int prefixLen,
+	                                           VectorRef<RedwoodRecordRef> records,
+	                                           int height,
+	                                           int blockSize) {
+		debug_printf("splitPages height=%d records=%d lowerBound=%s upperBound=%s\n",
+		             height,
+		             records.size(),
+		             lowerBound->toString(false).c_str(),
+		             upperBound->toString(false).c_str());
+		ASSERT(!records.empty());
+
+		// Leaves can have just one record if it's large, but internal pages should have at least 4
+		int minRecords = height == 1 ? 1 : 4;
+		double maxSlack = SERVER_KNOBS->REDWOOD_PAGE_REBUILD_MAX_SLACK;
+		std::vector<PageToBuild> pages;
+
+		// deltaSizes contains pair-wise delta sizes for [lowerBound, records..., upperBound]
+		std::vector<int> deltaSizes(records.size() + 1);
+		deltaSizes.front() = records.front().deltaSize(*lowerBound, prefixLen, true);
+		deltaSizes.back() = records.back().deltaSize(*upperBound, prefixLen, true);
+		for (int i = 1; i < records.size(); ++i) {
+			deltaSizes[i] = records[i].deltaSize(records[i - 1], prefixLen, true);
+		}
+
+		PageToBuild p(0, blockSize);
+
+		for (int i = 0; i < records.size(); ++i) {
+			bool force = p.count < minRecords || p.slackFraction() > maxSlack;
+			debug_printf(
+			    "  before addRecord  i=%d  records=%d  deltaSize=%d  kvSize=%d  force=%d  pageToBuild=%s  record=%s",
+			    i,
+			    records.size(),
+			    deltaSizes[i],
+			    records[i].kvBytes(),
+			    force,
+			    p.toString().c_str(),
+			    records[i].toString(height == 1).c_str());
+
+			if (!p.addRecord(records[i], deltaSizes[i], force)) {
+				pages.push_back(p);
+				p = PageToBuild(p.endIndex(), blockSize);
+				p.addRecord(records[i], deltaSizes[i], true);
+			}
+		}
+
+		if (p.count > 0) {
+			pages.push_back(p);
+		}
+
+		debug_printf("  Before shift: %s\n", ::toString(pages).c_str());
+
+		// If page count is > 1, try to balance slack between last two pages
+		// The buggify disables this balancing as this will result in more edge
+		// cases of pages with very few records.
+		if (pages.size() > 1 && !BUGGIFY) {
+			PageToBuild& a = pages[pages.size() - 2];
+			PageToBuild& b = pages.back();
+
+			// While the last page page has too much slack and the second to last page
+			// has more than the minimum record count, shift a record from the second
+			// to last page to the last page.
+			while (b.slackFraction() > maxSlack && a.count > minRecords) {
+				int i = a.lastIndex();
+				if (!PageToBuild::shiftItem(a, b, deltaSizes[i], records[i].kvBytes())) {
+					break;
+				}
+				debug_printf("  After shifting i=%d: a=%s b=%s\n", i, a.toString().c_str(), b.toString().c_str());
+			}
+		}
+
+		return pages;
+	}
+
 	// Writes entries to 1 or more pages and return a vector of boundary keys with their IPage(s)
 	ACTOR static Future<Standalone<VectorRef<RedwoodRecordRef>>> writePages(VersionedBTree* self,
 	                                                                        const RedwoodRecordRef* lowerBound,
@@ -3741,197 +3920,130 @@ private:
 	                                                                        Version v,
 	                                                                        BTreePageIDRef previousID) {
 		ASSERT(entries.size() > 0);
+
 		state Standalone<VectorRef<RedwoodRecordRef>> records;
 
-		// This is how much space for the binary tree exists in the page, after the header
-		state int blockSize = self->m_blockSize;
-		state int pageSize = blockSize - sizeof(BTreePage);
-		state int pageFillTarget = pageSize * SERVER_KNOBS->REDWOOD_PAGE_REBUILD_FILL_FACTOR;
-		state int blockCount = 1;
+		// All records share the prefix shared by the lower and upper boundaries
+		state int prefixLen = lowerBound->getCommonPrefixLen(*upperBound);
 
-		state int kvBytes = 0;
-		state int compressedBytes = BTreePage::BinaryTree::emptyTreeSize();
-		state bool largeTree = false;
-
-		state int start = 0;
-		state int i = 0;
-		// The common prefix length between the first and last records are common to all records
-		state int skipLen = entries.front().getCommonPrefixLen(entries.back());
-
-		// Leaves can have just one record if it's large, but internal pages should have at least 4
-		state int minimumEntries = (height == 1 ? 1 : 4);
+		state std::vector<PageToBuild> pagesToBuild =
+		    splitPages(lowerBound, upperBound, prefixLen, entries, height, self->m_blockSize);
+		debug_printf("splitPages returning %s\n", toString(pagesToBuild).c_str());
 
 		// Lower bound of the page being added to
 		state RedwoodRecordRef pageLowerBound = lowerBound->withoutValue();
 		state RedwoodRecordRef pageUpperBound;
 
-		while (1) {
-			// While there are still entries to add and the page isn't full enough, add an entry
-			while (i < entries.size() && (i - start < minimumEntries || compressedBytes < pageFillTarget)) {
-				const RedwoodRecordRef& entry = entries[i];
+		state int pageIndex;
 
-				// Get delta from previous record or page lower boundary if this is the first item in a page
-				const RedwoodRecordRef& base = (i == start) ? pageLowerBound : entries[i - 1];
+		for (pageIndex = 0; pageIndex < pagesToBuild.size(); ++pageIndex) {
+			auto& p = pagesToBuild[pageIndex];
+			debug_printf("building page %d of %d %s\n", pageIndex + 1, pagesToBuild.size(), p.toString().c_str());
+			ASSERT(p.count != 0);
 
-				// All record pairs in entries have skipLen bytes in common with each other, but for i == 0 the base is
-				// lowerBound
-				int skip = i == 0 ? 0 : skipLen;
+			// For internal pages, skip first entry if child link is null.  Such links only exist
+			// to maintain a borrow-able prefix for the previous subtree after a subtree deletion.
+			// If the null link falls on a new page post-split, then the pageLowerBound of the page
+			// being built now will serve as the previous subtree's upper boundary as it is the same
+			// key as entries[p.startIndex] and there is no need to actually store the null link in
+			// the new page.
+			if (height != 1 && !entries[p.startIndex].value.present()) {
+				p.kvBytes -= entries[p.startIndex].key.size();
+				++p.startIndex;
+				--p.count;
+				debug_printf("Skipping first null record, new count=%d\n", p.count);
 
-				// In a delta tree, all common prefix bytes that can be borrowed, will be, but not necessarily
-				// by the same records during the linear estimate of the built page size.  Since the key suffix bytes
-				// and therefore the key prefix lengths can be distributed differently in the balanced tree, worst case
-				// overhead for the delta size must be assumed.
-				int deltaSize = entry.deltaSize(base, skip, true);
-
-				int nodeSize = BTreePage::BinaryTree::Node::headerSize(largeTree) + deltaSize;
-				debug_printf("Adding %3d of %3lu (i=%3d) klen %4d  vlen %5d  nodeSize %5d  deltaSize %5d  page usage: "
-				             "%d/%d (%.2f%%)  record=%s\n",
-				             i + 1,
-				             entries.size(),
-				             i,
-				             entry.key.size(),
-				             entry.value.orDefault(StringRef()).size(),
-				             nodeSize,
-				             deltaSize,
-				             compressedBytes,
-				             pageSize,
-				             (float)compressedBytes / pageSize * 100,
-				             entry.toString(height == 1).c_str());
-
-				// While the node doesn't fit, expand the page.
-				// This is a loop because if the page size moves into "large" range for DeltaTree
-				// then the overhead will increase, which could require another page expansion.
-				int spaceAvailable = pageSize - compressedBytes;
-				if (nodeSize > spaceAvailable) {
-					// Figure out how many additional whole or partial blocks are needed
-					// newBlocks = ceil ( additional space needed / block size)
-					int newBlocks = 1 + (nodeSize - spaceAvailable - 1) / blockSize;
-					int newPageSize = pageSize + (newBlocks * blockSize);
-
-					// If we've moved into "large" page range for the delta tree then add additional overhead required
-					if (!largeTree && newPageSize > BTreePage::BinaryTree::SmallSizeLimit) {
-						largeTree = true;
-						// Add increased overhead for the current node to nodeSize
-						nodeSize += BTreePage::BinaryTree::LargeTreePerNodeExtraOverhead;
-						// Add increased overhead for all previously added nodes
-						compressedBytes += (i - start) * BTreePage::BinaryTree::LargeTreePerNodeExtraOverhead;
-
-						// Update calculations above made with previous overhead sizes
-						spaceAvailable = pageSize - compressedBytes;
-						newBlocks = 1 + (nodeSize - spaceAvailable - 1) / blockSize;
-						newPageSize = pageSize + (newBlocks * blockSize);
-					}
-
-					blockCount += newBlocks;
-					pageSize = newPageSize;
-					pageFillTarget = pageSize * SERVER_KNOBS->REDWOOD_PAGE_REBUILD_FILL_FACTOR;
+				// If the page is now empty then it must be the last page in pagesToBuild, otherwise there would
+				// be more than 1 item since internal pages need to have multiple children. While there is no page
+				// to be built here, a record must be added to the output set because the upper boundary of the last
+				// page built does not match the upper boundary of the original page that this call to writePages() is
+				// replacing.  Put another way, the upper boundary of the rightmost page of the page set that was just
+				// built does not match the upper boundary of the original page that the page set is replacing, so
+				// adding the extra null link fixes this.
+				if (p.count == 0) {
+					ASSERT(pageIndex == pagesToBuild.size() - 1);
+					records.push_back_deep(records.arena(), pageUpperBound);
+					break;
 				}
-
-				kvBytes += entry.kvBytes();
-				compressedBytes += nodeSize;
-				++i;
-			}
-
-			// Flush the accumulated records to a page
-			state int nextStart = i;
-			// If we are building internal pages and there is a record after this page (index nextStart) but it has an
-			// empty childPage value then skip it. It only exists to serve as an upper boundary for a child page that
-			// has not been rewritten in the current commit, and that purpose will now be served by the upper bound of
-			// the page we are now building.
-			if (height != 1 && nextStart < entries.size() && !entries[nextStart].value.present()) {
-				++nextStart;
 			}
 
 			// Use the next entry as the upper bound, or upperBound if there are no more entries beyond this page
-			pageUpperBound = (i == entries.size()) ? upperBound->withoutValue() : entries[i].withoutValue();
+			int endIndex = p.endIndex();
+			bool lastPage = endIndex == entries.size();
+			pageUpperBound = lastPage ? upperBound->withoutValue() : entries[endIndex].withoutValue();
 
 			// If this is a leaf page, and not the last one to be written, shorten the upper boundary
-			state bool isLastPage = (nextStart == entries.size());
-			if (!isLastPage && height == 1) {
-				int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[i - 1], 0);
+			if (!lastPage && height == 1) {
+				int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[endIndex - 1], prefixLen);
 				pageUpperBound.truncate(commonPrefix + 1);
 			}
 
 			state std::vector<Reference<IPage>> pages;
 			BTreePage* btPage;
 
-			int capacity = blockSize * blockCount;
-			if (blockCount == 1) {
+			if (p.blockCount == 1) {
 				Reference<IPage> page = self->m_pager->newPageBuffer();
 				btPage = (BTreePage*)page->mutate();
 				pages.push_back(std::move(page));
 			} else {
-				ASSERT(blockCount > 1);
-				btPage = (BTreePage*)new uint8_t[capacity];
+				ASSERT(p.blockCount > 1);
+				btPage = (BTreePage*)new uint8_t[p.pageSize];
 			}
 
 			btPage->height = height;
-			btPage->kvBytes = kvBytes;
+			btPage->kvBytes = p.kvBytes;
 
-			debug_printf(
-			    "Building tree.  start=%d  i=%d  count=%d  page usage: %d/%d (%.2f%%) bytes\nlower: %s\nupper: %s\n",
-			    start,
-			    i,
-			    i - start,
-			    compressedBytes,
-			    pageSize,
-			    (float)compressedBytes / pageSize * 100,
-			    pageLowerBound.toString(false).c_str(),
-			    pageUpperBound.toString(false).c_str());
+			debug_printf("Building tree for %s\nlower: %s\nupper: %s\n",
+			             p.toString().c_str(),
+			             pageLowerBound.toString(false).c_str(),
+			             pageUpperBound.toString(false).c_str());
 
-			int written =
-			    btPage->tree().build(pageSize, &entries[start], &entries[i], &pageLowerBound, &pageUpperBound);
-			if (written > pageSize) {
-				debug_printf("ERROR:  Wrote %d bytes to %d byte page (%d blocks). recs %d  kvBytes %d  compressed %d\n",
+			int deltaTreeSpace = p.pageSize - sizeof(BTreePage);
+			state int written = btPage->tree().build(
+			    deltaTreeSpace, &entries[p.startIndex], &entries[endIndex], &pageLowerBound, &pageUpperBound);
+
+			if (written > deltaTreeSpace) {
+				debug_printf("ERROR:  Wrote %d bytes to page %s deltaTreeSpace=%d\n",
 				             written,
-				             pageSize,
-				             blockCount,
-				             i - start,
-				             kvBytes,
-				             compressedBytes);
-				fprintf(stderr,
-				        "ERROR:  Wrote %d bytes to %d byte page (%d blocks). recs %d  kvBytes %d  compressed %d\n",
-				        written,
-				        pageSize,
-				        blockCount,
-				        i - start,
-				        kvBytes,
-				        compressedBytes);
+				             p.toString().c_str(),
+				             deltaTreeSpace);
+				TraceEvent(SevError, "RedwoodDeltaTreeOverflow")
+				    .detail("PageSize", p.pageSize)
+				    .detail("BytesWritten", written);
 				ASSERT(false);
 			}
 
 			auto& metrics = g_redwoodMetrics.level(btPage->height);
 			metrics.pageBuild += 1;
-			metrics.pageBuildExt += blockCount - 1;
-			metrics.buildFillPct += (double)written / capacity;
-			metrics.buildStoredPct += (double)btPage->kvBytes / capacity;
-			metrics.buildItemCount += btPage->tree().numItems;
+			metrics.pageBuildExt += p.blockCount - 1;
+			metrics.buildFillPct += p.usedFraction();
+			metrics.buildStoredPct += p.kvFraction();
+			metrics.buildItemCount += p.count;
 
 			// Create chunked pages
 			// TODO: Avoid copying page bytes, but this is not trivial due to how pager checksums are currently handled.
-			if (blockCount != 1) {
+			if (p.blockCount != 1) {
 				// Mark the slack in the page buffer as defined
-				VALGRIND_MAKE_MEM_DEFINED(((uint8_t*)btPage) + written, (blockCount * blockSize) - written);
+				VALGRIND_MAKE_MEM_DEFINED(((uint8_t*)btPage) + written, (p.blockCount * p.blockSize) - written);
 				const uint8_t* rptr = (const uint8_t*)btPage;
-				for (int b = 0; b < blockCount; ++b) {
+				for (int b = 0; b < p.blockCount; ++b) {
 					Reference<IPage> page = self->m_pager->newPageBuffer();
-					memcpy(page->mutate(), rptr, blockSize);
-					rptr += blockSize;
+					memcpy(page->mutate(), rptr, p.blockSize);
+					rptr += p.blockSize;
 					pages.push_back(std::move(page));
 				}
 				delete[](uint8_t*) btPage;
 			}
 
 			// Write this btree page, which is made of 1 or more pager pages.
-			state int p;
 			state BTreePageIDRef childPageID;
+			state int k;
 
 			// If we are only writing 1 page and it has the same BTreePageID size as the original then try to reuse the
 			// LogicalPageIDs in previousID and try to update them atomically.
-			bool isOnlyPage = isLastPage && (start == 0);
-			if (isOnlyPage && previousID.size() == pages.size()) {
-				for (p = 0; p < pages.size(); ++p) {
-					LogicalPageID id = wait(self->m_pager->atomicUpdatePage(previousID[p], pages[p], v));
+			if (pagesToBuild.size() == 1 && previousID.size() == pages.size()) {
+				for (k = 0; k < pages.size(); ++k) {
+					LogicalPageID id = wait(self->m_pager->atomicUpdatePage(previousID[k], pages[k], v));
 					childPageID.push_back(records.arena(), id);
 				}
 			} else {
@@ -3942,31 +4054,25 @@ private:
 				if (records.empty()) {
 					self->freeBTreePage(previousID, v);
 				}
-				for (p = 0; p < pages.size(); ++p) {
+				for (k = 0; k < pages.size(); ++k) {
 					LogicalPageID id = wait(self->m_pager->newPageID());
-					self->m_pager->updatePage(id, pages[p]);
+					self->m_pager->updatePage(id, pages[k]);
 					childPageID.push_back(records.arena(), id);
 				}
 			}
 
 			wait(yield());
 
-			debug_printf("Flushing %s  lastPage=%d  original=%s  start=%d  i=%d  count=%d  page usage: %d/%d (%.2f%%) "
-			             "bytes\nlower: %s\nupper: %s\n",
-			             toString(childPageID).c_str(),
-			             isLastPage,
-			             toString(previousID).c_str(),
-			             start,
-			             i,
-			             i - start,
-			             compressedBytes,
-			             pageSize,
-			             (float)compressedBytes / pageSize * 100,
-			             pageLowerBound.toString(false).c_str(),
-			             pageUpperBound.toString(false).c_str());
-
 			if (REDWOOD_DEBUG) {
-				for (int j = start; j < i; ++j) {
+				auto& p = pagesToBuild[pageIndex];
+				debug_printf("Wrote %s original=%s deltaTreeSize=%d for %s\nlower: %s\nupper: %s\n",
+				             toString(childPageID).c_str(),
+				             toString(previousID).c_str(),
+				             written,
+				             p.toString().c_str(),
+				             pageLowerBound.toString(false).c_str(),
+				             pageUpperBound.toString(false).c_str());
+				for (int j = p.startIndex; j < p.endIndex(); ++j) {
 					debug_printf(" %3d: %s\n", j, entries[j].toString(height == 1).c_str());
 				}
 				ASSERT(pageLowerBound.key <= pageUpperBound.key);
@@ -3978,27 +4084,9 @@ private:
 			// records.arena() above
 			records.back().setChildPage(childPageID);
 
-			if (isLastPage) {
-				break;
-			}
-
-			start = nextStart;
-			kvBytes = 0;
-			compressedBytes = BTreePage::BinaryTree::emptyTreeSize();
 			pageLowerBound = pageUpperBound;
 		}
 
-		// If we're writing internal pages, if the last entry was the start of a new page and had an empty child link
-		// then it would not be written to a page. This means that the upper boundary for the the page set being built
-		// is not the upper bound of the final page in that set, so it must be added to the output set to preserve the
-		// decodability of the subtree to its left. Fortunately, this is easy to detect because the loop above would
-		// exit before i has reached the item count.
-		if (height != 1 && i != entries.size()) {
-			debug_printf("Adding dummy record to avoid writing useless page containing only one null link: %s\n",
-			             pageUpperBound.toString(false).c_str());
-			records.push_back_deep(records.arena(), pageUpperBound);
-		}
-
 		return records;
 	}
 
@@ -4294,11 +4382,12 @@ private:
 
 		std::string toString() const {
 			std::string s;
-			s += format("SubtreeSlice: addr=%p skipLen=%d subtreeCleared=%d childrenChanged=%d\n",
+			s += format("SubtreeSlice: addr=%p skipLen=%d subtreeCleared=%d childrenChanged=%d inPlaceUpdate=%d\n",
 			            this,
 			            skipLen,
 			            childrenChanged && newLinks.empty(),
-			            childrenChanged);
+			            childrenChanged,
+			            inPlaceUpdate);
 			s += format("SubtreeLower: %s\n", subtreeLowerBound->toString(false).c_str());
 			s += format(" DecodeLower: %s\n", decodeLowerBound->toString(false).c_str());
 			s += format(" DecodeUpper: %s\n", decodeUpperBound->toString(false).c_str());

From 5c93e684f8c130a399fdb6b7d998917b5a085f7f Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sun, 4 Apr 2021 19:23:08 -0700
Subject: [PATCH 054/317] Added comments.

---
 fdbserver/DeltaTree.h              |  1 -
 fdbserver/VersionedBTree.actor.cpp | 28 ++++++++++++++++++----------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h
index 2e0fee0b40..ceff1f2ec3 100644
--- a/fdbserver/DeltaTree.h
+++ b/fdbserver/DeltaTree.h
@@ -230,7 +230,6 @@ struct DeltaTree {
 	inline Node& newNode() { return *(Node*)((uint8_t*)this + size()); }
 
 public:
-
 	struct DecodedNode {
 		DecodedNode() {}
 
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 4f3bb874c4..071ca5d074 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -3733,6 +3733,7 @@ private:
 	Future<int> m_lazyClearActor;
 	bool m_lazyClearStop;
 
+	// Describes a range of a vector of records that should be built into a BTreePage
 	struct PageToBuild {
 		PageToBuild(int index, int blockSize)
 		  : startIndex(index), count(0), pageSize(blockSize),
@@ -3740,27 +3741,33 @@ private:
 		    largeDeltaTree(pageSize > BTreePage::BinaryTree::SmallSizeLimit), blockSize(blockSize), blockCount(1),
 		    kvBytes(0) {}
 
-		int startIndex;
-		int count;
-		int pageSize;
-		int bytesLeft;
-		bool largeDeltaTree;
-		int blockSize;
-		int blockCount;
-		int kvBytes;
+		int startIndex; // Index of the first record
+		int count; // Number of records added to the page
+		int pageSize; // Page size required to hold a BTreePage of the added records, which is a multiple of blockSize
+		int bytesLeft; // Bytes in pageSize that are unused by the BTreePage so far
+		bool largeDeltaTree; // Whether or not the DeltaTree in the generated page is in the 'large' size range
+		int blockSize; // Base block size by which pageSize can be incremented
+		int blockCount; // The number of blocks in pageSize
+		int kvBytes; // The amount of user key/value bytes added to the page
 
+		// Number of bytes used by the generated/serialized BTreePage
 		int size() const { return pageSize - bytesLeft; }
 
+		// Used fraction of pageSize bytes
 		double usedFraction() const { return (double)size() / pageSize; }
 
+		// Unused fraction of pageSize bytes
 		double slackFraction() const { return (double)bytesLeft / pageSize; }
 
+		// Fraction of PageSize in use by key or value string bytes, disregarding all overhead including string sizes
 		double kvFraction() const { return (double)kvBytes / pageSize; }
 
-		int endIndex() const { return startIndex + count; }
-
+		// Index of the last record to be included in this page
 		int lastIndex() const { return endIndex() - 1; }
 
+		// Index of the first record NOT included in this page
+		int endIndex() const { return startIndex + count; }
+
 		std::string toString() const {
 			return format(
 			    "{start=%d count=%d used %d/%d bytes (%.2f%% slack) kvBytes=%d blocks=%d blockSize=%d large=%d}",
@@ -3836,6 +3843,7 @@ private:
 		}
 	};
 
+	// Scans a vector of records and decides on page split points, returning a vector of 1+ pages to build
 	static std::vector<PageToBuild> splitPages(const RedwoodRecordRef* lowerBound,
 	                                           const RedwoodRecordRef* upperBound,
 	                                           int prefixLen,

From eec119e0d0548a71c59be3c41028bfa5e2fd98fb Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sun, 4 Apr 2021 21:36:05 -0700
Subject: [PATCH 055/317] Added an fdbserver role to run unit tests directly
 without a cluster or test spec file, and added a unit test parameters concept
 for passing options into unit tests. Updated p2p network test to use unit
 test parameters instead of the environment.

---
 fdbserver/TesterInterface.actor.h |  2 +-
 fdbserver/fdbserver.actor.cpp     | 40 ++++++++++++++++----
 fdbserver/networktest.actor.cpp   | 63 ++++++++++++++++++-------------
 fdbserver/tester.actor.cpp        | 20 ++++++++--
 flow/UnitTest.cpp                 | 30 +++++++++++++++
 flow/UnitTest.h                   | 20 ++++++++++
 6 files changed, 137 insertions(+), 38 deletions(-)

diff --git a/fdbserver/TesterInterface.actor.h b/fdbserver/TesterInterface.actor.h
index f5e84a2a58..a59e2244ea 100644
--- a/fdbserver/TesterInterface.actor.h
+++ b/fdbserver/TesterInterface.actor.h
@@ -135,7 +135,7 @@ ACTOR Future<Void> testerServerCore(TesterInterface interf,
                                     LocalityData locality);
 
 enum test_location_t { TEST_HERE, TEST_ON_SERVERS, TEST_ON_TESTERS };
-enum test_type_t { TEST_TYPE_FROM_FILE, TEST_TYPE_CONSISTENCY_CHECK };
+enum test_type_t { TEST_TYPE_FROM_FILE, TEST_TYPE_CONSISTENCY_CHECK, TEST_TYPE_UNIT_TESTS };
 
 ACTOR Future<Void> runTests(Reference<ClusterConnectionFile> connFile,
                             test_type_t whatToRun,
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 545d407953..f59fdcd7c0 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -66,6 +66,7 @@
 #include "flow/SystemMonitor.h"
 #include "flow/TLSConfig.actor.h"
 #include "flow/Tracing.h"
+#include "flow/UnitTest.h"
 
 #if defined(__linux__) || defined(__FreeBSD__)
 #include <execinfo.h>
@@ -88,7 +89,7 @@ enum {
 	OPT_CONNFILE, OPT_SEEDCONNFILE, OPT_SEEDCONNSTRING, OPT_ROLE, OPT_LISTEN, OPT_PUBLICADDR, OPT_DATAFOLDER, OPT_LOGFOLDER, OPT_PARENTPID, OPT_TRACER, OPT_NEWCONSOLE,
 	OPT_NOBOX, OPT_TESTFILE, OPT_RESTARTING, OPT_RESTORING, OPT_RANDOMSEED, OPT_KEY, OPT_MEMLIMIT, OPT_STORAGEMEMLIMIT, OPT_CACHEMEMLIMIT, OPT_MACHINEID,
 	OPT_DCID, OPT_MACHINE_CLASS, OPT_BUGGIFY, OPT_VERSION, OPT_BUILD_FLAGS, OPT_CRASHONERROR, OPT_HELP, OPT_NETWORKIMPL, OPT_NOBUFSTDOUT, OPT_BUFSTDOUTERR,
-	OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE,
+	OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_UNITTESTPARAM, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE,
 	OPT_METRICSPREFIX, OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE,
 	OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE
 };
@@ -162,6 +163,7 @@ CSimpleOpt::SOption g_rgOptions[] = {
 	{ OPT_HELP,                  "--help",                      SO_NONE },
 	{ OPT_DEVHELP,               "--dev-help",                  SO_NONE },
 	{ OPT_KNOB,                  "--knob_",                     SO_REQ_SEP },
+	{ OPT_UNITTESTPARAM,         "--test_",                     SO_REQ_SEP },
 	{ OPT_LOCALITY,              "--locality_",                 SO_REQ_SEP },
 	{ OPT_TESTSERVERS,           "--testservers",               SO_REQ_SEP },
 	{ OPT_TEST_ON_SERVERS,       "--testonservers",             SO_NONE },
@@ -622,16 +624,19 @@ static void printUsage(const char* name, bool devhelp) {
 	printOptionUsage("-h, -?, --help", "Display this help and exit.");
 	if (devhelp) {
 		printf("  --build_flags  Print build information and exit.\n");
-		printOptionUsage("-r ROLE, --role ROLE",
-		                 " Server role (valid options are fdbd, test, multitest,"
-		                 " simulation, networktestclient, networktestserver, restore"
-		                 " consistencycheck, kvfileintegritycheck, kvfilegeneratesums). The default is `fdbd'.");
+		printOptionUsage(
+		    "-r ROLE, --role ROLE",
+		    " Server role (valid options are fdbd, test, multitest,"
+		    " simulation, networktestclient, networktestserver, restore"
+		    " consistencycheck, kvfileintegritycheck, kvfilegeneratesums, unittests). The default is `fdbd'.");
 #ifdef _WIN32
 		printOptionUsage("-n, --newconsole", " Create a new console.");
 		printOptionUsage("-q, --no_dialog", " Disable error dialog on crash.");
 		printOptionUsage("--parentpid PID", " Specify a process after whose termination to exit.");
 #endif
-		printOptionUsage("-f TESTFILE, --testfile", " Testfile to run, defaults to `tests/default.txt'.");
+		printOptionUsage("-f TESTFILE, --testfile",
+		                 " Testfile to run, defaults to `tests/default.txt'.  If role is `unittests', specifies which "
+		                 "unit tests to run as a search prefix.");
 		printOptionUsage("-R, --restarting", " Restart a previous simulation that was cleanly shut down.");
 		printOptionUsage("-s SEED, --seed SEED", " Random seed.");
 		printOptionUsage("-k KEY, --key KEY", "Target key for search role.");
@@ -651,6 +656,8 @@ static void printUsage(const char* name, bool devhelp) {
 		printOptionUsage("--num_testers NUM",
 		                 " A multitester will wait for NUM testers before starting"
 		                 " (defaults to 1).");
+		printOptionUsage("--test_PARAMNAME PARAMVALUE",
+		                 " Set a UnitTest named parameter to the given value.  Names are case sensitive.");
 #ifdef __linux__
 		printOptionUsage("--rsssize SIZE",
 		                 " Turns on automatic heap profiling when RSS memory size exceeds"
@@ -922,6 +929,7 @@ enum class ServerRole {
 	SkipListTest,
 	Test,
 	VersionedMapTest,
+	UnitTests
 };
 struct CLIOptions {
 	std::string commandLine;
@@ -1044,6 +1052,15 @@ private:
 				knobs.push_back(std::make_pair(syn, args.OptionArg()));
 				break;
 			}
+			case OPT_UNITTESTPARAM: {
+				std::string syn = args.OptionSyntax();
+				if (!StringRef(syn).startsWith(LiteralStringRef("--test_"))) {
+					fprintf(stderr, "ERROR: unable to parse knob option '%s'\n", syn.c_str());
+					flushAndExit(FDB_EXIT_ERROR);
+				}
+				UnitTestCollection::setParam(syn.substr(7), args.OptionArg());
+				break;
+			}
 			case OPT_LOCALITY: {
 				std::string syn = args.OptionSyntax();
 				if (!StringRef(syn).startsWith(LiteralStringRef("--locality_"))) {
@@ -1102,6 +1119,8 @@ private:
 					role = ServerRole::KVFileGenerateIOLogChecksums;
 				else if (!strcmp(sRole, "consistencycheck"))
 					role = ServerRole::ConsistencyCheck;
+				else if (!strcmp(sRole, "unittests"))
+					role = ServerRole::UnitTests;
 				else {
 					fprintf(stderr, "ERROR: Unknown role `%s'\n", sRole);
 					printHelpTeaser(argv[0]);
@@ -1461,7 +1480,8 @@ private:
 			    return StringRef(addr).startsWith(LiteralStringRef("auto:"));
 		    });
 		if ((role != ServerRole::Simulation && role != ServerRole::CreateTemplateDatabase &&
-		     role != ServerRole::KVFileIntegrityCheck && role != ServerRole::KVFileGenerateIOLogChecksums) ||
+		     role != ServerRole::KVFileIntegrityCheck && role != ServerRole::KVFileGenerateIOLogChecksums &&
+		     role != ServerRole::UnitTests) ||
 		    autoPublicAddress) {
 
 			if (seedSpecified && !fileExists(connFile)) {
@@ -1994,6 +2014,12 @@ int main(int argc, char* argv[]) {
 			                       StringRef(),
 			                       opts.localities));
 			g_network->run();
+		} else if (role == ServerRole::UnitTests) {
+			setupRunLoopProfiler();
+			auto m = startSystemMonitor(opts.dataFolder, opts.dcId, opts.zoneId, opts.zoneId);
+			f = stopAfter(runTests(
+			    opts.connectionFile, TEST_TYPE_UNIT_TESTS, TEST_HERE, 1, opts.testFile, StringRef(), opts.localities));
+			g_network->run();
 		} else if (role == ServerRole::CreateTemplateDatabase) {
 			createTemplateDatabase();
 		} else if (role == ServerRole::NetworkTestClient) {
diff --git a/fdbserver/networktest.actor.cpp b/fdbserver/networktest.actor.cpp
index 4acb46a2f0..540720e948 100644
--- a/fdbserver/networktest.actor.cpp
+++ b/fdbserver/networktest.actor.cpp
@@ -517,13 +517,6 @@ struct P2PNetworkTest {
 		       self->listeners.size(),
 		       self->remotes.size(),
 		       self->connectionsOut);
-		printf("Request size: %s\n", self->requestBytes.toString().c_str());
-		printf("Response size: %s\n", self->replyBytes.toString().c_str());
-		printf("Requests per outgoing session: %d\n", self->requests.toString().c_str());
-		printf("Delay before socket read: %s\n", self->waitReadMilliseconds.toString().c_str());
-		printf("Delay before socket write: %s\n", self->waitWriteMilliseconds.toString().c_str());
-		printf("Delay before session close: %s\n", self->idleMilliseconds.toString().c_str());
-		printf("Send/Recv size %d bytes\n", FLOW_KNOBS->MAX_PACKET_SEND_BYTES);
 
 		for (auto n : self->remotes) {
 			printf("Remote: %s\n", n.toString().c_str());
@@ -534,6 +527,19 @@ struct P2PNetworkTest {
 			actors.add(incoming(self, el));
 		}
 
+		printf("Request size: %s\n", self->requestBytes.toString().c_str());
+		printf("Response size: %s\n", self->replyBytes.toString().c_str());
+		printf("Requests per outgoing session: %s\n", self->requests.toString().c_str());
+		printf("Delay before socket read: %s\n", self->waitReadMilliseconds.toString().c_str());
+		printf("Delay before socket write: %s\n", self->waitWriteMilliseconds.toString().c_str());
+		printf("Delay before session close: %s\n", self->idleMilliseconds.toString().c_str());
+		printf("Send/Recv size %d bytes\n", FLOW_KNOBS->MAX_PACKET_SEND_BYTES);
+
+		if ((self->remotes.empty() || self->connectionsOut == 0) && self->listeners.empty()) {
+			printf("No listeners and no remotes or connectionsOut, so there is nothing to do!\n");
+			ASSERT((!self->remotes.empty() && (self->connectionsOut > 0)) || !self->listeners.empty());
+		}
+
 		if (!self->remotes.empty()) {
 			for (int i = 0; i < self->connectionsOut; ++i) {
 				actors.add(outgoing(self));
@@ -549,27 +555,30 @@ struct P2PNetworkTest {
 	Future<Void> run() { return run_impl(this); }
 };
 
-int getEnvInt(const char* name, int defaultValue = 0) {
-	const char* val = getenv(name);
-	return val != nullptr ? atol(val) : defaultValue;
-}
-
-std::string getEnvStr(const char* name, std::string defaultValue = "") {
-	const char* val = getenv(name);
-	return val != nullptr ? val : defaultValue;
-}
-
-// TODO: Remove this hacky thing and make a "networkp2ptest" role in fdbserver
+// Peer-to-Peer network test.
+// One or more instances can be run and set to talk to each other.
+// Each instance
+//   - listens on 0 or more listenerAddresses
+//   - maintains 0 or more connectionsOut at a time, each to a random choice from remoteAddresses
+// Address lists are a string of comma-separated IP:port[:tls] strings.
+//
+// The other arguments can be specified as "fixedValue" or "minValue:maxValue".
+// Each outgoing connection will live for a random requests count.
+// Each request will
+//   - send a random requestBytes sized message
+//   - wait for a random replyBytes sized response.
+// The client will close the connection after a random idleMilliseconds.
+// Reads and writes can optionally preceded by random delays, waitReadMilliseconds and waitWriteMilliseconds.
 TEST_CASE("!p2ptest") {
-	state P2PNetworkTest p2p(getEnvStr("listenerAddresses", ""),
-	                         getEnvStr("remoteAddresses", ""),
-	                         getEnvInt("connectionsOut", 0),
-	                         getEnvStr("requestBytes", "0"),
-	                         getEnvStr("replyBytes", "0"),
-	                         getEnvStr("requests", "0"),
-	                         getEnvStr("idleMilliseconds", "0"),
-	                         getEnvStr("waitReadMilliseconds", "0"),
-	                         getEnvStr("waitWriteMilliseconds", "0"));
+	state P2PNetworkTest p2p(UnitTestCollection::getParam("listenerAddresses").orDefault(""),
+	                         UnitTestCollection::getParam("remoteAddresses").orDefault(""),
+	                         UnitTestCollection::getIntParam("connectionsOut").orDefault(1),
+	                         UnitTestCollection::getParam("requestBytes").orDefault("50:100"),
+	                         UnitTestCollection::getParam("replyBytes").orDefault("500:1000"),
+	                         UnitTestCollection::getParam("requests").orDefault("10:10000"),
+	                         UnitTestCollection::getParam("idleMilliseconds").orDefault("0"),
+	                         UnitTestCollection::getParam("waitReadMilliseconds").orDefault("0"),
+	                         UnitTestCollection::getParam("waitWriteMilliseconds").orDefault("0"));
 
 	wait(p2p.run());
 	return Void();
diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp
index 839df40999..4307329840 100644
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@@ -763,7 +763,7 @@ ACTOR Future<DistributedTestResults> runWorkload(Database cx, std::vector<Tester
 		req.title = spec.title;
 		req.useDatabase = spec.useDB;
 		req.timeout = spec.timeout;
-		req.databasePingDelay = spec.databasePingDelay;
+		req.databasePingDelay = spec.useDB ? spec.databasePingDelay : 0.0;
 		req.options = spec.options;
 		req.clientId = i;
 		req.clientCount = testers.size();
@@ -1577,8 +1577,10 @@ ACTOR Future<Void> runTests(Reference<ClusterConnectionFile> connFile,
 	auto cc = makeReference<AsyncVar<Optional<ClusterControllerFullInterface>>>();
 	auto ci = makeReference<AsyncVar<Optional<ClusterInterface>>>();
 	vector<Future<Void>> actors;
-	actors.push_back(reportErrors(monitorLeader(connFile, cc), "MonitorLeader"));
-	actors.push_back(reportErrors(extractClusterInterface(cc, ci), "ExtractClusterInterface"));
+	if (connFile) {
+		actors.push_back(reportErrors(monitorLeader(connFile, cc), "MonitorLeader"));
+		actors.push_back(reportErrors(extractClusterInterface(cc, ci), "ExtractClusterInterface"));
+	}
 
 	if (whatToRun == TEST_TYPE_CONSISTENCY_CHECK) {
 		TestSpec spec;
@@ -1603,6 +1605,18 @@ ACTOR Future<Void> runTests(Reference<ClusterConnectionFile> connFile,
 		                       KeyValueRef(LiteralStringRef("shuffleShards"), LiteralStringRef("true")));
 		spec.options.push_back_deep(spec.options.arena(), options);
 		testSpecs.push_back(spec);
+	} else if (whatToRun == TEST_TYPE_UNIT_TESTS) {
+		TestSpec spec;
+		Standalone<VectorRef<KeyValueRef>> options;
+		spec.title = LiteralStringRef("UnitTests");
+		spec.startDelay = 0;
+		spec.useDB = false;
+		spec.timeout = 0;
+		options.push_back_deep(options.arena(),
+		                       KeyValueRef(LiteralStringRef("testName"), LiteralStringRef("UnitTests")));
+		options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("testsMatching"), fileName));
+		spec.options.push_back_deep(spec.options.arena(), options);
+		testSpecs.push_back(spec);
 	} else {
 		ifstream ifs;
 		ifs.open(fileName.c_str(), ifstream::in);
diff --git a/flow/UnitTest.cpp b/flow/UnitTest.cpp
index 1d383ac00e..83eec8c62a 100644
--- a/flow/UnitTest.cpp
+++ b/flow/UnitTest.cpp
@@ -26,3 +26,33 @@ UnitTest::UnitTest(const char* name, const char* file, int line, TestFunction fu
   : name(name), file(file), line(line), func(func), next(g_unittests.tests) {
 	g_unittests.tests = this;
 }
+
+UnitTestParameters& UnitTestCollection::params() {
+	static UnitTestParameters p;
+	return p;
+}
+
+void UnitTestCollection::setParam(const std::string& name, const std::string& value) {
+	printf("setting %s = %s\n", name.c_str(), value.c_str());
+	params()[name] = value;
+}
+
+Optional<std::string> UnitTestCollection::getParam(const std::string& name) {
+	auto it = params().find(name);
+	if (it != params().end()) {
+		return it->second;
+	}
+	return {};
+}
+
+void UnitTestCollection::setParam(const std::string& name, int64_t value) {
+	setParam(name, format("%" PRId64, value));
+};
+
+Optional<int64_t> UnitTestCollection::getIntParam(const std::string& name) {
+	auto opt = getParam(name);
+	if (opt.present()) {
+		return atoll(opt.get().c_str());
+	}
+	return {};
+}
diff --git a/flow/UnitTest.h b/flow/UnitTest.h
index c76344e4bb..92f44084ba 100644
--- a/flow/UnitTest.h
+++ b/flow/UnitTest.h
@@ -45,6 +45,9 @@
 
 #include "flow/flow.h"
 
+#include <cinttypes>
+
+// Unit test definition structured as a linked list item
 struct UnitTest {
 	typedef Future<Void> (*TestFunction)();
 
@@ -57,8 +60,25 @@ struct UnitTest {
 	UnitTest(const char* name, const char* file, int line, TestFunction func);
 };
 
+// Collection of unit tests in the form of a linked list
+typedef std::map<std::string, std::string> UnitTestParameters;
 struct UnitTestCollection {
 	UnitTest* tests;
+
+	// Map of named case-sensitive parameters available for all unit tests
+	static UnitTestParameters& params();
+
+	// Set a named parameter to a string value, replacing any existing value
+	static void setParam(const std::string& name, const std::string& value);
+
+	// Set a named parameter to an integer converted to a string value, replacing any existing value
+	static void setParam(const std::string& name, int64_t value);
+
+	// Get a parameter's value, will return !present() if parameter was not set
+	static Optional<std::string> getParam(const std::string& name);
+
+	// Get a parameter's value as an integer, will return !present() if parameter was not set
+	static Optional<int64_t> getIntParam(const std::string& name);
 };
 
 extern UnitTestCollection g_unittests;

From ffeb94ada43edce94b0ec5da0a10b1252c140408 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Sun, 4 Apr 2021 23:33:04 -0700
Subject: [PATCH 056/317] Updated Redwood set unit test to use unit test
 parameters.

---
 fdbserver/VersionedBTree.actor.cpp | 32 ++++++++++++++++--------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 8051d956b0..72fbb5eada 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -8045,21 +8045,23 @@ TEST_CASE("!/redwood/performance/set") {
 		deleteFile(pagerFile);
 	}
 
-	state int pageSize = SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE;
-	state int64_t pageCacheBytes = FLOW_KNOBS->PAGE_CACHE_4K;
-	state int nodeCount = 1e9;
-	state int maxRecordsPerCommit = 20000;
-	state int maxKVBytesPerCommit = 20e6;
-	state int64_t kvBytesTarget = 4e9;
-	state int minKeyPrefixBytes = 25;
-	state int maxKeyPrefixBytes = 25;
-	state int minValueSize = 100;
-	state int maxValueSize = 500;
-	state int minConsecutiveRun = 1;
-	state int maxConsecutiveRun = 100000;
-	state char firstKeyChar = 'a';
-	state char lastKeyChar = 'm';
-	state Version remapCleanupWindow = SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW;
+	state int pageSize = UnitTestCollection::getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
+	state int64_t pageCacheBytes =
+	    UnitTestCollection::getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
+	state int nodeCount = UnitTestCollection::getIntParam("nodeCount").orDefault(1e9);
+	state int maxRecordsPerCommit = UnitTestCollection::getIntParam("maxRecordsPerCommit").orDefault(20000);
+	state int maxKVBytesPerCommit = UnitTestCollection::getIntParam("maxKVBytesPerCommit").orDefault(20e6);
+	state int64_t kvBytesTarget = UnitTestCollection::getIntParam("kvBytesTarget").orDefault(4e9);
+	state int minKeyPrefixBytes = UnitTestCollection::getIntParam("minKeyPrefixBytes").orDefault(25);
+	state int maxKeyPrefixBytes = UnitTestCollection::getIntParam("maxKeyPrefixBytes").orDefault(25);
+	state int minValueSize = UnitTestCollection::getIntParam("minValueSize").orDefault(100);
+	state int maxValueSize = UnitTestCollection::getIntParam("maxValueSize").orDefault(500);
+	state int minConsecutiveRun = UnitTestCollection::getIntParam("minConsecutiveRun").orDefault(1);
+	state int maxConsecutiveRun = UnitTestCollection::getIntParam("maxConsecutiveRun").orDefault(100);
+	state char firstKeyChar = UnitTestCollection::getParam("firstKeyChar").orDefault("a")[0];
+	state char lastKeyChar = UnitTestCollection::getParam("lastKeyChar").orDefault("m")[0];
+	state Version remapCleanupWindow =
+	    UnitTestCollection::getIntParam("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
 
 	printf("pageSize: %d\n", pageSize);
 	printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes);

From e7573d546f094bd96e1b2fca0914c6e8d61c77e4 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Mon, 5 Apr 2021 00:03:15 -0700
Subject: [PATCH 057/317] Some unit tests names had a prefixed "!" in order to
 be excluded from random selection, this has been changed to a ":" as it is
 less problematic on the command line.  Some Redwood unit tests have been
 enabled for random selection.

---
 fdbserver/VersionedBTree.actor.cpp      | 18 +++++++++---------
 fdbserver/networktest.actor.cpp         |  2 +-
 tests/RedwoodCorrectness.txt            |  2 +-
 tests/RedwoodCorrectnessBTree.txt       |  2 +-
 tests/RedwoodCorrectnessPager.txt       |  2 +-
 tests/RedwoodCorrectnessUnits.txt       |  2 +-
 tests/RedwoodPerfPrefixCompression.txt  |  2 +-
 tests/RedwoodPerfSequentialInsert.txt   |  2 +-
 tests/RedwoodPerfSet.txt                |  2 +-
 tests/RedwoodPerfTests.txt              |  2 +-
 tests/rare/RedwoodCorrectnessBTree.toml |  2 +-
 11 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 72fbb5eada..773d3a1cce 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -6956,7 +6956,7 @@ RedwoodRecordRef randomRedwoodRecordRef(const std::string& keyBuffer, const std:
 	return rec;
 }
 
-TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") {
+TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[0] == 3);
 	ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[1] == 4);
 	ASSERT(RedwoodRecordRef::Delta::LengthFormatSizes[2] == 6);
@@ -7092,7 +7092,7 @@ TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") {
 	return Void();
 }
 
-TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") {
+TEST_CASE("/redwood/correctness/unit/deltaTree/RedwoodRecordRef") {
 	// Sanity check on delta tree node format
 	ASSERT(DeltaTree<RedwoodRecordRef>::Node::headerSize(false) == 4);
 	ASSERT(DeltaTree<RedwoodRecordRef>::Node::headerSize(true) == 8);
@@ -7271,7 +7271,7 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") {
 	return Void();
 }
 
-TEST_CASE("!/redwood/correctness/unit/deltaTree/IntIntPair") {
+TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") {
 	const int N = 200;
 	IntIntPair prev = { 1, 0 };
 	IntIntPair next = { 10000, 10000 };
@@ -7615,7 +7615,7 @@ struct SimpleCounter {
 	std::string toString() { return format("%" PRId64 "/%.2f/%.2f", x, rate() / 1e6, avgRate() / 1e6); }
 };
 
-TEST_CASE("!/redwood/performance/mutationBuffer") {
+TEST_CASE(":/redwood/performance/mutationBuffer") {
 	// This test uses pregenerated short random keys
 	int count = 10e6;
 
@@ -7643,7 +7643,7 @@ TEST_CASE("!/redwood/performance/mutationBuffer") {
 	return Void();
 }
 
-TEST_CASE("!/redwood/correctness/btree") {
+TEST_CASE("/redwood/correctness/btree") {
 	g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting
 	g_redwoodMetrics.clear();
 
@@ -8003,7 +8003,7 @@ ACTOR Future<Void> randomScans(VersionedBTree* btree,
 	return Void();
 }
 
-TEST_CASE("!/redwood/correctness/pager/cow") {
+TEST_CASE(":/redwood/correctness/pager/cow") {
 	state std::string pagerFile = "unittest_pageFile.redwood";
 	printf("Deleting old test data\n");
 	deleteFile(pagerFile);
@@ -8030,7 +8030,7 @@ TEST_CASE("!/redwood/correctness/pager/cow") {
 	return Void();
 }
 
-TEST_CASE("!/redwood/performance/set") {
+TEST_CASE(":/redwood/performance/set") {
 	state SignalableActorCollection actors;
 
 	g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting
@@ -8543,7 +8543,7 @@ ACTOR Future<Void> doPrefixInsertComparison(int suffixSize,
 	return Void();
 }
 
-TEST_CASE("!/redwood/performance/prefixSizeComparison") {
+TEST_CASE(":/redwood/performance/prefixSizeComparison") {
 	state int suffixSize = 12;
 	state int valueSize = 100;
 	state int recordCountTarget = 100e6;
@@ -8564,7 +8564,7 @@ TEST_CASE("!/redwood/performance/prefixSizeComparison") {
 	return Void();
 }
 
-TEST_CASE("!/redwood/performance/sequentialInsert") {
+TEST_CASE(":/redwood/performance/sequentialInsert") {
 	state int prefixLen = 30;
 	state int valueSize = 100;
 	state int recordCountTarget = 100e6;
diff --git a/fdbserver/networktest.actor.cpp b/fdbserver/networktest.actor.cpp
index 540720e948..9603fc25cb 100644
--- a/fdbserver/networktest.actor.cpp
+++ b/fdbserver/networktest.actor.cpp
@@ -569,7 +569,7 @@ struct P2PNetworkTest {
 //   - wait for a random replyBytes sized response.
 // The client will close the connection after a random idleMilliseconds.
 // Reads and writes can optionally preceded by random delays, waitReadMilliseconds and waitWriteMilliseconds.
-TEST_CASE("!p2ptest") {
+TEST_CASE(":/network/p2ptest") {
 	state P2PNetworkTest p2p(UnitTestCollection::getParam("listenerAddresses").orDefault(""),
 	                         UnitTestCollection::getParam("remoteAddresses").orDefault(""),
 	                         UnitTestCollection::getIntParam("connectionsOut").orDefault(1),
diff --git a/tests/RedwoodCorrectness.txt b/tests/RedwoodCorrectness.txt
index fbda6b04f4..6f190f2131 100644
--- a/tests/RedwoodCorrectness.txt
+++ b/tests/RedwoodCorrectness.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/correctness/
+    testsMatching=/redwood/correctness/
diff --git a/tests/RedwoodCorrectnessBTree.txt b/tests/RedwoodCorrectnessBTree.txt
index a2495adb7a..92bb3de164 100644
--- a/tests/RedwoodCorrectnessBTree.txt
+++ b/tests/RedwoodCorrectnessBTree.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/correctness/btree
+    testsMatching=/redwood/correctness/btree
diff --git a/tests/RedwoodCorrectnessPager.txt b/tests/RedwoodCorrectnessPager.txt
index 13f9ef1961..4b94c21cfc 100644
--- a/tests/RedwoodCorrectnessPager.txt
+++ b/tests/RedwoodCorrectnessPager.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/correctness/pager
+    testsMatching=:/redwood/correctness/pager
diff --git a/tests/RedwoodCorrectnessUnits.txt b/tests/RedwoodCorrectnessUnits.txt
index d32242f3df..ac56735455 100644
--- a/tests/RedwoodCorrectnessUnits.txt
+++ b/tests/RedwoodCorrectnessUnits.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/correctness/unit/
+    testsMatching=/redwood/correctness/unit/
diff --git a/tests/RedwoodPerfPrefixCompression.txt b/tests/RedwoodPerfPrefixCompression.txt
index 09bb6a30cc..3383a74c2b 100644
--- a/tests/RedwoodPerfPrefixCompression.txt
+++ b/tests/RedwoodPerfPrefixCompression.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/performance/prefixSizeComparison
+    testsMatching=:/redwood/performance/prefixSizeComparison
diff --git a/tests/RedwoodPerfSequentialInsert.txt b/tests/RedwoodPerfSequentialInsert.txt
index 2e61df3b53..21c7005951 100644
--- a/tests/RedwoodPerfSequentialInsert.txt
+++ b/tests/RedwoodPerfSequentialInsert.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/performance/sequentialInsert
+    testsMatching=:/redwood/performance/sequentialInsert
diff --git a/tests/RedwoodPerfSet.txt b/tests/RedwoodPerfSet.txt
index 0694fccdce..f720479ac2 100644
--- a/tests/RedwoodPerfSet.txt
+++ b/tests/RedwoodPerfSet.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/performance/set
+    testsMatching=:/redwood/performance/set
diff --git a/tests/RedwoodPerfTests.txt b/tests/RedwoodPerfTests.txt
index 91675d4b64..8d56ebc823 100644
--- a/tests/RedwoodPerfTests.txt
+++ b/tests/RedwoodPerfTests.txt
@@ -4,4 +4,4 @@ useDB=false
 
     testName=UnitTests
     maxTestCases=0
-    testsMatching=!/redwood/performance/
+    testsMatching=:/redwood/performance/
diff --git a/tests/rare/RedwoodCorrectnessBTree.toml b/tests/rare/RedwoodCorrectnessBTree.toml
index fea0577ee7..1a7320e416 100644
--- a/tests/rare/RedwoodCorrectnessBTree.toml
+++ b/tests/rare/RedwoodCorrectnessBTree.toml
@@ -6,4 +6,4 @@ startDelay = 0
     [[test.workload]]
     testName = 'UnitTests'
     maxTestCases = 0
-    testsMatching = '!/redwood/correctness/btree'
+    testsMatching = ':/redwood/correctness/btree'

From edb3dd44143a3871b95f17c90de14ab14db473aa Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 5 Apr 2021 17:36:49 -0700
Subject: [PATCH 058/317] Control backup's initial snapshot interval via knob.

---
 fdbclient/FileBackupAgent.actor.cpp | 5 ++---
 fdbclient/Knobs.cpp                 | 4 ++++
 fdbclient/Knobs.h                   | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 17112ca8f9..fd7f817711 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2777,9 +2777,8 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 
 		state Reference<TaskFuture> backupFinished = futureBucket->future(tr);
 
-		// Initialize the initial snapshot and create tasks to continually write logs and snapshots
-		// The initial snapshot has a desired duration of 0, meaning go as fast as possible.
-		wait(config.initNewSnapshot(tr, 0));
+		// Initialize the initial snapshot and create tasks to continually write logs and snapshots.
+		wait(config.initNewSnapshot(tr, CLIENT_KNOBS->BACKUP_INIT_SNAPSHOT_INTERVAL_SEC));
 
 		// Using priority 1 for both of these to at least start both tasks soon
 		// Do not add snapshot task if we only want the incremental backup
diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index bcca5ed166..b6500f84c2 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -133,6 +133,10 @@ void ClientKnobs::initialize(bool randomize) {
 	init( BACKUP_RANGE_TIMEOUT,   TASKBUCKET_TIMEOUT_VERSIONS/CORE_VERSIONSPERSECOND/2.0 );
 	init( BACKUP_RANGE_MINWAIT,   std::max(1.0, BACKUP_RANGE_TIMEOUT/2.0));
 	init( BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC,  10 * 60 );  // 10 minutes
+	init( BACKUP_INIT_SNAPSHOT_INTERVAL_SEC,          0);  // The initial snapshot has a desired duration of 0, meaning go as fast as possible.
+	if (randomize && BUGGIFY) {
+		BACKUP_INIT_SNAPSHOT_INTERVAL_SEC = deterministicRandom()->randomInt(0, 60); // 0 - 60 seconds
+	}
 	init( BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC,   3600 * 24 * 10); // 10 days
 	init( BACKUP_SHARD_TASK_LIMIT,                1000 ); if( randomize && BUGGIFY ) BACKUP_SHARD_TASK_LIMIT = 4;
 	init( BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL, 60);
diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h
index 3d22b5a24b..4fc925766c 100644
--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@@ -129,6 +129,7 @@ public:
 	double BACKUP_RANGE_TIMEOUT;
 	double BACKUP_RANGE_MINWAIT;
 	int BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC;
+	int BACKUP_INIT_SNAPSHOT_INTERVAL_SEC;
 	int BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC;
 	int BACKUP_SHARD_TASK_LIMIT;
 	double BACKUP_AGGREGATE_POLL_RATE;

From 7e1d60c924e346934a4732a2e5186f07104d14d4 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 5 Apr 2021 22:53:36 -0700
Subject: [PATCH 059/317] Format BACKUP_INIT_SNAPSHOT_INTERVAL_SEC like other
 knobs.

---
 fdbclient/Knobs.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index b6500f84c2..761a652a1a 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -133,10 +133,7 @@ void ClientKnobs::initialize(bool randomize) {
 	init( BACKUP_RANGE_TIMEOUT,   TASKBUCKET_TIMEOUT_VERSIONS/CORE_VERSIONSPERSECOND/2.0 );
 	init( BACKUP_RANGE_MINWAIT,   std::max(1.0, BACKUP_RANGE_TIMEOUT/2.0));
 	init( BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC,  10 * 60 );  // 10 minutes
-	init( BACKUP_INIT_SNAPSHOT_INTERVAL_SEC,          0);  // The initial snapshot has a desired duration of 0, meaning go as fast as possible.
-	if (randomize && BUGGIFY) {
-		BACKUP_INIT_SNAPSHOT_INTERVAL_SEC = deterministicRandom()->randomInt(0, 60); // 0 - 60 seconds
-	}
+	init( BACKUP_INIT_SNAPSHOT_INTERVAL_SEC,          0); if( randomize && BUGGIFY ) BACKUP_INIT_SNAPSHOT_INTERVAL_SEC = deterministicRandom()->randomInt(0, 60); // The initial snapshot has a desired duration of 0, meaning go as fast as possible. In simulation, choose a random value between 0 - 60 seconds.
 	init( BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC,   3600 * 24 * 10); // 10 days
 	init( BACKUP_SHARD_TASK_LIMIT,                1000 ); if( randomize && BUGGIFY ) BACKUP_SHARD_TASK_LIMIT = 4;
 	init( BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL, 60);

From b4e42476b726a18b196a31db338db960f2e99c81 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 6 Apr 2021 02:36:10 -0700
Subject: [PATCH 060/317] Unit test parameters are no longer global, they are
 accessible via a parameter to the unit test and initialized from otherwise
 unconsumed test options for the UnitTests workload in the test spec or from
 the fdbserver command line when using the unittests role.

---
 fdbserver/TesterInterface.actor.h       |  4 ++-
 fdbserver/VersionedBTree.actor.cpp      | 31 +++++++++----------
 fdbserver/fdbserver.actor.cpp           | 13 ++++++--
 fdbserver/networktest.actor.cpp         | 18 +++++------
 fdbserver/tester.actor.cpp              |  7 ++++-
 fdbserver/workloads/UnitTests.actor.cpp | 11 ++++++-
 flow/UnitTest.cpp                       | 19 +++++-------
 flow/UnitTest.h                         | 41 +++++++++++++------------
 flow/actorcompiler/ActorParser.cs       |  8 ++++-
 9 files changed, 88 insertions(+), 64 deletions(-)

diff --git a/fdbserver/TesterInterface.actor.h b/fdbserver/TesterInterface.actor.h
index a59e2244ea..c3dea2d445 100644
--- a/fdbserver/TesterInterface.actor.h
+++ b/fdbserver/TesterInterface.actor.h
@@ -29,6 +29,7 @@
 #include "fdbrpc/fdbrpc.h"
 #include "fdbrpc/PerfMetric.h"
 #include "fdbclient/NativeAPI.actor.h"
+#include "flow/UnitTest.h"
 #include "flow/actorcompiler.h" // has to be last include
 struct CheckReply {
 	constexpr static FileIdentifier file_identifier = 11;
@@ -143,7 +144,8 @@ ACTOR Future<Void> runTests(Reference<ClusterConnectionFile> connFile,
                             int minTestersExpected,
                             std::string fileName = std::string(),
                             StringRef startingConfiguration = StringRef(),
-                            LocalityData locality = LocalityData());
+                            LocalityData locality = LocalityData(),
+                            UnitTestParameters testOptions = UnitTestParameters());
 
 #include "flow/unactorcompiler.h"
 #endif
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 773d3a1cce..c12015ade8 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -8045,23 +8045,22 @@ TEST_CASE(":/redwood/performance/set") {
 		deleteFile(pagerFile);
 	}
 
-	state int pageSize = UnitTestCollection::getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
-	state int64_t pageCacheBytes =
-	    UnitTestCollection::getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
-	state int nodeCount = UnitTestCollection::getIntParam("nodeCount").orDefault(1e9);
-	state int maxRecordsPerCommit = UnitTestCollection::getIntParam("maxRecordsPerCommit").orDefault(20000);
-	state int maxKVBytesPerCommit = UnitTestCollection::getIntParam("maxKVBytesPerCommit").orDefault(20e6);
-	state int64_t kvBytesTarget = UnitTestCollection::getIntParam("kvBytesTarget").orDefault(4e9);
-	state int minKeyPrefixBytes = UnitTestCollection::getIntParam("minKeyPrefixBytes").orDefault(25);
-	state int maxKeyPrefixBytes = UnitTestCollection::getIntParam("maxKeyPrefixBytes").orDefault(25);
-	state int minValueSize = UnitTestCollection::getIntParam("minValueSize").orDefault(100);
-	state int maxValueSize = UnitTestCollection::getIntParam("maxValueSize").orDefault(500);
-	state int minConsecutiveRun = UnitTestCollection::getIntParam("minConsecutiveRun").orDefault(1);
-	state int maxConsecutiveRun = UnitTestCollection::getIntParam("maxConsecutiveRun").orDefault(100);
-	state char firstKeyChar = UnitTestCollection::getParam("firstKeyChar").orDefault("a")[0];
-	state char lastKeyChar = UnitTestCollection::getParam("lastKeyChar").orDefault("m")[0];
+	state int pageSize = params.getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
+	state int64_t pageCacheBytes = params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
+	state int nodeCount = params.getIntParam("nodeCount").orDefault(1e9);
+	state int maxRecordsPerCommit = params.getIntParam("maxRecordsPerCommit").orDefault(20000);
+	state int maxKVBytesPerCommit = params.getIntParam("maxKVBytesPerCommit").orDefault(20e6);
+	state int64_t kvBytesTarget = params.getIntParam("kvBytesTarget").orDefault(4e9);
+	state int minKeyPrefixBytes = params.getIntParam("minKeyPrefixBytes").orDefault(25);
+	state int maxKeyPrefixBytes = params.getIntParam("maxKeyPrefixBytes").orDefault(25);
+	state int minValueSize = params.getIntParam("minValueSize").orDefault(100);
+	state int maxValueSize = params.getIntParam("maxValueSize").orDefault(500);
+	state int minConsecutiveRun = params.getIntParam("minConsecutiveRun").orDefault(1);
+	state int maxConsecutiveRun = params.getIntParam("maxConsecutiveRun").orDefault(100);
+	state char firstKeyChar = params.getParam("firstKeyChar").orDefault("a")[0];
+	state char lastKeyChar = params.getParam("lastKeyChar").orDefault("m")[0];
 	state Version remapCleanupWindow =
-	    UnitTestCollection::getIntParam("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
+	    params.getIntParam("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
 
 	printf("pageSize: %d\n", pageSize);
 	printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes);
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index f59fdcd7c0..325caa10c5 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -978,6 +978,7 @@ struct CLIOptions {
 
 	Reference<ClusterConnectionFile> connectionFile;
 	Standalone<StringRef> machineId;
+	UnitTestParameters testParams;
 
 	static CLIOptions parseArgs(int argc, char* argv[]) {
 		CLIOptions opts;
@@ -1058,7 +1059,7 @@ private:
 					fprintf(stderr, "ERROR: unable to parse knob option '%s'\n", syn.c_str());
 					flushAndExit(FDB_EXIT_ERROR);
 				}
-				UnitTestCollection::setParam(syn.substr(7), args.OptionArg());
+				testParams.setParam(syn.substr(7), args.OptionArg());
 				break;
 			}
 			case OPT_LOCALITY: {
@@ -2017,8 +2018,14 @@ int main(int argc, char* argv[]) {
 		} else if (role == ServerRole::UnitTests) {
 			setupRunLoopProfiler();
 			auto m = startSystemMonitor(opts.dataFolder, opts.dcId, opts.zoneId, opts.zoneId);
-			f = stopAfter(runTests(
-			    opts.connectionFile, TEST_TYPE_UNIT_TESTS, TEST_HERE, 1, opts.testFile, StringRef(), opts.localities));
+			f = stopAfter(runTests(opts.connectionFile,
+			                       TEST_TYPE_UNIT_TESTS,
+			                       TEST_HERE,
+			                       1,
+			                       opts.testFile,
+			                       StringRef(),
+			                       opts.localities,
+			                       opts.testParams));
 			g_network->run();
 		} else if (role == ServerRole::CreateTemplateDatabase) {
 			createTemplateDatabase();
diff --git a/fdbserver/networktest.actor.cpp b/fdbserver/networktest.actor.cpp
index 9603fc25cb..cea0c2fca8 100644
--- a/fdbserver/networktest.actor.cpp
+++ b/fdbserver/networktest.actor.cpp
@@ -570,15 +570,15 @@ struct P2PNetworkTest {
 // The client will close the connection after a random idleMilliseconds.
 // Reads and writes can optionally preceded by random delays, waitReadMilliseconds and waitWriteMilliseconds.
 TEST_CASE(":/network/p2ptest") {
-	state P2PNetworkTest p2p(UnitTestCollection::getParam("listenerAddresses").orDefault(""),
-	                         UnitTestCollection::getParam("remoteAddresses").orDefault(""),
-	                         UnitTestCollection::getIntParam("connectionsOut").orDefault(1),
-	                         UnitTestCollection::getParam("requestBytes").orDefault("50:100"),
-	                         UnitTestCollection::getParam("replyBytes").orDefault("500:1000"),
-	                         UnitTestCollection::getParam("requests").orDefault("10:10000"),
-	                         UnitTestCollection::getParam("idleMilliseconds").orDefault("0"),
-	                         UnitTestCollection::getParam("waitReadMilliseconds").orDefault("0"),
-	                         UnitTestCollection::getParam("waitWriteMilliseconds").orDefault("0"));
+	state P2PNetworkTest p2p(params.getParam("listenerAddresses").orDefault(""),
+	                         params.getParam("remoteAddresses").orDefault(""),
+	                         params.getIntParam("connectionsOut").orDefault(1),
+	                         params.getParam("requestBytes").orDefault("50:100"),
+	                         params.getParam("replyBytes").orDefault("500:1000"),
+	                         params.getParam("requests").orDefault("10:10000"),
+	                         params.getParam("idleMilliseconds").orDefault("0"),
+	                         params.getParam("waitReadMilliseconds").orDefault("0"),
+	                         params.getParam("waitWriteMilliseconds").orDefault("0"));
 
 	wait(p2p.run());
 	return Void();
diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp
index 4307329840..5e7ce2b550 100644
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@@ -1572,7 +1572,8 @@ ACTOR Future<Void> runTests(Reference<ClusterConnectionFile> connFile,
                             int minTestersExpected,
                             std::string fileName,
                             StringRef startingConfiguration,
-                            LocalityData locality) {
+                            LocalityData locality,
+                            UnitTestParameters testOptions) {
 	state vector<TestSpec> testSpecs;
 	auto cc = makeReference<AsyncVar<Optional<ClusterControllerFullInterface>>>();
 	auto ci = makeReference<AsyncVar<Optional<ClusterInterface>>>();
@@ -1615,6 +1616,10 @@ ACTOR Future<Void> runTests(Reference<ClusterConnectionFile> connFile,
 		options.push_back_deep(options.arena(),
 		                       KeyValueRef(LiteralStringRef("testName"), LiteralStringRef("UnitTests")));
 		options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("testsMatching"), fileName));
+		// Add unit test options as test spec options
+		for (auto& kv : testOptions.params) {
+			options.push_back_deep(options.arena(), KeyValueRef(kv.first, kv.second));
+		}
 		spec.options.push_back_deep(spec.options.arena(), options);
 		testSpecs.push_back(spec);
 	} else {
diff --git a/fdbserver/workloads/UnitTests.actor.cpp b/fdbserver/workloads/UnitTests.actor.cpp
index 2cb4210d0f..024cfc9973 100644
--- a/fdbserver/workloads/UnitTests.actor.cpp
+++ b/fdbserver/workloads/UnitTests.actor.cpp
@@ -34,6 +34,7 @@ struct UnitTestWorkload : TestWorkload {
 	bool enabled;
 	std::string testPattern;
 	int testRunLimit;
+	UnitTestParameters testParams;
 
 	PerfIntCounter testsAvailable, testsExecuted, testsFailed;
 	PerfDoubleCounter totalWallTime, totalSimTime;
@@ -45,6 +46,14 @@ struct UnitTestWorkload : TestWorkload {
 		enabled = !clientId; // only do this on the "first" client
 		testPattern = getOption(options, LiteralStringRef("testsMatching"), Value()).toString();
 		testRunLimit = getOption(options, LiteralStringRef("maxTestCases"), -1);
+
+		// Consume all remaining options as testParams which the unit test can access
+		for(auto &kv : options) {
+			if(kv.value.size() != 0) {
+				testParams.setParam(kv.key.toString(), getOption(options, kv.key, StringRef()).toString());
+			}
+		}
+
 		forceLinkIndexedSetTests();
 		forceLinkDequeTests();
 		forceLinkFlowTests();
@@ -94,7 +103,7 @@ struct UnitTestWorkload : TestWorkload {
 			state double start_timer = timer();
 
 			try {
-				wait(test->func());
+				wait(test->func(self->testParams));
 			} catch (Error& e) {
 				++self->testsFailed;
 				result = e;
diff --git a/flow/UnitTest.cpp b/flow/UnitTest.cpp
index 83eec8c62a..7303cd33c7 100644
--- a/flow/UnitTest.cpp
+++ b/flow/UnitTest.cpp
@@ -27,29 +27,24 @@ UnitTest::UnitTest(const char* name, const char* file, int line, TestFunction fu
 	g_unittests.tests = this;
 }
 
-UnitTestParameters& UnitTestCollection::params() {
-	static UnitTestParameters p;
-	return p;
-}
-
-void UnitTestCollection::setParam(const std::string& name, const std::string& value) {
+void UnitTestParameters::setParam(const std::string& name, const std::string& value) {
 	printf("setting %s = %s\n", name.c_str(), value.c_str());
-	params()[name] = value;
+	params[name] = value;
 }
 
-Optional<std::string> UnitTestCollection::getParam(const std::string& name) {
-	auto it = params().find(name);
-	if (it != params().end()) {
+Optional<std::string> UnitTestParameters::getParam(const std::string& name) const {
+	auto it = params.find(name);
+	if (it != params.end()) {
 		return it->second;
 	}
 	return {};
 }
 
-void UnitTestCollection::setParam(const std::string& name, int64_t value) {
+void UnitTestParameters::setParam(const std::string& name, int64_t value) {
 	setParam(name, format("%" PRId64, value));
 };
 
-Optional<int64_t> UnitTestCollection::getIntParam(const std::string& name) {
+Optional<int64_t> UnitTestParameters::getIntParam(const std::string& name) const {
 	auto opt = getParam(name);
 	if (opt.present()) {
 		return atoll(opt.get().c_str());
diff --git a/flow/UnitTest.h b/flow/UnitTest.h
index 92f44084ba..21d51a158f 100644
--- a/flow/UnitTest.h
+++ b/flow/UnitTest.h
@@ -47,9 +47,26 @@
 
 #include <cinttypes>
 
+struct UnitTestParameters {
+	// Map of named case-sensitive parameters
+	std::map<std::string, std::string> params;
+
+	// Set a named parameter to a string value, replacing any existing value
+	void setParam(const std::string& name, const std::string& value);
+
+	// Set a named parameter to an integer converted to a string value, replacing any existing value
+	void setParam(const std::string& name, int64_t value);
+
+	// Get a parameter's value, will return !present() if parameter was not set
+	Optional<std::string> getParam(const std::string& name) const;
+
+	// Get a parameter's value as an integer, will return !present() if parameter was not set
+	Optional<int64_t> getIntParam(const std::string& name) const;
+};
+
 // Unit test definition structured as a linked list item
 struct UnitTest {
-	typedef Future<Void> (*TestFunction)();
+	typedef Future<Void> (*TestFunction)(const UnitTestParameters& params);
 
 	const char* name;
 	const char* file;
@@ -61,24 +78,8 @@ struct UnitTest {
 };
 
 // Collection of unit tests in the form of a linked list
-typedef std::map<std::string, std::string> UnitTestParameters;
 struct UnitTestCollection {
 	UnitTest* tests;
-
-	// Map of named case-sensitive parameters available for all unit tests
-	static UnitTestParameters& params();
-
-	// Set a named parameter to a string value, replacing any existing value
-	static void setParam(const std::string& name, const std::string& value);
-
-	// Set a named parameter to an integer converted to a string value, replacing any existing value
-	static void setParam(const std::string& name, int64_t value);
-
-	// Get a parameter's value, will return !present() if parameter was not set
-	static Optional<std::string> getParam(const std::string& name);
-
-	// Get a parameter's value as an integer, will return !present() if parameter was not set
-	static Optional<int64_t> getIntParam(const std::string& name);
 };
 
 extern UnitTestCollection g_unittests;
@@ -91,17 +92,17 @@ extern UnitTestCollection g_unittests;
 
 #ifdef FLOW_DISABLE_UNIT_TESTS
 
-#define TEST_CASE(name) static Future<Void> FILE_UNIQUE_NAME(disabled_testcase_func)()
+#define TEST_CASE(name) static Future<Void> FILE_UNIQUE_NAME(disabled_testcase_func)(const UnitTestParameters& params)
 #define ACTOR_TEST_CASE(actorname, name)
 
 #else
 
 #define TEST_CASE(name)                                                                                                \
-	static Future<Void> FILE_UNIQUE_NAME(testcase_func)();                                                             \
+	static Future<Void> FILE_UNIQUE_NAME(testcase_func)(const UnitTestParameters& params);                             \
 	namespace {                                                                                                        \
 	static UnitTest FILE_UNIQUE_NAME(testcase)(name, __FILE__, __LINE__, &FILE_UNIQUE_NAME(testcase_func));            \
 	}                                                                                                                  \
-	static Future<Void> FILE_UNIQUE_NAME(testcase_func)()
+	static Future<Void> FILE_UNIQUE_NAME(testcase_func)(const UnitTestParameters& params)
 
 // ACTOR_TEST_CASE generated by actorcompiler; don't use directly
 #define ACTOR_TEST_CASE(actorname, name)                                                                               \
diff --git a/flow/actorcompiler/ActorParser.cs b/flow/actorcompiler/ActorParser.cs
index d92bba9d53..f44b4e433f 100644
--- a/flow/actorcompiler/ActorParser.cs
+++ b/flow/actorcompiler/ActorParser.cs
@@ -535,7 +535,13 @@ namespace actorcompiler
             actor.testCaseParameters = str(paramRange);
 
             actor.name = "flowTestCase" + toks.First().SourceLine;
-            actor.parameters = new VarDeclaration[] { };
+            actor.parameters = new VarDeclaration[] { new VarDeclaration {
+                    name = "params",
+                    type = "UnitTestParameters",
+                    initializer = "",
+                    initializerConstructorSyntax = false
+                }
+            };
             actor.returnType = "Void";
         }
 

From 5f89640b1bc8853ea1c069b038548848d5cc24b0 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 6 Apr 2021 02:45:33 -0700
Subject: [PATCH 061/317] Added performance unit test options for read
 parallelism, using existing file, and whether or not to insert new records.

---
 fdbserver/VersionedBTree.actor.cpp | 48 +++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 614ce3a51b..10866d8e12 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -8133,17 +8133,10 @@ TEST_CASE(":/redwood/performance/set") {
 	g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting
 	g_redwoodMetrics.clear();
 
-	// If a test file is passed in by environment then don't write new data to it.
-	state bool reload = getenv("TESTFILE") == nullptr;
-	state std::string pagerFile = reload ? "unittest.redwood" : getenv("TESTFILE");
-
-	if (reload) {
-		printf("Deleting old test data\n");
-		deleteFile(pagerFile);
-	}
-
+	state std::string fileName = params.getParam("fileName").orDefault("unittest.redwood");
 	state int pageSize = params.getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
-	state int64_t pageCacheBytes = params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
+	state int64_t pageCacheBytes =
+	    params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
 	state int nodeCount = params.getIntParam("nodeCount").orDefault(1e9);
 	state int maxRecordsPerCommit = params.getIntParam("maxRecordsPerCommit").orDefault(20000);
 	state int maxKVBytesPerCommit = params.getIntParam("maxKVBytesPerCommit").orDefault(20e6);
@@ -8158,6 +8151,10 @@ TEST_CASE(":/redwood/performance/set") {
 	state char lastKeyChar = params.getParam("lastKeyChar").orDefault("m")[0];
 	state Version remapCleanupWindow =
 	    params.getIntParam("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
+	state bool openExisting = params.getIntParam("openExisting").orDefault(0);
+	state bool insertRecords = !openExisting || params.getIntParam("insertRecords").orDefault(0);
+	state int concurrentSeeks = params.getIntParam("concurrentSeeks").orDefault(64);
+	state int concurrentScans = params.getIntParam("concurrentScans").orDefault(64);
 
 	printf("pageSize: %d\n", pageSize);
 	printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes);
@@ -8173,9 +8170,19 @@ TEST_CASE(":/redwood/performance/set") {
 	printf("kvBytesTarget: %" PRId64 "\n", kvBytesTarget);
 	printf("KeyLexicon '%c' to '%c'\n", firstKeyChar, lastKeyChar);
 	printf("remapCleanupWindow: %" PRId64 "\n", remapCleanupWindow);
+	printf("fileName: %s\n", fileName.c_str());
+	printf("concurrentScans: %d\n", concurrentScans);
+	printf("concurrentSeeks: %d\n", concurrentSeeks);
+	printf("openExisting: %d\n", openExisting);
+	printf("insertRecords: %d\n", insertRecords);
 
-	DWALPager* pager = new DWALPager(pageSize, pagerFile, pageCacheBytes, remapCleanupWindow);
-	state VersionedBTree* btree = new VersionedBTree(pager, pagerFile);
+	if (!openExisting) {
+		printf("Deleting old test data\n");
+		deleteFile(fileName);
+	}
+
+	DWALPager* pager = new DWALPager(pageSize, fileName, pageCacheBytes, remapCleanupWindow);
+	state VersionedBTree* btree = new VersionedBTree(pager, fileName);
 	wait(btree->init());
 
 	state int64_t kvBytesThisCommit = 0;
@@ -8188,7 +8195,7 @@ TEST_CASE(":/redwood/performance/set") {
 	state double intervalStart = timer();
 	state double start = intervalStart;
 
-	if (reload) {
+	if (insertRecords) {
 		while (kvBytesTotal < kvBytesTarget) {
 			wait(yield());
 
@@ -8298,15 +8305,22 @@ TEST_CASE(":/redwood/performance/set") {
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
+	printf("Parallel scans, concurrency=%d, no readAhead ...\n", concurrentScans);
+	for(int x = 0; x < concurrentScans; ++x) {
+		actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar));
+	}
+	wait(actors.signalAndReset());
+	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
+
 	printf("Serial seeks...\n");
 	actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
-	printf("Parallel seeks...\n");
-	actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
-	actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
-	actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
+	printf("Parallel seeks, concurrency=%d ...\n", concurrentSeeks);
+	for(int x = 0; x < concurrentSeeks; ++x) {
+		actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
+	}
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 

From aacee0656926dd19661dac94b9ef464619404af8 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 6 Apr 2021 03:06:29 -0700
Subject: [PATCH 062/317] Applied clang-format.

---
 fdbserver/VersionedBTree.actor.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 10866d8e12..c45ff44f38 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -8135,8 +8135,7 @@ TEST_CASE(":/redwood/performance/set") {
 
 	state std::string fileName = params.getParam("fileName").orDefault("unittest.redwood");
 	state int pageSize = params.getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
-	state int64_t pageCacheBytes =
-	    params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
+	state int64_t pageCacheBytes = params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
 	state int nodeCount = params.getIntParam("nodeCount").orDefault(1e9);
 	state int maxRecordsPerCommit = params.getIntParam("maxRecordsPerCommit").orDefault(20000);
 	state int maxKVBytesPerCommit = params.getIntParam("maxKVBytesPerCommit").orDefault(20e6);
@@ -8306,7 +8305,7 @@ TEST_CASE(":/redwood/performance/set") {
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Parallel scans, concurrency=%d, no readAhead ...\n", concurrentScans);
-	for(int x = 0; x < concurrentScans; ++x) {
+	for (int x = 0; x < concurrentScans; ++x) {
 		actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar));
 	}
 	wait(actors.signalAndReset());
@@ -8318,7 +8317,7 @@ TEST_CASE(":/redwood/performance/set") {
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Parallel seeks, concurrency=%d ...\n", concurrentSeeks);
-	for(int x = 0; x < concurrentSeeks; ++x) {
+	for (int x = 0; x < concurrentSeeks; ++x) {
 		actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
 	}
 	wait(actors.signalAndReset());

From 394e5628033175ef2629f06a43401e44cfc301ed Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 6 Apr 2021 03:44:49 -0700
Subject: [PATCH 063/317] Added seek and scan counts, parallel reads divide
 count over concurrent readers.

---
 fdbserver/VersionedBTree.actor.cpp | 31 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index c45ff44f38..a7b999539f 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -8154,6 +8154,8 @@ TEST_CASE(":/redwood/performance/set") {
 	state bool insertRecords = !openExisting || params.getIntParam("insertRecords").orDefault(0);
 	state int concurrentSeeks = params.getIntParam("concurrentSeeks").orDefault(64);
 	state int concurrentScans = params.getIntParam("concurrentScans").orDefault(64);
+	state int seeks = params.getIntParam("seeks").orDefault(1000000);
+	state int scans = params.getIntParam("scans").orDefault(20000);
 
 	printf("pageSize: %d\n", pageSize);
 	printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes);
@@ -8169,9 +8171,11 @@ TEST_CASE(":/redwood/performance/set") {
 	printf("kvBytesTarget: %" PRId64 "\n", kvBytesTarget);
 	printf("KeyLexicon '%c' to '%c'\n", firstKeyChar, lastKeyChar);
 	printf("remapCleanupWindow: %" PRId64 "\n", remapCleanupWindow);
-	printf("fileName: %s\n", fileName.c_str());
 	printf("concurrentScans: %d\n", concurrentScans);
 	printf("concurrentSeeks: %d\n", concurrentSeeks);
+	printf("seeks: %d\n", seeks);
+	printf("scans: %d\n", scans);
+	printf("fileName: %s\n", fileName.c_str());
 	printf("openExisting: %d\n", openExisting);
 	printf("insertRecords: %d\n", insertRecords);
 
@@ -8269,56 +8273,53 @@ TEST_CASE(":/redwood/performance/set") {
 		       kvBytesTotal / (timer() - start) / 1e6);
 	}
 
-	int seeks = 1e6;
 	printf("Warming cache with seeks\n");
-	actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar));
-	actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar));
-	actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar));
+	for (int x = 0; x < concurrentSeeks; ++x) {
+		actors.add(randomSeeks(btree, seeks / concurrentSeeks, firstKeyChar, lastKeyChar));
+	}
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
-	state int ops = 10000;
-
 	printf("Serial scans with adaptive readAhead...\n");
-	actors.add(randomScans(btree, ops, 50, -1, firstKeyChar, lastKeyChar));
+	actors.add(randomScans(btree, scans, 50, -1, firstKeyChar, lastKeyChar));
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Serial scans with readAhead 3 pages...\n");
-	actors.add(randomScans(btree, ops, 50, 12000, firstKeyChar, lastKeyChar));
+	actors.add(randomScans(btree, scans, 50, 12000, firstKeyChar, lastKeyChar));
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Serial scans with readAhead 2 pages...\n");
-	actors.add(randomScans(btree, ops, 50, 8000, firstKeyChar, lastKeyChar));
+	actors.add(randomScans(btree, scans, 50, 8000, firstKeyChar, lastKeyChar));
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Serial scans with readAhead 1 page...\n");
-	actors.add(randomScans(btree, ops, 50, 4000, firstKeyChar, lastKeyChar));
+	actors.add(randomScans(btree, scans, 50, 4000, firstKeyChar, lastKeyChar));
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Serial scans...\n");
-	actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar));
+	actors.add(randomScans(btree, scans, 50, 0, firstKeyChar, lastKeyChar));
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Parallel scans, concurrency=%d, no readAhead ...\n", concurrentScans);
 	for (int x = 0; x < concurrentScans; ++x) {
-		actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar));
+		actors.add(randomScans(btree, scans / concurrentScans, 50, 0, firstKeyChar, lastKeyChar));
 	}
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Serial seeks...\n");
-	actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
+	actors.add(randomSeeks(btree, seeks, firstKeyChar, lastKeyChar));
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());
 
 	printf("Parallel seeks, concurrency=%d ...\n", concurrentSeeks);
 	for (int x = 0; x < concurrentSeeks; ++x) {
-		actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar));
+		actors.add(randomSeeks(btree, seeks / concurrentSeeks, firstKeyChar, lastKeyChar));
 	}
 	wait(actors.signalAndReset());
 	printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str());

From cf33cea0b46d600901c98f2ed602256cde61d17b Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh3y@gmail.com>
Date: Tue, 6 Apr 2021 13:44:33 +0000
Subject: [PATCH 064/317] loopback_cluster: Find `fdbcli` from `$BUILD` path

Currently expects to find `fdbcli` in current path, which doesn't seem
right. `fdbcli` should always be in the directory where we'll find
`fdbserver`.
---
 tests/loopback_cluster/run_cluster.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/loopback_cluster/run_cluster.sh b/tests/loopback_cluster/run_cluster.sh
index cea762ed17..a16228f9b9 100755
--- a/tests/loopback_cluster/run_cluster.sh
+++ b/tests/loopback_cluster/run_cluster.sh
@@ -40,7 +40,7 @@ for i in `seq 1 $2` ; do
 		${FDB} -p auto:${PORT_PREFIX}${j} -d $DATA -L $LOG -C $CLUSTER &
 	done
 	
-	CLI="$ROOT/bin/fdbcli -C ${CLUSTER} --exec"
+	CLI="$BUILD/bin/fdbcli -C ${CLUSTER} --exec"
 	( sleep 2 ; $CLI "configure new ssd single" ) &
 done;
 

From 48a475366cae487139f84f1badc341c1ad5bd79f Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Mon, 22 Mar 2021 10:39:06 -0400
Subject: [PATCH 065/317] Log latency metrics for batch GRV requests

---
 .../sphinx/source/mr-status-json-schemas.rst.inc   | 14 ++++++++++++++
 fdbclient/Schemas.cpp                              | 14 ++++++++++++++
 fdbserver/GrvProxyServer.actor.cpp                 |  6 ++++++
 fdbserver/Status.actor.cpp                         | 13 +++++++++++--
 fdbserver/storageserver.actor.cpp                  | 14 ++++++++++----
 5 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 81da2adf83..a80adc2a22 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -134,6 +134,20 @@
                          "p99.9":0.0
                      }
                   },
+                  "grv_batch_latency_statistics":{
+                     "default":{
+                         "count":0,
+                         "min":0.0,
+                         "max":0.0,
+                         "median":0.0,
+                         "mean":0.0,
+                         "p25":0.0,
+                         "p90":0.0,
+                         "p95":0.0,
+                         "p99":0.0,
+                         "p99.9":0.0
+                     }
+                  },                  
                   "read_latency_statistics":{
                      "count":0,
                      "min":0.0,
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 866ea4441e..4a734005b0 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -157,6 +157,20 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                         "p99.9":0.0
                      }
                   },
+                  "grv_batch_latency_statistics":{
+                     "default":{
+                        "count":0,
+                        "min":0.0,
+                        "max":0.0,
+                        "median":0.0,
+                        "mean":0.0,
+                        "p25":0.0,
+                        "p90":0.0,
+                        "p95":0.0,
+                        "p99":0.0,
+                        "p99.9":0.0
+                     }
+                  },                  
                   "read_latency_statistics":{
                      "count":0,
                      "min":0.0,
diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index 8ab3719181..5e70ea4f5b 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -47,6 +47,7 @@ struct GrvProxyStats {
 
 	LatencyBands grvLatencyBands;
 	LatencySample grvLatencySample;
+    LatencySample grvBatchLatencySample;
 
 	Future<Void> logger;
 
@@ -101,6 +102,8 @@ struct GrvProxyStats {
 	                     id,
 	                     SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
 	                     SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	    grvBatchLatencySample("BatchLatencyMetrics", id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+	                     SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 	    grvLatencyBands("GRVLatencyMetrics", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
 		// The rate at which the limit(budget) is allowed to grow.
 		specialCounter(cc, "SystemAndDefaultTxnRateAllowed", [this]() { return this->transactionRateAllowed; });
@@ -528,6 +531,9 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
 		if (request.priority >= TransactionPriority::DEFAULT) {
 			stats->grvLatencyBands.addMeasurement(duration);
 		}
+		else {
+			stats->grvBatchLatencySample.addMeasurement(duration);
+		}
 
 		if (request.flags & GetReadVersionRequest::FLAG_USE_MIN_KNOWN_COMMITTED_VERSION) {
 			// Only backup worker may infrequently use this flag.
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 284cedf97b..1559bd2acd 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -499,6 +499,11 @@ struct RolesInfo {
 				    maxTLogVersion - version - SERVER_KNOBS->STORAGE_LOGGING_DELAY * SERVER_KNOBS->VERSIONS_PER_SECOND);
 			}
 
+			TraceEventFields const& batchLatencyMetrics = metrics.at("BatchLatencyMetrics");
+			if(batchLatencyMetrics.size()) {
+				obj["grv_batch_latency_statistics"] = addLatencyStatistics(batchLatencyMetrics);
+			}
+
 			TraceEventFields const& readLatencyMetrics = metrics.at("ReadLatencyMetrics");
 			if (readLatencyMetrics.size()) {
 				obj["read_latency_statistics"] = addLatencyStatistics(readLatencyMetrics);
@@ -641,6 +646,10 @@ struct RolesInfo {
 			if (grvLatencyBands.size()) {
 				obj["grv_latency_bands"] = addLatencyBandInfo(grvLatencyBands);
 			}
+			TraceEventFields const& grvBatchMetrics = metrics.at("BatchLatencyMetrics");
+			if(grvBatchMetrics.size()) {
+				obj["grv_batch_latency_statistics"] = addLatencyStatistics(grvBatchMetrics);
+			}						
 		} catch (Error& e) {
 			if (e.code() != error_code_attribute_not_found) {
 				throw e;
@@ -1812,7 +1821,7 @@ ACTOR static Future<vector<std::pair<StorageServerInterface, EventMap>>> getStor
 	           getServerMetrics(servers,
 	                            address_workers,
 	                            std::vector<std::string>{
-	                                "StorageMetrics", "ReadLatencyMetrics", "ReadLatencyBands", "BusiestReadTag" })) &&
+	                                "StorageMetrics", "ReadLatencyMetrics", "ReadLatencyBands", "BusiestReadTag", "BatchLatencyMetrics" })) &&
 	     store(busiestWriteTags, getServerBusiestWriteTags(servers, address_workers, rkWorker)));
 
 	ASSERT(busiestWriteTags.size() == results.size());
@@ -1850,7 +1859,7 @@ ACTOR static Future<vector<std::pair<GrvProxyInterface, EventMap>>> getGrvProxie
 	vector<std::pair<GrvProxyInterface, EventMap>> results =
 	    wait(getServerMetrics(db->get().client.grvProxies,
 	                          address_workers,
-	                          std::vector<std::string>{ "GRVLatencyMetrics", "GRVLatencyBands" }));
+	                          std::vector<std::string>{ "GRVLatencyMetrics", "GRVLatencyBands", "BatchLatencyMetrics" }));
 	return results;
 }
 
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 8c26f955bb..47af85145a 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -678,6 +678,7 @@ public:
 
 		LatencySample readLatencySample;
 		LatencyBands readLatencyBands;
+		LatencySample batchLatencySample;
 
 		Counters(StorageServer* self)
 		  : cc("StorageServer", self->thisServerID.toString()), getKeyQueries("GetKeyQueries", cc),
@@ -692,10 +693,15 @@ public:
 		    updateBatches("UpdateBatches", cc), updateVersions("UpdateVersions", cc), loops("Loops", cc),
 		    fetchWaitingMS("FetchWaitingMS", cc), fetchWaitingCount("FetchWaitingCount", cc),
 		    fetchExecutingMS("FetchExecutingMS", cc), fetchExecutingCount("FetchExecutingCount", cc),
-		    readsRejected("ReadsRejected", cc), readLatencySample("ReadLatencyMetrics",
-		                                                          self->thisServerID,
-		                                                          SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-		                                                          SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		    readsRejected("ReadsRejected", cc), 
+			batchLatencySample("BatchLatencyMetrics", 
+				self->thisServerID, 
+				SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, 
+				SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+			readLatencySample("ReadLatencyMetrics",
+				self->thisServerID,
+				SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+				SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 		    readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
 			specialCounter(cc, "LastTLogVersion", [self]() { return self->lastTLogVersion; });
 			specialCounter(cc, "Version", [self]() { return self->version.get(); });

From cabf192f57a53eac4719c562fb72dea9ece828f7 Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Wed, 24 Mar 2021 13:41:13 -0400
Subject: [PATCH 066/317] Respond to review comments 3/23

---
 .../source/mr-status-json-schemas.rst.inc      |  8 +++-----
 fdbclient/Schemas.cpp                          |  8 +++-----
 fdbserver/GrvProxyServer.actor.cpp             | 11 ++++++-----
 fdbserver/Status.actor.cpp                     | 18 ++++++++----------
 fdbserver/storageserver.actor.cpp              | 14 ++++----------
 5 files changed, 24 insertions(+), 35 deletions(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index a80adc2a22..17a67dd57e 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -132,10 +132,8 @@
                          "p95":0.0,
                          "p99":0.0,
                          "p99.9":0.0
-                     }
-                  },
-                  "grv_batch_latency_statistics":{
-                     "default":{
+                     },
+                     "batch":{
                          "count":0,
                          "min":0.0,
                          "max":0.0,
@@ -147,7 +145,7 @@
                          "p99":0.0,
                          "p99.9":0.0
                      }
-                  },                  
+                  },
                   "read_latency_statistics":{
                      "count":0,
                      "min":0.0,
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 4a734005b0..fbe30fce83 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -155,10 +155,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                         "p95":0.0,
                         "p99":0.0,
                         "p99.9":0.0
-                     }
-                  },
-                  "grv_batch_latency_statistics":{
-                     "default":{
+                     },
+                     "batch":{
                         "count":0,
                         "min":0.0,
                         "max":0.0,
@@ -170,7 +168,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                         "p99":0.0,
                         "p99.9":0.0
                      }
-                  },                  
+                  },
                   "read_latency_statistics":{
                      "count":0,
                      "min":0.0,
diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index 5e70ea4f5b..b790baaf7a 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -47,7 +47,7 @@ struct GrvProxyStats {
 
 	LatencyBands grvLatencyBands;
 	LatencySample grvLatencySample;
-    LatencySample grvBatchLatencySample;
+	LatencySample grvBatchLatencySample;
 
 	Future<Void> logger;
 
@@ -102,8 +102,10 @@ struct GrvProxyStats {
 	                     id,
 	                     SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
 	                     SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
-	    grvBatchLatencySample("BatchLatencyMetrics", id, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-	                     SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+	    grvBatchLatencySample("GRVBatchLatencyMetrics",
+	                          id,
+	                          SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+	                          SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 	    grvLatencyBands("GRVLatencyMetrics", id, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
 		// The rate at which the limit(budget) is allowed to grow.
 		specialCounter(cc, "SystemAndDefaultTxnRateAllowed", [this]() { return this->transactionRateAllowed; });
@@ -530,8 +532,7 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
 
 		if (request.priority >= TransactionPriority::DEFAULT) {
 			stats->grvLatencyBands.addMeasurement(duration);
-		}
-		else {
+		} else {
 			stats->grvBatchLatencySample.addMeasurement(duration);
 		}
 
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 1559bd2acd..2ebfe30da0 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -499,11 +499,6 @@ struct RolesInfo {
 				    maxTLogVersion - version - SERVER_KNOBS->STORAGE_LOGGING_DELAY * SERVER_KNOBS->VERSIONS_PER_SECOND);
 			}
 
-			TraceEventFields const& batchLatencyMetrics = metrics.at("BatchLatencyMetrics");
-			if(batchLatencyMetrics.size()) {
-				obj["grv_batch_latency_statistics"] = addLatencyStatistics(batchLatencyMetrics);
-			}
-
 			TraceEventFields const& readLatencyMetrics = metrics.at("ReadLatencyMetrics");
 			if (readLatencyMetrics.size()) {
 				obj["read_latency_statistics"] = addLatencyStatistics(readLatencyMetrics);
@@ -646,10 +641,13 @@ struct RolesInfo {
 			if (grvLatencyBands.size()) {
 				obj["grv_latency_bands"] = addLatencyBandInfo(grvLatencyBands);
 			}
-			TraceEventFields const& grvBatchMetrics = metrics.at("BatchLatencyMetrics");
+
+			TraceEventFields const& grvBatchMetrics = metrics.at("GRVBatchLatencyMetrics");
 			if(grvBatchMetrics.size()) {
-				obj["grv_batch_latency_statistics"] = addLatencyStatistics(grvBatchMetrics);
-			}						
+				JsonBuilderObject priorityStats;
+				priorityStats["batch"] = addLatencyStatistics(grvBatchMetrics);
+				obj["grv_batch_latency_statistics"] = priorityStats;
+			}
 		} catch (Error& e) {
 			if (e.code() != error_code_attribute_not_found) {
 				throw e;
@@ -1821,7 +1819,7 @@ ACTOR static Future<vector<std::pair<StorageServerInterface, EventMap>>> getStor
 	           getServerMetrics(servers,
 	                            address_workers,
 	                            std::vector<std::string>{
-	                                "StorageMetrics", "ReadLatencyMetrics", "ReadLatencyBands", "BusiestReadTag", "BatchLatencyMetrics" })) &&
+	                                "StorageMetrics", "ReadLatencyMetrics", "ReadLatencyBands", "BusiestReadTag" })) &&
 	     store(busiestWriteTags, getServerBusiestWriteTags(servers, address_workers, rkWorker)));
 
 	ASSERT(busiestWriteTags.size() == results.size());
@@ -1859,7 +1857,7 @@ ACTOR static Future<vector<std::pair<GrvProxyInterface, EventMap>>> getGrvProxie
 	vector<std::pair<GrvProxyInterface, EventMap>> results =
 	    wait(getServerMetrics(db->get().client.grvProxies,
 	                          address_workers,
-	                          std::vector<std::string>{ "GRVLatencyMetrics", "GRVLatencyBands", "BatchLatencyMetrics" }));
+	                          std::vector<std::string>{ "GRVLatencyMetrics", "GRVLatencyBands", "GRVBatchLatencyMetrics" }));
 	return results;
 }
 
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 47af85145a..8c26f955bb 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -678,7 +678,6 @@ public:
 
 		LatencySample readLatencySample;
 		LatencyBands readLatencyBands;
-		LatencySample batchLatencySample;
 
 		Counters(StorageServer* self)
 		  : cc("StorageServer", self->thisServerID.toString()), getKeyQueries("GetKeyQueries", cc),
@@ -693,15 +692,10 @@ public:
 		    updateBatches("UpdateBatches", cc), updateVersions("UpdateVersions", cc), loops("Loops", cc),
 		    fetchWaitingMS("FetchWaitingMS", cc), fetchWaitingCount("FetchWaitingCount", cc),
 		    fetchExecutingMS("FetchExecutingMS", cc), fetchExecutingCount("FetchExecutingCount", cc),
-		    readsRejected("ReadsRejected", cc), 
-			batchLatencySample("BatchLatencyMetrics", 
-				self->thisServerID, 
-				SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, 
-				SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
-			readLatencySample("ReadLatencyMetrics",
-				self->thisServerID,
-				SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
-				SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
+		    readsRejected("ReadsRejected", cc), readLatencySample("ReadLatencyMetrics",
+		                                                          self->thisServerID,
+		                                                          SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL,
+		                                                          SERVER_KNOBS->LATENCY_SAMPLE_SIZE),
 		    readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY) {
 			specialCounter(cc, "LastTLogVersion", [self]() { return self->lastTLogVersion; });
 			specialCounter(cc, "Version", [self]() { return self->version.get(); });

From 7faca702d27622ecd1201a61c65c8aea586370a2 Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Mon, 29 Mar 2021 15:10:40 -0400
Subject: [PATCH 067/317] Fix bug in writes to json objects.

---
 fdbserver/GrvProxyServer.actor.cpp |  6 ++++--
 fdbserver/Status.actor.cpp         | 19 ++++++++++---------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index b790baaf7a..dd6048dd60 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -526,14 +526,16 @@ ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
 	double end = g_network->timer();
 	for (GetReadVersionRequest const& request : requests) {
 		double duration = end - request.requestTime();
+		if (request.priority == TransactionPriority::BATCH) {
+			stats->grvBatchLatencySample.addMeasurement(duration);
+		}
+
 		if (request.priority == TransactionPriority::DEFAULT) {
 			stats->grvLatencySample.addMeasurement(duration);
 		}
 
 		if (request.priority >= TransactionPriority::DEFAULT) {
 			stats->grvLatencyBands.addMeasurement(duration);
-		} else {
-			stats->grvBatchLatencySample.addMeasurement(duration);
 		}
 
 		if (request.flags & GetReadVersionRequest::FLAG_USE_MIN_KNOWN_COMMITTED_VERSION) {
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 2ebfe30da0..fbbc7df005 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -629,11 +629,19 @@ struct RolesInfo {
 		obj["id"] = iface.id().shortString();
 		obj["role"] = role;
 		try {
+			JsonBuilderObject priorityStats;
+
 			TraceEventFields const& grvLatencyMetrics = metrics.at("GRVLatencyMetrics");
 			if (grvLatencyMetrics.size()) {
-				JsonBuilderObject priorityStats;
-				// We only report default priority now, but this allows us to add other priorities if we want them
 				priorityStats["default"] = addLatencyStatistics(grvLatencyMetrics);
+			}
+
+			TraceEventFields const& grvBatchMetrics = metrics.at("GRVBatchLatencyMetrics");
+			if (grvBatchMetrics.size()) {
+				priorityStats["batch"] = addLatencyStatistics(grvBatchMetrics);
+			}
+
+			if (priorityStats.size()) {
 				obj["grv_latency_statistics"] = priorityStats;
 			}
 
@@ -641,13 +649,6 @@ struct RolesInfo {
 			if (grvLatencyBands.size()) {
 				obj["grv_latency_bands"] = addLatencyBandInfo(grvLatencyBands);
 			}
-
-			TraceEventFields const& grvBatchMetrics = metrics.at("GRVBatchLatencyMetrics");
-			if(grvBatchMetrics.size()) {
-				JsonBuilderObject priorityStats;
-				priorityStats["batch"] = addLatencyStatistics(grvBatchMetrics);
-				obj["grv_batch_latency_statistics"] = priorityStats;
-			}
 		} catch (Error& e) {
 			if (e.code() != error_code_attribute_not_found) {
 				throw e;

From a95e845200f8741a68138f4cb83eef0f36beebb6 Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Tue, 6 Apr 2021 14:56:58 -0400
Subject: [PATCH 068/317] document changes

---
 fdbserver/GrvProxyServer.actor.cpp | 4 ++--
 fdbserver/Status.actor.cpp         | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index dd6048dd60..9d3a0c2020 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -46,8 +46,8 @@ struct GrvProxyStats {
 	LatencySample batchTxnGRVTimeInQueue;
 
 	LatencyBands grvLatencyBands;
-	LatencySample grvLatencySample;
-	LatencySample grvBatchLatencySample;
+	LatencySample grvLatencySample; // GRV latency metric sample of default priority
+	LatencySample grvBatchLatencySample; // GRV latency metric sample of batched priority
 
 	Future<Void> logger;
 
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index fbbc7df005..90caeee703 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -631,6 +631,8 @@ struct RolesInfo {
 		try {
 			JsonBuilderObject priorityStats;
 
+			// GRV Latency metrics are grouped according to priority (currently batch or default).
+			// Other priorities can be added in the future.
 			TraceEventFields const& grvLatencyMetrics = metrics.at("GRVLatencyMetrics");
 			if (grvLatencyMetrics.size()) {
 				priorityStats["default"] = addLatencyStatistics(grvLatencyMetrics);
@@ -641,6 +643,7 @@ struct RolesInfo {
 				priorityStats["batch"] = addLatencyStatistics(grvBatchMetrics);
 			}
 
+			// Add GRV Latency metrics (for all priorities) to parent node.
 			if (priorityStats.size()) {
 				obj["grv_latency_statistics"] = priorityStats;
 			}

From 433872e17d2a128acc87d1efcc51fac152d0d706 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 6 Apr 2021 17:28:28 -0700
Subject: [PATCH 069/317] Sample actors waiting on network

---
 fdbclient/InstrumentRequest.h | 50 +++++++++++++++++++++++++++++++++++
 fdbclient/NativeAPI.actor.cpp |  5 ++++
 fdbrpc/FlowTests.actor.cpp    |  4 +++
 fdbrpc/sim2.actor.cpp         |  7 +++++
 flow/Net2.actor.cpp           |  8 ++++++
 flow/Platform.actor.cpp       | 12 ++++++---
 flow/network.h                |  4 +++
 7 files changed, 86 insertions(+), 4 deletions(-)
 create mode 100644 fdbclient/InstrumentRequest.h

diff --git a/fdbclient/InstrumentRequest.h b/fdbclient/InstrumentRequest.h
new file mode 100644
index 0000000000..77adbd1490
--- /dev/null
+++ b/fdbclient/InstrumentRequest.h
@@ -0,0 +1,50 @@
+/*
+ * InstrumentRequest.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "flow/flow.h"
+#include "flow/network.h"
+
+// Used to manually instrument waiting actors to collect samples for the
+// sampling profiler.
+struct InstrumentRequest {
+	unsigned index;
+
+	InstrumentRequest() {}
+
+	// This API isn't great. Ideally, no cleanup call is needed. I ran into an
+	// issue around the destructor being called twice because an instance of
+	// this class has to be stored as a class member (otherwise it goes away
+	// when wait is called), and due to how Flow does code generation the
+	// member will be default initialized and then initialized again when it is
+	// initially set. Then, the destructor will be called twice, causing issues
+	// when the WriteOnlySet tries to erase the same index twice. I'm working
+	// on this :)
+
+	void start() {
+		index = g_network->getActorLineageSet().insert(currentLineage);
+	}
+	
+	void complete() {
+		g_network->getActorLineageSet().erase(index);
+	}
+};
+
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index a0ed70997c..41e63c68f8 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -36,6 +36,7 @@
 #include "fdbclient/ClusterInterface.h"
 #include "fdbclient/CoordinationInterface.h"
 #include "fdbclient/DatabaseContext.h"
+#include "fdbclient/InstrumentRequest.h"
 #include "fdbclient/JsonBuilder.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/Knobs.h"
@@ -1770,6 +1771,7 @@ void runNetwork() {
 	if (networkOptions.traceDirectory.present() && networkOptions.runLoopProfilingEnabled) {
 		setupRunLoopProfiler();
 	}
+	setupSamplingProfiler();
 
 	g_network->run();
 
@@ -3025,6 +3027,8 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 						throw deterministicRandom()->randomChoice(
 						    std::vector<Error>{ transaction_too_old(), future_version() });
 					}
+					state InstrumentRequest request;
+					request.start();
 					GetKeyValuesReply _rep =
 					    wait(loadBalance(cx.getPtr(),
 					                     beginServer.second,
@@ -3035,6 +3039,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					                     cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
 					rep = _rep;
 					++cx->transactionPhysicalReadsCompleted;
+					request.complete();
 				} catch (Error&) {
 					++cx->transactionPhysicalReadsCompleted;
 					throw;
diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp
index 40e4ed1c52..c965149f70 100644
--- a/fdbrpc/FlowTests.actor.cpp
+++ b/fdbrpc/FlowTests.actor.cpp
@@ -24,6 +24,7 @@
 #include "flow/UnitTest.h"
 #include "flow/DeterministicRandom.h"
 #include "flow/IThreadPool.h"
+#include "flow/WriteOnlySet.h"
 #include "fdbrpc/fdbrpc.h"
 #include "fdbrpc/IAsyncFile.h"
 #include "flow/TLSConfig.actor.h"
@@ -283,6 +284,9 @@ struct YieldMockNetwork final : INetwork, ReferenceCounted<YieldMockNetwork> {
 		static TLSConfig emptyConfig;
 		return emptyConfig;
 	}
+	ActorLineageSet& getActorLineageSet() override {
+		throw std::exception();
+	}
 	ProtocolVersion protocolVersion() override { return baseNetwork->protocolVersion(); }
 };
 
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index e9219f3ff3..4bd2c9399e 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -31,6 +31,7 @@
 #include "flow/IThreadPool.h"
 #include "flow/ProtocolVersion.h"
 #include "flow/Util.h"
+#include "flow/WriteOnlySet.h"
 #include "fdbrpc/IAsyncFile.h"
 #include "fdbrpc/AsyncFileCached.actor.h"
 #include "fdbrpc/AsyncFileNonDurable.actor.h"
@@ -975,6 +976,10 @@ public:
 
 	bool checkRunnable() override { return net2->checkRunnable(); }
 
+	ActorLineageSet& getActorLineageSet() override {
+		return actorLineageSet;
+	}
+
 	void stop() override { isStopped = true; }
 	void addStopCallback(std::function<void()> fn) override { stopCallbacks.emplace_back(std::move(fn)); }
 	bool isSimulated() const override { return true; }
@@ -2117,6 +2122,8 @@ public:
 	// Whether or not yield has returned true during the current iteration of the run loop
 	bool yielded;
 	int yield_limit; // how many more times yield may return false before next returning true
+
+	ActorLineageSet actorLineageSet;
 };
 
 class UDPSimSocket : public IUDPSocket, ReferenceCounted<UDPSimSocket> {
diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp
index 5026d6a982..bb3c675de4 100644
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@@ -198,6 +198,8 @@ public:
 
 	bool checkRunnable() override;
 
+	ActorLineageSet& getActorLineageSet() override;
+
 	bool useThreadPool;
 
 	// private:
@@ -225,6 +227,8 @@ public:
 	std::atomic<bool> stopped;
 	mutable std::map<IPAddress, bool> addressOnHostCache;
 
+	ActorLineageSet actorLineageSet;
+
 	std::atomic<bool> started;
 
 	uint64_t numYields;
@@ -1377,6 +1381,10 @@ bool Net2::checkRunnable() {
 	return !started.exchange(true);
 }
 
+ActorLineageSet& Net2::getActorLineageSet() {
+	return actorLineageSet;
+}
+
 void Net2::run() {
 	TraceEvent::setNetworkThread();
 	TraceEvent("Net2Running");
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 50f252021b..5be9b6423f 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -3679,8 +3679,7 @@ void* sampleThread(void* arg) {
 	while (true) {
 		threadSleep(1.0); // TODO: Read sample rate from global config
 
-		// TODO: Copy actor lineage of currently running actor
-		// Read currentLineage
+		// Get actor lineage of currently running actor.
 		auto actorLineage = currentLineageThreadSafe.get();
 		printf("Currently running actor lineage (%p):\n", actorLineage.getPtr());
 		auto stack = actorLineage->stack(&StackLineage::actorName);
@@ -3690,11 +3689,16 @@ void* sampleThread(void* arg) {
 		}
 		printf("\n");
 
+		// Get lineage of actors waiting on disk.
 		auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy();
-		printf("Disk ALPs: %d\n", diskAlps.size());
+		// printf("Disk ALPs: %d\n", diskAlps.size());
+
+		// TODO: Get lineage of actors waiting on network
+		auto networkAlps = g_network->getActorLineageSet().copy();
+		printf("Network ALPs: %d\n", networkAlps.size());
 
 		// TODO: Call collect on all actor lineages
-		for (auto actorLineage : diskAlps) {
+		for (auto actorLineage : networkAlps) {
 			auto stack = actorLineage->stack(&StackLineage::actorName);
 			while (!stack.empty()) {
 				printf("%s ", stack.top());
diff --git a/flow/network.h b/flow/network.h
index 33fb7b0f26..b335db3c2d 100644
--- a/flow/network.h
+++ b/flow/network.h
@@ -34,6 +34,7 @@
 #include "flow/Arena.h"
 #include "flow/IRandom.h"
 #include "flow/Trace.h"
+#include "flow/WriteOnlySet.h"
 
 enum class TaskPriority {
 	Max = 1000000,
@@ -535,6 +536,9 @@ public:
 	// returns false.
 	virtual bool checkRunnable() = 0;
 
+	// Returns the shared memory data structure used to store actor lineages.
+	virtual ActorLineageSet& getActorLineageSet() = 0;
+
 	virtual ProtocolVersion protocolVersion() = 0;
 
 	// Shorthand for transport().getLocalAddress()

From c481ba2cfa0f330230d83f8f290fa663d9e08348 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 6 Apr 2021 17:32:02 -0700
Subject: [PATCH 070/317] Update annotation class name

---
 fdbclient/{InstrumentRequest.h => AnnotateActor.h} | 6 +++---
 fdbclient/NativeAPI.actor.cpp                      | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)
 rename fdbclient/{InstrumentRequest.h => AnnotateActor.h} (95%)

diff --git a/fdbclient/InstrumentRequest.h b/fdbclient/AnnotateActor.h
similarity index 95%
rename from fdbclient/InstrumentRequest.h
rename to fdbclient/AnnotateActor.h
index 77adbd1490..cf5bf2c57e 100644
--- a/fdbclient/InstrumentRequest.h
+++ b/fdbclient/AnnotateActor.h
@@ -1,5 +1,5 @@
 /*
- * InstrumentRequest.h
+ * AnnotateActor.h
  *
  * This source file is part of the FoundationDB open source project
  *
@@ -25,10 +25,10 @@
 
 // Used to manually instrument waiting actors to collect samples for the
 // sampling profiler.
-struct InstrumentRequest {
+struct AnnotateActor {
 	unsigned index;
 
-	InstrumentRequest() {}
+	AnnotateActor() {}
 
 	// This API isn't great. Ideally, no cleanup call is needed. I ran into an
 	// issue around the destructor being called twice because an instance of
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 41e63c68f8..e6d9463157 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -32,11 +32,11 @@
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/MultiInterface.h"
 
+#include "fdbclient/AnnotateActor.h"
 #include "fdbclient/Atomic.h"
 #include "fdbclient/ClusterInterface.h"
 #include "fdbclient/CoordinationInterface.h"
 #include "fdbclient/DatabaseContext.h"
-#include "fdbclient/InstrumentRequest.h"
 #include "fdbclient/JsonBuilder.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/Knobs.h"
@@ -3027,8 +3027,8 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 						throw deterministicRandom()->randomChoice(
 						    std::vector<Error>{ transaction_too_old(), future_version() });
 					}
-					state InstrumentRequest request;
-					request.start();
+					state AnnotateActor annotation;
+					annotation.start();
 					GetKeyValuesReply _rep =
 					    wait(loadBalance(cx.getPtr(),
 					                     beginServer.second,
@@ -3039,7 +3039,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					                     cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
 					rep = _rep;
 					++cx->transactionPhysicalReadsCompleted;
-					request.complete();
+					annotation.complete();
 				} catch (Error&) {
 					++cx->transactionPhysicalReadsCompleted;
 					throw;

From 1b6ad42db80f18261516516e64cc4a7da8010111 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Tue, 6 Apr 2021 19:07:01 -0700
Subject: [PATCH 071/317] Use a knob to completely ignore log files

---
 fdbclient/FileBackupAgent.actor.cpp | 15 ++++++++-------
 fdbclient/Knobs.cpp                 |  1 +
 fdbclient/Knobs.h                   |  1 +
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 17112ca8f9..825b29d5a6 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -4311,13 +4311,14 @@ public:
 			    .detail("OverrideTargetVersion", targetVersion);
 		}
 
-		Optional<RestorableFileSet> restoreSet = wait(bc->getRestoreSet(targetVersion));
-
-		if (!restoreSet.present()) {
-			TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
-			    .detail("BackupContainer", bc->getURL())
-			    .detail("TargetVersion", targetVersion);
-			throw restore_invalid_version();
+		if (!CLIENT_KNOBS->IGNORE_LOG_FILES) {
+			Optional<RestorableFileSet> restoreSet = wait(bc->getRestoreSet(targetVersion));
+			if (!restoreSet.present()) {
+				TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
+					.detail("BackupContainer", bc->getURL())
+					.detail("TargetVersion", targetVersion);
+				throw restore_invalid_version();
+			}
 		}
 
 		TraceEvent("FastRestoreSubmitRestoreRequest")
diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index bcca5ed166..c5456acd0a 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -172,6 +172,7 @@ void ClientKnobs::initialize(bool randomize) {
 	init( BACKUP_STATUS_DELAY,                    40.0 );
 	init( BACKUP_STATUS_JITTER,                   0.05 );
 	init( MIN_CLEANUP_SECONDS,                  3600.0 );
+	init( IGNORE_LOG_FILES,                      false );
 
 	// Configuration
 	init( DEFAULT_AUTO_COMMIT_PROXIES,               3 );
diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h
index 3d22b5a24b..d12bba2fe5 100644
--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@@ -167,6 +167,7 @@ public:
 	double BACKUP_STATUS_DELAY;
 	double BACKUP_STATUS_JITTER;
 	double MIN_CLEANUP_SECONDS;
+	bool IGNORE_LOG_FILES;
 
 	// Configuration
 	int32_t DEFAULT_AUTO_COMMIT_PROXIES;

From fadc9cccee8ebd5ac18542529d682bb246d85934 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 7 Apr 2021 10:05:56 -0700
Subject: [PATCH 072/317] Use knob RESTORE_IGNORE_LOG_FILES in restore.

Rename IGNORE_LOG_FILES to RESTORE_IGNORE_LOG_FILES. Also, this knob should be used in regular restore, not parallel restore.
---
 fdbclient/FileBackupAgent.actor.cpp | 21 +++++++++++----------
 fdbclient/Knobs.cpp                 |  2 +-
 fdbclient/Knobs.h                   |  2 +-
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 825b29d5a6..d7bbd1be61 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -4100,8 +4100,10 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
 		}
 
-		for (const LogFile& f : restorable.get().logs) {
-			files.push_back({ f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion });
+		if (!CLIENT_KNOBS->RESTORE_IGNORE_LOG_FILES) {
+			for (const LogFile& f : restorable.get().logs) {
+				files.push_back({ f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion });
+			}
 		}
 
 		state std::vector<RestoreConfig::RestoreFile>::iterator start = files.begin();
@@ -4311,14 +4313,13 @@ public:
 			    .detail("OverrideTargetVersion", targetVersion);
 		}
 
-		if (!CLIENT_KNOBS->IGNORE_LOG_FILES) {
-			Optional<RestorableFileSet> restoreSet = wait(bc->getRestoreSet(targetVersion));
-			if (!restoreSet.present()) {
-				TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
-					.detail("BackupContainer", bc->getURL())
-					.detail("TargetVersion", targetVersion);
-				throw restore_invalid_version();
-			}
+		Optional<RestorableFileSet> restoreSet = wait(bc->getRestoreSet(targetVersion));
+
+		if (!restoreSet.present()) {
+			TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
+			    .detail("BackupContainer", bc->getURL())
+			    .detail("TargetVersion", targetVersion);
+			throw restore_invalid_version();
 		}
 
 		TraceEvent("FastRestoreSubmitRestoreRequest")
diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index c5456acd0a..dbd09f39f8 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -172,7 +172,7 @@ void ClientKnobs::initialize(bool randomize) {
 	init( BACKUP_STATUS_DELAY,                    40.0 );
 	init( BACKUP_STATUS_JITTER,                   0.05 );
 	init( MIN_CLEANUP_SECONDS,                  3600.0 );
-	init( IGNORE_LOG_FILES,                      false );
+	init( RESTORE_IGNORE_LOG_FILES,              false ); // When set to true, the log files in restore will be ignored.
 
 	// Configuration
 	init( DEFAULT_AUTO_COMMIT_PROXIES,               3 );
diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h
index d12bba2fe5..87caa3cfac 100644
--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@@ -167,7 +167,7 @@ public:
 	double BACKUP_STATUS_DELAY;
 	double BACKUP_STATUS_JITTER;
 	double MIN_CLEANUP_SECONDS;
-	bool IGNORE_LOG_FILES;
+	bool RESTORE_IGNORE_LOG_FILES;
 
 	// Configuration
 	int32_t DEFAULT_AUTO_COMMIT_PROXIES;

From 551dfa6ad81b08063324ec432d0344437c655f80 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 7 Apr 2021 10:21:24 -0700
Subject: [PATCH 073/317] Move description of RESTORE_IGNORE_LOG_FILES to
 header file.

---
 fdbclient/Knobs.cpp | 2 +-
 fdbclient/Knobs.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index dbd09f39f8..3f5523e218 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -172,7 +172,7 @@ void ClientKnobs::initialize(bool randomize) {
 	init( BACKUP_STATUS_DELAY,                    40.0 );
 	init( BACKUP_STATUS_JITTER,                   0.05 );
 	init( MIN_CLEANUP_SECONDS,                  3600.0 );
-	init( RESTORE_IGNORE_LOG_FILES,              false ); // When set to true, the log files in restore will be ignored.
+	init( RESTORE_IGNORE_LOG_FILES,              false );
 
 	// Configuration
 	init( DEFAULT_AUTO_COMMIT_PROXIES,               3 );
diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h
index 87caa3cfac..8cfcd9e6bd 100644
--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@@ -167,7 +167,7 @@ public:
 	double BACKUP_STATUS_DELAY;
 	double BACKUP_STATUS_JITTER;
 	double MIN_CLEANUP_SECONDS;
-	bool RESTORE_IGNORE_LOG_FILES;
+	bool RESTORE_IGNORE_LOG_FILES;   // Default is false. When set to true, the log files will be ignored during the restore, which can produce inconsistent restored data.
 
 	// Configuration
 	int32_t DEFAULT_AUTO_COMMIT_PROXIES;

From 5c79d29140614245e21beb4dc12e1be30479e98d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 7 Apr 2021 10:59:45 -0700
Subject: [PATCH 074/317] Use object lifetimes instead of function calls

---
 fdbclient/AnnotateActor.h     | 37 +++++++++++++++++++++++------------
 fdbclient/NativeAPI.actor.cpp |  4 +---
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h
index cf5bf2c57e..0d0cd4a632 100644
--- a/fdbclient/AnnotateActor.h
+++ b/fdbclient/AnnotateActor.h
@@ -27,24 +27,35 @@
 // sampling profiler.
 struct AnnotateActor {
 	unsigned index;
+	bool set;
 
-	AnnotateActor() {}
+	AnnotateActor() : set(false) {}
 
-	// This API isn't great. Ideally, no cleanup call is needed. I ran into an
-	// issue around the destructor being called twice because an instance of
-	// this class has to be stored as a class member (otherwise it goes away
-	// when wait is called), and due to how Flow does code generation the
-	// member will be default initialized and then initialized again when it is
-	// initially set. Then, the destructor will be called twice, causing issues
-	// when the WriteOnlySet tries to erase the same index twice. I'm working
-	// on this :)
+	AnnotateActor(Reference<ActorLineage> lineage) : set(true) {
+		index = g_network->getActorLineageSet().insert(lineage);
+	}
 
-	void start() {
-		index = g_network->getActorLineageSet().insert(currentLineage);
+	AnnotateActor(const AnnotateActor& other) = delete;
+	AnnotateActor(AnnotateActor&& other) = delete;
+	AnnotateActor& operator=(const AnnotateActor& other) = delete;
+
+	AnnotateActor& operator=(AnnotateActor&& other) {
+		if (this == &other) {
+			return *this;
+		}
+
+		this->index = other.index;
+		this->set = other.set;
+
+		other.set = false;
+
+		return *this;
 	}
 	
-	void complete() {
-		g_network->getActorLineageSet().erase(index);
+	~AnnotateActor() {
+		if (set) {
+			g_network->getActorLineageSet().erase(index);
+		}
 	}
 };
 
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index e6d9463157..f05257e06d 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -3027,8 +3027,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 						throw deterministicRandom()->randomChoice(
 						    std::vector<Error>{ transaction_too_old(), future_version() });
 					}
-					state AnnotateActor annotation;
-					annotation.start();
+					state AnnotateActor annotation(currentLineage);
 					GetKeyValuesReply _rep =
 					    wait(loadBalance(cx.getPtr(),
 					                     beginServer.second,
@@ -3039,7 +3038,6 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					                     cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
 					rep = _rep;
 					++cx->transactionPhysicalReadsCompleted;
-					annotation.complete();
 				} catch (Error&) {
 					++cx->transactionPhysicalReadsCompleted;
 					throw;

From 2a64c227fb60599b2596cfa2512e24debbe2d6f3 Mon Sep 17 00:00:00 2001
From: Jon Fu <jon.fu@snowflake.com>
Date: Wed, 7 Apr 2021 15:59:51 -0400
Subject: [PATCH 075/317] Added options to test config that specify
 maxtlogversion and array of excluded storage engine types

---
 fdbserver/SimulatedCluster.actor.cpp          | 26 ++++++++++++++-----
 fdbserver/TesterInterface.actor.h             |  7 +++--
 fdbserver/tester.actor.cpp                    |  6 +++--
 .../to_6.3.10/CycleTestRestart-1.txt          |  3 ++-
 .../to_6.3.10/CycleTestRestart-2.txt          |  3 ++-
 5 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index e5ce23da2f..f8d9610f32 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -21,6 +21,7 @@
 #include <cstdint>
 #include <fstream>
 #include <ostream>
+#include <sstream>
 #include "fdbrpc/Locality.h"
 #include "fdbrpc/simulator.h"
 #include "fdbclient/DatabaseContext.h"
@@ -874,7 +875,9 @@ void SimulationConfig::set_config(std::string config) {
 StringRef StringRefOf(const char* s) {
 	return StringRef((uint8_t*)s, strlen(s));
 }
-
+// Generates and sets an appropriate configuration for the database according to
+// the provided testConfig. Some attributes are randomly generated for more coverage
+// of different combinations
 void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	set_config("new");
 	const bool simple = false; // Set true to simplify simulation configs for easier debugging
@@ -897,7 +900,9 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 		db.resolverCount = deterministicRandom()->randomInt(1, 7);
 	int storage_engine_type = deterministicRandom()->randomInt(0, 4);
 	// Continuously re-pick the storage engine type if it's the one we want to exclude
-	while (storage_engine_type == testConfig.storageEngineExcludeType) {
+	while (std::find(testConfig.storageEngineExcludeTypes.begin(),
+	                 testConfig.storageEngineExcludeTypes.end(),
+	                 storage_engine_type) != testConfig.storageEngineExcludeTypes.end()) {
 		storage_engine_type = deterministicRandom()->randomInt(0, 4);
 	}
 	switch (storage_engine_type) {
@@ -989,11 +994,11 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) {
 	if (deterministicRandom()->random01() < 0.5) {
 		int logSpill = deterministicRandom()->randomInt(TLogSpillType::VALUE, TLogSpillType::END);
 		set_config(format("log_spill:=%d", logSpill));
-		int logVersion = deterministicRandom()->randomInt(TLogVersion::MIN_RECRUITABLE, TLogVersion::MAX_SUPPORTED + 1);
+		int logVersion = deterministicRandom()->randomInt(TLogVersion::MIN_RECRUITABLE, testConfig.maxTLogVersion + 1);
 		set_config(format("log_version:=%d", logVersion));
 	} else {
 		if (deterministicRandom()->random01() < 0.7)
-			set_config(format("log_version:=%d", TLogVersion::MAX_SUPPORTED));
+			set_config(format("log_version:=%d", testConfig.maxTLogVersion));
 		if (deterministicRandom()->random01() < 0.5)
 			set_config(format("log_spill:=%d", TLogSpillType::DEFAULT));
 	}
@@ -1663,8 +1668,17 @@ void checkTestConf(const char* testFile, TestConfig* testConfig) {
 			sscanf(value.c_str(), "%d", &testConfig->logAntiQuorum);
 		}
 
-		if (attrib == "storageEngineExcludeType") {
-			sscanf(value.c_str(), "%d", &testConfig->storageEngineExcludeType);
+		if (attrib == "storageEngineExcludeTypes") {
+			std::stringstream ss(value);
+			for (int i; ss >> i;) {
+				testConfig->storageEngineExcludeTypes.push_back(i);
+				if (ss.peek() == ',') {
+					ss.ignore();
+				}
+			}
+		}
+		if (attrib == "maxTLogVersion") {
+			sscanf(value.c_str(), "%d", &testConfig->maxTLogVersion);
 		}
 	}
 
diff --git a/fdbserver/TesterInterface.actor.h b/fdbserver/TesterInterface.actor.h
index f5e84a2a58..061128a9a7 100644
--- a/fdbserver/TesterInterface.actor.h
+++ b/fdbserver/TesterInterface.actor.h
@@ -109,12 +109,15 @@ struct TestConfig {
 	bool startIncompatibleProcess = false;
 	int logAntiQuorum = -1;
 	// Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig
-	//	-1 = None
 	//	0 = "ssd"
 	//	1 = "memory"
 	//	2 = "memory-radixtree-beta"
 	//	3 = "ssd-redwood-experimental"
-	int storageEngineExcludeType = -1;
+	// Requires a comma-separated list of numbers WITHOUT whitespaces
+	std::vector<int> storageEngineExcludeTypes;
+	// Set the maximum TLog version that can be selected for a test
+	// Refer to FDBTypes.h::TLogVersion. Defaults to the maximum supported version.
+	int maxTLogVersion = TLogVersion::MAX_SUPPORTED;
 };
 
 struct TesterInterface {
diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp
index 839df40999..714747ec32 100644
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@@ -1036,8 +1036,10 @@ std::map<std::string, std::function<void(const std::string&)>> testSpecGlobalKey
 	  } },
 	{ "startIncompatibleProcess",
 	  [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedStartIncompatibleProcess", value); } },
-	{ "storageEngineExcludeType",
-	  [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedStorageEngineExcludeType", ""); } }
+	{ "storageEngineExcludeTypes",
+	  [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedStorageEngineExcludeTypes", ""); } },
+	{ "maxTLogVersion",
+	  [](const std::string& value) { TraceEvent("TestParserTest").detail("ParsedMaxTLogVersion", ""); } }
 };
 
 std::map<std::string, std::function<void(const std::string& value, TestSpec* spec)>> testSpecTestKeys = {
diff --git a/tests/restarting/to_6.3.10/CycleTestRestart-1.txt b/tests/restarting/to_6.3.10/CycleTestRestart-1.txt
index 59e764c697..fe2a95fd46 100644
--- a/tests/restarting/to_6.3.10/CycleTestRestart-1.txt
+++ b/tests/restarting/to_6.3.10/CycleTestRestart-1.txt
@@ -1,4 +1,5 @@
-storageEngineExcludeType=-1
+storageEngineExcludeTypes=-1,-2
+maxTLogVersion=6
 testTitle=Clogged
     clearAfterTest=false
     testName=Cycle
diff --git a/tests/restarting/to_6.3.10/CycleTestRestart-2.txt b/tests/restarting/to_6.3.10/CycleTestRestart-2.txt
index ecd3c77b52..8af5b92392 100644
--- a/tests/restarting/to_6.3.10/CycleTestRestart-2.txt
+++ b/tests/restarting/to_6.3.10/CycleTestRestart-2.txt
@@ -1,4 +1,5 @@
-storageEngineExcludeType=-1
+storageEngineExcludeTypes=-1,-2
+maxTLogVersion=6
 testTitle=Clogged
     runSetup=false
     testName=Cycle

From d6c4aa67d71c829c2da198a65c4753cbaa1c1246 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 6 Apr 2021 17:28:28 -0700
Subject: [PATCH 076/317] Sample actors waiting on network

---
 fdbclient/InstrumentRequest.h | 50 +++++++++++++++++++++++++++++++++++
 fdbclient/NativeAPI.actor.cpp |  5 ++++
 fdbrpc/FlowTests.actor.cpp    |  4 +++
 fdbrpc/sim2.actor.cpp         |  7 +++++
 flow/Net2.actor.cpp           |  8 ++++++
 flow/Platform.actor.cpp       | 12 ++++++---
 flow/network.h                |  4 +++
 7 files changed, 86 insertions(+), 4 deletions(-)
 create mode 100644 fdbclient/InstrumentRequest.h

diff --git a/fdbclient/InstrumentRequest.h b/fdbclient/InstrumentRequest.h
new file mode 100644
index 0000000000..77adbd1490
--- /dev/null
+++ b/fdbclient/InstrumentRequest.h
@@ -0,0 +1,50 @@
+/*
+ * InstrumentRequest.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "flow/flow.h"
+#include "flow/network.h"
+
+// Used to manually instrument waiting actors to collect samples for the
+// sampling profiler.
+struct InstrumentRequest {
+	unsigned index;
+
+	InstrumentRequest() {}
+
+	// This API isn't great. Ideally, no cleanup call is needed. I ran into an
+	// issue around the destructor being called twice because an instance of
+	// this class has to be stored as a class member (otherwise it goes away
+	// when wait is called), and due to how Flow does code generation the
+	// member will be default initialized and then initialized again when it is
+	// initially set. Then, the destructor will be called twice, causing issues
+	// when the WriteOnlySet tries to erase the same index twice. I'm working
+	// on this :)
+
+	void start() {
+		index = g_network->getActorLineageSet().insert(currentLineage);
+	}
+	
+	void complete() {
+		g_network->getActorLineageSet().erase(index);
+	}
+};
+
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 8b55757621..0952bae4d4 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -36,6 +36,7 @@
 #include "fdbclient/ClusterInterface.h"
 #include "fdbclient/CoordinationInterface.h"
 #include "fdbclient/DatabaseContext.h"
+#include "fdbclient/InstrumentRequest.h"
 #include "fdbclient/JsonBuilder.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/Knobs.h"
@@ -1796,6 +1797,7 @@ void runNetwork() {
 	if (networkOptions.traceDirectory.present() && networkOptions.runLoopProfilingEnabled) {
 		setupRunLoopProfiler();
 	}
+	setupSamplingProfiler();
 
 	g_network->run();
 
@@ -3051,6 +3053,8 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 						throw deterministicRandom()->randomChoice(
 						    std::vector<Error>{ transaction_too_old(), future_version() });
 					}
+					state InstrumentRequest request;
+					request.start();
 					GetKeyValuesReply _rep =
 					    wait(loadBalance(cx.getPtr(),
 					                     beginServer.second,
@@ -3061,6 +3065,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					                     cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
 					rep = _rep;
 					++cx->transactionPhysicalReadsCompleted;
+					request.complete();
 				} catch (Error&) {
 					++cx->transactionPhysicalReadsCompleted;
 					throw;
diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp
index 40e4ed1c52..c965149f70 100644
--- a/fdbrpc/FlowTests.actor.cpp
+++ b/fdbrpc/FlowTests.actor.cpp
@@ -24,6 +24,7 @@
 #include "flow/UnitTest.h"
 #include "flow/DeterministicRandom.h"
 #include "flow/IThreadPool.h"
+#include "flow/WriteOnlySet.h"
 #include "fdbrpc/fdbrpc.h"
 #include "fdbrpc/IAsyncFile.h"
 #include "flow/TLSConfig.actor.h"
@@ -283,6 +284,9 @@ struct YieldMockNetwork final : INetwork, ReferenceCounted<YieldMockNetwork> {
 		static TLSConfig emptyConfig;
 		return emptyConfig;
 	}
+	ActorLineageSet& getActorLineageSet() override {
+		throw std::exception();
+	}
 	ProtocolVersion protocolVersion() override { return baseNetwork->protocolVersion(); }
 };
 
diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp
index 3b965d22b9..5cf65da0a5 100644
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@@ -31,6 +31,7 @@
 #include "flow/IThreadPool.h"
 #include "flow/ProtocolVersion.h"
 #include "flow/Util.h"
+#include "flow/WriteOnlySet.h"
 #include "fdbrpc/IAsyncFile.h"
 #include "fdbrpc/AsyncFileCached.actor.h"
 #include "fdbrpc/AsyncFileNonDurable.actor.h"
@@ -975,6 +976,10 @@ public:
 
 	bool checkRunnable() override { return net2->checkRunnable(); }
 
+	ActorLineageSet& getActorLineageSet() override {
+		return actorLineageSet;
+	}
+
 	void stop() override { isStopped = true; }
 	void addStopCallback(std::function<void()> fn) override { stopCallbacks.emplace_back(std::move(fn)); }
 	bool isSimulated() const override { return true; }
@@ -2117,6 +2122,8 @@ public:
 	// Whether or not yield has returned true during the current iteration of the run loop
 	bool yielded;
 	int yield_limit; // how many more times yield may return false before next returning true
+
+	ActorLineageSet actorLineageSet;
 };
 
 class UDPSimSocket : public IUDPSocket, ReferenceCounted<UDPSimSocket> {
diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp
index bb0b0325c6..fb64671c28 100644
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@@ -204,6 +204,8 @@ public:
 
 	bool checkRunnable() override;
 
+	ActorLineageSet& getActorLineageSet() override;
+
 	bool useThreadPool;
 
 	// private:
@@ -231,6 +233,8 @@ public:
 	std::atomic<bool> stopped;
 	mutable std::map<IPAddress, bool> addressOnHostCache;
 
+	ActorLineageSet actorLineageSet;
+
 	std::atomic<bool> started;
 
 	uint64_t numYields;
@@ -1383,6 +1387,10 @@ bool Net2::checkRunnable() {
 	return !started.exchange(true);
 }
 
+ActorLineageSet& Net2::getActorLineageSet() {
+	return actorLineageSet;
+}
+
 void Net2::run() {
 	TraceEvent::setNetworkThread();
 	TraceEvent("Net2Running");
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 50f252021b..5be9b6423f 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -3679,8 +3679,7 @@ void* sampleThread(void* arg) {
 	while (true) {
 		threadSleep(1.0); // TODO: Read sample rate from global config
 
-		// TODO: Copy actor lineage of currently running actor
-		// Read currentLineage
+		// Get actor lineage of currently running actor.
 		auto actorLineage = currentLineageThreadSafe.get();
 		printf("Currently running actor lineage (%p):\n", actorLineage.getPtr());
 		auto stack = actorLineage->stack(&StackLineage::actorName);
@@ -3690,11 +3689,16 @@ void* sampleThread(void* arg) {
 		}
 		printf("\n");
 
+		// Get lineage of actors waiting on disk.
 		auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy();
-		printf("Disk ALPs: %d\n", diskAlps.size());
+		// printf("Disk ALPs: %d\n", diskAlps.size());
+
+		// TODO: Get lineage of actors waiting on network
+		auto networkAlps = g_network->getActorLineageSet().copy();
+		printf("Network ALPs: %d\n", networkAlps.size());
 
 		// TODO: Call collect on all actor lineages
-		for (auto actorLineage : diskAlps) {
+		for (auto actorLineage : networkAlps) {
 			auto stack = actorLineage->stack(&StackLineage::actorName);
 			while (!stack.empty()) {
 				printf("%s ", stack.top());
diff --git a/flow/network.h b/flow/network.h
index d0f117dede..ec14167121 100644
--- a/flow/network.h
+++ b/flow/network.h
@@ -35,6 +35,7 @@
 #include "flow/Arena.h"
 #include "flow/IRandom.h"
 #include "flow/Trace.h"
+#include "flow/WriteOnlySet.h"
 
 enum class TaskPriority {
 	Max = 1000000,
@@ -558,6 +559,9 @@ public:
 	// returns false.
 	virtual bool checkRunnable() = 0;
 
+	// Returns the shared memory data structure used to store actor lineages.
+	virtual ActorLineageSet& getActorLineageSet() = 0;
+
 	virtual ProtocolVersion protocolVersion() = 0;
 
 	// Shorthand for transport().getLocalAddress()

From d60011aa74105f496cd76578cd7413c84864f884 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 6 Apr 2021 17:32:02 -0700
Subject: [PATCH 077/317] Update annotation class name

---
 fdbclient/{InstrumentRequest.h => AnnotateActor.h} | 6 +++---
 fdbclient/NativeAPI.actor.cpp                      | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)
 rename fdbclient/{InstrumentRequest.h => AnnotateActor.h} (95%)

diff --git a/fdbclient/InstrumentRequest.h b/fdbclient/AnnotateActor.h
similarity index 95%
rename from fdbclient/InstrumentRequest.h
rename to fdbclient/AnnotateActor.h
index 77adbd1490..cf5bf2c57e 100644
--- a/fdbclient/InstrumentRequest.h
+++ b/fdbclient/AnnotateActor.h
@@ -1,5 +1,5 @@
 /*
- * InstrumentRequest.h
+ * AnnotateActor.h
  *
  * This source file is part of the FoundationDB open source project
  *
@@ -25,10 +25,10 @@
 
 // Used to manually instrument waiting actors to collect samples for the
 // sampling profiler.
-struct InstrumentRequest {
+struct AnnotateActor {
 	unsigned index;
 
-	InstrumentRequest() {}
+	AnnotateActor() {}
 
 	// This API isn't great. Ideally, no cleanup call is needed. I ran into an
 	// issue around the destructor being called twice because an instance of
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 0952bae4d4..cdac01f56f 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -32,11 +32,11 @@
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/MultiInterface.h"
 
+#include "fdbclient/AnnotateActor.h"
 #include "fdbclient/Atomic.h"
 #include "fdbclient/ClusterInterface.h"
 #include "fdbclient/CoordinationInterface.h"
 #include "fdbclient/DatabaseContext.h"
-#include "fdbclient/InstrumentRequest.h"
 #include "fdbclient/JsonBuilder.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/Knobs.h"
@@ -3053,8 +3053,8 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 						throw deterministicRandom()->randomChoice(
 						    std::vector<Error>{ transaction_too_old(), future_version() });
 					}
-					state InstrumentRequest request;
-					request.start();
+					state AnnotateActor annotation;
+					annotation.start();
 					GetKeyValuesReply _rep =
 					    wait(loadBalance(cx.getPtr(),
 					                     beginServer.second,
@@ -3065,7 +3065,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					                     cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
 					rep = _rep;
 					++cx->transactionPhysicalReadsCompleted;
-					request.complete();
+					annotation.complete();
 				} catch (Error&) {
 					++cx->transactionPhysicalReadsCompleted;
 					throw;

From 130e520ad78aefbe3dccf02680f13dfdc5d9ac89 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 7 Apr 2021 10:59:45 -0700
Subject: [PATCH 078/317] Use object lifetimes instead of function calls

---
 fdbclient/AnnotateActor.h     | 37 +++++++++++++++++++++++------------
 fdbclient/NativeAPI.actor.cpp |  4 +---
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h
index cf5bf2c57e..0d0cd4a632 100644
--- a/fdbclient/AnnotateActor.h
+++ b/fdbclient/AnnotateActor.h
@@ -27,24 +27,35 @@
 // sampling profiler.
 struct AnnotateActor {
 	unsigned index;
+	bool set;
 
-	AnnotateActor() {}
+	AnnotateActor() : set(false) {}
 
-	// This API isn't great. Ideally, no cleanup call is needed. I ran into an
-	// issue around the destructor being called twice because an instance of
-	// this class has to be stored as a class member (otherwise it goes away
-	// when wait is called), and due to how Flow does code generation the
-	// member will be default initialized and then initialized again when it is
-	// initially set. Then, the destructor will be called twice, causing issues
-	// when the WriteOnlySet tries to erase the same index twice. I'm working
-	// on this :)
+	AnnotateActor(Reference<ActorLineage> lineage) : set(true) {
+		index = g_network->getActorLineageSet().insert(lineage);
+	}
 
-	void start() {
-		index = g_network->getActorLineageSet().insert(currentLineage);
+	AnnotateActor(const AnnotateActor& other) = delete;
+	AnnotateActor(AnnotateActor&& other) = delete;
+	AnnotateActor& operator=(const AnnotateActor& other) = delete;
+
+	AnnotateActor& operator=(AnnotateActor&& other) {
+		if (this == &other) {
+			return *this;
+		}
+
+		this->index = other.index;
+		this->set = other.set;
+
+		other.set = false;
+
+		return *this;
 	}
 	
-	void complete() {
-		g_network->getActorLineageSet().erase(index);
+	~AnnotateActor() {
+		if (set) {
+			g_network->getActorLineageSet().erase(index);
+		}
 	}
 };
 
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index cdac01f56f..b208107fde 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -3053,8 +3053,7 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 						throw deterministicRandom()->randomChoice(
 						    std::vector<Error>{ transaction_too_old(), future_version() });
 					}
-					state AnnotateActor annotation;
-					annotation.start();
+					state AnnotateActor annotation(currentLineage);
 					GetKeyValuesReply _rep =
 					    wait(loadBalance(cx.getPtr(),
 					                     beginServer.second,
@@ -3065,7 +3064,6 @@ ACTOR Future<Standalone<RangeResultRef>> getRange(Database cx,
 					                     cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
 					rep = _rep;
 					++cx->transactionPhysicalReadsCompleted;
-					annotation.complete();
 				} catch (Error&) {
 					++cx->transactionPhysicalReadsCompleted;
 					throw;

From 040ba0c5874e02ac7def13edee88fe6b01593b7f Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 7 Apr 2021 15:23:50 -0700
Subject: [PATCH 079/317] Rearrange things no that the backoff delay has no
 impact unless it's needed.

---
 fdbrpc/LoadBalance.actor.h | 79 +++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 43 deletions(-)

diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h
index 78f73352ba..f3fe58e441 100644
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@@ -78,10 +78,10 @@ Optional<LoadBalancedReply> getLoadBalancedReply(const void*);
 // Stores state for a request made by the load balancer
 template <class Request>
 struct RequestData : NonCopyable {
-	Future<ErrorOr<REPLY_TYPE(Request)>> response;
+	typedef ErrorOr<REPLY_TYPE(Request)> Reply;
+
+	Future<Reply> response;
 	Reference<ModelHolder> modelHolder;
-	Future<Void> backoffDelay;
-	RequestStream<Request> const* stream = nullptr;
 	bool triedAllOptions = false;
 
 	bool requestStarted = false; // true once the request has been sent to an alternative
@@ -91,36 +91,38 @@ struct RequestData : NonCopyable {
 	// This is true once setupRequest is called, even though at that point the response is Never().
 	bool isValid() { return response.isValid(); }
 
-	// Initializes the request state and starts the backoff delay
-	void setupRequest(double backoff, bool triedAllOptions, RequestStream<Request> const* stream) {
-		backoffDelay = (backoff > 0) ? delay(backoff) : Void();
-		response = Never();
+	// Initializes the request state and starts it, possibly after a backoff delay
+	void startRequest(double backoff,
+	                  bool triedAllOptions,
+	                  RequestStream<Request> const* stream,
+	                  Request const& request,
+	                  QueueModel* model) {
 		modelHolder = Reference<ModelHolder>();
 		requestStarted = false;
+
+		if (backoff > 0) {
+			response = mapAsync<Void, std::function<Future<Reply>(Void)>, Reply>(
+			    delay(backoff), [this, stream, &request, model](Void _) {
+				    requestStarted = true;
+				    modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
+				    return stream->tryGetReply(request);
+			    });
+		} else {
+			requestStarted = true;
+			modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
+			response = stream->tryGetReply(request);
+		}
+
 		requestProcessed = false;
-
-		this->stream = stream;
 		this->triedAllOptions = triedAllOptions;
 	}
 
-	// Sends the request to the configured stream
-	// This should not be called until after setupRequest has been called and the backoff delay has elapsed
-	void startRequest(Request request, QueueModel* model) {
-		ASSERT(stream);
-		ASSERT(backoffDelay.isReady());
-
-		backoffDelay = Never();
-		modelHolder = Reference<ModelHolder>(new ModelHolder(model, stream->getEndpoint().token.first()));
-		response = stream->tryGetReply(request);
-		requestStarted = true;
-	}
-
 	// Implementation of the logic to handle a response.
 	// Checks the state of the response, updates the queue model, and returns one of the following outcomes:
 	// A return value of true means that the request completed successfully
 	// A return value of false means that the request failed but should be retried
 	// A return value with an error means that the error should be thrown back to original caller
-	static ErrorOr<bool> checkAndProcessResultImpl(ErrorOr<REPLY_TYPE(Request)> result,
+	static ErrorOr<bool> checkAndProcessResultImpl(Reply result,
 	                                               Reference<ModelHolder> modelHolder,
 	                                               bool atMostOnce,
 	                                               bool triedAllOptions) {
@@ -189,7 +191,7 @@ struct RequestData : NonCopyable {
 		if (outcome.isError()) {
 			throw outcome.getError();
 		} else if (!outcome.get()) {
-			response = Future<ErrorOr<REPLY_TYPE(Request)>>();
+			response = Future<Reply>();
 		}
 
 		return outcome.get();
@@ -215,11 +217,10 @@ struct RequestData : NonCopyable {
 		// We need to process the lagging request in order to update the queue model
 		Reference<ModelHolder> holderCapture = std::move(modelHolder);
 		bool triedAllOptionsCapture = triedAllOptions;
-		Future<Void> updateModel =
-		    map(response, [holderCapture, triedAllOptionsCapture](ErrorOr<REPLY_TYPE(Request)> result) {
-			    checkAndProcessResultImpl(result, holderCapture, false, triedAllOptionsCapture);
-			    return Void();
-		    });
+		Future<Void> updateModel = map(response, [holderCapture, triedAllOptionsCapture](Reply result) {
+			checkAndProcessResultImpl(result, holderCapture, false, triedAllOptionsCapture);
+			return Void();
+		});
 		model->addActor.send(updateModel);
 	}
 
@@ -453,25 +454,18 @@ Future<REPLY_TYPE(Request)> loadBalance(
 			numAttempts = 0; // now that we've got a server back, reset the backoff
 		} else if (!stream) {
 			// Only the first location is available.
-			loop choose {
-				when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); }
-				when(ErrorOr<REPLY_TYPE(Request)> result = wait(firstRequestData.response)) {
-					if (firstRequestData.checkAndProcessResult(atMostOnce)) {
-						return result.get();
-					}
-
-					firstRequestEndpoint = Optional<uint64_t>();
-					break;
-				}
+			ErrorOr<REPLY_TYPE(Request)> result = wait(firstRequestData.response);
+			if (firstRequestData.checkAndProcessResult(atMostOnce)) {
+				return result.get();
 			}
+
+			firstRequestEndpoint = Optional<uint64_t>();
 		} else if (firstRequestData.isValid()) {
 			// Issue a second request, the first one is taking a long time.
-			secondRequestData.setupRequest(backoff, triedAllOptions, stream);
+			secondRequestData.startRequest(backoff, triedAllOptions, stream, request, model);
 			state bool firstFinished = false;
 
 			loop choose {
-				when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); }
-				when(wait(secondRequestData.backoffDelay)) { secondRequestData.startRequest(request, model); }
 				when(ErrorOr<REPLY_TYPE(Request)> result =
 				         wait(firstRequestData.response.isValid() ? firstRequestData.response : Never())) {
 					if (firstRequestData.checkAndProcessResult(atMostOnce)) {
@@ -497,12 +491,11 @@ Future<REPLY_TYPE(Request)> loadBalance(
 			}
 		} else {
 			// Issue a request, if it takes too long to get a reply, go around the loop
-			firstRequestData.setupRequest(backoff, triedAllOptions, stream);
+			firstRequestData.startRequest(backoff, triedAllOptions, stream, request, model);
 			firstRequestEndpoint = stream->getEndpoint().token.first();
 
 			loop {
 				choose {
-					when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); }
 					when(ErrorOr<REPLY_TYPE(Request)> result = wait(firstRequestData.response)) {
 						if (model) {
 							model->secondMultiplier =

From 83cf9658750bfed301702c7e24d4de5de0fb1a65 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 7 Apr 2021 15:38:01 -0700
Subject: [PATCH 080/317] Add global variable to fetch each type of sample

---
 fdbclient/AnnotateActor.cpp | 23 +++++++++++++++++++++++
 fdbclient/AnnotateActor.h   |  3 +++
 fdbclient/CMakeLists.txt    |  1 +
 flow/Platform.actor.cpp     | 32 +++++++++++++++++---------------
 4 files changed, 44 insertions(+), 15 deletions(-)
 create mode 100644 fdbclient/AnnotateActor.cpp

diff --git a/fdbclient/AnnotateActor.cpp b/fdbclient/AnnotateActor.cpp
new file mode 100644
index 0000000000..80b9a8cec4
--- /dev/null
+++ b/fdbclient/AnnotateActor.cpp
@@ -0,0 +1,23 @@
+/*
+ * AnnotateActor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/AnnotateActor.h"
+
+std::map<WaitState, std::function<std::vector<Reference<ActorLineage>>()>> samples;
diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h
index 0d0cd4a632..265d1bb3ad 100644
--- a/fdbclient/AnnotateActor.h
+++ b/fdbclient/AnnotateActor.h
@@ -59,3 +59,6 @@ struct AnnotateActor {
 	}
 };
 
+enum WaitState { Disk, Network };
+
+extern std::map<WaitState, std::function<std::vector<Reference<ActorLineage>>()>> samples;
diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index 129f9e7d3e..0f61d0c638 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(FDBCLIENT_SRCS
+  AnnotateActor.cpp
   AsyncFileS3BlobStore.actor.cpp
   AsyncFileS3BlobStore.actor.h
   AsyncTaskThread.actor.cpp
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 5be9b6423f..be12a594d2 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -50,6 +50,8 @@
 
 #include "fdbrpc/IAsyncFile.h"
 
+#include "fdbclient/AnnotateActor.h"
+
 #ifdef _WIN32
 #include <windows.h>
 #include <winioctl.h>
@@ -3689,31 +3691,31 @@ void* sampleThread(void* arg) {
 		}
 		printf("\n");
 
-		// Get lineage of actors waiting on disk.
-		auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy();
-		// printf("Disk ALPs: %d\n", diskAlps.size());
+		for (const auto& [waitState, lineageFn] : samples) {
+			auto alps = lineageFn();
 
-		// TODO: Get lineage of actors waiting on network
-		auto networkAlps = g_network->getActorLineageSet().copy();
-		printf("Network ALPs: %d\n", networkAlps.size());
+			// TODO: Serialize collected actor linage properties
 
-		// TODO: Call collect on all actor lineages
-		for (auto actorLineage : networkAlps) {
-			auto stack = actorLineage->stack(&StackLineage::actorName);
-			while (!stack.empty()) {
-				printf("%s ", stack.top());
-				stack.pop();
+			printf("Wait State #%d ALPs (%d):\n", waitState, alps.size());
+			for (auto actorLineage : alps) {
+				auto stack = actorLineage->stack(&StackLineage::actorName);
+				while (!stack.empty()) {
+					printf("%s ", stack.top());
+					stack.pop();
+				}
+				printf("\n");
 			}
-			printf("\n");
 		}
-
-		// TODO: Serialize collected actor linage properties
 	}
 
 	return nullptr;
 }
 
 void setupSamplingProfiler() {
+	samples[WaitState::Disk] = std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet()));
+	samples[WaitState::Network] =
+	    std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet()));
+
 	// TODO: Add knob
 	TraceEvent("StartingSamplingProfilerThread");
 	startThread(&sampleThread, nullptr);

From c27d82cecdd11f91f3b32d924003fb0a72272eb3 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 7 Apr 2021 16:04:08 -0700
Subject: [PATCH 081/317] tlog recruitment used a degraded LogClass process
 over a non-degraded TransactionClass process tlog recruitment would not use
 TransactionClass processes if it fulfulled the required amount with LogClass
 processes Better master exists did not account for how many times a process
 had been used when comparing recruitments Better master exists did not
 account for the fact that tlogs prefer to be in a different dc than the
 cluster controller RoleFitness comparison did not properly order count before
 degraded or bestFit betterCount was returning worstFit when worstIsDegraded
 did not match backupWorker recruitment did not attempt to avoid sharing
 processes with other roles If any of the commit_proxy, grv_proxy, or resolver
 are forced to share a process, allow the recruitment for all of them to share
 to an equal degree, this change allows BetterMasterExists to be refactors as
 a tuple comparison

---
 fdbserver/ClusterController.actor.cpp | 491 ++++++++++++++++----------
 1 file changed, 310 insertions(+), 181 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 55770d6f3b..87dcbd7c11 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -407,16 +407,17 @@ public:
 
 			// This worker is a candidate for TLog recruitment.
 			bool inCCDC = worker_details.interf.locality.dcId() == clusterControllerDcId;
+			// Prefer recruiting a TransactionClass non-degraded process over a LogClass degraded process
+			if (worker_details.degraded) {
+				fitness = std::max(fitness, ProcessClass::GoodFit);
+			}
 			fitness_workers[std::make_tuple(fitness, id_used[worker_process_id], worker_details.degraded, inCCDC)]
 			    .push_back(worker_details);
 		}
 
-		//  FIXME: it's not clear whether this is necessary.
-		for (int fitness = ProcessClass::BestFit; fitness != ProcessClass::NeverAssign; fitness++) {
-			auto fitnessEnum = (ProcessClass::Fitness)fitness;
-			for (int addingDegraded = 0; addingDegraded < 2; addingDegraded++) {
-				fitness_workers[std::make_tuple(fitnessEnum, 0, addingDegraded, false)];
-			}
+		// Make sure we check for tlogs at the required size before adding processses from the next fitness level
+		for (int fitness = ProcessClass::GoodFit; fitness != ProcessClass::NeverAssign; fitness++) {
+			fitness_workers[std::make_tuple((ProcessClass::Fitness)fitness, 0, true, false)];
 		}
 		results.reserve(results.size() + id_worker.size());
 		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
@@ -432,7 +433,7 @@ public:
 				logServerMap->add(worker.interf.locality, &worker);
 			}
 
-			if (logServerSet->size() < (std::get<2>(workerIter->first) ? required : desired)) {
+			if (logServerSet->size() < (addingDegraded ? required : desired)) {
 			} else if (logServerSet->size() == required || logServerSet->size() <= desired) {
 				if (logServerSet->validate(policy)) {
 					for (auto& object : logServerMap->getObjects()) {
@@ -742,29 +743,40 @@ public:
 		return results;
 	}
 
+	// Allows the comparison of two different recruitments to determine which one is better
+	// Tlog recruitment is different from all the other roles, in that it avoids degraded processes
+	// And tried to avoid recruitment in the same DC as the cluster controller
 	struct RoleFitness {
 		ProcessClass::Fitness bestFit;
 		ProcessClass::Fitness worstFit;
 		ProcessClass::ClusterRole role;
 		int count;
+		int worstUsed;
 		bool worstIsDegraded;
+		bool inClusterControllerDC;
 
 		RoleFitness(int bestFit, int worstFit, int count, ProcessClass::ClusterRole role)
 		  : bestFit((ProcessClass::Fitness)bestFit), worstFit((ProcessClass::Fitness)worstFit), count(count),
-		    role(role), worstIsDegraded(false) {}
+		    role(role), worstUsed(1), worstIsDegraded(false), inClusterControllerDC(false) {}
 
 		RoleFitness(int fitness, int count, ProcessClass::ClusterRole role)
 		  : bestFit((ProcessClass::Fitness)fitness), worstFit((ProcessClass::Fitness)fitness), count(count), role(role),
-		    worstIsDegraded(false) {}
+		    worstUsed(1), worstIsDegraded(false), inClusterControllerDC(false) {}
 
 		RoleFitness()
 		  : bestFit(ProcessClass::NeverAssign), worstFit(ProcessClass::NeverAssign), role(ProcessClass::NoRole),
-		    count(0), worstIsDegraded(false) {}
+		    count(0), worstUsed(1), worstIsDegraded(false), inClusterControllerDC(false) {}
 
-		RoleFitness(vector<WorkerDetails> workers, ProcessClass::ClusterRole role) : role(role) {
-			worstFit = ProcessClass::GoodFit;
+		RoleFitness(const vector<WorkerDetails>& workers,
+		            ProcessClass::ClusterRole role,
+		            const std::map<Optional<Standalone<StringRef>>, int>& id_used,
+		            Optional<Standalone<StringRef>> ccDcId)
+		  : role(role) {
+			worstFit = ProcessClass::BestFit;
 			worstIsDegraded = false;
+			inClusterControllerDC = false;
 			bestFit = ProcessClass::NeverAssign;
+			worstUsed = 1;
 			for (auto& it : workers) {
 				auto thisFit = it.processClass.machineClassFitness(role);
 				if (thisFit > worstFit) {
@@ -774,7 +786,24 @@ public:
 					worstIsDegraded = worstIsDegraded || it.degraded;
 				}
 				bestFit = std::min(bestFit, thisFit);
+				auto thisUsed = id_used.find(it.interf.locality.processId());
+				if (thisUsed == id_used.end()) {
+					TraceEvent(SevError, "UsedNotFound").detail("ProcessId", it.interf.locality.processId().get());
+					ASSERT(false);
+				}
+				if (thisUsed->second == 0) {
+					TraceEvent(SevError, "UsedIsZero").detail("ProcessId", it.interf.locality.processId().get());
+					ASSERT(false);
+				}
+				worstUsed = std::max(worstUsed, thisUsed->second);
+				// only tlogs avoid the cluster controller dc
+				if (role == ProcessClass::TLog && it.interf.locality.dcId() == ccDcId) {
+					inClusterControllerDC = true;
+				}
 			}
+			// Every recruitment will attempt to recruit the preferred amount through GoodFit,
+			// So a recruitment which only has BestFit is not better than one that has a GoodFit process
+			worstFit = std::max(worstFit, ProcessClass::GoodFit);
 			count = workers.size();
 			// degraded is only used for recruitment of tlogs
 			if (role != ProcessClass::TLog) {
@@ -785,87 +814,45 @@ public:
 		bool operator<(RoleFitness const& r) const {
 			if (worstFit != r.worstFit)
 				return worstFit < r.worstFit;
+			if (worstUsed != r.worstUsed)
+				return worstUsed < r.worstUsed;
+			if (count != r.count)
+				return count > r.count;
 			if (worstIsDegraded != r.worstIsDegraded)
 				return r.worstIsDegraded;
+			if (inClusterControllerDC != r.inClusterControllerDC)
+				return r.inClusterControllerDC;
 			// FIXME: TLog recruitment process does not guarantee the best fit is not worsened.
 			if (role != ProcessClass::TLog && role != ProcessClass::LogRouter && bestFit != r.bestFit)
 				return bestFit < r.bestFit;
-			return count > r.count;
+			return false;
 		}
 		bool operator>(RoleFitness const& r) const { return r < *this; }
 		bool operator<=(RoleFitness const& r) const { return !(*this > r); }
 		bool operator>=(RoleFitness const& r) const { return !(*this < r); }
 
-		bool betterFitness(RoleFitness const& r) const {
-			if (worstFit != r.worstFit)
-				return worstFit < r.worstFit;
-			if (worstIsDegraded != r.worstIsDegraded)
-				return r.worstFit;
-			if (bestFit != r.bestFit)
-				return bestFit < r.bestFit;
-			return false;
-		}
-
 		bool betterCount(RoleFitness const& r) const {
 			if (count > r.count)
 				return true;
 			if (worstFit != r.worstFit)
 				return worstFit < r.worstFit;
+			if (worstUsed != r.worstUsed)
+				return worstUsed < r.worstUsed;
 			if (worstIsDegraded != r.worstIsDegraded)
-				return r.worstFit;
+				return r.worstIsDegraded;
+			if (inClusterControllerDC != r.inClusterControllerDC)
+				return r.inClusterControllerDC;
 			return false;
 		}
 
 		bool operator==(RoleFitness const& r) const {
-			return worstFit == r.worstFit && bestFit == r.bestFit && count == r.count &&
-			       worstIsDegraded == r.worstIsDegraded;
+			return worstFit == r.worstFit && worstUsed == r.worstUsed && bestFit == r.bestFit && count == r.count &&
+			       worstIsDegraded == r.worstIsDegraded && inClusterControllerDC == r.inClusterControllerDC;
 		}
 
-		std::string toString() const { return format("%d %d %d %d", bestFit, worstFit, count, worstIsDegraded); }
-	};
-
-	struct RoleFitnessPair {
-		RoleFitness proxy;
-		RoleFitness grvProxy;
-		RoleFitness resolver;
-
-		RoleFitnessPair() {}
-		RoleFitnessPair(RoleFitness const& proxy, RoleFitness const& grvProxy, RoleFitness const& resolver)
-		  : proxy(proxy), grvProxy(grvProxy), resolver(resolver) {}
-
-		bool operator<(RoleFitnessPair const& r) const {
-			if (proxy.betterFitness(r.proxy)) {
-				return true;
-			}
-			if (r.proxy.betterFitness(proxy)) {
-				return false;
-			}
-			if (grvProxy.betterFitness(r.grvProxy)) {
-				return true;
-			}
-			if (r.grvProxy.betterFitness(grvProxy)) {
-				return false;
-			}
-			if (resolver.betterFitness(r.resolver)) {
-				return true;
-			}
-			if (r.resolver.betterFitness(resolver)) {
-				return false;
-			}
-			if (proxy.count != r.proxy.count) {
-				return proxy.count > r.proxy.count;
-			}
-			if (grvProxy.count != r.grvProxy.count) {
-				return grvProxy.count > r.grvProxy.count;
-			}
-			return resolver.count > r.resolver.count;
-		}
-		bool operator>(RoleFitnessPair const& r) const { return r < *this; }
-		bool operator<=(RoleFitnessPair const& r) const { return !(*this > r); }
-		bool operator>=(RoleFitnessPair const& r) const { return !(*this < r); }
-
-		bool operator==(RoleFitnessPair const& r) const {
-			return proxy == r.proxy && grvProxy == r.grvProxy && resolver == r.resolver;
+		std::string toString() const {
+			return format(
+			    "%d %d %d %d %d %d", worstFit, worstUsed, count, worstIsDegraded, inClusterControllerDC, bestFit);
 		}
 	};
 
@@ -914,9 +901,9 @@ public:
 		if (!goodRemoteRecruitmentTime.isReady() &&
 		    ((RoleFitness(
 		          SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredRemoteLogs(), ProcessClass::TLog)
-		          .betterCount(RoleFitness(remoteLogs, ProcessClass::TLog))) ||
+		          .betterCount(RoleFitness(remoteLogs, ProcessClass::TLog, id_used, clusterControllerDcId))) ||
 		     (RoleFitness(SERVER_KNOBS->EXPECTED_LOG_ROUTER_FITNESS, req.logRouterCount, ProcessClass::LogRouter)
-		          .betterCount(RoleFitness(logRouters, ProcessClass::LogRouter))))) {
+		          .betterCount(RoleFitness(logRouters, ProcessClass::LogRouter, id_used, clusterControllerDcId))))) {
 			throw operation_failed();
 		}
 
@@ -980,6 +967,13 @@ public:
 		auto first_resolver = getWorkerForRoleInDatacenter(
 		    dcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, req.configuration, id_used);
 
+		// If one of the first process recruitments is forced to share a process, allow all of next recruitments
+		// to also share a process.
+		auto maxUsed = std::max(std::max(first_commit_proxy.used, first_grv_proxy.used), first_resolver.used);
+		first_commit_proxy.used = maxUsed;
+		first_grv_proxy.used = maxUsed;
+		first_resolver.used = maxUsed;
+
 		auto commit_proxies = getWorkersForRoleInDatacenter(dcId,
 		                                                    ProcessClass::CommitProxy,
 		                                                    req.configuration.getDesiredCommitProxies(),
@@ -1031,24 +1025,24 @@ public:
 
 		if (!goodRecruitmentTime.isReady() &&
 		    (RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs(), ProcessClass::TLog)
-		         .betterCount(RoleFitness(tlogs, ProcessClass::TLog)) ||
+		         .betterCount(RoleFitness(tlogs, ProcessClass::TLog, id_used, clusterControllerDcId)) ||
 		     (region.satelliteTLogReplicationFactor > 0 && req.configuration.usableRegions > 1 &&
 		      RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS,
 		                  req.configuration.getDesiredSatelliteLogs(dcId),
 		                  ProcessClass::TLog)
-		          .betterCount(RoleFitness(satelliteLogs, ProcessClass::TLog))) ||
+		          .betterCount(RoleFitness(satelliteLogs, ProcessClass::TLog, id_used, clusterControllerDcId))) ||
 		     RoleFitness(SERVER_KNOBS->EXPECTED_COMMIT_PROXY_FITNESS,
 		                 req.configuration.getDesiredCommitProxies(),
 		                 ProcessClass::CommitProxy)
-		         .betterCount(RoleFitness(commit_proxies, ProcessClass::CommitProxy)) ||
+		         .betterCount(RoleFitness(commit_proxies, ProcessClass::CommitProxy, id_used, clusterControllerDcId)) ||
 		     RoleFitness(SERVER_KNOBS->EXPECTED_GRV_PROXY_FITNESS,
 		                 req.configuration.getDesiredGrvProxies(),
 		                 ProcessClass::GrvProxy)
-		         .betterCount(RoleFitness(grv_proxies, ProcessClass::GrvProxy)) ||
+		         .betterCount(RoleFitness(grv_proxies, ProcessClass::GrvProxy, id_used, clusterControllerDcId)) ||
 		     RoleFitness(SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS,
 		                 req.configuration.getDesiredResolvers(),
 		                 ProcessClass::Resolver)
-		         .betterCount(RoleFitness(resolvers, ProcessClass::Resolver)))) {
+		         .betterCount(RoleFitness(resolvers, ProcessClass::Resolver, id_used, clusterControllerDcId)))) {
 			return operation_failed();
 		}
 
@@ -1149,7 +1143,7 @@ public:
 
 			auto datacenters = getDatacenters(req.configuration);
 
-			RoleFitnessPair bestFitness;
+			std::tuple<RoleFitness, RoleFitness, RoleFitness> bestFitness;
 			int numEquivalent = 1;
 			Optional<Key> bestDC;
 
@@ -1165,6 +1159,14 @@ public:
 					auto first_resolver = getWorkerForRoleInDatacenter(
 					    dcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, req.configuration, used);
 
+					// If one of the first process recruitments is forced to share a process, allow all of next
+					// recruitments to also share a process.
+					auto maxUsed =
+					    std::max(std::max(first_commit_proxy.used, first_grv_proxy.used), first_resolver.used);
+					first_commit_proxy.used = maxUsed;
+					first_grv_proxy.used = maxUsed;
+					first_resolver.used = maxUsed;
+
 					auto commit_proxies = getWorkersForRoleInDatacenter(dcId,
 					                                                    ProcessClass::CommitProxy,
 					                                                    req.configuration.getDesiredCommitProxies(),
@@ -1186,9 +1188,10 @@ public:
 					                                               used,
 					                                               first_resolver);
 
-					RoleFitnessPair fitness(RoleFitness(commit_proxies, ProcessClass::CommitProxy),
-					                        RoleFitness(grv_proxies, ProcessClass::GrvProxy),
-					                        RoleFitness(resolvers, ProcessClass::Resolver));
+					auto fitness = std::make_tuple(
+					    RoleFitness(commit_proxies, ProcessClass::CommitProxy, used, clusterControllerDcId),
+					    RoleFitness(grv_proxies, ProcessClass::GrvProxy, used, clusterControllerDcId),
+					    RoleFitness(resolvers, ProcessClass::Resolver, used, clusterControllerDcId));
 
 					if (dcId == clusterControllerDcId) {
 						bestFitness = fitness;
@@ -1206,7 +1209,7 @@ public:
 						if (req.configuration.backupWorkerEnabled) {
 							const int nBackup = std::max<int>(tlogs.size(), req.maxOldLogRouters);
 							auto backupWorkers = getWorkersForRoleInDatacenter(
-							    dcId, ProcessClass::Backup, nBackup, req.configuration, id_used);
+							    dcId, ProcessClass::Backup, nBackup, req.configuration, used);
 							std::transform(backupWorkers.begin(),
 							               backupWorkers.end(),
 							               std::back_inserter(result.backupWorkers),
@@ -1254,19 +1257,19 @@ public:
 			if (!goodRecruitmentTime.isReady() &&
 			    (RoleFitness(
 			         SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs(), ProcessClass::TLog)
-			         .betterCount(RoleFitness(tlogs, ProcessClass::TLog)) ||
+			         .betterCount(RoleFitness(tlogs, ProcessClass::TLog, id_used, clusterControllerDcId)) ||
 			     RoleFitness(SERVER_KNOBS->EXPECTED_COMMIT_PROXY_FITNESS,
 			                 req.configuration.getDesiredCommitProxies(),
 			                 ProcessClass::CommitProxy)
-			         .betterCount(bestFitness.proxy) ||
+			         .betterCount(std::get<0>(bestFitness)) ||
 			     RoleFitness(SERVER_KNOBS->EXPECTED_GRV_PROXY_FITNESS,
 			                 req.configuration.getDesiredGrvProxies(),
 			                 ProcessClass::GrvProxy)
-			         .betterCount(bestFitness.grvProxy) ||
+			         .betterCount(std::get<1>(bestFitness)) ||
 			     RoleFitness(SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS,
 			                 req.configuration.getDesiredResolvers(),
 			                 ProcessClass::Resolver)
-			         .betterCount(bestFitness.resolver))) {
+			         .betterCount(std::get<2>(bestFitness)))) {
 				throw operation_failed();
 			}
 
@@ -1337,6 +1340,12 @@ public:
 		}
 	}
 
+	void updateIdUsed(const vector<WorkerDetails>& workers, std::map<Optional<Standalone<StringRef>>, int>& id_used) {
+		for (auto& it : workers) {
+			id_used[it.interf.locality.processId()]++;
+		}
+	}
+
 	// FIXME: determine when to fail the cluster controller when a primaryDC has not been set
 
 	// This function returns true when the cluster controller determines it is worth forcing
@@ -1351,6 +1360,7 @@ public:
 		// Do not trigger better master exists if the cluster controller is excluded, since the master will change
 		// anyways once the cluster controller is moved
 		if (id_worker[clusterControllerProcessId].priorityInfo.isExcluded) {
+			TraceEvent("WorseMasterExists", id).detail("Reason", "ClusterControllerExcluded");
 			return false;
 		}
 
@@ -1363,6 +1373,9 @@ public:
 		// Get master process
 		auto masterWorker = id_worker.find(dbi.master.locality.processId());
 		if (masterWorker == id_worker.end()) {
+			TraceEvent("WorseMasterExists", id)
+			    .detail("Reason", "CannotFindMaster")
+			    .detail("ProcessID", dbi.master.locality.processId());
 			return false;
 		}
 
@@ -1378,10 +1391,18 @@ public:
 		for (auto& logSet : dbi.logSystemConfig.tLogs) {
 			for (auto& it : logSet.tLogs) {
 				auto tlogWorker = id_worker.find(it.interf().filteredLocality.processId());
-				if (tlogWorker == id_worker.end())
+				if (tlogWorker == id_worker.end()) {
+					TraceEvent("WorseMasterExists", id)
+					    .detail("Reason", "CannotFindTLog")
+					    .detail("ProcessID", it.interf().filteredLocality.processId());
 					return false;
-				if (tlogWorker->second.priorityInfo.isExcluded)
+				}
+				if (tlogWorker->second.priorityInfo.isExcluded) {
+					TraceEvent("BetterMasterExists", id)
+					    .detail("Reason", "TLogExcluded")
+					    .detail("ProcessID", it.interf().filteredLocality.processId());
 					return true;
+				}
 
 				if (logSet.isLocal && logSet.locality == tagLocalitySatellite) {
 					satellite_tlogs.push_back(tlogWorker->second.details);
@@ -1394,10 +1415,18 @@ public:
 
 			for (auto& it : logSet.logRouters) {
 				auto tlogWorker = id_worker.find(it.interf().filteredLocality.processId());
-				if (tlogWorker == id_worker.end())
+				if (tlogWorker == id_worker.end()) {
+					TraceEvent("WorseMasterExists", id)
+					    .detail("Reason", "CannotFindLogRouter")
+					    .detail("ProcessID", it.interf().filteredLocality.processId());
 					return false;
-				if (tlogWorker->second.priorityInfo.isExcluded)
+				}
+				if (tlogWorker->second.priorityInfo.isExcluded) {
+					TraceEvent("BetterMasterExists", id)
+					    .detail("Reason", "LogRouterExcluded")
+					    .detail("ProcessID", it.interf().filteredLocality.processId());
 					return true;
+				}
 				if (!logRouterAddresses.count(tlogWorker->second.details.interf.address())) {
 					logRouterAddresses.insert(tlogWorker->second.details.interf.address());
 					log_routers.push_back(tlogWorker->second.details);
@@ -1406,10 +1435,18 @@ public:
 
 			for (const auto& worker : logSet.backupWorkers) {
 				auto workerIt = id_worker.find(worker.interf().locality.processId());
-				if (workerIt == id_worker.end())
+				if (workerIt == id_worker.end()) {
+					TraceEvent("WorseMasterExists", id)
+					    .detail("Reason", "CannotFindBackupWorker")
+					    .detail("ProcessID", worker.interf().locality.processId());
 					return false;
-				if (workerIt->second.priorityInfo.isExcluded)
+				}
+				if (workerIt->second.priorityInfo.isExcluded) {
+					TraceEvent("BetterMasterExists", id)
+					    .detail("Reason", "BackupWorkerExcluded")
+					    .detail("ProcessID", worker.interf().locality.processId());
 					return true;
+				}
 				if (backup_addresses.count(workerIt->second.details.interf.address()) == 0) {
 					backup_addresses.insert(workerIt->second.details.interf.address());
 					backup_workers.push_back(workerIt->second.details);
@@ -1421,10 +1458,18 @@ public:
 		std::vector<WorkerDetails> commitProxyClasses;
 		for (auto& it : dbi.client.commitProxies) {
 			auto commitProxyWorker = id_worker.find(it.processId);
-			if (commitProxyWorker == id_worker.end())
+			if (commitProxyWorker == id_worker.end()) {
+				TraceEvent("WorseMasterExists", id)
+				    .detail("Reason", "CannotFindCommitProxy")
+				    .detail("ProcessID", it.processId);
 				return false;
-			if (commitProxyWorker->second.priorityInfo.isExcluded)
+			}
+			if (commitProxyWorker->second.priorityInfo.isExcluded) {
+				TraceEvent("BetterMasterExists", id)
+				    .detail("Reason", "CommitProxyExcluded")
+				    .detail("ProcessID", it.processId);
 				return true;
+			}
 			commitProxyClasses.push_back(commitProxyWorker->second.details);
 		}
 
@@ -1432,10 +1477,18 @@ public:
 		std::vector<WorkerDetails> grvProxyClasses;
 		for (auto& it : dbi.client.grvProxies) {
 			auto grvProxyWorker = id_worker.find(it.processId);
-			if (grvProxyWorker == id_worker.end())
+			if (grvProxyWorker == id_worker.end()) {
+				TraceEvent("WorseMasterExists", id)
+				    .detail("Reason", "CannotFindGrvProxy")
+				    .detail("ProcessID", it.processId);
 				return false;
-			if (grvProxyWorker->second.priorityInfo.isExcluded)
+			}
+			if (grvProxyWorker->second.priorityInfo.isExcluded) {
+				TraceEvent("BetterMasterExists", id)
+				    .detail("Reason", "GrvProxyExcluded")
+				    .detail("ProcessID", it.processId);
 				return true;
+			}
 			grvProxyClasses.push_back(grvProxyWorker->second.details);
 		}
 
@@ -1443,10 +1496,18 @@ public:
 		std::vector<WorkerDetails> resolverClasses;
 		for (auto& it : dbi.resolvers) {
 			auto resolverWorker = id_worker.find(it.locality.processId());
-			if (resolverWorker == id_worker.end())
+			if (resolverWorker == id_worker.end()) {
+				TraceEvent("WorseMasterExists", id)
+				    .detail("Reason", "CannotFindResolver")
+				    .detail("ProcessID", it.locality.processId());
 				return false;
-			if (resolverWorker->second.priorityInfo.isExcluded)
+			}
+			if (resolverWorker->second.priorityInfo.isExcluded) {
+				TraceEvent("BetterMasterExists", id)
+				    .detail("Reason", "ResolverExcluded")
+				    .detail("ProcessID", it.locality.processId());
 				return true;
+			}
 			resolverClasses.push_back(resolverWorker->second.details);
 		}
 
@@ -1459,7 +1520,9 @@ public:
 		}
 
 		std::map<Optional<Standalone<StringRef>>, int> id_used;
+		std::map<Optional<Standalone<StringRef>>, int> old_id_used;
 		id_used[clusterControllerProcessId]++;
+		old_id_used[clusterControllerProcessId]++;
 		WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(
 		    clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db.config, id_used, true);
 		auto newMasterFit = mworker.worker.processClass.machineClassFitness(ProcessClass::Master);
@@ -1467,11 +1530,25 @@ public:
 			newMasterFit = std::max(newMasterFit, ProcessClass::ExcludeFit);
 		}
 
-		if (oldMasterFit < newMasterFit)
+		old_id_used[masterWorker->first]++;
+		if (oldMasterFit < newMasterFit) {
+			TraceEvent("WorseMasterExists", id)
+			    .detail("OldMasterFit", oldMasterFit)
+			    .detail("NewMasterFit", newMasterFit)
+			    .detail("OldIsCC", dbi.master.locality.processId() == clusterControllerProcessId)
+			    .detail("NewIsCC", mworker.worker.interf.locality.processId() == clusterControllerProcessId);
+			;
 			return false;
+		}
 		if (oldMasterFit > newMasterFit || (dbi.master.locality.processId() == clusterControllerProcessId &&
-		                                    mworker.worker.interf.locality.processId() != clusterControllerProcessId))
+		                                    mworker.worker.interf.locality.processId() != clusterControllerProcessId)) {
+			TraceEvent("BetterMasterExists", id)
+			    .detail("OldMasterFit", oldMasterFit)
+			    .detail("NewMasterFit", newMasterFit)
+			    .detail("OldIsCC", dbi.master.locality.processId() == clusterControllerProcessId)
+			    .detail("NewIsCC", mworker.worker.interf.locality.processId() == clusterControllerProcessId);
 			return true;
+		}
 
 		std::set<Optional<Key>> primaryDC;
 		std::set<Optional<Key>> remoteDC;
@@ -1493,7 +1570,8 @@ public:
 		}
 
 		// Check tLog fitness
-		RoleFitness oldTLogFit(tlogs, ProcessClass::TLog);
+		updateIdUsed(tlogs, old_id_used);
+		RoleFitness oldTLogFit(tlogs, ProcessClass::TLog, old_id_used, clusterControllerDcId);
 		auto newTLogs = getWorkersForTlogs(db.config,
 		                                   db.config.tLogReplicationFactor,
 		                                   db.config.getDesiredLogs(),
@@ -1501,10 +1579,7 @@ public:
 		                                   id_used,
 		                                   true,
 		                                   primaryDC);
-		RoleFitness newTLogFit(newTLogs, ProcessClass::TLog);
-
-		if (oldTLogFit < newTLogFit)
-			return false;
+		RoleFitness newTLogFit(newTLogs, ProcessClass::TLog, id_used, clusterControllerDcId);
 
 		bool oldSatelliteFallback = false;
 
@@ -1520,13 +1595,16 @@ public:
 			}
 		}
 
-		RoleFitness oldSatelliteTLogFit(satellite_tlogs, ProcessClass::TLog);
+		updateIdUsed(satellite_tlogs, old_id_used);
+		RoleFitness oldSatelliteTLogFit(satellite_tlogs, ProcessClass::TLog, old_id_used, clusterControllerDcId);
 		bool newSatelliteFallback = false;
-		auto newSatelliteTLogs =
-		    (region.satelliteTLogReplicationFactor > 0 && db.config.usableRegions > 1)
-		        ? getWorkersForSatelliteLogs(db.config, region, remoteRegion, id_used, newSatelliteFallback, true)
-		        : satellite_tlogs;
-		RoleFitness newSatelliteTLogFit(newSatelliteTLogs, ProcessClass::TLog);
+		auto newSatelliteTLogs = satellite_tlogs;
+		RoleFitness newSatelliteTLogFit = oldSatelliteTLogFit;
+		if (region.satelliteTLogReplicationFactor > 0 && db.config.usableRegions > 1) {
+			newSatelliteTLogs =
+			    getWorkersForSatelliteLogs(db.config, region, remoteRegion, id_used, newSatelliteFallback, true);
+			newSatelliteTLogFit = RoleFitness(newSatelliteTLogs, ProcessClass::TLog, id_used, clusterControllerDcId);
+		}
 
 		std::map<Optional<Key>, int32_t> satellite_priority;
 		for (auto& r : region.satellites) {
@@ -1551,55 +1629,72 @@ public:
 			}
 		}
 
-		if (oldSatelliteFallback && !newSatelliteFallback)
+		if (oldSatelliteFallback && !newSatelliteFallback) {
+			TraceEvent("BetterMasterExists", id)
+			    .detail("OldSatelliteFallback", oldSatelliteFallback)
+			    .detail("NewSatelliteFallback", newSatelliteFallback);
 			return true;
-		if (!oldSatelliteFallback && newSatelliteFallback)
+		}
+		if (!oldSatelliteFallback && newSatelliteFallback) {
+			TraceEvent("WorseMasterExists", id)
+			    .detail("OldSatelliteFallback", oldSatelliteFallback)
+			    .detail("NewSatelliteFallback", newSatelliteFallback);
 			return false;
+		}
 
-		if (oldSatelliteRegionFit < newSatelliteRegionFit)
+		if (oldSatelliteRegionFit < newSatelliteRegionFit) {
+			TraceEvent("BetterMasterExists", id)
+			    .detail("OldSatelliteRegionFit", oldSatelliteRegionFit)
+			    .detail("NewSatelliteRegionFit", newSatelliteRegionFit);
 			return true;
-		if (oldSatelliteRegionFit > newSatelliteRegionFit)
+		}
+		if (oldSatelliteRegionFit > newSatelliteRegionFit) {
+			TraceEvent("WorseMasterExists", id)
+			    .detail("OldSatelliteRegionFit", oldSatelliteRegionFit)
+			    .detail("NewSatelliteRegionFit", newSatelliteRegionFit);
 			return false;
+		}
 
-		if (oldSatelliteTLogFit < newSatelliteTLogFit)
-			return false;
-
-		RoleFitness oldRemoteTLogFit(remote_tlogs, ProcessClass::TLog);
+		updateIdUsed(remote_tlogs, old_id_used);
+		RoleFitness oldRemoteTLogFit(remote_tlogs, ProcessClass::TLog, old_id_used, clusterControllerDcId);
 		std::vector<UID> exclusionWorkerIds;
 		auto fn = [](const WorkerDetails& in) { return in.interf.id(); };
 		std::transform(newTLogs.begin(), newTLogs.end(), std::back_inserter(exclusionWorkerIds), fn);
 		std::transform(newSatelliteTLogs.begin(), newSatelliteTLogs.end(), std::back_inserter(exclusionWorkerIds), fn);
-		RoleFitness newRemoteTLogFit(
-		    (db.config.usableRegions > 1 && (dbi.recoveryState == RecoveryState::ALL_LOGS_RECRUITED ||
-		                                     dbi.recoveryState == RecoveryState::FULLY_RECOVERED))
-		        ? getWorkersForTlogs(db.config,
-		                             db.config.getRemoteTLogReplicationFactor(),
-		                             db.config.getDesiredRemoteLogs(),
-		                             db.config.getRemoteTLogPolicy(),
-		                             id_used,
-		                             true,
-		                             remoteDC,
-		                             exclusionWorkerIds)
-		        : remote_tlogs,
-		    ProcessClass::TLog);
-		if (oldRemoteTLogFit < newRemoteTLogFit)
-			return false;
+		RoleFitness newRemoteTLogFit = oldRemoteTLogFit;
+		if (db.config.usableRegions > 1 && (dbi.recoveryState == RecoveryState::ALL_LOGS_RECRUITED ||
+		                                    dbi.recoveryState == RecoveryState::FULLY_RECOVERED)) {
+			newRemoteTLogFit = RoleFitness(getWorkersForTlogs(db.config,
+			                                                  db.config.getRemoteTLogReplicationFactor(),
+			                                                  db.config.getDesiredRemoteLogs(),
+			                                                  db.config.getRemoteTLogPolicy(),
+			                                                  id_used,
+			                                                  true,
+			                                                  remoteDC,
+			                                                  exclusionWorkerIds),
+			                               ProcessClass::TLog,
+			                               id_used,
+			                               clusterControllerDcId);
+		}
 		int oldRouterCount =
 		    oldTLogFit.count * std::max<int>(1, db.config.desiredLogRouterCount / std::max(1, oldTLogFit.count));
 		int newRouterCount =
 		    newTLogFit.count * std::max<int>(1, db.config.desiredLogRouterCount / std::max(1, newTLogFit.count));
-		RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter);
-		RoleFitness newLogRoutersFit(
-		    (db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED)
-		        ? getWorkersForRoleInDatacenter(*remoteDC.begin(),
-		                                        ProcessClass::LogRouter,
-		                                        newRouterCount,
-		                                        db.config,
-		                                        id_used,
-		                                        Optional<WorkerFitnessInfo>(),
-		                                        true)
-		        : log_routers,
-		    ProcessClass::LogRouter);
+		updateIdUsed(log_routers, old_id_used);
+		RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter, old_id_used, clusterControllerDcId);
+		RoleFitness newLogRoutersFit = oldLogRoutersFit;
+		if (db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED) {
+			newLogRoutersFit = RoleFitness(getWorkersForRoleInDatacenter(*remoteDC.begin(),
+			                                                             ProcessClass::LogRouter,
+			                                                             newRouterCount,
+			                                                             db.config,
+			                                                             id_used,
+			                                                             Optional<WorkerFitnessInfo>(),
+			                                                             true),
+			                               ProcessClass::LogRouter,
+			                               id_used,
+			                               clusterControllerDcId);
+		}
 
 		if (oldLogRoutersFit.count < oldRouterCount) {
 			oldLogRoutersFit.worstFit = ProcessClass::NeverAssign;
@@ -1607,13 +1702,15 @@ public:
 		if (newLogRoutersFit.count < newRouterCount) {
 			newLogRoutersFit.worstFit = ProcessClass::NeverAssign;
 		}
-		if (oldLogRoutersFit < newLogRoutersFit)
-			return false;
 
 		// Check proxy/grvProxy/resolver fitness
-		RoleFitnessPair oldInFit(RoleFitness(commitProxyClasses, ProcessClass::CommitProxy),
-		                         RoleFitness(grvProxyClasses, ProcessClass::GrvProxy),
-		                         RoleFitness(resolverClasses, ProcessClass::Resolver));
+		updateIdUsed(commitProxyClasses, old_id_used);
+		updateIdUsed(grvProxyClasses, old_id_used);
+		updateIdUsed(resolverClasses, old_id_used);
+		RoleFitness oldCommitProxyFit(
+		    commitProxyClasses, ProcessClass::CommitProxy, old_id_used, clusterControllerDcId);
+		RoleFitness oldGrvProxyFit(grvProxyClasses, ProcessClass::GrvProxy, old_id_used, clusterControllerDcId);
+		RoleFitness oldResolverFit(resolverClasses, ProcessClass::Resolver, old_id_used, clusterControllerDcId);
 
 		auto first_commit_proxy = getWorkerForRoleInDatacenter(
 		    clusterControllerDcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, db.config, id_used, true);
@@ -1621,6 +1718,10 @@ public:
 		    clusterControllerDcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, db.config, id_used, true);
 		auto first_resolver = getWorkerForRoleInDatacenter(
 		    clusterControllerDcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, db.config, id_used, true);
+		auto maxUsed = std::max(std::max(first_commit_proxy.used, first_grv_proxy.used), first_resolver.used);
+		first_commit_proxy.used = maxUsed;
+		first_grv_proxy.used = maxUsed;
+		first_resolver.used = maxUsed;
 		auto commit_proxies = getWorkersForRoleInDatacenter(clusterControllerDcId,
 		                                                    ProcessClass::CommitProxy,
 		                                                    db.config.getDesiredCommitProxies(),
@@ -1643,25 +1744,13 @@ public:
 		                                               first_resolver,
 		                                               true);
 
-		RoleFitnessPair newInFit(RoleFitness(commit_proxies, ProcessClass::CommitProxy),
-		                         RoleFitness(grv_proxies, ProcessClass::GrvProxy),
-		                         RoleFitness(resolvers, ProcessClass::Resolver));
-		if (oldInFit.proxy.betterFitness(newInFit.proxy) || oldInFit.grvProxy.betterFitness(newInFit.grvProxy) ||
-		    oldInFit.resolver.betterFitness(newInFit.resolver)) {
-			return false;
-		}
-
-		// Because a configuration with fewer proxies or resolvers does not cause this function to fail,
-		// we need an extra check to determine if the total number of processes has been reduced.
-		// This is mainly helpful in avoiding situations where killing a degraded process
-		// would result in a configuration with less total processes than desired.
-		if (oldTLogFit.count + oldInFit.proxy.count + oldInFit.grvProxy.count + oldInFit.resolver.count >
-		    newTLogFit.count + newInFit.proxy.count + newInFit.grvProxy.count + newInFit.resolver.count) {
-			return false;
-		}
+		RoleFitness newCommitProxyFit(commit_proxies, ProcessClass::CommitProxy, id_used, clusterControllerDcId);
+		RoleFitness newGrvProxyFit(grv_proxies, ProcessClass::GrvProxy, id_used, clusterControllerDcId);
+		RoleFitness newResolverFit(resolvers, ProcessClass::Resolver, id_used, clusterControllerDcId);
 
 		// Check backup worker fitness
-		RoleFitness oldBackupWorkersFit(backup_workers, ProcessClass::Backup);
+		updateIdUsed(backup_workers, old_id_used);
+		RoleFitness oldBackupWorkersFit(backup_workers, ProcessClass::Backup, old_id_used, clusterControllerDcId);
 		const int nBackup = backup_addresses.size();
 		RoleFitness newBackupWorkersFit(getWorkersForRoleInDatacenter(clusterControllerDcId,
 		                                                              ProcessClass::Backup,
@@ -1670,35 +1759,75 @@ public:
 		                                                              id_used,
 		                                                              Optional<WorkerFitnessInfo>(),
 		                                                              true),
-		                                ProcessClass::Backup);
+		                                ProcessClass::Backup,
+		                                id_used,
+		                                clusterControllerDcId);
 
-		if (oldTLogFit > newTLogFit || oldInFit > newInFit || oldSatelliteTLogFit > newSatelliteTLogFit ||
-		    oldRemoteTLogFit > newRemoteTLogFit || oldLogRoutersFit > newLogRoutersFit ||
-		    oldBackupWorkersFit > newBackupWorkersFit) {
+		auto oldFit = std::make_tuple(oldTLogFit,
+		                              oldSatelliteTLogFit,
+		                              oldCommitProxyFit,
+		                              oldGrvProxyFit,
+		                              oldResolverFit,
+		                              oldBackupWorkersFit,
+		                              oldRemoteTLogFit,
+		                              oldLogRoutersFit);
+		auto newFit = std::make_tuple(newTLogFit,
+		                              newSatelliteTLogFit,
+		                              newCommitProxyFit,
+		                              newGrvProxyFit,
+		                              newResolverFit,
+		                              newBackupWorkersFit,
+		                              newRemoteTLogFit,
+		                              newLogRoutersFit);
+
+		if (oldFit > newFit) {
 			TraceEvent("BetterMasterExists", id)
 			    .detail("OldMasterFit", oldMasterFit)
 			    .detail("NewMasterFit", newMasterFit)
 			    .detail("OldTLogFit", oldTLogFit.toString())
 			    .detail("NewTLogFit", newTLogFit.toString())
-			    .detail("OldProxyFit", oldInFit.proxy.toString())
-			    .detail("NewProxyFit", newInFit.proxy.toString())
-			    .detail("OldGrvProxyFit", oldInFit.grvProxy.toString())
-			    .detail("NewGrvProxyFit", newInFit.grvProxy.toString())
-			    .detail("OldResolverFit", oldInFit.resolver.toString())
-			    .detail("NewResolverFit", newInFit.resolver.toString())
 			    .detail("OldSatelliteFit", oldSatelliteTLogFit.toString())
 			    .detail("NewSatelliteFit", newSatelliteTLogFit.toString())
+			    .detail("OldCommitProxyFit", oldCommitProxyFit.toString())
+			    .detail("NewCommitProxyFit", newCommitProxyFit.toString())
+			    .detail("OldGrvProxyFit", oldGrvProxyFit.toString())
+			    .detail("NewGrvProxyFit", newGrvProxyFit.toString())
+			    .detail("OldResolverFit", oldResolverFit.toString())
+			    .detail("NewResolverFit", newResolverFit.toString())
+			    .detail("OldBackupWorkerFit", oldBackupWorkersFit.toString())
+			    .detail("NewBackupWorkerFit", newBackupWorkersFit.toString())
 			    .detail("OldRemoteFit", oldRemoteTLogFit.toString())
 			    .detail("NewRemoteFit", newRemoteTLogFit.toString())
 			    .detail("OldRouterFit", oldLogRoutersFit.toString())
 			    .detail("NewRouterFit", newLogRoutersFit.toString())
-			    .detail("OldBackupWorkerFit", oldBackupWorkersFit.toString())
-			    .detail("NewBackupWorkerFit", newBackupWorkersFit.toString())
 			    .detail("OldSatelliteFallback", oldSatelliteFallback)
 			    .detail("NewSatelliteFallback", newSatelliteFallback);
 			return true;
 		}
 
+		if (oldFit < newFit) {
+			TraceEvent("WorseMasterExists", id)
+			    .detail("OldMasterFit", oldMasterFit)
+			    .detail("NewMasterFit", newMasterFit)
+			    .detail("OldTLogFit", oldTLogFit.toString())
+			    .detail("NewTLogFit", newTLogFit.toString())
+			    .detail("OldSatelliteFit", oldSatelliteTLogFit.toString())
+			    .detail("NewSatelliteFit", newSatelliteTLogFit.toString())
+			    .detail("OldCommitProxyFit", oldCommitProxyFit.toString())
+			    .detail("NewCommitProxyFit", newCommitProxyFit.toString())
+			    .detail("OldGrvProxyFit", oldGrvProxyFit.toString())
+			    .detail("NewGrvProxyFit", newGrvProxyFit.toString())
+			    .detail("OldResolverFit", oldResolverFit.toString())
+			    .detail("NewResolverFit", newResolverFit.toString())
+			    .detail("OldBackupWorkerFit", oldBackupWorkersFit.toString())
+			    .detail("NewBackupWorkerFit", newBackupWorkersFit.toString())
+			    .detail("OldRemoteFit", oldRemoteTLogFit.toString())
+			    .detail("NewRemoteFit", newRemoteTLogFit.toString())
+			    .detail("OldRouterFit", oldLogRoutersFit.toString())
+			    .detail("NewRouterFit", newLogRoutersFit.toString())
+			    .detail("OldSatelliteFallback", oldSatelliteFallback)
+			    .detail("NewSatelliteFallback", newSatelliteFallback);
+		}
 		return false;
 	}
 

From 434f41a0937031338bf28f9ffca9300eef6be5c3 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 7 Apr 2021 18:14:44 -0700
Subject: [PATCH 082/317] Renamed members of UnitTestParameters to look
 cleaner.  Added getDouble().  Updated more Redwood unit test parameters to be
 initialized from params.

---
 fdbserver/VersionedBTree.actor.cpp      | 115 +++++++++++++-----------
 fdbserver/fdbserver.actor.cpp           |   2 +-
 fdbserver/networktest.actor.cpp         |  18 ++--
 fdbserver/workloads/UnitTests.actor.cpp |   6 +-
 flow/UnitTest.cpp                       |  24 +++--
 flow/UnitTest.h                         |  14 ++-
 6 files changed, 105 insertions(+), 74 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index c12015ade8..fab29a7034 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -7029,7 +7029,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 		bytes += deltaTest(a, b);
 	}
 	double elapsed = timer() - start;
-	printf("DeltaTest() on random large records %g M/s  %g MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
+	printf("DeltaTest() on random large records %f M/s  %f MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
 
 	keyBuffer.resize(30);
 	valueBuffer.resize(100);
@@ -7041,7 +7041,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 		RedwoodRecordRef b = randomRedwoodRecordRef(keyBuffer, valueBuffer);
 		bytes += deltaTest(a, b);
 	}
-	printf("DeltaTest() on random small records %g M/s  %g MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
+	printf("DeltaTest() on random small records %f M/s  %f MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
 
 	RedwoodRecordRef rec1;
 	RedwoodRecordRef rec2;
@@ -7058,7 +7058,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.getCommonPrefixLen(rec2, 50);
 	}
-	printf("%" PRId64 " getCommonPrefixLen(skip=50) %g M/s\n", total, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " getCommonPrefixLen(skip=50) %f M/s\n", total, count / (timer() - start) / 1e6);
 
 	start = timer();
 	total = 0;
@@ -7066,7 +7066,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.getCommonPrefixLen(rec2, 0);
 	}
-	printf("%" PRId64 " getCommonPrefixLen(skip=0) %g M/s\n", total, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " getCommonPrefixLen(skip=0) %f M/s\n", total, count / (timer() - start) / 1e6);
 
 	char buf[1000];
 	RedwoodRecordRef::Delta& d = *(RedwoodRecordRef::Delta*)buf;
@@ -7079,7 +7079,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.writeDelta(d, rec2, commonPrefix);
 	}
-	printf("%" PRId64 " writeDelta(commonPrefix=%d) %g M/s\n", total, commonPrefix, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " writeDelta(commonPrefix=%d) %f M/s\n", total, commonPrefix, count / (timer() - start) / 1e6);
 
 	start = timer();
 	total = 0;
@@ -7087,7 +7087,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.writeDelta(d, rec2);
 	}
-	printf("%" PRId64 " writeDelta() %g M/s\n", total, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " writeDelta() %f M/s\n", total, count / (timer() - start) / 1e6);
 
 	return Void();
 }
@@ -7647,30 +7647,43 @@ TEST_CASE("/redwood/correctness/btree") {
 	g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting
 	g_redwoodMetrics.clear();
 
-	state std::string pagerFile = "unittest_pageFile.redwood";
+	state std::string fileName = params.get("fileName").orDefault("unittest_pageFile.redwood");
 	IPager2* pager;
 
-	state bool serialTest = deterministicRandom()->coinflip();
-	state bool shortTest = deterministicRandom()->coinflip();
+	state bool serialTest = params.getInt("serialTest").orDefault(deterministicRandom()->coinflip());
+	state bool shortTest = params.getInt("shortTest").orDefault(deterministicRandom()->coinflip());
 
 	state int pageSize =
 	    shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400));
 
-	state int64_t targetPageOps = shortTest ? 50000 : 1000000;
-	state bool pagerMemoryOnly = shortTest && (deterministicRandom()->random01() < .001);
-	state int maxKeySize = deterministicRandom()->randomInt(1, pageSize * 2);
-	state int maxValueSize = randomSize(pageSize * 25);
-	state int maxCommitSize = shortTest ? 1000 : randomSize(std::min<int>((maxKeySize + maxValueSize) * 20000, 10e6));
-	state double clearProbability = deterministicRandom()->random01() * .1;
-	state double clearSingleKeyProbability = deterministicRandom()->random01();
-	state double clearPostSetProbability = deterministicRandom()->random01() * .1;
-	state double coldStartProbability = pagerMemoryOnly ? 0 : (deterministicRandom()->random01() * 0.3);
-	state double advanceOldVersionProbability = deterministicRandom()->random01();
+	state int64_t targetPageOps = params.getInt("targetPageOps").orDefault(shortTest ? 50000 : 1000000);
+	state bool pagerMemoryOnly =
+	    params.getInt("pagerMemoryOnly").orDefault(shortTest && (deterministicRandom()->random01() < .001));
+	state int maxKeySize = params.getInt("maxKeySize").orDefault(deterministicRandom()->randomInt(1, pageSize * 2));
+	state int maxValueSize = params.getInt("maxValueSize").orDefault(randomSize(pageSize * 25));
+	state int maxCommitSize =
+	    params.getInt("maxCommitSize")
+	        .orDefault(shortTest ? 1000 : randomSize(std::min<int>((maxKeySize + maxValueSize) * 20000, 10e6)));
+	state double clearProbability =
+	    params.getDouble("clearProbability").orDefault(deterministicRandom()->random01() * .1);
+	state double clearSingleKeyProbability =
+	    params.getDouble("clearSingleKeyProbability").orDefault(deterministicRandom()->random01());
+	state double clearPostSetProbability =
+	    params.getDouble("clearPostSetProbability").orDefault(deterministicRandom()->random01() * .1);
+	state double coldStartProbability = params.getDouble("coldStartProbability")
+	                                        .orDefault(pagerMemoryOnly ? 0 : (deterministicRandom()->random01() * 0.3));
+	state double advanceOldVersionProbability =
+	    params.getDouble("advanceOldVersionProbability").orDefault(deterministicRandom()->random01());
 	state int64_t cacheSizeBytes =
-	    pagerMemoryOnly ? 2e9 : (pageSize * deterministicRandom()->randomInt(1, (BUGGIFY ? 2 : 10000) + 1));
-	state Version versionIncrement = deterministicRandom()->randomInt64(1, 1e8);
-	state Version remapCleanupWindow = BUGGIFY ? 0 : deterministicRandom()->randomInt64(1, versionIncrement * 50);
-	state int maxVerificationMapEntries = 300e3;
+	    params.getInt("cacheSizeBytes")
+	        .orDefault(pagerMemoryOnly ? 2e9
+	                                   : (pageSize * deterministicRandom()->randomInt(1, (BUGGIFY ? 2 : 10000) + 1)));
+	state Version versionIncrement =
+	    params.getInt("versionIncrement").orDefault(deterministicRandom()->randomInt64(1, 1e8));
+	state Version remapCleanupWindow =
+	    params.getInt("remapCleanupWindow")
+	        .orDefault(BUGGIFY ? 0 : deterministicRandom()->randomInt64(1, versionIncrement * 50));
+	state int maxVerificationMapEntries = params.getInt("maxVerificationMapEntries").orDefault(300e3);
 
 	printf("\n");
 	printf("targetPageOps: %" PRId64 "\n", targetPageOps);
@@ -7693,11 +7706,11 @@ TEST_CASE("/redwood/correctness/btree") {
 	printf("\n");
 
 	printf("Deleting existing test data...\n");
-	deleteFile(pagerFile);
+	deleteFile(fileName);
 
 	printf("Initializing...\n");
-	pager = new DWALPager(pageSize, pagerFile, cacheSizeBytes, remapCleanupWindow, pagerMemoryOnly);
-	state VersionedBTree* btree = new VersionedBTree(pager, pagerFile);
+	pager = new DWALPager(pageSize, fileName, cacheSizeBytes, remapCleanupWindow, pagerMemoryOnly);
+	state VersionedBTree* btree = new VersionedBTree(pager, fileName);
 	wait(btree->init());
 
 	state std::map<std::pair<std::string, Version>, Optional<std::string>> written;
@@ -7900,8 +7913,8 @@ TEST_CASE("/redwood/correctness/btree") {
 				wait(closedFuture);
 
 				printf("Reopening btree from disk.\n");
-				IPager2* pager = new DWALPager(pageSize, pagerFile, cacheSizeBytes, remapCleanupWindow);
-				btree = new VersionedBTree(pager, pagerFile);
+				IPager2* pager = new DWALPager(pageSize, fileName, cacheSizeBytes, remapCleanupWindow);
+				btree = new VersionedBTree(pager, fileName);
 				wait(btree->init());
 
 				Version v = btree->getLatestVersion();
@@ -7937,7 +7950,7 @@ TEST_CASE("/redwood/correctness/btree") {
 	state Future<Void> closedFuture = btree->onClosed();
 	btree->close();
 	wait(closedFuture);
-	btree = new VersionedBTree(new DWALPager(pageSize, pagerFile, cacheSizeBytes, 0), pagerFile);
+	btree = new VersionedBTree(new DWALPager(pageSize, fileName, cacheSizeBytes, 0), fileName);
 	wait(btree->init());
 
 	wait(btree->clearAllAndCheckSanity());
@@ -8045,22 +8058,22 @@ TEST_CASE(":/redwood/performance/set") {
 		deleteFile(pagerFile);
 	}
 
-	state int pageSize = params.getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
-	state int64_t pageCacheBytes = params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
-	state int nodeCount = params.getIntParam("nodeCount").orDefault(1e9);
-	state int maxRecordsPerCommit = params.getIntParam("maxRecordsPerCommit").orDefault(20000);
-	state int maxKVBytesPerCommit = params.getIntParam("maxKVBytesPerCommit").orDefault(20e6);
-	state int64_t kvBytesTarget = params.getIntParam("kvBytesTarget").orDefault(4e9);
-	state int minKeyPrefixBytes = params.getIntParam("minKeyPrefixBytes").orDefault(25);
-	state int maxKeyPrefixBytes = params.getIntParam("maxKeyPrefixBytes").orDefault(25);
-	state int minValueSize = params.getIntParam("minValueSize").orDefault(100);
-	state int maxValueSize = params.getIntParam("maxValueSize").orDefault(500);
-	state int minConsecutiveRun = params.getIntParam("minConsecutiveRun").orDefault(1);
-	state int maxConsecutiveRun = params.getIntParam("maxConsecutiveRun").orDefault(100);
-	state char firstKeyChar = params.getParam("firstKeyChar").orDefault("a")[0];
-	state char lastKeyChar = params.getParam("lastKeyChar").orDefault("m")[0];
+	state int pageSize = params.getInt("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
+	state int64_t pageCacheBytes = params.getInt("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
+	state int nodeCount = params.getInt("nodeCount").orDefault(1e9);
+	state int maxRecordsPerCommit = params.getInt("maxRecordsPerCommit").orDefault(20000);
+	state int maxKVBytesPerCommit = params.getInt("maxKVBytesPerCommit").orDefault(20e6);
+	state int64_t kvBytesTarget = params.getInt("kvBytesTarget").orDefault(4e9);
+	state int minKeyPrefixBytes = params.getInt("minKeyPrefixBytes").orDefault(25);
+	state int maxKeyPrefixBytes = params.getInt("maxKeyPrefixBytes").orDefault(25);
+	state int minValueSize = params.getInt("minValueSize").orDefault(100);
+	state int maxValueSize = params.getInt("maxValueSize").orDefault(500);
+	state int minConsecutiveRun = params.getInt("minConsecutiveRun").orDefault(1);
+	state int maxConsecutiveRun = params.getInt("maxConsecutiveRun").orDefault(100);
+	state char firstKeyChar = params.get("firstKeyChar").orDefault("a")[0];
+	state char lastKeyChar = params.get("lastKeyChar").orDefault("m")[0];
 	state Version remapCleanupWindow =
-	    params.getIntParam("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
+	    params.getInt("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
 
 	printf("pageSize: %d\n", pageSize);
 	printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes);
@@ -8543,10 +8556,10 @@ ACTOR Future<Void> doPrefixInsertComparison(int suffixSize,
 }
 
 TEST_CASE(":/redwood/performance/prefixSizeComparison") {
-	state int suffixSize = 12;
-	state int valueSize = 100;
-	state int recordCountTarget = 100e6;
-	state int usePrefixesInOrder = false;
+	state int suffixSize = params.getInt("suffixSize").orDefault(12);
+	state int valueSize = params.getInt("valueSize").orDefault(100);
+	state int recordCountTarget = params.getInt("recordCountTarget").orDefault(100e6);
+	state bool usePrefixesInOrder = params.getInt("usePrefixesInOrder").orDefault(0);
 
 	wait(doPrefixInsertComparison(
 	    suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({ { 10, 100000 } })));
@@ -8564,9 +8577,9 @@ TEST_CASE(":/redwood/performance/prefixSizeComparison") {
 }
 
 TEST_CASE(":/redwood/performance/sequentialInsert") {
-	state int prefixLen = 30;
-	state int valueSize = 100;
-	state int recordCountTarget = 100e6;
+	state int prefixLen = params.getInt("prefixLen").orDefault(30);
+	state int valueSize = params.getInt("valueSize").orDefault(100);
+	state int recordCountTarget = params.getInt("recordCountTarget").orDefault(100e6);
 
 	deleteFile("test.redwood");
 	wait(delay(5));
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 325caa10c5..610410dec9 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -1059,7 +1059,7 @@ private:
 					fprintf(stderr, "ERROR: unable to parse knob option '%s'\n", syn.c_str());
 					flushAndExit(FDB_EXIT_ERROR);
 				}
-				testParams.setParam(syn.substr(7), args.OptionArg());
+				testParams.set(syn.substr(7), args.OptionArg());
 				break;
 			}
 			case OPT_LOCALITY: {
diff --git a/fdbserver/networktest.actor.cpp b/fdbserver/networktest.actor.cpp
index cea0c2fca8..d9ef7e4857 100644
--- a/fdbserver/networktest.actor.cpp
+++ b/fdbserver/networktest.actor.cpp
@@ -570,15 +570,15 @@ struct P2PNetworkTest {
 // The client will close the connection after a random idleMilliseconds.
 // Reads and writes can optionally preceded by random delays, waitReadMilliseconds and waitWriteMilliseconds.
 TEST_CASE(":/network/p2ptest") {
-	state P2PNetworkTest p2p(params.getParam("listenerAddresses").orDefault(""),
-	                         params.getParam("remoteAddresses").orDefault(""),
-	                         params.getIntParam("connectionsOut").orDefault(1),
-	                         params.getParam("requestBytes").orDefault("50:100"),
-	                         params.getParam("replyBytes").orDefault("500:1000"),
-	                         params.getParam("requests").orDefault("10:10000"),
-	                         params.getParam("idleMilliseconds").orDefault("0"),
-	                         params.getParam("waitReadMilliseconds").orDefault("0"),
-	                         params.getParam("waitWriteMilliseconds").orDefault("0"));
+	state P2PNetworkTest p2p(params.get("listenerAddresses").orDefault(""),
+	                         params.get("remoteAddresses").orDefault(""),
+	                         params.getInt("connectionsOut").orDefault(1),
+	                         params.get("requestBytes").orDefault("50:100"),
+	                         params.get("replyBytes").orDefault("500:1000"),
+	                         params.get("requests").orDefault("10:10000"),
+	                         params.get("idleMilliseconds").orDefault("0"),
+	                         params.get("waitReadMilliseconds").orDefault("0"),
+	                         params.get("waitWriteMilliseconds").orDefault("0"));
 
 	wait(p2p.run());
 	return Void();
diff --git a/fdbserver/workloads/UnitTests.actor.cpp b/fdbserver/workloads/UnitTests.actor.cpp
index 024cfc9973..db816fe4c7 100644
--- a/fdbserver/workloads/UnitTests.actor.cpp
+++ b/fdbserver/workloads/UnitTests.actor.cpp
@@ -48,9 +48,9 @@ struct UnitTestWorkload : TestWorkload {
 		testRunLimit = getOption(options, LiteralStringRef("maxTestCases"), -1);
 
 		// Consume all remaining options as testParams which the unit test can access
-		for(auto &kv : options) {
-			if(kv.value.size() != 0) {
-				testParams.setParam(kv.key.toString(), getOption(options, kv.key, StringRef()).toString());
+		for (auto& kv : options) {
+			if (kv.value.size() != 0) {
+				testParams.set(kv.key.toString(), getOption(options, kv.key, StringRef()).toString());
 			}
 		}
 
diff --git a/flow/UnitTest.cpp b/flow/UnitTest.cpp
index 7303cd33c7..f797fc32c1 100644
--- a/flow/UnitTest.cpp
+++ b/flow/UnitTest.cpp
@@ -27,12 +27,12 @@ UnitTest::UnitTest(const char* name, const char* file, int line, TestFunction fu
 	g_unittests.tests = this;
 }
 
-void UnitTestParameters::setParam(const std::string& name, const std::string& value) {
+void UnitTestParameters::set(const std::string& name, const std::string& value) {
 	printf("setting %s = %s\n", name.c_str(), value.c_str());
 	params[name] = value;
 }
 
-Optional<std::string> UnitTestParameters::getParam(const std::string& name) const {
+Optional<std::string> UnitTestParameters::get(const std::string& name) const {
 	auto it = params.find(name);
 	if (it != params.end()) {
 		return it->second;
@@ -40,14 +40,26 @@ Optional<std::string> UnitTestParameters::getParam(const std::string& name) cons
 	return {};
 }
 
-void UnitTestParameters::setParam(const std::string& name, int64_t value) {
-	setParam(name, format("%" PRId64, value));
+void UnitTestParameters::set(const std::string& name, int64_t value) {
+	set(name, format("%" PRId64, value));
 };
 
-Optional<int64_t> UnitTestParameters::getIntParam(const std::string& name) const {
-	auto opt = getParam(name);
+void UnitTestParameters::set(const std::string& name, double value) {
+	set(name, format("%g", value));
+};
+
+Optional<int64_t> UnitTestParameters::getInt(const std::string& name) const {
+	auto opt = get(name);
 	if (opt.present()) {
 		return atoll(opt.get().c_str());
 	}
 	return {};
 }
+
+Optional<double> UnitTestParameters::getDouble(const std::string& name) const {
+	auto opt = get(name);
+	if (opt.present()) {
+		return atof(opt.get().c_str());
+	}
+	return {};
+}
diff --git a/flow/UnitTest.h b/flow/UnitTest.h
index 21d51a158f..3a0d4c1db6 100644
--- a/flow/UnitTest.h
+++ b/flow/UnitTest.h
@@ -52,16 +52,22 @@ struct UnitTestParameters {
 	std::map<std::string, std::string> params;
 
 	// Set a named parameter to a string value, replacing any existing value
-	void setParam(const std::string& name, const std::string& value);
+	void set(const std::string& name, const std::string& value);
 
 	// Set a named parameter to an integer converted to a string value, replacing any existing value
-	void setParam(const std::string& name, int64_t value);
+	void set(const std::string& name, int64_t value);
+
+	// Set a named parameter to a double converted to a string value, replacing any existing value
+	void set(const std::string& name, double value);
 
 	// Get a parameter's value, will return !present() if parameter was not set
-	Optional<std::string> getParam(const std::string& name) const;
+	Optional<std::string> get(const std::string& name) const;
 
 	// Get a parameter's value as an integer, will return !present() if parameter was not set
-	Optional<int64_t> getIntParam(const std::string& name) const;
+	Optional<int64_t> getInt(const std::string& name) const;
+
+	// Get a parameter's value parsed as a double, will return !present() if parameter was not set
+	Optional<double> getDouble(const std::string& name) const;
 };
 
 // Unit test definition structured as a linked list item

From 7b08886caffdfc1ec4ac32cd2bff1c27a9feb3cc Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 7 Apr 2021 18:29:17 -0700
Subject: [PATCH 083/317] Updated btree unit test name.

---
 tests/rare/RedwoodCorrectnessBTree.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/rare/RedwoodCorrectnessBTree.toml b/tests/rare/RedwoodCorrectnessBTree.toml
index 1a7320e416..db21848a4b 100644
--- a/tests/rare/RedwoodCorrectnessBTree.toml
+++ b/tests/rare/RedwoodCorrectnessBTree.toml
@@ -6,4 +6,5 @@ startDelay = 0
     [[test.workload]]
     testName = 'UnitTests'
     maxTestCases = 0
-    testsMatching = ':/redwood/correctness/btree'
+    testsMatching = '/redwood/correctness/btree'
+    remapCleanupWindow = 1000000000

From 60e59555a729a8227da903d65a6a264de4d97629 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 7 Apr 2021 18:39:06 -0700
Subject: [PATCH 084/317] Removed btree cleanup parameter override.

---
 tests/rare/RedwoodCorrectnessBTree.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/rare/RedwoodCorrectnessBTree.toml b/tests/rare/RedwoodCorrectnessBTree.toml
index db21848a4b..c39098e4cc 100644
--- a/tests/rare/RedwoodCorrectnessBTree.toml
+++ b/tests/rare/RedwoodCorrectnessBTree.toml
@@ -7,4 +7,3 @@ startDelay = 0
     testName = 'UnitTests'
     maxTestCases = 0
     testsMatching = '/redwood/correctness/btree'
-    remapCleanupWindow = 1000000000

From 15e8b43961cec6eb4d0b700d754125a82b11eb57 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 7 Apr 2021 19:57:24 -0700
Subject: [PATCH 085/317] rewrote getWorkersForTLogs to do a much better job of
 avoiding degraded processes and processes in the same DC as the cluster
 controller

---
 fdbserver/ClusterController.actor.cpp | 292 ++++++++++++++++----------
 1 file changed, 178 insertions(+), 114 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 87dcbd7c11..c5546830c6 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -415,91 +415,37 @@ public:
 			    .push_back(worker_details);
 		}
 
-		// Make sure we check for tlogs at the required size before adding processses from the next fitness level
-		for (int fitness = ProcessClass::GoodFit; fitness != ProcessClass::NeverAssign; fitness++) {
-			fitness_workers[std::make_tuple((ProcessClass::Fitness)fitness, 0, true, false)];
-		}
-		results.reserve(results.size() + id_worker.size());
+		int requiredProcesses = 0;
+		auto requiredFitness = ProcessClass::BestFit;
+		int requiredUsed = 0;
+		bool requiredDegraded = false;
+		bool requiredInCCDC = false;
+
 		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
 			auto fitness = std::get<0>(workerIter->first);
 			auto used = std::get<1>(workerIter->first);
-			auto addingDegraded = std::get<2>(workerIter->first);
-			ASSERT(fitness < ProcessClass::NeverAssign);
-			if (bCompleted) {
-				break;
+			if (fitness > requiredFitness || used > requiredUsed) {
+				requiredFitness = fitness;
+				requiredUsed = used;
+				if (logServerSet->size() >= required && logServerSet->validate(policy)) {
+					requiredProcesses = logServerSet->size();
+					bCompleted = true;
+					break;
+				}
 			}
 
+			if (std::get<2>(workerIter->first)) {
+				requiredDegraded = true;
+			}
+			if (std::get<3>(workerIter->first)) {
+				requiredInCCDC = true;
+			}
 			for (auto& worker : workerIter->second) {
 				logServerMap->add(worker.interf.locality, &worker);
 			}
-
-			if (logServerSet->size() < (addingDegraded ? required : desired)) {
-			} else if (logServerSet->size() == required || logServerSet->size() <= desired) {
-				if (logServerSet->validate(policy)) {
-					for (auto& object : logServerMap->getObjects()) {
-						results.push_back(*object);
-					}
-					bCompleted = true;
-					break;
-				}
-				TraceEvent(SevWarn, "GWFTADNotAcceptable", id)
-				    .detail("DcIds", dcList)
-				    .detail("Fitness", fitness)
-				    .detail("Processes", logServerSet->size())
-				    .detail("Required", required)
-				    .detail("TLogPolicy", policy->info())
-				    .detail("DesiredLogs", desired)
-				    .detail("Used", used)
-				    .detail("AddingDegraded", addingDegraded);
-			}
-			// Try to select the desired size, if larger
-			else {
-				std::vector<LocalityEntry> bestSet;
-				std::vector<LocalityData> tLocalities;
-
-				// Try to find the best team of servers to fulfill the policy
-				if (findBestPolicySet(bestSet,
-				                      logServerSet,
-				                      policy,
-				                      desired,
-				                      SERVER_KNOBS->POLICY_RATING_TESTS,
-				                      SERVER_KNOBS->POLICY_GENERATIONS)) {
-					results.reserve(results.size() + bestSet.size());
-					for (auto& entry : bestSet) {
-						auto object = logServerMap->getObject(entry);
-						ASSERT(object);
-						results.push_back(*object);
-						tLocalities.push_back(object->interf.locality);
-					}
-					TraceEvent("GWFTADBestResults", id)
-					    .detail("DcIds", dcList)
-					    .detail("Fitness", fitness)
-					    .detail("Used", used)
-					    .detail("Processes", logServerSet->size())
-					    .detail("BestCount", bestSet.size())
-					    .detail("BestZones", ::describeZones(tLocalities))
-					    .detail("BestDataHalls", ::describeDataHalls(tLocalities))
-					    .detail("TLogPolicy", policy->info())
-					    .detail("TotalResults", results.size())
-					    .detail("DesiredLogs", desired)
-					    .detail("AddingDegraded", addingDegraded);
-					bCompleted = true;
-					break;
-				}
-				TraceEvent(SevWarn, "GWFTADNoBest", id)
-				    .detail("DcIds", dcList)
-				    .detail("Fitness", fitness)
-				    .detail("Used", used)
-				    .detail("Processes", logServerSet->size())
-				    .detail("Required", required)
-				    .detail("TLogPolicy", policy->info())
-				    .detail("DesiredLogs", desired)
-				    .detail("AddingDegraded", addingDegraded);
-			}
 		}
 
-		// If policy cannot be satisfied
-		if (!bCompleted) {
+		if (!bCompleted && !(logServerSet->size() >= required && logServerSet->validate(policy))) {
 			std::vector<LocalityData> tLocalities;
 			for (auto& object : logServerMap->getObjects()) {
 				tLocalities.push_back(object->interf.locality);
@@ -517,33 +463,154 @@ public:
 			    .detail("MissingDataHalls", ::describeDataHalls(unavailableLocals))
 			    .detail("Required", required)
 			    .detail("DesiredLogs", desired)
-			    .detail("RatingTests", SERVER_KNOBS->POLICY_RATING_TESTS)
 			    .detail("CheckStable", checkStable)
-			    .detail("NumExclusionWorkers", exclusionWorkerIds.size())
-			    .detail("PolicyGenerations", SERVER_KNOBS->POLICY_GENERATIONS)
-			    .backtrace();
+			    .detail("NumExclusionWorkers", exclusionWorkerIds.size());
 
 			logServerSet->clear();
 			logServerSet.clear();
 			throw no_more_servers();
 		}
 
+		if (requiredProcesses <= desired) {
+			for (auto& object : logServerMap->getObjects()) {
+				results.push_back(*object);
+			}
+			for (auto& result : results) {
+				id_used[result.interf.locality.processId()]++;
+			}
+			TraceEvent("GetTLogTeamDone")
+			    .detail("DcIds", dcList)
+			    .detail("Policy", policy->info())
+			    .detail("Results", results.size())
+			    .detail("Processes", logServerSet->size())
+			    .detail("Workers", id_worker.size())
+			    .detail("Required", required)
+			    .detail("Desired", desired)
+			    .detail("Fitness", requiredFitness)
+			    .detail("Used", requiredUsed)
+			    .detail("AddingDegraded", requiredDegraded)
+			    .detail("InCCDC", requiredInCCDC);
+			return results;
+		}
+
+		if (requiredDegraded) {
+			logServerMap->clear();
+			for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+				auto fitness = std::get<0>(workerIter->first);
+				auto used = std::get<1>(workerIter->first);
+				auto addingDegraded = std::get<2>(workerIter->first);
+				if (addingDegraded) {
+					continue;
+				}
+				if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+					break;
+				}
+				for (auto& worker : workerIter->second) {
+					logServerMap->add(worker.interf.locality, &worker);
+				}
+			}
+			if (logServerSet->size() >= desired && logServerSet->validate(policy)) {
+				requiredDegraded = false;
+			}
+		}
+
+		if (requiredInCCDC) {
+			logServerMap->clear();
+			for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+				auto fitness = std::get<0>(workerIter->first);
+				auto used = std::get<1>(workerIter->first);
+				auto addingDegraded = std::get<2>(workerIter->first);
+				auto inCCDC = std::get<3>(workerIter->first);
+				if (inCCDC || (!requiredDegraded && addingDegraded)) {
+					continue;
+				}
+				if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+					break;
+				}
+				for (auto& worker : workerIter->second) {
+					logServerMap->add(worker.interf.locality, &worker);
+				}
+			}
+			if (logServerSet->size() >= desired && logServerSet->validate(policy)) {
+				requiredInCCDC = false;
+			}
+		}
+
+		logServerMap->clear();
+		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+			auto fitness = std::get<0>(workerIter->first);
+			auto used = std::get<1>(workerIter->first);
+			auto addingDegraded = std::get<2>(workerIter->first);
+			auto inCCDC = std::get<3>(workerIter->first);
+			if ((!requiredInCCDC && inCCDC) || (!requiredDegraded && addingDegraded)) {
+				continue;
+			}
+			if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+				break;
+			}
+			for (auto& worker : workerIter->second) {
+				logServerMap->add(worker.interf.locality, &worker);
+			}
+		}
+
+		if (logServerSet->size() == desired) {
+			for (auto& object : logServerMap->getObjects()) {
+				results.push_back(*object);
+			}
+			for (auto& result : results) {
+				id_used[result.interf.locality.processId()]++;
+			}
+			TraceEvent("GetTLogTeamDone")
+			    .detail("DcIds", dcList)
+			    .detail("Policy", policy->info())
+			    .detail("Results", results.size())
+			    .detail("Processes", logServerSet->size())
+			    .detail("Workers", id_worker.size())
+			    .detail("Required", required)
+			    .detail("Desired", desired)
+			    .detail("Fitness", requiredFitness)
+			    .detail("Used", requiredUsed)
+			    .detail("AddingDegraded", requiredDegraded)
+			    .detail("InCCDC", requiredInCCDC);
+			return results;
+		}
+
+		std::vector<LocalityEntry> bestSet;
+		std::vector<LocalityData> tLocalities;
+
+		// Try to find the best team of servers to fulfill the policy
+		bCompleted = findBestPolicySet(bestSet,
+		                               logServerSet,
+		                               policy,
+		                               desired,
+		                               SERVER_KNOBS->POLICY_RATING_TESTS,
+		                               SERVER_KNOBS->POLICY_GENERATIONS);
+		ASSERT(bCompleted);
+		results.reserve(results.size() + bestSet.size());
+		for (auto& entry : bestSet) {
+			auto object = logServerMap->getObject(entry);
+			ASSERT(object);
+			results.push_back(*object);
+			tLocalities.push_back(object->interf.locality);
+		}
 		for (auto& result : results) {
 			id_used[result.interf.locality.processId()]++;
 		}
-
 		TraceEvent("GetTLogTeamDone")
 		    .detail("DcIds", dcList)
-		    .detail("Completed", bCompleted)
 		    .detail("Policy", policy->info())
 		    .detail("Results", results.size())
 		    .detail("Processes", logServerSet->size())
 		    .detail("Workers", id_worker.size())
 		    .detail("Required", required)
 		    .detail("Desired", desired)
-		    .detail("RatingTests", SERVER_KNOBS->POLICY_RATING_TESTS)
-		    .detail("PolicyGenerations", SERVER_KNOBS->POLICY_GENERATIONS);
-
+		    .detail("Fitness", requiredFitness)
+		    .detail("Used", requiredUsed)
+		    .detail("AddingDegraded", requiredDegraded)
+		    .detail("InCCDC", requiredInCCDC)
+		    .detail("BestCount", bestSet.size())
+		    .detail("BestZones", ::describeZones(tLocalities))
+		    .detail("BestDataHalls", ::describeDataHalls(tLocalities));
 		return results;
 	}
 
@@ -751,41 +818,42 @@ public:
 		ProcessClass::Fitness worstFit;
 		ProcessClass::ClusterRole role;
 		int count;
-		int worstUsed;
-		bool worstIsDegraded;
-		bool inClusterControllerDC;
+		int worstUsed = 1;
+		bool degraded = false;
+		bool inClusterControllerDC = false;
 
 		RoleFitness(int bestFit, int worstFit, int count, ProcessClass::ClusterRole role)
 		  : bestFit((ProcessClass::Fitness)bestFit), worstFit((ProcessClass::Fitness)worstFit), count(count),
-		    role(role), worstUsed(1), worstIsDegraded(false), inClusterControllerDC(false) {}
+		    role(role) {}
 
 		RoleFitness(int fitness, int count, ProcessClass::ClusterRole role)
-		  : bestFit((ProcessClass::Fitness)fitness), worstFit((ProcessClass::Fitness)fitness), count(count), role(role),
-		    worstUsed(1), worstIsDegraded(false), inClusterControllerDC(false) {}
+		  : bestFit((ProcessClass::Fitness)fitness), worstFit((ProcessClass::Fitness)fitness), count(count),
+		    role(role) {}
 
 		RoleFitness()
 		  : bestFit(ProcessClass::NeverAssign), worstFit(ProcessClass::NeverAssign), role(ProcessClass::NoRole),
-		    count(0), worstUsed(1), worstIsDegraded(false), inClusterControllerDC(false) {}
+		    count(0) {}
 
 		RoleFitness(const vector<WorkerDetails>& workers,
 		            ProcessClass::ClusterRole role,
 		            const std::map<Optional<Standalone<StringRef>>, int>& id_used,
 		            Optional<Standalone<StringRef>> ccDcId)
 		  : role(role) {
-			worstFit = ProcessClass::BestFit;
-			worstIsDegraded = false;
+			// Every recruitment will attempt to recruit the preferred amount through GoodFit,
+			// So a recruitment which only has BestFit is not better than one that has a GoodFit process
+			worstFit = ProcessClass::GoodFit;
+
+			degraded = false;
 			inClusterControllerDC = false;
 			bestFit = ProcessClass::NeverAssign;
 			worstUsed = 1;
 			for (auto& it : workers) {
 				auto thisFit = it.processClass.machineClassFitness(role);
-				if (thisFit > worstFit) {
-					worstFit = thisFit;
-					worstIsDegraded = it.degraded;
-				} else if (thisFit == worstFit) {
-					worstIsDegraded = worstIsDegraded || it.degraded;
-				}
+				worstFit = std::max(worstFit, thisFit);
 				bestFit = std::min(bestFit, thisFit);
+				degraded |= it.degraded;
+				inClusterControllerDC |= (it.interf.locality.dcId() == ccDcId);
+
 				auto thisUsed = id_used.find(it.interf.locality.processId());
 				if (thisUsed == id_used.end()) {
 					TraceEvent(SevError, "UsedNotFound").detail("ProcessId", it.interf.locality.processId().get());
@@ -796,18 +864,15 @@ public:
 					ASSERT(false);
 				}
 				worstUsed = std::max(worstUsed, thisUsed->second);
-				// only tlogs avoid the cluster controller dc
-				if (role == ProcessClass::TLog && it.interf.locality.dcId() == ccDcId) {
-					inClusterControllerDC = true;
-				}
 			}
-			// Every recruitment will attempt to recruit the preferred amount through GoodFit,
-			// So a recruitment which only has BestFit is not better than one that has a GoodFit process
-			worstFit = std::max(worstFit, ProcessClass::GoodFit);
+
 			count = workers.size();
+
 			// degraded is only used for recruitment of tlogs
+			// only tlogs avoid the cluster controller dc
 			if (role != ProcessClass::TLog) {
-				worstIsDegraded = false;
+				degraded = false;
+				inClusterControllerDC = false;
 			}
 		}
 
@@ -818,8 +883,8 @@ public:
 				return worstUsed < r.worstUsed;
 			if (count != r.count)
 				return count > r.count;
-			if (worstIsDegraded != r.worstIsDegraded)
-				return r.worstIsDegraded;
+			if (degraded != r.degraded)
+				return r.degraded;
 			if (inClusterControllerDC != r.inClusterControllerDC)
 				return r.inClusterControllerDC;
 			// FIXME: TLog recruitment process does not guarantee the best fit is not worsened.
@@ -838,8 +903,8 @@ public:
 				return worstFit < r.worstFit;
 			if (worstUsed != r.worstUsed)
 				return worstUsed < r.worstUsed;
-			if (worstIsDegraded != r.worstIsDegraded)
-				return r.worstIsDegraded;
+			if (degraded != r.degraded)
+				return r.degraded;
 			if (inClusterControllerDC != r.inClusterControllerDC)
 				return r.inClusterControllerDC;
 			return false;
@@ -847,12 +912,11 @@ public:
 
 		bool operator==(RoleFitness const& r) const {
 			return worstFit == r.worstFit && worstUsed == r.worstUsed && bestFit == r.bestFit && count == r.count &&
-			       worstIsDegraded == r.worstIsDegraded && inClusterControllerDC == r.inClusterControllerDC;
+			       degraded == r.degraded && inClusterControllerDC == r.inClusterControllerDC;
 		}
 
 		std::string toString() const {
-			return format(
-			    "%d %d %d %d %d %d", worstFit, worstUsed, count, worstIsDegraded, inClusterControllerDC, bestFit);
+			return format("%d %d %d %d %d %d", worstFit, worstUsed, count, degraded, inClusterControllerDC, bestFit);
 		}
 	};
 

From 14213b01519d83f109c85082a1b7c0579208e3f9 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 7 Apr 2021 20:06:30 -0700
Subject: [PATCH 086/317] code cleanup

---
 fdbserver/ClusterController.actor.cpp | 31 +++++++++++++--------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index c5546830c6..fb81c571c1 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1033,7 +1033,7 @@ public:
 
 		// If one of the first process recruitments is forced to share a process, allow all of next recruitments
 		// to also share a process.
-		auto maxUsed = std::max(std::max(first_commit_proxy.used, first_grv_proxy.used), first_resolver.used);
+		auto maxUsed = std::max({ first_commit_proxy.used, first_grv_proxy.used, first_resolver.used });
 		first_commit_proxy.used = maxUsed;
 		first_grv_proxy.used = maxUsed;
 		first_resolver.used = maxUsed;
@@ -1225,8 +1225,7 @@ public:
 
 					// If one of the first process recruitments is forced to share a process, allow all of next
 					// recruitments to also share a process.
-					auto maxUsed =
-					    std::max(std::max(first_commit_proxy.used, first_grv_proxy.used), first_resolver.used);
+					auto maxUsed = std::max({ first_commit_proxy.used, first_grv_proxy.used, first_resolver.used });
 					first_commit_proxy.used = maxUsed;
 					first_grv_proxy.used = maxUsed;
 					first_resolver.used = maxUsed;
@@ -1424,7 +1423,7 @@ public:
 		// Do not trigger better master exists if the cluster controller is excluded, since the master will change
 		// anyways once the cluster controller is moved
 		if (id_worker[clusterControllerProcessId].priorityInfo.isExcluded) {
-			TraceEvent("WorseMasterExists", id).detail("Reason", "ClusterControllerExcluded");
+			TraceEvent("NewRecruitmentIsWorse", id).detail("Reason", "ClusterControllerExcluded");
 			return false;
 		}
 
@@ -1437,7 +1436,7 @@ public:
 		// Get master process
 		auto masterWorker = id_worker.find(dbi.master.locality.processId());
 		if (masterWorker == id_worker.end()) {
-			TraceEvent("WorseMasterExists", id)
+			TraceEvent("NewRecruitmentIsWorse", id)
 			    .detail("Reason", "CannotFindMaster")
 			    .detail("ProcessID", dbi.master.locality.processId());
 			return false;
@@ -1456,7 +1455,7 @@ public:
 			for (auto& it : logSet.tLogs) {
 				auto tlogWorker = id_worker.find(it.interf().filteredLocality.processId());
 				if (tlogWorker == id_worker.end()) {
-					TraceEvent("WorseMasterExists", id)
+					TraceEvent("NewRecruitmentIsWorse", id)
 					    .detail("Reason", "CannotFindTLog")
 					    .detail("ProcessID", it.interf().filteredLocality.processId());
 					return false;
@@ -1480,7 +1479,7 @@ public:
 			for (auto& it : logSet.logRouters) {
 				auto tlogWorker = id_worker.find(it.interf().filteredLocality.processId());
 				if (tlogWorker == id_worker.end()) {
-					TraceEvent("WorseMasterExists", id)
+					TraceEvent("NewRecruitmentIsWorse", id)
 					    .detail("Reason", "CannotFindLogRouter")
 					    .detail("ProcessID", it.interf().filteredLocality.processId());
 					return false;
@@ -1500,7 +1499,7 @@ public:
 			for (const auto& worker : logSet.backupWorkers) {
 				auto workerIt = id_worker.find(worker.interf().locality.processId());
 				if (workerIt == id_worker.end()) {
-					TraceEvent("WorseMasterExists", id)
+					TraceEvent("NewRecruitmentIsWorse", id)
 					    .detail("Reason", "CannotFindBackupWorker")
 					    .detail("ProcessID", worker.interf().locality.processId());
 					return false;
@@ -1523,7 +1522,7 @@ public:
 		for (auto& it : dbi.client.commitProxies) {
 			auto commitProxyWorker = id_worker.find(it.processId);
 			if (commitProxyWorker == id_worker.end()) {
-				TraceEvent("WorseMasterExists", id)
+				TraceEvent("NewRecruitmentIsWorse", id)
 				    .detail("Reason", "CannotFindCommitProxy")
 				    .detail("ProcessID", it.processId);
 				return false;
@@ -1542,7 +1541,7 @@ public:
 		for (auto& it : dbi.client.grvProxies) {
 			auto grvProxyWorker = id_worker.find(it.processId);
 			if (grvProxyWorker == id_worker.end()) {
-				TraceEvent("WorseMasterExists", id)
+				TraceEvent("NewRecruitmentIsWorse", id)
 				    .detail("Reason", "CannotFindGrvProxy")
 				    .detail("ProcessID", it.processId);
 				return false;
@@ -1561,7 +1560,7 @@ public:
 		for (auto& it : dbi.resolvers) {
 			auto resolverWorker = id_worker.find(it.locality.processId());
 			if (resolverWorker == id_worker.end()) {
-				TraceEvent("WorseMasterExists", id)
+				TraceEvent("NewRecruitmentIsWorse", id)
 				    .detail("Reason", "CannotFindResolver")
 				    .detail("ProcessID", it.locality.processId());
 				return false;
@@ -1596,7 +1595,7 @@ public:
 
 		old_id_used[masterWorker->first]++;
 		if (oldMasterFit < newMasterFit) {
-			TraceEvent("WorseMasterExists", id)
+			TraceEvent("NewRecruitmentIsWorse", id)
 			    .detail("OldMasterFit", oldMasterFit)
 			    .detail("NewMasterFit", newMasterFit)
 			    .detail("OldIsCC", dbi.master.locality.processId() == clusterControllerProcessId)
@@ -1700,7 +1699,7 @@ public:
 			return true;
 		}
 		if (!oldSatelliteFallback && newSatelliteFallback) {
-			TraceEvent("WorseMasterExists", id)
+			TraceEvent("NewRecruitmentIsWorse", id)
 			    .detail("OldSatelliteFallback", oldSatelliteFallback)
 			    .detail("NewSatelliteFallback", newSatelliteFallback);
 			return false;
@@ -1713,7 +1712,7 @@ public:
 			return true;
 		}
 		if (oldSatelliteRegionFit > newSatelliteRegionFit) {
-			TraceEvent("WorseMasterExists", id)
+			TraceEvent("NewRecruitmentIsWorse", id)
 			    .detail("OldSatelliteRegionFit", oldSatelliteRegionFit)
 			    .detail("NewSatelliteRegionFit", newSatelliteRegionFit);
 			return false;
@@ -1782,7 +1781,7 @@ public:
 		    clusterControllerDcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, db.config, id_used, true);
 		auto first_resolver = getWorkerForRoleInDatacenter(
 		    clusterControllerDcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, db.config, id_used, true);
-		auto maxUsed = std::max(std::max(first_commit_proxy.used, first_grv_proxy.used), first_resolver.used);
+		auto maxUsed = std::max({ first_commit_proxy.used, first_grv_proxy.used, first_resolver.used });
 		first_commit_proxy.used = maxUsed;
 		first_grv_proxy.used = maxUsed;
 		first_resolver.used = maxUsed;
@@ -1870,7 +1869,7 @@ public:
 		}
 
 		if (oldFit < newFit) {
-			TraceEvent("WorseMasterExists", id)
+			TraceEvent("NewRecruitmentIsWorse", id)
 			    .detail("OldMasterFit", oldMasterFit)
 			    .detail("NewMasterFit", newMasterFit)
 			    .detail("OldTLogFit", oldTLogFit.toString())

From f8786da688737e42f1a482375c550258d03e0628 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 7 Apr 2021 20:14:16 -0700
Subject: [PATCH 087/317] Added StorageByte::toString() and printed it in
 Redwood direct perf test.

---
 fdbclient/FDBTypes.h               |  9 ++++++++-
 fdbserver/VersionedBTree.actor.cpp | 10 ++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h
index b2cd469ab8..dde2a348ca 100644
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@@ -880,8 +880,15 @@ struct StorageBytes {
 	void serialize(Ar& ar) {
 		serializer(ar, free, total, used, available);
 	}
-};
 
+	std::string toString() const {
+		return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used}",
+		              total / 1e6,
+		              free / 1e6,
+		              available / 1e6,
+		              used / 1e6);
+	}
+};
 struct LogMessageVersion {
 	// Each message pushed into the log system has a unique, totally ordered LogMessageVersion
 	// See ILogSystem::push() for how these are assigned
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index a7b999539f..89f6bae442 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -8187,6 +8187,7 @@ TEST_CASE(":/redwood/performance/set") {
 	DWALPager* pager = new DWALPager(pageSize, fileName, pageCacheBytes, remapCleanupWindow);
 	state VersionedBTree* btree = new VersionedBTree(pager, fileName);
 	wait(btree->init());
+	printf("Initialized.  StorageBytes=%s\n", btree->getStorageBytes().toString().c_str());
 
 	state int64_t kvBytesThisCommit = 0;
 	state int64_t kvBytesTotal = 0;
@@ -8271,6 +8272,7 @@ TEST_CASE(":/redwood/performance/set") {
 		printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n",
 		       kvBytesTotal / 1e6,
 		       kvBytesTotal / (timer() - start) / 1e6);
+		printf("StorageBytes=%s\n", btree->getStorageBytes().toString().c_str());
 	}
 
 	printf("Warming cache with seeks\n");
@@ -8441,14 +8443,6 @@ struct KVSource {
 	}
 };
 
-std::string toString(const StorageBytes& sb) {
-	return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used}",
-	              sb.total / 1e6,
-	              sb.free / 1e6,
-	              sb.available / 1e6,
-	              sb.used / 1e6);
-}
-
 ACTOR Future<StorageBytes> getStableStorageBytes(IKeyValueStore* kvs) {
 	state StorageBytes sb = kvs->getStorageBytes();
 

From 4d8dd0b0a0d173a3cea08c6ded51b11e8b16e3c9 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 7 Apr 2021 20:32:45 -0700
Subject: [PATCH 088/317] fix: desired must be greater than or equal to
 required

---
 fdbserver/ClusterController.actor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index fb81c571c1..64794cb2ae 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -345,6 +345,7 @@ public:
 		Reference<LocalitySet> logServerSet;
 		LocalityMap<WorkerDetails>* logServerMap;
 		bool bCompleted = false;
+		desired = std::max(required, desired);
 
 		// Construct the list of DCs where the TLog recruitment is happening. This is mainly for logging purpose.
 		std::string dcList;

From 5074ac6a4d9b5cfd1275193f2ad0746ddb0ca786 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 7 Apr 2021 20:40:06 -0700
Subject: [PATCH 089/317] Missed file from previous merge commit.

---
 fdbserver/VersionedBTree.actor.cpp | 129 ++++++++++++++++-------------
 1 file changed, 71 insertions(+), 58 deletions(-)

diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 89f6bae442..1c0c013892 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -7126,7 +7126,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 		bytes += deltaTest(a, b);
 	}
 	double elapsed = timer() - start;
-	printf("DeltaTest() on random large records %g M/s  %g MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
+	printf("DeltaTest() on random large records %f M/s  %f MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
 
 	keyBuffer.resize(30);
 	valueBuffer.resize(100);
@@ -7138,7 +7138,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 		RedwoodRecordRef b = randomRedwoodRecordRef(keyBuffer, valueBuffer);
 		bytes += deltaTest(a, b);
 	}
-	printf("DeltaTest() on random small records %g M/s  %g MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
+	printf("DeltaTest() on random small records %f M/s  %f MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6);
 
 	RedwoodRecordRef rec1;
 	RedwoodRecordRef rec2;
@@ -7155,7 +7155,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.getCommonPrefixLen(rec2, 50);
 	}
-	printf("%" PRId64 " getCommonPrefixLen(skip=50) %g M/s\n", total, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " getCommonPrefixLen(skip=50) %f M/s\n", total, count / (timer() - start) / 1e6);
 
 	start = timer();
 	total = 0;
@@ -7163,7 +7163,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.getCommonPrefixLen(rec2, 0);
 	}
-	printf("%" PRId64 " getCommonPrefixLen(skip=0) %g M/s\n", total, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " getCommonPrefixLen(skip=0) %f M/s\n", total, count / (timer() - start) / 1e6);
 
 	char buf[1000];
 	RedwoodRecordRef::Delta& d = *(RedwoodRecordRef::Delta*)buf;
@@ -7176,7 +7176,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.writeDelta(d, rec2, commonPrefix);
 	}
-	printf("%" PRId64 " writeDelta(commonPrefix=%d) %g M/s\n", total, commonPrefix, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " writeDelta(commonPrefix=%d) %f M/s\n", total, commonPrefix, count / (timer() - start) / 1e6);
 
 	start = timer();
 	total = 0;
@@ -7184,7 +7184,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	for (i = 0; i < count; ++i) {
 		total += rec1.writeDelta(d, rec2);
 	}
-	printf("%" PRId64 " writeDelta() %g M/s\n", total, count / (timer() - start) / 1e6);
+	printf("%" PRId64 " writeDelta() %f M/s\n", total, count / (timer() - start) / 1e6);
 
 	return Void();
 }
@@ -7744,30 +7744,43 @@ TEST_CASE("/redwood/correctness/btree") {
 	g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting
 	g_redwoodMetrics.clear();
 
-	state std::string pagerFile = "unittest_pageFile.redwood";
+	state std::string fileName = params.get("fileName").orDefault("unittest_pageFile.redwood");
 	IPager2* pager;
 
-	state bool serialTest = deterministicRandom()->coinflip();
-	state bool shortTest = deterministicRandom()->coinflip();
+	state bool serialTest = params.getInt("serialTest").orDefault(deterministicRandom()->coinflip());
+	state bool shortTest = params.getInt("shortTest").orDefault(deterministicRandom()->coinflip());
 
 	state int pageSize =
 	    shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400));
 
-	state int64_t targetPageOps = shortTest ? 50000 : 1000000;
-	state bool pagerMemoryOnly = shortTest && (deterministicRandom()->random01() < .001);
-	state int maxKeySize = deterministicRandom()->randomInt(1, pageSize * 2);
-	state int maxValueSize = randomSize(pageSize * 25);
-	state int maxCommitSize = shortTest ? 1000 : randomSize(std::min<int>((maxKeySize + maxValueSize) * 20000, 10e6));
-	state double clearProbability = deterministicRandom()->random01() * .1;
-	state double clearSingleKeyProbability = deterministicRandom()->random01();
-	state double clearPostSetProbability = deterministicRandom()->random01() * .1;
-	state double coldStartProbability = pagerMemoryOnly ? 0 : (deterministicRandom()->random01() * 0.3);
-	state double advanceOldVersionProbability = deterministicRandom()->random01();
+	state int64_t targetPageOps = params.getInt("targetPageOps").orDefault(shortTest ? 50000 : 1000000);
+	state bool pagerMemoryOnly =
+	    params.getInt("pagerMemoryOnly").orDefault(shortTest && (deterministicRandom()->random01() < .001));
+	state int maxKeySize = params.getInt("maxKeySize").orDefault(deterministicRandom()->randomInt(1, pageSize * 2));
+	state int maxValueSize = params.getInt("maxValueSize").orDefault(randomSize(pageSize * 25));
+	state int maxCommitSize =
+	    params.getInt("maxCommitSize")
+	        .orDefault(shortTest ? 1000 : randomSize(std::min<int>((maxKeySize + maxValueSize) * 20000, 10e6)));
+	state double clearProbability =
+	    params.getDouble("clearProbability").orDefault(deterministicRandom()->random01() * .1);
+	state double clearSingleKeyProbability =
+	    params.getDouble("clearSingleKeyProbability").orDefault(deterministicRandom()->random01());
+	state double clearPostSetProbability =
+	    params.getDouble("clearPostSetProbability").orDefault(deterministicRandom()->random01() * .1);
+	state double coldStartProbability = params.getDouble("coldStartProbability")
+	                                        .orDefault(pagerMemoryOnly ? 0 : (deterministicRandom()->random01() * 0.3));
+	state double advanceOldVersionProbability =
+	    params.getDouble("advanceOldVersionProbability").orDefault(deterministicRandom()->random01());
 	state int64_t cacheSizeBytes =
-	    pagerMemoryOnly ? 2e9 : (pageSize * deterministicRandom()->randomInt(1, (BUGGIFY ? 2 : 10000) + 1));
-	state Version versionIncrement = deterministicRandom()->randomInt64(1, 1e8);
-	state Version remapCleanupWindow = BUGGIFY ? 0 : deterministicRandom()->randomInt64(1, versionIncrement * 50);
-	state int maxVerificationMapEntries = 300e3;
+	    params.getInt("cacheSizeBytes")
+	        .orDefault(pagerMemoryOnly ? 2e9
+	                                   : (pageSize * deterministicRandom()->randomInt(1, (BUGGIFY ? 2 : 10000) + 1)));
+	state Version versionIncrement =
+	    params.getInt("versionIncrement").orDefault(deterministicRandom()->randomInt64(1, 1e8));
+	state Version remapCleanupWindow =
+	    params.getInt("remapCleanupWindow")
+	        .orDefault(BUGGIFY ? 0 : deterministicRandom()->randomInt64(1, versionIncrement * 50));
+	state int maxVerificationMapEntries = params.getInt("maxVerificationMapEntries").orDefault(300e3);
 
 	printf("\n");
 	printf("targetPageOps: %" PRId64 "\n", targetPageOps);
@@ -7790,11 +7803,11 @@ TEST_CASE("/redwood/correctness/btree") {
 	printf("\n");
 
 	printf("Deleting existing test data...\n");
-	deleteFile(pagerFile);
+	deleteFile(fileName);
 
 	printf("Initializing...\n");
-	pager = new DWALPager(pageSize, pagerFile, cacheSizeBytes, remapCleanupWindow, pagerMemoryOnly);
-	state VersionedBTree* btree = new VersionedBTree(pager, pagerFile);
+	pager = new DWALPager(pageSize, fileName, cacheSizeBytes, remapCleanupWindow, pagerMemoryOnly);
+	state VersionedBTree* btree = new VersionedBTree(pager, fileName);
 	wait(btree->init());
 
 	state std::map<std::pair<std::string, Version>, Optional<std::string>> written;
@@ -7997,8 +8010,8 @@ TEST_CASE("/redwood/correctness/btree") {
 				wait(closedFuture);
 
 				printf("Reopening btree from disk.\n");
-				IPager2* pager = new DWALPager(pageSize, pagerFile, cacheSizeBytes, remapCleanupWindow);
-				btree = new VersionedBTree(pager, pagerFile);
+				IPager2* pager = new DWALPager(pageSize, fileName, cacheSizeBytes, remapCleanupWindow);
+				btree = new VersionedBTree(pager, fileName);
 				wait(btree->init());
 
 				Version v = btree->getLatestVersion();
@@ -8034,7 +8047,7 @@ TEST_CASE("/redwood/correctness/btree") {
 	state Future<Void> closedFuture = btree->onClosed();
 	btree->close();
 	wait(closedFuture);
-	btree = new VersionedBTree(new DWALPager(pageSize, pagerFile, cacheSizeBytes, 0), pagerFile);
+	btree = new VersionedBTree(new DWALPager(pageSize, fileName, cacheSizeBytes, 0), fileName);
 	wait(btree->init());
 
 	wait(btree->clearAllAndCheckSanity());
@@ -8133,29 +8146,29 @@ TEST_CASE(":/redwood/performance/set") {
 	g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting
 	g_redwoodMetrics.clear();
 
-	state std::string fileName = params.getParam("fileName").orDefault("unittest.redwood");
-	state int pageSize = params.getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
-	state int64_t pageCacheBytes = params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
-	state int nodeCount = params.getIntParam("nodeCount").orDefault(1e9);
-	state int maxRecordsPerCommit = params.getIntParam("maxRecordsPerCommit").orDefault(20000);
-	state int maxKVBytesPerCommit = params.getIntParam("maxKVBytesPerCommit").orDefault(20e6);
-	state int64_t kvBytesTarget = params.getIntParam("kvBytesTarget").orDefault(4e9);
-	state int minKeyPrefixBytes = params.getIntParam("minKeyPrefixBytes").orDefault(25);
-	state int maxKeyPrefixBytes = params.getIntParam("maxKeyPrefixBytes").orDefault(25);
-	state int minValueSize = params.getIntParam("minValueSize").orDefault(100);
-	state int maxValueSize = params.getIntParam("maxValueSize").orDefault(500);
-	state int minConsecutiveRun = params.getIntParam("minConsecutiveRun").orDefault(1);
-	state int maxConsecutiveRun = params.getIntParam("maxConsecutiveRun").orDefault(100);
-	state char firstKeyChar = params.getParam("firstKeyChar").orDefault("a")[0];
-	state char lastKeyChar = params.getParam("lastKeyChar").orDefault("m")[0];
+	state std::string fileName = params.get("fileName").orDefault("unittest.redwood");
+	state int pageSize = params.getInt("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE);
+	state int64_t pageCacheBytes = params.getInt("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K);
+	state int nodeCount = params.getInt("nodeCount").orDefault(1e9);
+	state int maxRecordsPerCommit = params.getInt("maxRecordsPerCommit").orDefault(20000);
+	state int maxKVBytesPerCommit = params.getInt("maxKVBytesPerCommit").orDefault(20e6);
+	state int64_t kvBytesTarget = params.getInt("kvBytesTarget").orDefault(4e9);
+	state int minKeyPrefixBytes = params.getInt("minKeyPrefixBytes").orDefault(25);
+	state int maxKeyPrefixBytes = params.getInt("maxKeyPrefixBytes").orDefault(25);
+	state int minValueSize = params.getInt("minValueSize").orDefault(100);
+	state int maxValueSize = params.getInt("maxValueSize").orDefault(500);
+	state int minConsecutiveRun = params.getInt("minConsecutiveRun").orDefault(1);
+	state int maxConsecutiveRun = params.getInt("maxConsecutiveRun").orDefault(100);
+	state char firstKeyChar = params.get("firstKeyChar").orDefault("a")[0];
+	state char lastKeyChar = params.get("lastKeyChar").orDefault("m")[0];
 	state Version remapCleanupWindow =
-	    params.getIntParam("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
-	state bool openExisting = params.getIntParam("openExisting").orDefault(0);
-	state bool insertRecords = !openExisting || params.getIntParam("insertRecords").orDefault(0);
-	state int concurrentSeeks = params.getIntParam("concurrentSeeks").orDefault(64);
-	state int concurrentScans = params.getIntParam("concurrentScans").orDefault(64);
-	state int seeks = params.getIntParam("seeks").orDefault(1000000);
-	state int scans = params.getIntParam("scans").orDefault(20000);
+	    params.getInt("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW);
+	state bool openExisting = params.getInt("openExisting").orDefault(0);
+	state bool insertRecords = !openExisting || params.getInt("insertRecords").orDefault(0);
+	state int concurrentSeeks = params.getInt("concurrentSeeks").orDefault(64);
+	state int concurrentScans = params.getInt("concurrentScans").orDefault(64);
+	state int seeks = params.getInt("seeks").orDefault(1000000);
+	state int scans = params.getInt("scans").orDefault(20000);
 
 	printf("pageSize: %d\n", pageSize);
 	printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes);
@@ -8648,10 +8661,10 @@ ACTOR Future<Void> doPrefixInsertComparison(int suffixSize,
 }
 
 TEST_CASE(":/redwood/performance/prefixSizeComparison") {
-	state int suffixSize = 12;
-	state int valueSize = 100;
-	state int recordCountTarget = 100e6;
-	state int usePrefixesInOrder = false;
+	state int suffixSize = params.getInt("suffixSize").orDefault(12);
+	state int valueSize = params.getInt("valueSize").orDefault(100);
+	state int recordCountTarget = params.getInt("recordCountTarget").orDefault(100e6);
+	state bool usePrefixesInOrder = params.getInt("usePrefixesInOrder").orDefault(0);
 
 	wait(doPrefixInsertComparison(
 	    suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({ { 10, 100000 } })));
@@ -8669,9 +8682,9 @@ TEST_CASE(":/redwood/performance/prefixSizeComparison") {
 }
 
 TEST_CASE(":/redwood/performance/sequentialInsert") {
-	state int prefixLen = 30;
-	state int valueSize = 100;
-	state int recordCountTarget = 100e6;
+	state int prefixLen = params.getInt("prefixLen").orDefault(30);
+	state int valueSize = params.getInt("valueSize").orDefault(100);
+	state int recordCountTarget = params.getInt("recordCountTarget").orDefault(100e6);
 
 	deleteFile("test.redwood");
 	wait(delay(5));

From 1b1f73ea16c7a4510b133a015677fd0c6777eb7b Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 7 Apr 2021 20:40:42 -0700
Subject: [PATCH 090/317] added comments

---
 fdbserver/ClusterController.actor.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 64794cb2ae..8480566914 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -416,12 +416,12 @@ public:
 			    .push_back(worker_details);
 		}
 
-		int requiredProcesses = 0;
 		auto requiredFitness = ProcessClass::BestFit;
 		int requiredUsed = 0;
 		bool requiredDegraded = false;
 		bool requiredInCCDC = false;
 
+		// Determine the minimum fitness and used necessary to fulfill the policy
 		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
 			auto fitness = std::get<0>(workerIter->first);
 			auto used = std::get<1>(workerIter->first);
@@ -429,7 +429,6 @@ public:
 				requiredFitness = fitness;
 				requiredUsed = used;
 				if (logServerSet->size() >= required && logServerSet->validate(policy)) {
-					requiredProcesses = logServerSet->size();
 					bCompleted = true;
 					break;
 				}
@@ -472,7 +471,8 @@ public:
 			throw no_more_servers();
 		}
 
-		if (requiredProcesses <= desired) {
+		// If we have less than the desired amount, return all of the processes we have
+		if (logServerSet->size() <= desired) {
 			for (auto& object : logServerMap->getObjects()) {
 				results.push_back(*object);
 			}
@@ -494,6 +494,8 @@ public:
 			return results;
 		}
 
+		// If we have added any degraded processes, try and remove them to see if we can still
+		// have the desired amount of processes
 		if (requiredDegraded) {
 			logServerMap->clear();
 			for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
@@ -515,6 +517,8 @@ public:
 			}
 		}
 
+		// If we have added any processes in the CC DC, try and remove them to see if we can still
+		// have the desired amount of processes
 		if (requiredInCCDC) {
 			logServerMap->clear();
 			for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
@@ -579,7 +583,8 @@ public:
 		std::vector<LocalityEntry> bestSet;
 		std::vector<LocalityData> tLocalities;
 
-		// Try to find the best team of servers to fulfill the policy
+		// We have more than the desired number of processes, so use the policy engine to
+		// pick a diverse subset of them
 		bCompleted = findBestPolicySet(bestSet,
 		                               logServerSet,
 		                               policy,

From 5695a1816f0bad8bc95c690a1c78977aa939914c Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 7 Apr 2021 21:31:14 -0700
Subject: [PATCH 091/317] fix: requiredFitness was being set to one higher than
 the actual requirement

---
 fdbserver/ClusterController.actor.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 8480566914..903c6cf2b8 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -426,12 +426,12 @@ public:
 			auto fitness = std::get<0>(workerIter->first);
 			auto used = std::get<1>(workerIter->first);
 			if (fitness > requiredFitness || used > requiredUsed) {
-				requiredFitness = fitness;
-				requiredUsed = used;
 				if (logServerSet->size() >= required && logServerSet->validate(policy)) {
 					bCompleted = true;
 					break;
 				}
+				requiredFitness = fitness;
+				requiredUsed = used;
 			}
 
 			if (std::get<2>(workerIter->first)) {
@@ -501,13 +501,13 @@ public:
 			for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
 				auto fitness = std::get<0>(workerIter->first);
 				auto used = std::get<1>(workerIter->first);
+				if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+					break;
+				}
 				auto addingDegraded = std::get<2>(workerIter->first);
 				if (addingDegraded) {
 					continue;
 				}
-				if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
-					break;
-				}
 				for (auto& worker : workerIter->second) {
 					logServerMap->add(worker.interf.locality, &worker);
 				}
@@ -524,14 +524,14 @@ public:
 			for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
 				auto fitness = std::get<0>(workerIter->first);
 				auto used = std::get<1>(workerIter->first);
+				if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+					break;
+				}
 				auto addingDegraded = std::get<2>(workerIter->first);
 				auto inCCDC = std::get<3>(workerIter->first);
 				if (inCCDC || (!requiredDegraded && addingDegraded)) {
 					continue;
 				}
-				if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
-					break;
-				}
 				for (auto& worker : workerIter->second) {
 					logServerMap->add(worker.interf.locality, &worker);
 				}
@@ -545,14 +545,14 @@ public:
 		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
 			auto fitness = std::get<0>(workerIter->first);
 			auto used = std::get<1>(workerIter->first);
+			if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+				break;
+			}
 			auto addingDegraded = std::get<2>(workerIter->first);
 			auto inCCDC = std::get<3>(workerIter->first);
 			if ((!requiredInCCDC && inCCDC) || (!requiredDegraded && addingDegraded)) {
 				continue;
 			}
-			if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
-				break;
-			}
 			for (auto& worker : workerIter->second) {
 				logServerMap->add(worker.interf.locality, &worker);
 			}

From 5e6655f11134f2880f55157f4a8d3e1515369398 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 7 Apr 2021 23:56:20 -0700
Subject: [PATCH 092/317] Added temp space to StorageBytes.

---
 fdbclient/FDBTypes.h               | 21 ++++++++++++++-------
 fdbserver/VersionedBTree.actor.cpp |  3 ++-
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h
index dde2a348ca..7334917639 100644
--- a/fdbclient/FDBTypes.h
+++ b/fdbclient/FDBTypes.h
@@ -866,15 +866,21 @@ struct TLogSpillType {
 
 // Contains the amount of free and total space for a storage server, in bytes
 struct StorageBytes {
+	// Free space on the filesystem
 	int64_t free;
+	// Total space on the filesystem
 	int64_t total;
-	int64_t used; // Used by *this* store, not total-free
-	int64_t available; // Amount of disk space that can be used by data structure, including free disk space and
-	                   // internally reusable space
+	// Used by *this* store, not total - free
+	int64_t used;
+	// Amount of space available for use by the store, which includes free space on the filesystem
+	// and internal free space within the store data that is immediately reusable.
+	int64_t available;
+	// Amount of space that could eventually be available for use after garbage collection
+	int64_t temp;
 
 	StorageBytes() {}
-	StorageBytes(int64_t free, int64_t total, int64_t used, int64_t available)
-	  : free(free), total(total), used(used), available(available) {}
+	StorageBytes(int64_t free, int64_t total, int64_t used, int64_t available, int64_t temp = 0)
+	  : free(free), total(total), used(used), available(available), temp(temp) {}
 
 	template <class Ar>
 	void serialize(Ar& ar) {
@@ -882,11 +888,12 @@ struct StorageBytes {
 	}
 
 	std::string toString() const {
-		return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used}",
+		return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used, %.2f MB temp}",
 		              total / 1e6,
 		              free / 1e6,
 		              available / 1e6,
-		              used / 1e6);
+		              used / 1e6,
+		              temp / 1e6);
 	}
 };
 struct LogMessageVersion {
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index 1c0c013892..8d659ff368 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -2111,8 +2111,9 @@ public:
 		// known, if each commit delayed entries that were freeable were shuffled from the delayed free queue to the
 		// free queue, but this doesn't seem necessary.
 		int64_t reusable = (freeList.numEntries + delayedFreeList.numEntries) * physicalPageSize;
+		int64_t temp = remapQueue.numEntries * physicalPageSize;
 
-		return StorageBytes(free, total, pagerSize - reusable, free + reusable);
+		return StorageBytes(free, total, pagerSize - reusable, free + reusable, temp);
 	}
 
 	ACTOR static Future<Void> getUserPageCount_cleanup(DWALPager* self) {

From cbd77fe6f3861ffdcd35e20dbfc838d15da0f3e7 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Thu, 8 Apr 2021 01:09:47 -0700
Subject: [PATCH 093/317] Added new StorageBytes member to StorageMetrics and
 TLogMetrics (for newest TLog version only).  Moved StorageBytes detail from
 SpecialCounters to the traceCounters() decorator callback to avoid calling
 getStorageBytes(), which makes a system call, four extra times on storage
 servers and eight extra times on logs.

---
 fdbserver/TLogServer.actor.cpp    | 36 +++++++++++++++----------------
 fdbserver/storageserver.actor.cpp | 14 +++++++-----
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp
index 1561b2f81f..5c744f2e78 100644
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@@ -665,24 +665,6 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {
 		specialCounter(cc, "SharedBytesDurable", [tLogData]() { return tLogData->bytesDurable; });
 		specialCounter(cc, "SharedOverheadBytesInput", [tLogData]() { return tLogData->overheadBytesInput; });
 		specialCounter(cc, "SharedOverheadBytesDurable", [tLogData]() { return tLogData->overheadBytesDurable; });
-		specialCounter(
-		    cc, "KvstoreBytesUsed", [tLogData]() { return tLogData->persistentData->getStorageBytes().used; });
-		specialCounter(
-		    cc, "KvstoreBytesFree", [tLogData]() { return tLogData->persistentData->getStorageBytes().free; });
-		specialCounter(cc, "KvstoreBytesAvailable", [tLogData]() {
-			return tLogData->persistentData->getStorageBytes().available;
-		});
-		specialCounter(
-		    cc, "KvstoreBytesTotal", [tLogData]() { return tLogData->persistentData->getStorageBytes().total; });
-		specialCounter(
-		    cc, "QueueDiskBytesUsed", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().used; });
-		specialCounter(
-		    cc, "QueueDiskBytesFree", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().free; });
-		specialCounter(cc, "QueueDiskBytesAvailable", [tLogData]() {
-			return tLogData->rawPersistentQueue->getStorageBytes().available;
-		});
-		specialCounter(
-		    cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; });
 		specialCounter(cc, "PeekMemoryReserved", [tLogData]() { return tLogData->peekMemoryLimiter.activePermits(); });
 		specialCounter(cc, "PeekMemoryRequestsStalled", [tLogData]() { return tLogData->peekMemoryLimiter.waiters(); });
 		specialCounter(cc, "Generation", [this]() { return this->recoveryCount; });
@@ -2672,7 +2654,23 @@ ACTOR Future<Void> tLogCore(TLogData* self,
 	                                     logData->logId,
 	                                     SERVER_KNOBS->STORAGE_LOGGING_DELAY,
 	                                     &logData->cc,
-	                                     logData->logId.toString() + "/TLogMetrics"));
+	                                     logData->logId.toString() + "/TLogMetrics",
+	                                     [self=self](TraceEvent& te) {
+		                                     StorageBytes sbTlog = self->persistentData->getStorageBytes();
+		                                     te.detail("KvstoreBytesUsed", sbTlog.used);
+		                                     te.detail("KvstoreBytesFree", sbTlog.free);
+		                                     te.detail("KvstoreBytesAvailable", sbTlog.available);
+		                                     te.detail("KvstoreBytesTotal", sbTlog.total);
+		                                     te.detail("KvstoreBytesTemp", sbTlog.temp);
+
+		                                     StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes();
+		                                     te.detail("QueueDiskBytesUsed", sbQueue.used);
+		                                     te.detail("QueueDiskBytesFree", sbQueue.free);
+		                                     te.detail("QueueDiskBytesAvailable", sbQueue.available);
+		                                     te.detail("QueueDiskBytesTotal", sbQueue.total);
+		                                     te.detail("QueueDiskBytesTemp", sbQueue.temp);
+	                                     }));
+
 	logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput));
 	logData->addActor.send(cleanupPeekTrackers(logData.getPtr()));
 	logData->addActor.send(logPeekTrackers(logData.getPtr()));
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 8c26f955bb..5ded5d78d1 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -717,10 +717,6 @@ public:
 			specialCounter(cc, "ActiveWatches", [self]() { return self->numWatches; });
 			specialCounter(cc, "WatchBytes", [self]() { return self->watchBytes; });
 
-			specialCounter(cc, "KvstoreBytesUsed", [self]() { return self->storage.getStorageBytes().used; });
-			specialCounter(cc, "KvstoreBytesFree", [self]() { return self->storage.getStorageBytes().free; });
-			specialCounter(cc, "KvstoreBytesAvailable", [self]() { return self->storage.getStorageBytes().available; });
-			specialCounter(cc, "KvstoreBytesTotal", [self]() { return self->storage.getStorageBytes().total; });
 			specialCounter(cc, "KvstoreSizeTotal", [self]() { return std::get<0>(self->storage.getSize()); });
 			specialCounter(cc, "KvstoreNodeTotal", [self]() { return std::get<1>(self->storage.getSize()); });
 			specialCounter(cc, "KvstoreInlineKey", [self]() { return std::get<2>(self->storage.getSize()); });
@@ -4240,7 +4236,15 @@ ACTOR Future<Void> metricsCore(StorageServer* self, StorageServerInterface ssi)
 	                               SERVER_KNOBS->STORAGE_LOGGING_DELAY,
 	                               &self->counters.cc,
 	                               self->thisServerID.toString() + "/StorageMetrics",
-	                               [tag](TraceEvent& te) { te.detail("Tag", tag.toString()); }));
+	                               [tag, self=self](TraceEvent& te) {
+		                               te.detail("Tag", tag.toString());
+		                               StorageBytes sb = self->storage.getStorageBytes();
+		                               te.detail("KvstoreBytesUsed", sb.used);
+		                               te.detail("KvstoreBytesFree", sb.free);
+		                               te.detail("KvstoreBytesAvailable", sb.available);
+		                               te.detail("KvstoreBytesTotal", sb.total);
+		                               te.detail("KvstoreBytesTemp", sb.temp);
+	                               }));
 
 	loop {
 		choose {

From 2de2ebd337481a3c7daaae8f75efbd229e5b0948 Mon Sep 17 00:00:00 2001
From: Aaron Molitor <amolitor@apple.com>
Date: Tue, 6 Apr 2021 10:21:35 -0500
Subject: [PATCH 094/317] add old fdb binaries to build image, update cmake to
 look in the right place, add new alias to devel image, update FDBTLS lib
 install for joshua, get the default joshua branch

---
 build/docker/centos6/build/Dockerfile |  8 ++++++++
 build/docker/centos6/devel/Dockerfile | 26 ++++++++++++--------------
 build/docker/centos7/build/Dockerfile |  8 ++++++++
 build/docker/centos7/devel/Dockerfile | 27 +++++++++++++--------------
 tests/CMakeLists.txt                  |  2 +-
 5 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/build/docker/centos6/build/Dockerfile b/build/docker/centos6/build/Dockerfile
index aec0a4dd11..1290160c4f 100644
--- a/build/docker/centos6/build/Dockerfile
+++ b/build/docker/centos6/build/Dockerfile
@@ -216,6 +216,14 @@ RUN source /opt/rh/devtoolset-8/enable && \
     cd .. && \
     rm -rf /tmp/*
 
+# download old fdbserver binaries
+ARG FDB_VERSION="6.2.29"
+RUN mkdir -p /opt/foundationdb/old && \
+    curl -Ls https://www.foundationdb.org/downloads/misc/fdbservers-${FDB_VERSION}.tar.gz | \
+        tar --no-same-owner --directory /opt/foundationdb/old -xz && \
+    chmod +x /opt/foundationdb/old/* && \
+    ln -sf /opt/foundationdb/old/fdbserver-${FDB_VERSION} /opt/foundationdb/old/fdbserver
+
 # build/install distcc
 RUN source /opt/rh/devtoolset-8/enable && \
     source /opt/rh/rh-python36/enable && \
diff --git a/build/docker/centos6/devel/Dockerfile b/build/docker/centos6/devel/Dockerfile
index 5dfeb7774f..82c99d4464 100644
--- a/build/docker/centos6/devel/Dockerfile
+++ b/build/docker/centos6/devel/Dockerfile
@@ -28,7 +28,7 @@ RUN source /opt/rh/devtoolset-8/enable && \
         subprocess32 && \
     mkdir fdb-joshua && \
     cd fdb-joshua && \
-    git clone --branch code_pipeline https://github.com/FoundationDB/fdb-joshua . && \
+    git clone https://github.com/FoundationDB/fdb-joshua . && \
     pip3 install /tmp/fdb-joshua && \
     cd /tmp && \
     curl -Ls https://amazon-eks.s3.us-west-2.amazonaws.com/1.18.9/2020-11-02/bin/linux/amd64/kubectl -o kubectl && \
@@ -43,20 +43,13 @@ RUN source /opt/rh/devtoolset-8/enable && \
     ./aws/install && \
     rm -rf /tmp/*
 
-ARG OLD_FDB_BINARY_DIR=/app/deploy/global_data/oldBinaries/
-ARG OLD_TLS_LIBRARY_DIR=/app/deploy/runtime/.tls_5_1/
 ARG FDB_VERSION="6.2.29"
-RUN mkdir -p ${OLD_FDB_BINARY_DIR} \
-             ${OLD_TLS_LIBRARY_DIR} \
-             /usr/lib/foundationdb/plugins && \
-    curl -Ls https://www.foundationdb.org/downloads/misc/fdbservers-${FDB_VERSION}.tar.gz | tar -xz -C ${OLD_FDB_BINARY_DIR} && \
-    rm -f ${OLD_FDB_BINARY_DIR}/*.sha256 && \
-    chmod +x ${OLD_FDB_BINARY_DIR}/* && \
-    curl -Ls https://www.foundationdb.org/downloads/misc/joshua_tls_library.tar.gz | tar -xz -C ${OLD_TLS_LIBRARY_DIR} --strip-components=1 && \
+RUN mkdir -p /usr/lib/foundationdb/plugins && \
+    curl -Ls https://www.foundationdb.org/downloads/misc/joshua_tls_library.tar.gz | \
+        tar --strip-components=1 --no-same-owner --directory /usr/lib/foundationdb/plugins -xz && \
+    ln -sf /usr/lib/foundationdb/plugins/FDBGnuTLS.so /usr/lib/foundationdb/plugins/fdb-libressl-plugin.so && \
     curl -Ls https://www.foundationdb.org/downloads/${FDB_VERSION}/linux/libfdb_c_${FDB_VERSION}.so -o /usr/lib64/libfdb_c_${FDB_VERSION}.so && \
-    ln -s /usr/lib64/libfdb_c_${FDB_VERSION}.so /usr/lib64/libfdb_c.so && \
-    ln -s ${OLD_TLS_LIBRARY_DIR}/FDBGnuTLS.so /usr/lib/foundationdb/plugins/fdb-libressl-plugin.so && \
-    ln -s ${OLD_TLS_LIBRARY_DIR}/FDBGnuTLS.so /usr/lib/foundationdb/plugins/FDBGnuTLS.so
+    ln -sf /usr/lib64/libfdb_c_${FDB_VERSION}.so /usr/lib64/libfdb_c.so
 
 WORKDIR /root
 RUN rm -f /root/anaconda-ks.cfg && \
@@ -65,8 +58,13 @@ RUN rm -f /root/anaconda-ks.cfg && \
     'source /opt/rh/rh-python36/enable' \
     'source /opt/rh/rh-ruby26/enable' \
     '' \
+    'function cmk_ci() {' \
+    '    cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON -G Ninja && \' \
+    '    ninja -v -C ${HOME}/build_output -j 84 all packages strip_targets' \
+    '}' \
     'function cmk() {' \
-    '    cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON -G Ninja && ninja -C ${HOME}/build_output -j 84' \
+    '    cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON -G Ninja && \' \
+    '    ninja -C ${HOME}/build_output -j 84' \
     '}' \
     'function ct() {' \
     '    cd ${HOME}/build_output && ctest -j 32 --no-compress-output -T test --output-on-failure' \
diff --git a/build/docker/centos7/build/Dockerfile b/build/docker/centos7/build/Dockerfile
index e8720bed5f..379c6d6777 100644
--- a/build/docker/centos7/build/Dockerfile
+++ b/build/docker/centos7/build/Dockerfile
@@ -200,6 +200,14 @@ RUN source /opt/rh/devtoolset-8/enable && \
     cd .. && \
     rm -rf /tmp/*
 
+# download old fdbserver binaries
+ARG FDB_VERSION="6.2.29"
+RUN mkdir -p /opt/foundationdb/old && \
+    curl -Ls https://www.foundationdb.org/downloads/misc/fdbservers-${FDB_VERSION}.tar.gz | \
+        tar --no-same-owner --directory /opt/foundationdb/old -xz && \
+    chmod +x /opt/foundationdb/old/* && \
+    ln -sf /opt/foundationdb/old/fdbserver-${FDB_VERSION} /opt/foundationdb/old/fdbserver
+
 # build/install distcc
 RUN source /opt/rh/devtoolset-8/enable && \
     if [ "$(uname -p)" == "aarch64" ]; then \
diff --git a/build/docker/centos7/devel/Dockerfile b/build/docker/centos7/devel/Dockerfile
index c2da42b136..ea60da54e7 100644
--- a/build/docker/centos7/devel/Dockerfile
+++ b/build/docker/centos7/devel/Dockerfile
@@ -31,7 +31,7 @@ RUN source /opt/rh/devtoolset-8/enable && \
         subprocess32 && \
     mkdir fdb-joshua && \
     cd fdb-joshua && \
-    git clone --branch code_pipeline https://github.com/FoundationDB/fdb-joshua . && \
+    git clone https://github.com/FoundationDB/fdb-joshua . && \
     pip3 install /tmp/fdb-joshua && \
     cd /tmp && \
     curl -Ls https://amazon-eks.s3.us-west-2.amazonaws.com/1.18.9/2020-11-02/bin/linux/amd64/kubectl -o kubectl && \
@@ -46,20 +46,13 @@ RUN source /opt/rh/devtoolset-8/enable && \
     ./aws/install && \
     rm -rf /tmp/*
 
-ARG OLD_FDB_BINARY_DIR=/app/deploy/global_data/oldBinaries/
-ARG OLD_TLS_LIBRARY_DIR=/app/deploy/runtime/.tls_5_1/
 ARG FDB_VERSION="6.2.29"
-RUN mkdir -p ${OLD_FDB_BINARY_DIR} \
-             ${OLD_TLS_LIBRARY_DIR} \
-             /usr/lib/foundationdb/plugins && \
-    curl -Ls https://www.foundationdb.org/downloads/misc/fdbservers-${FDB_VERSION}.tar.gz | tar -xz -C ${OLD_FDB_BINARY_DIR} && \
-    rm -f ${OLD_FDB_BINARY_DIR}/*.sha256 && \
-    chmod +x ${OLD_FDB_BINARY_DIR}/* && \
-    curl -Ls https://www.foundationdb.org/downloads/misc/joshua_tls_library.tar.gz | tar -xz -C ${OLD_TLS_LIBRARY_DIR} --strip-components=1 && \
+RUN mkdir -p /usr/lib/foundationdb/plugins && \
+    curl -Ls https://www.foundationdb.org/downloads/misc/joshua_tls_library.tar.gz | \
+        tar --strip-components=1 --no-same-owner --directory /usr/lib/foundationdb/plugins -xz && \
+    ln -sf /usr/lib/foundationdb/plugins/FDBGnuTLS.so /usr/lib/foundationdb/plugins/fdb-libressl-plugin.so && \
     curl -Ls https://www.foundationdb.org/downloads/${FDB_VERSION}/linux/libfdb_c_${FDB_VERSION}.so -o /usr/lib64/libfdb_c_${FDB_VERSION}.so && \
-    ln -s /usr/lib64/libfdb_c_${FDB_VERSION}.so /usr/lib64/libfdb_c.so && \
-    ln -s ${OLD_TLS_LIBRARY_DIR}/FDBGnuTLS.so /usr/lib/foundationdb/plugins/fdb-libressl-plugin.so && \
-    ln -s ${OLD_TLS_LIBRARY_DIR}/FDBGnuTLS.so /usr/lib/foundationdb/plugins/FDBGnuTLS.so
+    ln -sf /usr/lib64/libfdb_c_${FDB_VERSION}.so /usr/lib64/libfdb_c.so
 
 WORKDIR /root
 RUN curl -Ls https://update.code.visualstudio.com/latest/server-linux-x64/stable -o /tmp/vscode-server-linux-x64.tar.gz && \
@@ -93,8 +86,13 @@ RUN rm -f /root/anaconda-ks.cfg && \
     'source /opt/rh/rh-python36/enable' \
     'source /opt/rh/rh-ruby26/enable' \
     '' \
+    'function cmk_ci() {' \
+    '    cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON -G Ninja && \' \
+    '    ninja -v -C ${HOME}/build_output -j 84 all packages strip_targets' \
+    '}' \
     'function cmk() {' \
-    '    cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON -G Ninja && ninja -C ${HOME}/build_output -j 84' \
+    '    cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON -G Ninja && \' \
+    '    ninja -C ${HOME}/build_output -j 84' \
     '}' \
     'function ct() {' \
     '    cd ${HOME}/build_output && ctest -j 32 --no-compress-output -T test --output-on-failure' \
@@ -106,4 +104,5 @@ RUN rm -f /root/anaconda-ks.cfg && \
     '   j start --tarball $(find ${HOME}/build_output/packages -name correctness\*.tar.gz) "${@}"' \
     '}' \
     '' \
+    'bash ${HOME}/docker_proxy.sh' \
     >> .bashrc
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 132616b1bb..7caaf13007 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -15,7 +15,7 @@ set(TEST_EXCLUDE ".^" CACHE STRING "Exclude all tests matching the given regex")
 if(WITH_PYTHON)
   find_program(OLD_FDBSERVER_BINARY
     fdbserver
-    HINTS /usr/sbin /usr/libexec /usr/local/sbin /usr/local/libexec)
+    HINTS /opt/foundationdb/old /usr/sbin /usr/libexec /usr/local/sbin /usr/local/libexec)
   if(OLD_FDBSERVER_BINARY)
     message(STATUS "Use old fdb at ${OLD_FDBSERVER_BINARY}")
   else()

From a90c26f1d03bb61acc64923f489b622ab6ccf6ea Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Thu, 8 Apr 2021 14:29:12 -0700
Subject: [PATCH 095/317] The master, proxies, and resolver all need to have
 the same machine class fitness function besides best fit to ensure
 recruitment is deterministic if the first GRV proxy or resolver is forced to
 share a process, it should prefer to share with the commit proxy so that the
 commit proxy has more potential options it can share with

---
 fdbrpc/Locality.cpp                   |  22 +---
 fdbserver/ClusterController.actor.cpp | 159 ++++++++++++++++++--------
 2 files changed, 116 insertions(+), 65 deletions(-)

diff --git a/fdbrpc/Locality.cpp b/fdbrpc/Locality.cpp
index 3cf70943e0..8cdc0751c4 100644
--- a/fdbrpc/Locality.cpp
+++ b/fdbrpc/Locality.cpp
@@ -63,7 +63,7 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 		default:
 			return ProcessClass::NeverAssign;
 		}
-	case ProcessClass::CommitProxy:
+	case ProcessClass::CommitProxy: // Resolver, Master, CommitProxy, and GrvProxy need to be the same besides best fit
 		switch (_class) {
 		case ProcessClass::CommitProxyClass:
 			return ProcessClass::BestFit;
@@ -71,10 +71,6 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 			return ProcessClass::GoodFit;
 		case ProcessClass::UnsetClass:
 			return ProcessClass::UnsetFit;
-		case ProcessClass::GrvProxyClass:
-			return ProcessClass::OkayFit;
-		case ProcessClass::ResolutionClass:
-			return ProcessClass::OkayFit;
 		case ProcessClass::TransactionClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
@@ -84,7 +80,7 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 		default:
 			return ProcessClass::WorstFit;
 		}
-	case ProcessClass::GrvProxy:
+	case ProcessClass::GrvProxy: // Resolver, Master, CommitProxy, and GrvProxy need to be the same besides best fit
 		switch (_class) {
 		case ProcessClass::GrvProxyClass:
 			return ProcessClass::BestFit;
@@ -92,10 +88,6 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 			return ProcessClass::GoodFit;
 		case ProcessClass::UnsetClass:
 			return ProcessClass::UnsetFit;
-		case ProcessClass::CommitProxyClass:
-			return ProcessClass::OkayFit;
-		case ProcessClass::ResolutionClass:
-			return ProcessClass::OkayFit;
 		case ProcessClass::TransactionClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
@@ -105,7 +97,7 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 		default:
 			return ProcessClass::WorstFit;
 		}
-	case ProcessClass::Master:
+	case ProcessClass::Master: // Resolver, Master, CommitProxy, and GrvProxy need to be the same besides best fit
 		switch (_class) {
 		case ProcessClass::MasterClass:
 			return ProcessClass::BestFit;
@@ -113,7 +105,7 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 			return ProcessClass::GoodFit;
 		case ProcessClass::UnsetClass:
 			return ProcessClass::UnsetFit;
-		case ProcessClass::ResolutionClass:
+		case ProcessClass::TransactionClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
 		case ProcessClass::TesterClass:
@@ -122,7 +114,7 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 		default:
 			return ProcessClass::WorstFit;
 		}
-	case ProcessClass::Resolver:
+	case ProcessClass::Resolver: // Resolver, Master, CommitProxy, and GrvProxy need to be the same besides best fit
 		switch (_class) {
 		case ProcessClass::ResolutionClass:
 			return ProcessClass::BestFit;
@@ -147,8 +139,6 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 			return ProcessClass::GoodFit;
 		case ProcessClass::UnsetClass:
 			return ProcessClass::UnsetFit;
-		case ProcessClass::ResolutionClass:
-			return ProcessClass::OkayFit;
 		case ProcessClass::TransactionClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
@@ -167,8 +157,6 @@ ProcessClass::Fitness ProcessClass::machineClassFitness(ClusterRole role) const
 			return ProcessClass::GoodFit;
 		case ProcessClass::UnsetClass:
 			return ProcessClass::UnsetFit;
-		case ProcessClass::ResolutionClass:
-			return ProcessClass::OkayFit;
 		case ProcessClass::TransactionClass:
 			return ProcessClass::OkayFit;
 		case ProcessClass::CoordinatorClass:
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 903c6cf2b8..97249dccf8 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -727,14 +727,15 @@ public:
 		return bestFitness;
 	}
 
-	WorkerFitnessInfo getWorkerForRoleInDatacenter(Optional<Standalone<StringRef>> const& dcId,
-	                                               ProcessClass::ClusterRole role,
-	                                               ProcessClass::Fitness unacceptableFitness,
-	                                               DatabaseConfiguration const& conf,
-	                                               std::map<Optional<Standalone<StringRef>>, int>& id_used,
-	                                               bool checkStable = false) {
-		std::map<std::pair<ProcessClass::Fitness, int>, std::pair<vector<WorkerDetails>, vector<WorkerDetails>>>
-		    fitness_workers;
+	WorkerFitnessInfo getWorkerForRoleInDatacenter(
+	    Optional<Standalone<StringRef>> const& dcId,
+	    ProcessClass::ClusterRole role,
+	    ProcessClass::Fitness unacceptableFitness,
+	    DatabaseConfiguration const& conf,
+	    std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	    Optional<Standalone<StringRef>> preferredSharing = Optional<Standalone<StringRef>>(),
+	    bool checkStable = false) {
+		std::map<std::tuple<ProcessClass::Fitness, int, bool, bool>, vector<WorkerDetails>> fitness_workers;
 
 		for (auto& it : id_worker) {
 			auto fitness = it.second.details.processClass.machineClassFitness(role);
@@ -743,23 +744,20 @@ public:
 			}
 			if (workerAvailable(it.second, checkStable) && fitness < unacceptableFitness &&
 			    it.second.details.interf.locality.dcId() == dcId) {
-				if (isLongLivedStateless(it.first)) {
-					fitness_workers[std::make_pair(fitness, id_used[it.first])].second.push_back(it.second.details);
-				} else {
-					fitness_workers[std::make_pair(fitness, id_used[it.first])].first.push_back(it.second.details);
-				}
+				fitness_workers[std::make_tuple(fitness,
+				                                id_used[it.first],
+				                                isLongLivedStateless(it.first),
+				                                preferredSharing != it.first)]
+				    .push_back(it.second.details);
 			}
 		}
 
-		for (auto& it : fitness_workers) {
-			for (int j = 0; j < 2; j++) {
-				auto& w = j == 0 ? it.second.first : it.second.second;
-				deterministicRandom()->randomShuffle(w);
-				for (int i = 0; i < w.size(); i++) {
-					id_used[w[i].interf.locality.processId()]++;
-					return WorkerFitnessInfo(w[i], std::max(ProcessClass::GoodFit, it.first.first), it.first.second);
-				}
-			}
+		if (fitness_workers.size()) {
+			auto worker = deterministicRandom()->randomChoice(fitness_workers.begin()->second);
+			id_used[worker.interf.locality.processId()]++;
+			return WorkerFitnessInfo(worker,
+			                         std::max(ProcessClass::GoodFit, std::get<0>(fitness_workers.begin()->first)),
+			                         std::get<1>(fitness_workers.begin()->first));
 		}
 
 		throw no_more_servers();
@@ -1032,10 +1030,18 @@ public:
 
 		auto first_commit_proxy = getWorkerForRoleInDatacenter(
 		    dcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, req.configuration, id_used);
-		auto first_grv_proxy = getWorkerForRoleInDatacenter(
-		    dcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, req.configuration, id_used);
-		auto first_resolver = getWorkerForRoleInDatacenter(
-		    dcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, req.configuration, id_used);
+		auto first_grv_proxy = getWorkerForRoleInDatacenter(dcId,
+		                                                    ProcessClass::GrvProxy,
+		                                                    ProcessClass::ExcludeFit,
+		                                                    req.configuration,
+		                                                    id_used,
+		                                                    first_commit_proxy.worker.interf.locality.processId());
+		auto first_resolver = getWorkerForRoleInDatacenter(dcId,
+		                                                   ProcessClass::Resolver,
+		                                                   ProcessClass::ExcludeFit,
+		                                                   req.configuration,
+		                                                   id_used,
+		                                                   first_commit_proxy.worker.interf.locality.processId());
 
 		// If one of the first process recruitments is forced to share a process, allow all of next recruitments
 		// to also share a process.
@@ -1224,10 +1230,20 @@ public:
 					auto used = id_used;
 					auto first_commit_proxy = getWorkerForRoleInDatacenter(
 					    dcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, req.configuration, used);
-					auto first_grv_proxy = getWorkerForRoleInDatacenter(
-					    dcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, req.configuration, used);
-					auto first_resolver = getWorkerForRoleInDatacenter(
-					    dcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, req.configuration, used);
+					auto first_grv_proxy =
+					    getWorkerForRoleInDatacenter(dcId,
+					                                 ProcessClass::GrvProxy,
+					                                 ProcessClass::ExcludeFit,
+					                                 req.configuration,
+					                                 used,
+					                                 first_commit_proxy.worker.interf.locality.processId());
+					auto first_resolver =
+					    getWorkerForRoleInDatacenter(dcId,
+					                                 ProcessClass::Resolver,
+					                                 ProcessClass::ExcludeFit,
+					                                 req.configuration,
+					                                 used,
+					                                 first_commit_proxy.worker.interf.locality.processId());
 
 					// If one of the first process recruitments is forced to share a process, allow all of next
 					// recruitments to also share a process.
@@ -1356,10 +1372,20 @@ public:
 
 		try {
 			std::map<Optional<Standalone<StringRef>>, int> id_used;
-			getWorkerForRoleInDatacenter(
-			    regions[0].dcId, ProcessClass::ClusterController, ProcessClass::ExcludeFit, db.config, id_used, true);
-			getWorkerForRoleInDatacenter(
-			    regions[0].dcId, ProcessClass::Master, ProcessClass::ExcludeFit, db.config, id_used, true);
+			getWorkerForRoleInDatacenter(regions[0].dcId,
+			                             ProcessClass::ClusterController,
+			                             ProcessClass::ExcludeFit,
+			                             db.config,
+			                             id_used,
+			                             Optional<Standalone<StringRef>>(),
+			                             true);
+			getWorkerForRoleInDatacenter(regions[0].dcId,
+			                             ProcessClass::Master,
+			                             ProcessClass::ExcludeFit,
+			                             db.config,
+			                             id_used,
+			                             Optional<Standalone<StringRef>>(),
+			                             true);
 
 			std::set<Optional<Key>> primaryDC;
 			primaryDC.insert(regions[0].dcId);
@@ -1375,12 +1401,27 @@ public:
 				getWorkersForSatelliteLogs(db.config, regions[0], regions[1], id_used, satelliteFallback, true);
 			}
 
-			getWorkerForRoleInDatacenter(
-			    regions[0].dcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, db.config, id_used, true);
-			getWorkerForRoleInDatacenter(
-			    regions[0].dcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, db.config, id_used, true);
-			getWorkerForRoleInDatacenter(
-			    regions[0].dcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, db.config, id_used, true);
+			getWorkerForRoleInDatacenter(regions[0].dcId,
+			                             ProcessClass::Resolver,
+			                             ProcessClass::ExcludeFit,
+			                             db.config,
+			                             id_used,
+			                             Optional<Standalone<StringRef>>(),
+			                             true);
+			getWorkerForRoleInDatacenter(regions[0].dcId,
+			                             ProcessClass::CommitProxy,
+			                             ProcessClass::ExcludeFit,
+			                             db.config,
+			                             id_used,
+			                             Optional<Standalone<StringRef>>(),
+			                             true);
+			getWorkerForRoleInDatacenter(regions[0].dcId,
+			                             ProcessClass::GrvProxy,
+			                             ProcessClass::ExcludeFit,
+			                             db.config,
+			                             id_used,
+			                             Optional<Standalone<StringRef>>(),
+			                             true);
 
 			vector<Optional<Key>> dcPriority;
 			dcPriority.push_back(regions[0].dcId);
@@ -1592,8 +1633,13 @@ public:
 		std::map<Optional<Standalone<StringRef>>, int> old_id_used;
 		id_used[clusterControllerProcessId]++;
 		old_id_used[clusterControllerProcessId]++;
-		WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(
-		    clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db.config, id_used, true);
+		WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(clusterControllerDcId,
+		                                                         ProcessClass::Master,
+		                                                         ProcessClass::NeverAssign,
+		                                                         db.config,
+		                                                         id_used,
+		                                                         Optional<Standalone<StringRef>>(),
+		                                                         true);
 		auto newMasterFit = mworker.worker.processClass.machineClassFitness(ProcessClass::Master);
 		if (db.config.isExcludedServer(mworker.worker.interf.addresses())) {
 			newMasterFit = std::max(newMasterFit, ProcessClass::ExcludeFit);
@@ -1781,12 +1827,27 @@ public:
 		RoleFitness oldGrvProxyFit(grvProxyClasses, ProcessClass::GrvProxy, old_id_used, clusterControllerDcId);
 		RoleFitness oldResolverFit(resolverClasses, ProcessClass::Resolver, old_id_used, clusterControllerDcId);
 
-		auto first_commit_proxy = getWorkerForRoleInDatacenter(
-		    clusterControllerDcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, db.config, id_used, true);
-		auto first_grv_proxy = getWorkerForRoleInDatacenter(
-		    clusterControllerDcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, db.config, id_used, true);
-		auto first_resolver = getWorkerForRoleInDatacenter(
-		    clusterControllerDcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, db.config, id_used, true);
+		auto first_commit_proxy = getWorkerForRoleInDatacenter(clusterControllerDcId,
+		                                                       ProcessClass::CommitProxy,
+		                                                       ProcessClass::ExcludeFit,
+		                                                       db.config,
+		                                                       id_used,
+		                                                       Optional<Standalone<StringRef>>(),
+		                                                       true);
+		auto first_grv_proxy = getWorkerForRoleInDatacenter(clusterControllerDcId,
+		                                                    ProcessClass::GrvProxy,
+		                                                    ProcessClass::ExcludeFit,
+		                                                    db.config,
+		                                                    id_used,
+		                                                    first_commit_proxy.worker.interf.locality.processId(),
+		                                                    true);
+		auto first_resolver = getWorkerForRoleInDatacenter(clusterControllerDcId,
+		                                                   ProcessClass::Resolver,
+		                                                   ProcessClass::ExcludeFit,
+		                                                   db.config,
+		                                                   id_used,
+		                                                   first_commit_proxy.worker.interf.locality.processId(),
+		                                                   true);
 		auto maxUsed = std::max({ first_commit_proxy.used, first_grv_proxy.used, first_resolver.used });
 		first_commit_proxy.used = maxUsed;
 		first_grv_proxy.used = maxUsed;
@@ -2266,6 +2327,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 	                                                               ProcessClass::NeverAssign,
 	                                                               self->db.config,
 	                                                               id_used,
+	                                                               Optional<Standalone<StringRef>>(),
 	                                                               true)
 	                                .worker;
 	if (self->onMasterIsBetter(newRKWorker, ProcessClass::Ratekeeper)) {
@@ -2281,6 +2343,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 	                                                               ProcessClass::NeverAssign,
 	                                                               self->db.config,
 	                                                               id_used,
+	                                                               Optional<Standalone<StringRef>>(),
 	                                                               true)
 	                                .worker;
 	if (self->onMasterIsBetter(newDDWorker, ProcessClass::DataDistributor)) {

From 1d701e8bcfcd01b31949f92e095fd405b4826cfd Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 8 Apr 2021 14:38:37 -0700
Subject: [PATCH 096/317] Log a warning when remote dc's priority doesn't match
 the original primary.

---
 fdbserver/ClusterController.actor.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 55770d6f3b..f73720bf41 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1058,8 +1058,14 @@ public:
 	RecruitFromConfigurationReply findWorkersForConfiguration(RecruitFromConfigurationRequest const& req) {
 		if (req.configuration.regions.size() > 1) {
 			std::vector<RegionInfo> regions = req.configuration.regions;
-			if (regions[0].priority == regions[1].priority && regions[1].dcId == clusterControllerDcId.get()) {
-				std::swap(regions[0], regions[1]);
+			if (regions[1].dcId == clusterControllerDcId.get()) {
+				if (regions[1].priority == regions[0].priority) {
+					std::swap(regions[0], regions[1]);
+				}
+			} else {
+				TraceEvent(SevWarn, "DcPriorityUnmatch")
+				    .detail("DcId", regions[1].dcId)
+				    .detail("Priority", regions[1].priority);
 			}
 
 			if (regions[1].dcId == clusterControllerDcId.get() && regions[1].priority >= 0 &&

From f3d5fa47502a2ae40b7540097131adde84bab229 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 8 Apr 2021 15:19:43 -0700
Subject: [PATCH 097/317] Revert "Log a warning when remote dc's priority
 doesn't match the original primary."

This reverts commit 1d701e8bcfcd01b31949f92e095fd405b4826cfd.
---
 fdbserver/ClusterController.actor.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index f73720bf41..55770d6f3b 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1058,14 +1058,8 @@ public:
 	RecruitFromConfigurationReply findWorkersForConfiguration(RecruitFromConfigurationRequest const& req) {
 		if (req.configuration.regions.size() > 1) {
 			std::vector<RegionInfo> regions = req.configuration.regions;
-			if (regions[1].dcId == clusterControllerDcId.get()) {
-				if (regions[1].priority == regions[0].priority) {
-					std::swap(regions[0], regions[1]);
-				}
-			} else {
-				TraceEvent(SevWarn, "DcPriorityUnmatch")
-				    .detail("DcId", regions[1].dcId)
-				    .detail("Priority", regions[1].priority);
+			if (regions[0].priority == regions[1].priority && regions[1].dcId == clusterControllerDcId.get()) {
+				std::swap(regions[0], regions[1]);
 			}
 
 			if (regions[1].dcId == clusterControllerDcId.get() && regions[1].priority >= 0 &&

From 738e7402f7ad5bdfe047ced9864086837187ec79 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 8 Apr 2021 15:36:52 -0700
Subject: [PATCH 098/317] Log a warning when remote dc is disabled (priority <
 0)

---
 fdbserver/ClusterController.actor.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 55770d6f3b..0d00bd2086 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1062,9 +1062,15 @@ public:
 				std::swap(regions[0], regions[1]);
 			}
 
-			if (regions[1].dcId == clusterControllerDcId.get() && regions[1].priority >= 0 &&
+			if (regions[1].dcId == clusterControllerDcId.get() &&
 			    (!versionDifferenceUpdated || datacenterVersionDifference >= SERVER_KNOBS->MAX_VERSION_DIFFERENCE)) {
-				std::swap(regions[0], regions[1]);
+				if (regions[1].priority >= 0) {
+					std::swap(regions[0], regions[1]);
+				} else {
+					TraceEvent(SevWarn, "DcPriorityNegative")
+					    .detail("DcId", regions[1].dcId)
+					    .detail("Priority", regions[1].priority);
+				}
 			}
 
 			bool setPrimaryDesired = false;

From 1a05d225386afc21a12e7e7e6df02ef1c2d62a16 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Thu, 8 Apr 2021 15:46:04 -0700
Subject: [PATCH 099/317] Add valgrind to foundationdb/build image (#4636)

* Add valgrind to foundationdb/build image

* Remove valgrind-devel system package
---
 build/docker/centos7/build/Dockerfile | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/build/docker/centos7/build/Dockerfile b/build/docker/centos7/build/Dockerfile
index 379c6d6777..3a9ee06938 100644
--- a/build/docker/centos7/build/Dockerfile
+++ b/build/docker/centos7/build/Dockerfile
@@ -19,7 +19,6 @@ RUN rpmkeys --import mono-project.com.rpmkey.pgp && \
         debbuild \
         devtoolset-8 \
         devtoolset-8-libubsan-devel \
-        devtoolset-8-valgrind-devel \
         devtoolset-8-systemtap-sdt-devel \
         docker-ce \
         dos2unix \
@@ -228,4 +227,18 @@ RUN source /opt/rh/devtoolset-8/enable && \
     cd .. && \
     rm -rf /tmp/*
 
+# valgrind
+RUN source /opt/rh/devtoolset-8/enable && \
+    curl -Ls https://sourceware.org/pub/valgrind/valgrind-3.17.0.tar.bz2 -o valgrind-3.17.0.tar.bz2 && \
+    echo "ad3aec668e813e40f238995f60796d9590eee64a16dff88421430630e69285a2  valgrind-3.17.0.tar.bz2" > valgrind-sha.txt && \
+    sha256sum -c valgrind-sha.txt && \
+    mkdir valgrind && \
+    tar --strip-components 1 --no-same-owner --no-same-permissions --directory valgrind -xjf valgrind-3.17.0.tar.bz2 && \
+    cd valgrind && \
+    ./configure && \
+    make && \
+    make install && \
+    cd .. && \
+    rm -rf /tmp/*
+
 RUN curl -Ls https://github.com/manticoresoftware/manticoresearch/raw/master/misc/junit/ctest2junit.xsl -o /opt/ctest2junit.xsl

From 7be8dab045a725028a83e97371d559f58bbee61f Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 8 Apr 2021 16:00:37 -0700
Subject: [PATCH 100/317] Change DcPriorityNegative to CCDcPriorityNegative

---
 fdbserver/ClusterController.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 0d00bd2086..7fc5d51d37 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1067,7 +1067,7 @@ public:
 				if (regions[1].priority >= 0) {
 					std::swap(regions[0], regions[1]);
 				} else {
-					TraceEvent(SevWarn, "DcPriorityNegative")
+					TraceEvent(SevWarnAlways, "CCDcPriorityNegative")
 					    .detail("DcId", regions[1].dcId)
 					    .detail("Priority", regions[1].priority);
 				}

From 7652bd1c085c87f45b963d9cdba4c4914a832704 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Fri, 2 Apr 2021 09:27:49 -0700
Subject: [PATCH 101/317] Update help with all command line options

---
 fdbbackup/FileDecoder.actor.cpp | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/fdbbackup/FileDecoder.actor.cpp b/fdbbackup/FileDecoder.actor.cpp
index 89f1855365..6a1ba9346c 100644
--- a/fdbbackup/FileDecoder.actor.cpp
+++ b/fdbbackup/FileDecoder.actor.cpp
@@ -40,7 +40,17 @@ namespace file_converter {
 void printDecodeUsage() {
 	std::cout << "\n"
 	             "  -r, --container   Container URL.\n"
-	             "  -i, --input FILE  Log file to be decoded.\n"
+	             "  -i, --input FILE  Log file filter, only matched files are decoded.\n"
+	             "  --log             Enables trace file logging for the CLI session.\n"
+	             "  --logdir PATH     Specifes the output directory for trace files. If\n"
+	             "                    unspecified, defaults to the current directory. Has\n"
+	             "                    no effect unless --log is specified.\n"
+	             "  --loggroup        LOG_GROUP\n"
+	             "                    Sets the LogGroup field with the specified value for all\n"
+	             "                    events in the trace output (defaults to `default').\n"
+	             "  --trace_format    FORMAT\n"
+	             "                    Select the format of the trace files. xml (the default) and json are supported.\n"
+	             "                    Has no effect unless --log is specified.\n"
 	             "  --crash           Crash on serious error.\n"
 	             "  --build_flags     Print build information and exit.\n"
 	             "\n";
@@ -48,12 +58,12 @@ void printDecodeUsage() {
 }
 
 void printBuildInformation() {
-	printf("%s", jsonBuildInformation().c_str());
+	printf("%s\n", jsonBuildInformation().c_str());
 }
 
 struct DecodeParams {
 	std::string container_url;
-	std::string file;
+	std::string fileFilter; // only files match the filter will be decoded
 	bool log_enabled = false;
 	std::string log_dir, trace_format, trace_log_group;
 
@@ -61,8 +71,8 @@ struct DecodeParams {
 		std::string s;
 		s.append("ContainerURL: ");
 		s.append(container_url);
-		s.append(", File: ");
-		s.append(file);
+		s.append(", FileFilter: ");
+		s.append(fileFilter);
 		if (log_enabled) {
 			if (!log_dir.empty()) {
 				s.append(" LogDir:").append(log_dir);
@@ -105,7 +115,7 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
 			break;
 
 		case OPT_INPUT_FILE:
-			param->file = args->OptionArg();
+			param->fileFilter = args->OptionArg();
 			break;
 
 		case OPT_TRACE:
@@ -127,6 +137,7 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
 		case OPT_TRACE_LOG_GROUP:
 			param->trace_log_group = args->OptionArg();
 			break;
+
 		case OPT_BUILD_FLAGS:
 			printBuildInformation();
 			return FDB_EXIT_ERROR;
@@ -147,7 +158,7 @@ void printLogFiles(std::string msg, const std::vector<LogFile>& files) {
 std::vector<LogFile> getRelevantLogFiles(const std::vector<LogFile>& files, const DecodeParams& params) {
 	std::vector<LogFile> filtered;
 	for (const auto& file : files) {
-		if (file.fileName.find(params.file) != std::string::npos) {
+		if (file.fileName.find(params.fileFilter) != std::string::npos) {
 			filtered.push_back(file);
 		}
 	}

From 219c8d8526f8a8dbd5833eb2dcb9733d8ade2964 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Fri, 2 Apr 2021 09:37:33 -0700
Subject: [PATCH 102/317] Remove verbose help output

---
 fdbbackup/FileDecoder.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbbackup/FileDecoder.actor.cpp b/fdbbackup/FileDecoder.actor.cpp
index 6a1ba9346c..b709008b28 100644
--- a/fdbbackup/FileDecoder.actor.cpp
+++ b/fdbbackup/FileDecoder.actor.cpp
@@ -38,7 +38,8 @@ extern bool g_crashOnError;
 namespace file_converter {
 
 void printDecodeUsage() {
-	std::cout << "\n"
+	std::cout << "Decoder for FoundationDB backup mutation logs.\n"
+	             "Usage: fdbdecode    [OPTIONS]\n"
 	             "  -r, --container   Container URL.\n"
 	             "  -i, --input FILE  Log file filter, only matched files are decoded.\n"
 	             "  --log             Enables trace file logging for the CLI session.\n"
@@ -103,7 +104,6 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
 		int optId = args->OptionId();
 		switch (optId) {
 		case OPT_HELP:
-			printDecodeUsage();
 			return FDB_EXIT_ERROR;
 
 		case OPT_CONTAINER:

From a5841dad7b8014f1bc6cf739f2b680df91a02bfa Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Fri, 2 Apr 2021 10:18:26 -0700
Subject: [PATCH 103/317] Add TLS support to fdbdecode

---
 fdbbackup/FileConverter.h       |   4 ++
 fdbbackup/FileDecoder.actor.cpp | 107 +++++++++++++++++++++++++++-----
 2 files changed, 95 insertions(+), 16 deletions(-)

diff --git a/fdbbackup/FileConverter.h b/fdbbackup/FileConverter.h
index 0f7bfd6b16..4a1033c09a 100644
--- a/fdbbackup/FileConverter.h
+++ b/fdbbackup/FileConverter.h
@@ -24,6 +24,7 @@
 
 #include <cinttypes>
 #include "flow/SimpleOpt.h"
+#include "flow/TLSConfig.actor.h"
 
 namespace file_converter {
 
@@ -55,6 +56,9 @@ CSimpleOpt::SOption gConverterOptions[] = { { OPT_CONTAINER, "-r", SO_REQ_SEP },
 	                                        { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP },
 	                                        { OPT_INPUT_FILE, "-i", SO_REQ_SEP },
 	                                        { OPT_INPUT_FILE, "--input", SO_REQ_SEP },
+#ifndef TLS_DISABLED
+	                                        TLS_OPTION_FLAGS
+#endif
 	                                        { OPT_BUILD_FLAGS, "--build_flags", SO_NONE },
 	                                        { OPT_HELP, "-?", SO_NONE },
 	                                        { OPT_HELP, "-h", SO_NONE },
diff --git a/fdbbackup/FileDecoder.actor.cpp b/fdbbackup/FileDecoder.actor.cpp
index b709008b28..8ec041482a 100644
--- a/fdbbackup/FileDecoder.actor.cpp
+++ b/fdbbackup/FileDecoder.actor.cpp
@@ -39,21 +39,26 @@ namespace file_converter {
 
 void printDecodeUsage() {
 	std::cout << "Decoder for FoundationDB backup mutation logs.\n"
-	             "Usage: fdbdecode    [OPTIONS]\n"
-	             "  -r, --container   Container URL.\n"
-	             "  -i, --input FILE  Log file filter, only matched files are decoded.\n"
-	             "  --log             Enables trace file logging for the CLI session.\n"
-	             "  --logdir PATH     Specifes the output directory for trace files. If\n"
-	             "                    unspecified, defaults to the current directory. Has\n"
-	             "                    no effect unless --log is specified.\n"
-	             "  --loggroup        LOG_GROUP\n"
-	             "                    Sets the LogGroup field with the specified value for all\n"
-	             "                    events in the trace output (defaults to `default').\n"
-	             "  --trace_format    FORMAT\n"
-	             "                    Select the format of the trace files. xml (the default) and json are supported.\n"
-	             "                    Has no effect unless --log is specified.\n"
-	             "  --crash           Crash on serious error.\n"
-	             "  --build_flags     Print build information and exit.\n"
+	             "Usage: fdbdecode  [OPTIONS]\n"
+	             "  -r, --container URL\n"
+				 "                 Backup container URL, e.g., file:///some/path/.\n"
+	             "  -i, --input    FILE\n"
+				 "                 Log file filter, only matched files are decoded.\n"
+	             "  --log          Enables trace file logging for the CLI session.\n"
+	             "  --logdir PATH  Specifes the output directory for trace files. If\n"
+	             "                 unspecified, defaults to the current directory. Has\n"
+	             "                 no effect unless --log is specified.\n"
+	             "  --loggroup     LOG_GROUP\n"
+	             "                 Sets the LogGroup field with the specified value for all\n"
+	             "                 events in the trace output (defaults to `default').\n"
+	             "  --trace_format FORMAT\n"
+	             "                 Select the format of the trace files. xml (the default) and json are supported.\n"
+	             "                 Has no effect unless --log is specified.\n"
+	             "  --crash        Crash on serious error.\n"
+#ifndef TLS_DISABLED
+	             TLS_HELP
+#endif
+	             "  --build_flags  Print build information and exit.\n"
 	             "\n";
 	return;
 }
@@ -67,6 +72,7 @@ struct DecodeParams {
 	std::string fileFilter; // only files match the filter will be decoded
 	bool log_enabled = false;
 	std::string log_dir, trace_format, trace_log_group;
+	std::string tlsCertPath, tlsKeyPath, tlsCAPath, tlsPassword, tlsVerifyPeers;
 
 	std::string toString() {
 		std::string s;
@@ -87,6 +93,48 @@ struct DecodeParams {
 		}
 		return s;
 	}
+
+	// Returns if TLS setup is successful
+	bool setupTLS() {
+		if (tlsCertPath.size()) {
+			try {
+				setNetworkOption(FDBNetworkOptions::TLS_CERT_PATH, tlsCertPath);
+			} catch (Error& e) {
+				std::cerr << "ERROR: cannot set TLS certificate path to " << tlsCertPath << " (" << e.what() << ")\n";
+				return false;
+			}
+		}
+
+		if (tlsCAPath.size()) {
+			try {
+				setNetworkOption(FDBNetworkOptions::TLS_CA_PATH, tlsCAPath);
+			} catch (Error& e) {
+				std::cerr << "ERROR: cannot set TLS CA path to " << tlsCAPath << " (" << e.what() << ")\n";
+				return false;
+			}
+		}
+		if (tlsKeyPath.size()) {
+			try {
+				if (tlsPassword.size())
+					setNetworkOption(FDBNetworkOptions::TLS_PASSWORD, tlsPassword);
+
+				setNetworkOption(FDBNetworkOptions::TLS_KEY_PATH, tlsKeyPath);
+			} catch (Error& e) {
+				std::cerr << "ERROR: cannot set TLS key path to " << tlsKeyPath << " (" << e.what() << ")\n";
+				return false;
+			}
+		}
+		if (tlsVerifyPeers.size()) {
+			try {
+				setNetworkOption(FDBNetworkOptions::TLS_VERIFY_PEERS, tlsVerifyPeers);
+			} catch (Error& e) {
+				std::cerr << "ERROR: cannot set TLS peer verification to " << tlsVerifyPeers << " (" << e.what()
+				          << ")\n";
+				return false;
+			}
+		}
+		return true;
+	}
 };
 
 int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
@@ -138,6 +186,32 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
 			param->trace_log_group = args->OptionArg();
 			break;
 
+#ifndef TLS_DISABLED
+		case TLSConfig::OPT_TLS_PLUGIN:
+			args->OptionArg();
+			break;
+
+		case TLSConfig::OPT_TLS_CERTIFICATES:
+			param->tlsCertPath = args->OptionArg();
+			break;
+
+		case TLSConfig::OPT_TLS_PASSWORD:
+			param->tlsPassword = args->OptionArg();
+			break;
+
+		case TLSConfig::OPT_TLS_CA_FILE:
+			param->tlsCAPath = args->OptionArg();
+			break;
+
+		case TLSConfig::OPT_TLS_KEY:
+			param->tlsKeyPath = args->OptionArg();
+			break;
+
+		case TLSConfig::OPT_TLS_VERIFY_PEERS:
+			param->tlsVerifyPeers = args->OptionArg();
+			break;
+#endif
+
 		case OPT_BUILD_FLAGS:
 			printBuildInformation();
 			return FDB_EXIT_ERROR;
@@ -525,6 +599,7 @@ int main(int argc, char** argv) {
 				setNetworkOption(FDBNetworkOptions::TRACE_LOG_GROUP, StringRef(param.trace_log_group));
 			}
 		}
+		param.setupTLS();
 
 		platformInit();
 		Error::init();
@@ -540,7 +615,7 @@ int main(int argc, char** argv) {
 		runNetwork();
 		return status;
 	} catch (Error& e) {
-		fprintf(stderr, "ERROR: %s\n", e.what());
+		std::cerr << "ERROR: " << e.what() << "\n";
 		return FDB_EXIT_ERROR;
 	} catch (std::exception& e) {
 		TraceEvent(SevError, "MainError").error(unknown_error()).detail("RootException", e.what());

From 86482606bb916f64506387bd670de09148594a10 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Fri, 2 Apr 2021 18:22:06 -0700
Subject: [PATCH 104/317] Add blob credentials option for fdbdecode

---
 fdbbackup/FileConverter.h       |  2 +
 fdbbackup/FileDecoder.actor.cpp | 82 ++++++++++++++++++++++++---------
 2 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/fdbbackup/FileConverter.h b/fdbbackup/FileConverter.h
index 4a1033c09a..e3890cb476 100644
--- a/fdbbackup/FileConverter.h
+++ b/fdbbackup/FileConverter.h
@@ -32,6 +32,7 @@ namespace file_converter {
 enum {
 	OPT_CONTAINER,
 	OPT_BEGIN_VERSION,
+	OPT_BLOB_CREDENTIALS,
 	OPT_CRASHONERROR,
 	OPT_END_VERSION,
 	OPT_TRACE,
@@ -56,6 +57,7 @@ CSimpleOpt::SOption gConverterOptions[] = { { OPT_CONTAINER, "-r", SO_REQ_SEP },
 	                                        { OPT_TRACE_LOG_GROUP, "--loggroup", SO_REQ_SEP },
 	                                        { OPT_INPUT_FILE, "-i", SO_REQ_SEP },
 	                                        { OPT_INPUT_FILE, "--input", SO_REQ_SEP },
+	                                        { OPT_BLOB_CREDENTIALS, "--blob_credentials", SO_REQ_SEP },
 #ifndef TLS_DISABLED
 	                                        TLS_OPTION_FLAGS
 #endif
diff --git a/fdbbackup/FileDecoder.actor.cpp b/fdbbackup/FileDecoder.actor.cpp
index 8ec041482a..d20e2bc12f 100644
--- a/fdbbackup/FileDecoder.actor.cpp
+++ b/fdbbackup/FileDecoder.actor.cpp
@@ -26,6 +26,7 @@
 #include "fdbclient/BackupContainer.h"
 #include "fdbbackup/FileConverter.h"
 #include "fdbclient/MutationList.h"
+#include "flow/Trace.h"
 #include "flow/flow.h"
 #include "flow/serialize.h"
 #include "fdbclient/BuildFlags.h"
@@ -38,33 +39,37 @@ extern bool g_crashOnError;
 namespace file_converter {
 
 void printDecodeUsage() {
-	std::cout << "Decoder for FoundationDB backup mutation logs.\n"
-	             "Usage: fdbdecode  [OPTIONS]\n"
-	             "  -r, --container URL\n"
-				 "                 Backup container URL, e.g., file:///some/path/.\n"
-	             "  -i, --input    FILE\n"
-				 "                 Log file filter, only matched files are decoded.\n"
-	             "  --log          Enables trace file logging for the CLI session.\n"
-	             "  --logdir PATH  Specifes the output directory for trace files. If\n"
-	             "                 unspecified, defaults to the current directory. Has\n"
-	             "                 no effect unless --log is specified.\n"
-	             "  --loggroup     LOG_GROUP\n"
-	             "                 Sets the LogGroup field with the specified value for all\n"
-	             "                 events in the trace output (defaults to `default').\n"
-	             "  --trace_format FORMAT\n"
-	             "                 Select the format of the trace files. xml (the default) and json are supported.\n"
-	             "                 Has no effect unless --log is specified.\n"
-	             "  --crash        Crash on serious error.\n"
+	std::cout
+	    << "Decoder for FoundationDB backup mutation logs.\n"
+	       "Usage: fdbdecode  [OPTIONS]\n"
+	       "  -r, --container URL\n"
+	       "                 Backup container URL, e.g., file:///some/path/.\n"
+	       "  -i, --input    FILE\n"
+	       "                 Log file filter, only matched files are decoded.\n"
+	       "  --log          Enables trace file logging for the CLI session.\n"
+	       "  --logdir PATH  Specifes the output directory for trace files. If\n"
+	       "                 unspecified, defaults to the current directory. Has\n"
+	       "                 no effect unless --log is specified.\n"
+	       "  --loggroup     LOG_GROUP\n"
+	       "                 Sets the LogGroup field with the specified value for all\n"
+	       "                 events in the trace output (defaults to `default').\n"
+	       "  --trace_format FORMAT\n"
+	       "                 Select the format of the trace files, xml (the default) or json.\n"
+	       "                 Has no effect unless --log is specified.\n"
+	       "  --crash        Crash on serious error.\n"
+	       "  --blob_credentials FILE\n"
+	       "                 File containing blob credentials in JSON format.\n"
+	       "                 The same credential format/file fdbbackup uses.\n"
 #ifndef TLS_DISABLED
-	             TLS_HELP
+	    TLS_HELP
 #endif
-	             "  --build_flags  Print build information and exit.\n"
-	             "\n";
+	       "  --build_flags  Print build information and exit.\n"
+	       "\n";
 	return;
 }
 
 void printBuildInformation() {
-	printf("%s\n", jsonBuildInformation().c_str());
+	std::cout << jsonBuildInformation() << "\n";
 }
 
 struct DecodeParams {
@@ -73,6 +78,7 @@ struct DecodeParams {
 	bool log_enabled = false;
 	std::string log_dir, trace_format, trace_log_group;
 	std::string tlsCertPath, tlsKeyPath, tlsCAPath, tlsPassword, tlsVerifyPeers;
+	std::vector<std::string> blobCredentials;
 
 	std::string toString() {
 		std::string s;
@@ -94,6 +100,29 @@ struct DecodeParams {
 		return s;
 	}
 
+	// Sets up blob crentials. Add the file specified by FDB_BLOB_CREDENTIALS as well.
+	void setupBlobCredentials() {
+		// Add blob credentials files from the environment to the list collected from the command line.
+		const char* blobCredsFromENV = getenv("FDB_BLOB_CREDENTIALS");
+		if (blobCredsFromENV != nullptr) {
+			StringRef t((uint8_t*)blobCredsFromENV, strlen(blobCredsFromENV));
+			do {
+				StringRef file = t.eat(":");
+				if (file.size() != 0)
+					blobCredentials.push_back(file.toString());
+			} while (t.size() != 0);
+		}
+
+		// Update the global blob credential files list
+		std::vector<std::string>* pFiles =
+		    (std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
+		if (pFiles != nullptr) {
+			for (auto& f : blobCredentials) {
+				pFiles->push_back(f);
+			}
+		}
+	}
+
 	// Returns if TLS setup is successful
 	bool setupTLS() {
 		if (tlsCertPath.size()) {
@@ -186,6 +215,10 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
 			param->trace_log_group = args->OptionArg();
 			break;
 
+		case OPT_BLOB_CREDENTIALS:
+			param->blobCredentials.push_back(args->OptionArg());
+			break;
+
 #ifndef TLS_DISABLED
 		case TLSConfig::OPT_TLS_PLUGIN:
 			args->OptionArg();
@@ -599,7 +632,11 @@ int main(int argc, char** argv) {
 				setNetworkOption(FDBNetworkOptions::TRACE_LOG_GROUP, StringRef(param.trace_log_group));
 			}
 		}
-		param.setupTLS();
+
+		if (!param.setupTLS()) {
+			TraceEvent(SevError, "TLSError");
+			throw tls_error();
+		}
 
 		platformInit();
 		Error::init();
@@ -609,6 +646,7 @@ int main(int argc, char** argv) {
 
 		TraceEvent::setNetworkThread();
 		openTraceFile(NetworkAddress(), 10 << 20, 10 << 20, param.log_dir, "decode", param.trace_log_group);
+		param.setupBlobCredentials();
 
 		auto f = stopAfter(decode_logs(param));
 

From 0ce3ed93ec8c6ced964a76eb682b0b07991f5b91 Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Mon, 5 Apr 2021 11:47:23 -0700
Subject: [PATCH 105/317] Refactor TLS config for backup

---
 fdbbackup/BackupTLSConfig.cpp   | 90 +++++++++++++++++++++++++++++++++
 fdbbackup/BackupTLSConfig.h     | 41 +++++++++++++++
 fdbbackup/CMakeLists.txt        |  2 +
 fdbbackup/FileDecoder.actor.cpp | 83 ++++--------------------------
 4 files changed, 143 insertions(+), 73 deletions(-)
 create mode 100644 fdbbackup/BackupTLSConfig.cpp
 create mode 100644 fdbbackup/BackupTLSConfig.h

diff --git a/fdbbackup/BackupTLSConfig.cpp b/fdbbackup/BackupTLSConfig.cpp
new file mode 100644
index 0000000000..4df47e0b3b
--- /dev/null
+++ b/fdbbackup/BackupTLSConfig.cpp
@@ -0,0 +1,90 @@
+/*
+ * BackupTLSConfig.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+
+#include "fdbclient/NativeAPI.actor.h"
+#include "flow/Arena.h"
+#include "flow/Error.h"
+#include "flow/network.h"
+
+#include "fdbbackup/BackupTLSConfig.h"
+
+void BackupTLSConfig::setupBlobCredentials() {
+	// Add blob credentials files from the environment to the list collected from the command line.
+	const char* blobCredsFromENV = getenv("FDB_BLOB_CREDENTIALS");
+	if (blobCredsFromENV != nullptr) {
+		StringRef t((uint8_t*)blobCredsFromENV, strlen(blobCredsFromENV));
+		do {
+			StringRef file = t.eat(":");
+			if (file.size() != 0)
+				blobCredentials.push_back(file.toString());
+		} while (t.size() != 0);
+	}
+
+	// Update the global blob credential files list
+	std::vector<std::string>* pFiles = (std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
+	if (pFiles != nullptr) {
+		for (auto& f : blobCredentials) {
+			pFiles->push_back(f);
+		}
+	}
+}
+
+bool BackupTLSConfig::setupTLS() {
+	if (tlsCertPath.size()) {
+		try {
+			setNetworkOption(FDBNetworkOptions::TLS_CERT_PATH, tlsCertPath);
+		} catch (Error& e) {
+			std::cerr << "ERROR: cannot set TLS certificate path to " << tlsCertPath << " (" << e.what() << ")\n";
+			return false;
+		}
+	}
+
+	if (tlsCAPath.size()) {
+		try {
+			setNetworkOption(FDBNetworkOptions::TLS_CA_PATH, tlsCAPath);
+		} catch (Error& e) {
+			std::cerr << "ERROR: cannot set TLS CA path to " << tlsCAPath << " (" << e.what() << ")\n";
+			return false;
+		}
+	}
+	if (tlsKeyPath.size()) {
+		try {
+			if (tlsPassword.size())
+				setNetworkOption(FDBNetworkOptions::TLS_PASSWORD, tlsPassword);
+
+			setNetworkOption(FDBNetworkOptions::TLS_KEY_PATH, tlsKeyPath);
+		} catch (Error& e) {
+			std::cerr << "ERROR: cannot set TLS key path to " << tlsKeyPath << " (" << e.what() << ")\n";
+			return false;
+		}
+	}
+	if (tlsVerifyPeers.size()) {
+		try {
+			setNetworkOption(FDBNetworkOptions::TLS_VERIFY_PEERS, tlsVerifyPeers);
+		} catch (Error& e) {
+			std::cerr << "ERROR: cannot set TLS peer verification to " << tlsVerifyPeers << " (" << e.what()
+						<< ")\n";
+			return false;
+		}
+	}
+	return true;
+}
\ No newline at end of file
diff --git a/fdbbackup/BackupTLSConfig.h b/fdbbackup/BackupTLSConfig.h
new file mode 100644
index 0000000000..4222c0c25f
--- /dev/null
+++ b/fdbbackup/BackupTLSConfig.h
@@ -0,0 +1,41 @@
+/*
+ * BackupTLSConfig.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FDBBACKUP_BACKUPTLSCONFIG_H
+#define FDBBACKUP_BACKUPTLSCONFIG_H
+#pragma once
+
+#include <string>
+#include <vector>
+
+// TLS and blob credentials for backups and setup for these credentials.
+struct BackupTLSConfig {
+	std::string tlsCertPath, tlsKeyPath, tlsCAPath, tlsPassword, tlsVerifyPeers;
+	std::vector<std::string> blobCredentials;
+
+	// Returns if TLS setup is successful
+	bool setupTLS();
+
+	// Sets up blob crentials. Add the file specified by FDB_BLOB_CREDENTIALS as well.
+	// Note this must be called after g_network is set up.
+	void setupBlobCredentials();
+};
+
+#endif // FDBBACKUP_BACKUPTLSCONFIG_H
diff --git a/fdbbackup/CMakeLists.txt b/fdbbackup/CMakeLists.txt
index 1737b9042b..ffb151530b 100644
--- a/fdbbackup/CMakeLists.txt
+++ b/fdbbackup/CMakeLists.txt
@@ -11,6 +11,8 @@ add_flow_target(EXECUTABLE NAME fdbconvert SRCS ${FDBCONVERT_SRCS})
 target_link_libraries(fdbconvert PRIVATE fdbclient)
 
 set(FDBDECODE_SRCS
+	BackupTLSConfig.h
+	BackupTLSConfig.cpp
 	FileDecoder.actor.cpp
 	FileConverter.h)
 add_flow_target(EXECUTABLE NAME fdbdecode SRCS ${FDBDECODE_SRCS})
diff --git a/fdbbackup/FileDecoder.actor.cpp b/fdbbackup/FileDecoder.actor.cpp
index d20e2bc12f..193564d905 100644
--- a/fdbbackup/FileDecoder.actor.cpp
+++ b/fdbbackup/FileDecoder.actor.cpp
@@ -22,6 +22,7 @@
 #include <iostream>
 #include <vector>
 
+#include "fdbbackup/BackupTLSConfig.h"
 #include "fdbclient/BackupAgent.actor.h"
 #include "fdbclient/BackupContainer.h"
 #include "fdbbackup/FileConverter.h"
@@ -77,8 +78,7 @@ struct DecodeParams {
 	std::string fileFilter; // only files match the filter will be decoded
 	bool log_enabled = false;
 	std::string log_dir, trace_format, trace_log_group;
-	std::string tlsCertPath, tlsKeyPath, tlsCAPath, tlsPassword, tlsVerifyPeers;
-	std::vector<std::string> blobCredentials;
+	BackupTLSConfig tlsConfig;
 
 	std::string toString() {
 		std::string s;
@@ -100,70 +100,7 @@ struct DecodeParams {
 		return s;
 	}
 
-	// Sets up blob crentials. Add the file specified by FDB_BLOB_CREDENTIALS as well.
-	void setupBlobCredentials() {
-		// Add blob credentials files from the environment to the list collected from the command line.
-		const char* blobCredsFromENV = getenv("FDB_BLOB_CREDENTIALS");
-		if (blobCredsFromENV != nullptr) {
-			StringRef t((uint8_t*)blobCredsFromENV, strlen(blobCredsFromENV));
-			do {
-				StringRef file = t.eat(":");
-				if (file.size() != 0)
-					blobCredentials.push_back(file.toString());
-			} while (t.size() != 0);
-		}
 
-		// Update the global blob credential files list
-		std::vector<std::string>* pFiles =
-		    (std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
-		if (pFiles != nullptr) {
-			for (auto& f : blobCredentials) {
-				pFiles->push_back(f);
-			}
-		}
-	}
-
-	// Returns if TLS setup is successful
-	bool setupTLS() {
-		if (tlsCertPath.size()) {
-			try {
-				setNetworkOption(FDBNetworkOptions::TLS_CERT_PATH, tlsCertPath);
-			} catch (Error& e) {
-				std::cerr << "ERROR: cannot set TLS certificate path to " << tlsCertPath << " (" << e.what() << ")\n";
-				return false;
-			}
-		}
-
-		if (tlsCAPath.size()) {
-			try {
-				setNetworkOption(FDBNetworkOptions::TLS_CA_PATH, tlsCAPath);
-			} catch (Error& e) {
-				std::cerr << "ERROR: cannot set TLS CA path to " << tlsCAPath << " (" << e.what() << ")\n";
-				return false;
-			}
-		}
-		if (tlsKeyPath.size()) {
-			try {
-				if (tlsPassword.size())
-					setNetworkOption(FDBNetworkOptions::TLS_PASSWORD, tlsPassword);
-
-				setNetworkOption(FDBNetworkOptions::TLS_KEY_PATH, tlsKeyPath);
-			} catch (Error& e) {
-				std::cerr << "ERROR: cannot set TLS key path to " << tlsKeyPath << " (" << e.what() << ")\n";
-				return false;
-			}
-		}
-		if (tlsVerifyPeers.size()) {
-			try {
-				setNetworkOption(FDBNetworkOptions::TLS_VERIFY_PEERS, tlsVerifyPeers);
-			} catch (Error& e) {
-				std::cerr << "ERROR: cannot set TLS peer verification to " << tlsVerifyPeers << " (" << e.what()
-				          << ")\n";
-				return false;
-			}
-		}
-		return true;
-	}
 };
 
 int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
@@ -216,7 +153,7 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
 			break;
 
 		case OPT_BLOB_CREDENTIALS:
-			param->blobCredentials.push_back(args->OptionArg());
+			param->tlsConfig.blobCredentials.push_back(args->OptionArg());
 			break;
 
 #ifndef TLS_DISABLED
@@ -225,23 +162,23 @@ int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
 			break;
 
 		case TLSConfig::OPT_TLS_CERTIFICATES:
-			param->tlsCertPath = args->OptionArg();
+			param->tlsConfig.tlsCertPath = args->OptionArg();
 			break;
 
 		case TLSConfig::OPT_TLS_PASSWORD:
-			param->tlsPassword = args->OptionArg();
+			param->tlsConfig.tlsPassword = args->OptionArg();
 			break;
 
 		case TLSConfig::OPT_TLS_CA_FILE:
-			param->tlsCAPath = args->OptionArg();
+			param->tlsConfig.tlsCAPath = args->OptionArg();
 			break;
 
 		case TLSConfig::OPT_TLS_KEY:
-			param->tlsKeyPath = args->OptionArg();
+			param->tlsConfig.tlsKeyPath = args->OptionArg();
 			break;
 
 		case TLSConfig::OPT_TLS_VERIFY_PEERS:
-			param->tlsVerifyPeers = args->OptionArg();
+			param->tlsConfig.tlsVerifyPeers = args->OptionArg();
 			break;
 #endif
 
@@ -633,7 +570,7 @@ int main(int argc, char** argv) {
 			}
 		}
 
-		if (!param.setupTLS()) {
+		if (!param.tlsConfig.setupTLS()) {
 			TraceEvent(SevError, "TLSError");
 			throw tls_error();
 		}
@@ -646,7 +583,7 @@ int main(int argc, char** argv) {
 
 		TraceEvent::setNetworkThread();
 		openTraceFile(NetworkAddress(), 10 << 20, 10 << 20, param.log_dir, "decode", param.trace_log_group);
-		param.setupBlobCredentials();
+		param.tlsConfig.setupBlobCredentials();
 
 		auto f = stopAfter(decode_logs(param));
 

From cf70575d743c70b0a874c56002fa671bcd6b4cfb Mon Sep 17 00:00:00 2001
From: Jingyu Zhou <jingyuzhou@gmail.com>
Date: Mon, 5 Apr 2021 12:21:32 -0700
Subject: [PATCH 106/317] Refactor fdbbackup with BackupTLSConfig

---
 fdbbackup/CMakeLists.txt   |  4 +-
 fdbbackup/backup.actor.cpp | 75 ++++++--------------------------------
 2 files changed, 15 insertions(+), 64 deletions(-)

diff --git a/fdbbackup/CMakeLists.txt b/fdbbackup/CMakeLists.txt
index ffb151530b..48b1ad1aef 100644
--- a/fdbbackup/CMakeLists.txt
+++ b/fdbbackup/CMakeLists.txt
@@ -1,5 +1,7 @@
 set(FDBBACKUP_SRCS
-  backup.actor.cpp)
+	BackupTLSConfig.h
+	BackupTLSConfig.cpp
+	backup.actor.cpp)
 
 add_flow_target(EXECUTABLE NAME fdbbackup SRCS ${FDBBACKUP_SRCS})
 target_link_libraries(fdbbackup PRIVATE fdbclient)
diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index 7614324afc..7cf72ab517 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -18,6 +18,7 @@
  * limitations under the License.
  */
 
+#include "fdbbackup/BackupTLSConfig.h"
 #include "fdbclient/JsonBuilder.h"
 #include "flow/Arena.h"
 #include "flow/Error.h"
@@ -3251,8 +3252,7 @@ int main(int argc, char* argv[]) {
 		LocalityData localities;
 		uint64_t memLimit = 8LL << 30;
 		Optional<uint64_t> ti;
-		std::vector<std::string> blobCredentials;
-		std::string tlsCertPath, tlsKeyPath, tlsCAPath, tlsPassword, tlsVerifyPeers;
+		BackupTLSConfig tlsConfig;
 		Version dumpBegin = 0;
 		Version dumpEnd = std::numeric_limits<Version>::max();
 		std::string restoreClusterFileDest;
@@ -3578,26 +3578,26 @@ int main(int argc, char* argv[]) {
 				memLimit = ti.get();
 				break;
 			case OPT_BLOB_CREDENTIALS:
-				blobCredentials.push_back(args->OptionArg());
+				tlsConfig.blobCredentials.push_back(args->OptionArg());
 				break;
 #ifndef TLS_DISABLED
 			case TLSConfig::OPT_TLS_PLUGIN:
 				args->OptionArg();
 				break;
 			case TLSConfig::OPT_TLS_CERTIFICATES:
-				tlsCertPath = args->OptionArg();
+				tlsConfig.tlsCertPath = args->OptionArg();
 				break;
 			case TLSConfig::OPT_TLS_PASSWORD:
-				tlsPassword = args->OptionArg();
+				tlsConfig.tlsPassword = args->OptionArg();
 				break;
 			case TLSConfig::OPT_TLS_CA_FILE:
-				tlsCAPath = args->OptionArg();
+				tlsConfig.tlsCAPath = args->OptionArg();
 				break;
 			case TLSConfig::OPT_TLS_KEY:
-				tlsKeyPath = args->OptionArg();
+				tlsConfig.tlsKeyPath = args->OptionArg();
 				break;
 			case TLSConfig::OPT_TLS_VERIFY_PEERS:
-				tlsVerifyPeers = args->OptionArg();
+				tlsConfig.tlsVerifyPeers = args->OptionArg();
 				break;
 #endif
 			case OPT_DUMP_BEGIN:
@@ -3731,42 +3731,8 @@ int main(int argc, char* argv[]) {
 		setNetworkOption(FDBNetworkOptions::DISABLE_CLIENT_STATISTICS_LOGGING);
 
 		// deferred TLS options
-		if (tlsCertPath.size()) {
-			try {
-				setNetworkOption(FDBNetworkOptions::TLS_CERT_PATH, tlsCertPath);
-			} catch (Error& e) {
-				fprintf(stderr, "ERROR: cannot set TLS certificate path to `%s' (%s)\n", tlsCertPath.c_str(), e.what());
-				return 1;
-			}
-		}
-
-		if (tlsCAPath.size()) {
-			try {
-				setNetworkOption(FDBNetworkOptions::TLS_CA_PATH, tlsCAPath);
-			} catch (Error& e) {
-				fprintf(stderr, "ERROR: cannot set TLS CA path to `%s' (%s)\n", tlsCAPath.c_str(), e.what());
-				return 1;
-			}
-		}
-		if (tlsKeyPath.size()) {
-			try {
-				if (tlsPassword.size())
-					setNetworkOption(FDBNetworkOptions::TLS_PASSWORD, tlsPassword);
-
-				setNetworkOption(FDBNetworkOptions::TLS_KEY_PATH, tlsKeyPath);
-			} catch (Error& e) {
-				fprintf(stderr, "ERROR: cannot set TLS key path to `%s' (%s)\n", tlsKeyPath.c_str(), e.what());
-				return 1;
-			}
-		}
-		if (tlsVerifyPeers.size()) {
-			try {
-				setNetworkOption(FDBNetworkOptions::TLS_VERIFY_PEERS, tlsVerifyPeers);
-			} catch (Error& e) {
-				fprintf(
-				    stderr, "ERROR: cannot set TLS peer verification to `%s' (%s)\n", tlsVerifyPeers.c_str(), e.what());
-				return 1;
-			}
+		if (!tlsConfig.setupTLS()) {
+			return 1;
 		}
 
 		Error::init();
@@ -3806,25 +3772,8 @@ int main(int argc, char* argv[]) {
 		// are logged. This thread will eventually run the network, so call it now.
 		TraceEvent::setNetworkThread();
 
-		// Add blob credentials files from the environment to the list collected from the command line.
-		const char* blobCredsFromENV = getenv("FDB_BLOB_CREDENTIALS");
-		if (blobCredsFromENV != nullptr) {
-			StringRef t((uint8_t*)blobCredsFromENV, strlen(blobCredsFromENV));
-			do {
-				StringRef file = t.eat(":");
-				if (file.size() != 0)
-					blobCredentials.push_back(file.toString());
-			} while (t.size() != 0);
-		}
-
-		// Update the global blob credential files list
-		std::vector<std::string>* pFiles =
-		    (std::vector<std::string>*)g_network->global(INetwork::enBlobCredentialFiles);
-		if (pFiles != nullptr) {
-			for (auto& f : blobCredentials) {
-				pFiles->push_back(f);
-			}
-		}
+		// Sets up blob credentials, including one from the environment FDB_BLOB_CREDENTIALS.
+		tlsConfig.setupBlobCredentials();
 
 		// Opens a trace file if trace is set (and if a trace file isn't already open)
 		// For most modes, initCluster() will open a trace file, but some fdbbackup operations do not require

From 20649037057ee8deba2fca24cd2204d10a5e4b61 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 9 Apr 2021 14:25:11 -0600
Subject: [PATCH 107/317] collect and serialize

---
 CMakeLists.txt                     |   1 +
 cmake/GetMsgpack.cmake             |  16 ++
 fdbclient/ActorLineageProfiler.cpp | 183 ++++++++++++++++++++++
 fdbclient/ActorLineageProfiler.h   |  80 ++++++++++
 fdbclient/CMakeLists.txt           |   7 +-
 fdbserver/RoleLineage.actor.cpp    |   4 +-
 fdbserver/RoleLineage.actor.h      |  43 ++++--
 fdbserver/WorkerInterface.actor.h  |  34 +++++
 flow/Net2.actor.cpp                |   4 +-
 flow/Platform.actor.cpp            |   8 +-
 flow/flow.cpp                      |   6 +-
 flow/flow.h                        |  13 +-
 flow/singleton.h                   | 237 +++++++++++++++++++++++++++++
 13 files changed, 606 insertions(+), 30 deletions(-)
 create mode 100644 cmake/GetMsgpack.cmake
 create mode 100644 fdbclient/ActorLineageProfiler.cpp
 create mode 100644 fdbclient/ActorLineageProfiler.h
 create mode 100644 flow/singleton.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f6e85984f1..2e48d95447 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,6 +152,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")
 endif()
 
 include(CompileBoost)
+include(GetMsgpack)
 add_subdirectory(flow)
 add_subdirectory(fdbrpc)
 add_subdirectory(fdbclient)
diff --git a/cmake/GetMsgpack.cmake b/cmake/GetMsgpack.cmake
new file mode 100644
index 0000000000..0b951d5a1b
--- /dev/null
+++ b/cmake/GetMsgpack.cmake
@@ -0,0 +1,16 @@
+find_package(msgpack 3.3.0 EXACT QUIET CONFIG)
+
+add_library(msgpack INTERFACE)
+
+if(msgpack_FOUND)
+  target_link_libraries(msgpack INTERFACE msgpackc-cxx)
+else()
+  include(ExternalProject)
+  ExternalProject_add(msgpackProject
+    URL "https://github.com/msgpack/msgpack-c/releases/download/cpp-3.3.0/msgpack-3.3.0.tar.gz"
+    URL_HASH SHA256=6e114d12a5ddb8cb11f669f83f32246e484a8addd0ce93f274996f1941c1f07b
+    CONFIGURE_COMMAND BUILD_COMMAND INSTALL_COMMAND)
+
+  ExternalProject_Get_property(msgpackProject SOURCE_DIR)
+  target_include_directories(msgpack SYSTEM INTERFACE "${SOURCE_DIR}/include")
+endif()
\ No newline at end of file
diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
new file mode 100644
index 0000000000..8d5ad1d6ae
--- /dev/null
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -0,0 +1,183 @@
+/*
+ * ActorLineageProfiler.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-20201 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flow/singleton.h"
+#include "fdbclient/ActorLineageProfiler.h"
+#include <msgpack.hpp>
+#include <memory>
+#include <boost/endian/conversion.hpp>
+
+using namespace std::literals;
+
+class Packer : public msgpack::packer<msgpack::sbuffer> {
+	struct visitor_t {
+		using VisitorMap = std::unordered_map<std::type_info, std::function<void(std::any const&, Packer& packer)>>;
+		VisitorMap visitorMap;
+
+		template <class T>
+		static void any_visitor(std::any const& val, Packer& packer) {
+			const T& v = std::any_cast<const T&>(val);
+			packer.pack(v);
+		}
+
+		template <class... Args>
+		struct populate_visitor_map;
+		template <class Head, class... Tail>
+		struct populate_visitor_map<Head, Tail...> {
+			static void populate(VisitorMap& map) {
+				map.emplace(any_visitor<Head>);
+				populate_visitor_map<Tail...>::populate(map);
+			}
+		};
+		template <>
+		struct populate_visitor_map<> {
+			static void populate(VisitorMap&) {}
+		};
+
+		visitor_t() { populate_visitor_map<int64_t, uint64_t, bool, float, double>::populate(visitorMap); }
+
+		void visit(const std::any& val, Packer& packer) {
+			auto iter = visitorMap.find(val.type());
+			if (iter == visitorMap.end()) {
+				// TODO: trace error
+			} else {
+				iter->second(val, packer);
+			}
+		}
+	};
+	msgpack::sbuffer sbuffer;
+	// Initializing visitor_t involves building a type-map. As this is a relatively expensive operation, we don't want
+	// to do this each time we create a Packer object. So visitor_t is a stateless class and we only use it as a
+	// visitor.
+	crossbow::singleton<visitor_t> visitor;
+
+public:
+	Packer() : msgpack::packer<msgpack::sbuffer>(sbuffer) {}
+
+	void pack(std::any const& val) { visitor->visit(val, *this); }
+
+	void pack(bool val) {
+		if (val) {
+			pack_true();
+		} else {
+			pack_false();
+		}
+	}
+
+	void pack(uint64_t val) {
+		if (val <= std::numeric_limits<uint8_t>::max()) {
+			pack_uint8(uint8_t(val));
+		} else if (val <= std::numeric_limits<uint16_t>::max()) {
+			pack_uint16(uint16_t(val));
+		} else if (val <= std::numeric_limits<uint32_t>::max()) {
+			pack_uint32(uint32_t(val));
+		} else {
+			pack_uint64(val);
+		}
+	}
+
+	void pack(int64_t val) {
+		if (val >= 0) {
+			this->pack(uint64_t(val));
+		} else if (val >= std::numeric_limits<uint8_t>::min()) {
+			pack_int8(int8_t(val));
+		} else if (val >= std::numeric_limits<uint16_t>::min()) {
+			pack_int8(int16_t(val));
+		} else if (val >= std::numeric_limits<uint32_t>::min()) {
+			pack_int8(int32_t(val));
+		} else if (val >= std::numeric_limits<uint64_t>::min()) {
+			pack_int8(int64_t(val));
+		}
+	}
+
+	void pack(float val) { pack_float(val); }
+	void pack(double val) { pack_double(val); }
+	void pack(std::string const& str) {
+		pack_str(str.size());
+		pack_str_body(str.data(), str.size());
+	}
+
+	void pack(std::string_view val) {
+		pack_str(val.size());
+		pack_str_body(val.data(), val.size());
+	}
+
+	template <class K, class V>
+	void pack(std::map<K, V> const& map) {
+		pack_map(map.size());
+		for (const auto& p : map) {
+			pack(p.first);
+			pack(p.second);
+		}
+	}
+
+	template <class T>
+	void pack(std::vector<T> const& val) {
+		pack_array(val.size());
+		for (const auto& v : val) {
+			pack(v);
+		}
+	}
+
+	std::shared_ptr<Sample> done(double time) {
+		auto res = std::make_shared<Sample>();
+		res->time = time;
+		res->size = sbuffer.size();
+		res->data = sbuffer.release();
+		return res;
+	}
+};
+
+IALPCollectorBase::IALPCollectorBase() {
+	SampleCollector::instance().addCollector(this);
+}
+
+std::map<std::string_view, std::any> SampleCollectorT::collect(ActorLineage* lineage) {
+	std::map<std::string_view, std::any> out;
+	for (auto& collector : collectors) {
+		auto val = collector->collect(lineage);
+		if (val.has_value()) {
+			out[collector->name()] = val.value();
+		}
+	}
+	return out;
+}
+
+std::shared_ptr<Sample> SampleCollectorT::collect() {
+	Packer packer;
+	std::map<std::string_view, std::any> res;
+	double time = g_network->now();
+	res["time"sv] = time;
+	for (auto& p : getSamples) {
+		std::vector<std::map<std::string_view, std::any>> samples;
+		auto sampleVec = p.second();
+		for (auto& val : sampleVec) {
+			auto m = collect(val.getPtr());
+			if (!m.empty()) {
+				samples.emplace_back(std::move(m));
+			}
+		}
+		if (!samples.empty()) {
+			res[to_string(p.first)] = samples;
+		}
+	}
+	packer.pack(res);
+	return packer.done(time);
+}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
new file mode 100644
index 0000000000..cbd2e7d1f3
--- /dev/null
+++ b/fdbclient/ActorLineageProfiler.h
@@ -0,0 +1,80 @@
+/*
+ * ActorLineageProfiler.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-20201 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <optional>
+#include <string>
+#include <any>
+#include <vector>
+#include "flow/singleton.h"
+#include "flow/flow.h"
+
+struct IALPCollectorBase {
+	virtual std::optional<std::any> collect(ActorLineage*) = 0;
+	virtual const std::string_view& name() = 0;
+	IALPCollectorBase();
+};
+
+template <class T>
+struct IALPCollector : IALPCollectorBase {
+	const std::string_view& name() override {
+		static std::string_view res;
+		if (res == "") {
+			res = T::name;
+		}
+		return res;
+	}
+};
+
+enum class WaitState { Running, DiskIO };
+
+std::string_view to_string(WaitState w) {
+	switch (w) {
+	case WaitState::Running:
+		return "Running";
+	case WaitState::DiskIO:
+		return "DiskIO";
+	}
+}
+
+struct Sample : std::enable_shared_from_this<Sample> {
+	double time = 0.0;
+	unsigned size = 0u;
+	char* data = nullptr;
+	~Sample() { ::free(data); }
+};
+
+class SampleCollectorT {
+public: // Types
+	friend class crossbow::singleton<SampleCollectorT>;
+	using Getter = std::function<std::vector<Reference<ActorLineage>>()>;
+
+private:
+	std::vector<IALPCollectorBase*> collectors;
+	std::map<WaitState, Getter> getSamples;
+	SampleCollectorT() {}
+
+public:
+	void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); }
+	std::map<std::string_view, std::any> collect(ActorLineage* lineage);
+	std::shared_ptr<Sample> collect();
+};
+
+using SampleCollector = crossbow::singleton<SampleCollectorT>;
diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index 129f9e7d3e..f81fd92eac 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -1,4 +1,6 @@
 set(FDBCLIENT_SRCS
+  ActorLineageProfiler.h
+  ActorLineageProfiler.cpp
   AsyncFileS3BlobStore.actor.cpp
   AsyncFileS3BlobStore.actor.h
   AsyncTaskThread.actor.cpp
@@ -137,8 +139,7 @@ endif()
 
 add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs})
 add_dependencies(fdbclient fdboptions)
+target_link_libraries(fdbclient PUBLIC fdbrpc msgpack)
 if(BUILD_AZURE_BACKUP)
-  target_link_libraries(fdbclient PUBLIC fdbrpc PRIVATE curl uuid azure-storage-lite)
-else()
-  target_link_libraries(fdbclient PUBLIC fdbrpc)
+  target_link_libraries(fdbclient PRIVATE curl uuid azure-storage-lite)
 endif()
diff --git a/fdbserver/RoleLineage.actor.cpp b/fdbserver/RoleLineage.actor.cpp
index 6d1b49527a..b54282f5f0 100644
--- a/fdbserver/RoleLineage.actor.cpp
+++ b/fdbserver/RoleLineage.actor.cpp
@@ -20,4 +20,6 @@
 
 #include "fdbserver/RoleLineage.actor.h"
 
-StringRef RoleLineage::name = "RoleLineage"_sr;
+using namespace std::literals;
+
+std::string_view RoleLineage::name = "RoleLineage"sv;
diff --git a/fdbserver/RoleLineage.actor.h b/fdbserver/RoleLineage.actor.h
index d35c749771..5cbf65ed53 100644
--- a/fdbserver/RoleLineage.actor.h
+++ b/fdbserver/RoleLineage.actor.h
@@ -21,30 +21,47 @@
 #pragma once
 #include "flow/flow.h"
 #if defined(NO_INTELLISENSE) && !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_G_H)
-#  define FDBSERVER_ROLE_LINEAGE_ACTOR_G_H
-#  include "fdbserver/RoleLineage.actor.g.h"
+#define FDBSERVER_ROLE_LINEAGE_ACTOR_G_H
+#include "fdbserver/RoleLineage.actor.g.h"
 #elif !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_H)
-#  define FDBSERVER_ROLE_LINEAGE_ACTOR_H
+#define FDBSERVER_ROLE_LINEAGE_ACTOR_H
 
+#include "flow/singleton.h"
 #include "fdbrpc/Locality.h"
+#include "fdbclient/ActorLineageProfiler.h"
+#include "fdbserver/WorkerInterface.actor.h"
+
+#include <string_view>
+#include <msgpack.hpp>
+#include <any>
 #include "flow/actorcompiler.h" // This must be the last include
 
 struct RoleLineage : LineageProperties<RoleLineage> {
-    static StringRef name;
-    ProcessClass::ClusterRole role = ProcessClass::NoRole;
+	static std::string_view name;
+	ProcessClass::ClusterRole role = ProcessClass::NoRole;
 
-    bool isSet(ProcessClass::ClusterRole RoleLineage::*member) const {
-        return this->*member != ProcessClass::NoRole;
-    }
+	bool isSet(ProcessClass::ClusterRole RoleLineage::*member) const { return this->*member != ProcessClass::NoRole; }
+};
+
+struct RoleLineageCollector : IALPCollector<RoleLineage> {
+	RoleLineageCollector() : IALPCollector() {}
+	std::optional<std::any> collect(ActorLineage* lineage) override {
+		auto res = lineage->get(&RoleLineage::role);
+		if (res.has_value()) {
+			return Role::get(res.value()).abbreviation;
+		} else {
+			return std::optional<std::any>();
+		}
+	}
 };
 
 // creates a new root and sets the role lineage
-ACTOR template<class Fun>
+ACTOR template <class Fun>
 Future<decltype(std::declval<Fun>()())> runInRole(Fun fun, ProcessClass::ClusterRole role) {
-    currentLineage->makeRoot();
-    currentLineage->modify(&RoleLineage::role) = role;
-    decltype(std::declval<Fun>()()) res = wait(fun());
-    return res;
+	currentLineage->makeRoot();
+	currentLineage->modify(&RoleLineage::role) = role;
+	decltype(std::declval<Fun>()()) res = wait(fun());
+	return res;
 }
 
 #endif
diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h
index f1d83ec819..57c2833f3c 100644
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@@ -787,6 +787,40 @@ struct Role {
 	std::string abbreviation;
 	bool includeInTraceRoles;
 
+	static const Role& get(ProcessClass::ClusterRole role) {
+		switch (role) {
+		case ProcessClass::Storage:
+			return STORAGE_SERVER;
+		case ProcessClass::TLog:
+			return TRANSACTION_LOG;
+		case ProcessClass::CommitProxy:
+			return COMMIT_PROXY;
+		case ProcessClass::GrvProxy:
+			return GRV_PROXY;
+		case ProcessClass::Master:
+			return MASTER;
+		case ProcessClass::Resolver:
+			return RESOLVER;
+		case ProcessClass::LogRouter:
+			return LOG_ROUTER;
+		case ProcessClass::ClusterController:
+			return CLUSTER_CONTROLLER;
+		case ProcessClass::DataDistributor:
+			return DATA_DISTRIBUTOR;
+		case ProcessClass::Ratekeeper:
+			return RATEKEEPER;
+		case ProcessClass::StorageCache:
+			return STORAGE_CACHE;
+		case ProcessClass::Backup:
+			return BACKUP;
+		case ProcessClass::Worker:
+			return WORKER;
+		case ProcessClass::NoRole:
+			ASSERT(false);
+			throw internal_error();
+		}
+	}
+
 	bool operator==(const Role& r) const { return roleName == r.roleName; }
 	bool operator!=(const Role& r) const { return !(*this == r); }
 
diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp
index bb0b0325c6..a95af0cd21 100644
--- a/flow/Net2.actor.cpp
+++ b/flow/Net2.actor.cpp
@@ -226,7 +226,9 @@ public:
 	TaskPriority currentTaskID;
 	uint64_t tasksIssued;
 	TDMetricCollection tdmetrics;
-	double currentTime;
+	// we read now() from a different thread. On Intel, reading a double is atomic anyways, but on other platforms it's
+	// not. For portability this should be atomic
+	std::atomic<double> currentTime;
 	// May be accessed off the network thread, e.g. by onMainThread
 	std::atomic<bool> stopped;
 	mutable std::map<IPAddress, bool> addressOnHostCache;
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 50f252021b..b28c6c35d5 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -3685,8 +3685,8 @@ void* sampleThread(void* arg) {
 		printf("Currently running actor lineage (%p):\n", actorLineage.getPtr());
 		auto stack = actorLineage->stack(&StackLineage::actorName);
 		while (!stack.empty()) {
-			printf("%s ", stack.top());
-			stack.pop();
+			printf("%s ", stack.back());
+			stack.pop_back();
 		}
 		printf("\n");
 
@@ -3697,8 +3697,8 @@ void* sampleThread(void* arg) {
 		for (auto actorLineage : diskAlps) {
 			auto stack = actorLineage->stack(&StackLineage::actorName);
 			while (!stack.empty()) {
-				printf("%s ", stack.top());
-				stack.pop();
+				printf("%s ", stack.back());
+				stack.pop_back();
 			}
 			printf("\n");
 		}
diff --git a/flow/flow.cpp b/flow/flow.cpp
index 9a7dda781a..351c8d0aa2 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -39,9 +39,11 @@ ActorLineage::~ActorLineage() {
 	}
 }
 
-StringRef StackLineage::name = "StackLineage"_sr;
+using namespace std::literals;
 
-std::stack<StringRef> getActorStackTrace() {
+std::string_view StackLineage::name = "StackLineage"sv;
+
+std::vector<StringRef> getActorStackTrace() {
 	return currentLineage->stack(&StackLineage::actorName);
 }
 
diff --git a/flow/flow.h b/flow/flow.h
index b61453c8f2..09211959a7 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -38,6 +38,7 @@
 #include <functional>
 #include <iostream>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <algorithm>
 #include <memory>
@@ -450,7 +451,7 @@ struct ActorLineage : ReferenceCounted<ActorLineage> {
 	friend class LocalLineage;
 
 private:
-	std::unordered_map<StringRef, LineagePropertiesBase*> properties;
+	std::unordered_map<std::string_view, LineagePropertiesBase*> properties;
 	Reference<ActorLineage> parent;
 
 public:
@@ -483,15 +484,15 @@ public:
 		return std::optional<V>{};
 	}
 	template <class T, class V>
-	std::stack<V> stack(V T::*member) const {
+	std::vector<V> stack(V T::*member) const {
 		auto current = this;
-		std::stack<V> res;
+		std::vector<V> res;
 		while (current != nullptr) {
 			auto iter = current->properties.find(T::name);
 			if (iter != current->properties.end()) {
 				T const& map = static_cast<T const&>(*iter->second);
 				if (map.isSet(member)) {
-					res.push(map.*member);
+					res.push_back(map.*member);
 				}
 			}
 			current = current->parent.getPtr();
@@ -529,11 +530,11 @@ struct restore_lineage {
 };
 
 struct StackLineage : LineageProperties<StackLineage> {
-	static StringRef name;
+	static const std::string_view name;
 	StringRef actorName;
 };
 
-extern std::stack<StringRef> getActorStackTrace();
+extern std::vector<StringRef> getActorStackTrace();
 
 // SAV is short for Single Assignment Variable: It can be assigned for only once!
 template <class T>
diff --git a/flow/singleton.h b/flow/singleton.h
new file mode 100644
index 0000000000..c6a256ac42
--- /dev/null
+++ b/flow/singleton.h
@@ -0,0 +1,237 @@
+/*
+ * (C) Copyright 2015 ETH Zurich Systems Group (http://www.systems.ethz.ch/) and others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Contributors:
+ *     Markus Pilman <mpilman@inf.ethz.ch>
+ *     Simon Loesing <sloesing@inf.ethz.ch>
+ *     Thomas Etter <etterth@gmail.com>
+ *     Kevin Bocksrocker <kevin.bocksrocker@gmail.com>
+ *     Lucas Braun <braunl@inf.ethz.ch>
+ */
+#pragma once
+
+#include <mutex>
+#include <memory>
+#include <cstdlib>
+#include <cassert>
+
+namespace crossbow {
+
+/**
+ * @brief A mock mutex for disabling locking in the singleton
+ *
+ * This class implements the mutex concept with empty methods.
+ * This can be used to disable synchronization in the singleton
+ * holder.
+ */
+struct no_locking {
+	void lock() {}
+	void unlock() {}
+	bool try_lock() { return true; }
+};
+
+template <typename T>
+struct create_static {
+	static constexpr bool supports_recreation = false;
+	union max_align {
+		char t_[sizeof(T)];
+		short int short_int_;
+		long int long_int_;
+		float float_;
+		double double_;
+		long double longDouble_;
+		struct Test;
+		int Test::*pMember_;
+		int (Test::*pMemberFn_)(int);
+	};
+
+	static T* create() {
+		static max_align static_memory_;
+		return new (&static_memory_) T;
+	}
+
+	static void destroy(T* ptr) { ptr->~T(); }
+};
+
+template <typename T>
+struct create_using_new {
+	static constexpr bool supports_recreation = true;
+	static T* create() { return new T; };
+
+	static void destroy(T* ptr) { delete ptr; }
+};
+
+template <typename T>
+struct create_using_malloc {
+	static constexpr bool supports_recreation = true;
+	static T* create() {
+		void* p = std::malloc(sizeof(T));
+		if (!p)
+			return nullptr;
+		return new (p) T;
+	}
+
+	static void destroy(T* ptr) {
+		ptr->~T();
+		free(ptr);
+	}
+};
+
+template <class T, class allocator>
+struct create_using {
+	static constexpr bool supports_recreation = true;
+	static allocator alloc_;
+
+	static T* create() {
+		T* p = alloc_.allocate(1);
+		if (!p)
+			return nullptr;
+		alloc_.construct(p);
+		return p;
+	};
+
+	static void destroy(T* ptr) {
+		alloc_.destroy(ptr);
+		alloc_.deallocate(ptr, 1);
+	}
+};
+
+template <typename T>
+struct default_lifetime {
+	static void schedule_destruction(T*, void (*func)()) { std::atexit(func); }
+
+	static void on_dead_ref() { throw std::logic_error("Dead reference detected"); }
+};
+
+template <typename T>
+struct phoenix_lifetime {
+	static void schedule_destruction(T*, void (*func)()) { std::atexit(func); }
+
+	static void on_dead_ref() {}
+};
+
+template <typename T>
+struct infinite_lifetime {
+	static void schedule_destruction(T*, void (*)()) {}
+	static void on_dead_ref() {}
+};
+
+template <typename T>
+struct lifetime_traits {
+	static constexpr bool supports_recreation = true;
+};
+
+template <typename T>
+struct lifetime_traits<infinite_lifetime<T>> {
+	static constexpr bool supports_recreation = false;
+};
+
+template <typename T>
+struct lifetime_traits<default_lifetime<T>> {
+	static constexpr bool supports_recreation = false;
+};
+
+template <typename Type,
+          typename Create = create_static<Type>,
+          typename LifetimePolicy = default_lifetime<Type>,
+          typename Mutex = std::mutex>
+class singleton {
+public:
+	typedef Type value_type;
+	typedef Type* pointer;
+	typedef const Type* const_pointer;
+	typedef const Type& const_reference;
+	typedef Type& reference;
+
+private:
+	static bool destroyed_;
+	static pointer instance_;
+	static Mutex mutex_;
+
+	static void destroy() {
+		if (destroyed_)
+			return;
+		Create::destroy(instance_);
+		instance_ = nullptr;
+		destroyed_ = true;
+	}
+
+public:
+	static reference instance() {
+		static_assert(Create::supports_recreation || !lifetime_traits<LifetimePolicy>::supports_recreation,
+		              "The creation policy does not support instance recreation, while the lifetime does support it.");
+		if (!instance_) {
+			std::lock_guard<Mutex> l(mutex_);
+			if (!instance_) {
+				if (destroyed_) {
+					destroyed_ = false;
+					LifetimePolicy::on_dead_ref();
+				}
+				instance_ = Create::create();
+				LifetimePolicy::schedule_destruction(instance_, &destroy);
+			}
+		}
+		return *instance_;
+	}
+	/**
+	 * WARNING: DO NOT EXECUTE THIS MULTITHREADED!!!
+	 */
+	static void destroy_instance() {
+		if (instance_) {
+			std::lock_guard<Mutex> l(mutex_);
+			destroy();
+		}
+	}
+
+public:
+	pointer operator->() {
+		if (!instance_) {
+			instance();
+		}
+		return instance_;
+	}
+
+	reference operator*() {
+		if (!instance_) {
+			instance();
+		}
+		return *instance_;
+	}
+
+	const_pointer operator->() const {
+		if (!instance_) {
+			instance();
+		}
+		return instance_;
+	}
+
+	const_reference operator*() const {
+		if (!instance_) {
+			instance();
+		}
+		return *instance_;
+	}
+};
+
+template <typename T, typename C, typename L, typename M>
+bool singleton<T, C, L, M>::destroyed_ = false;
+
+template <typename T, typename C, typename L, typename M>
+typename singleton<T, C, L, M>::pointer singleton<T, C, L, M>::instance_ = nullptr;
+
+template <typename T, typename C, typename L, typename M>
+M singleton<T, C, L, M>::mutex_;
+
+} // namespace crossbow
\ No newline at end of file

From 20d98421af0e8d11ca41f3e748d52f54d9a143e7 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 9 Apr 2021 15:16:07 -0600
Subject: [PATCH 108/317] fix compiler errors

---
 fdbclient/ActorLineageProfiler.cpp | 13 +++++++++++--
 fdbclient/ActorLineageProfiler.h   | 11 ++---------
 fdbserver/SigStack.cpp             | 20 ++++++++++----------
 flow/flow.cpp                      |  2 +-
 4 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 8d5ad1d6ae..a28f011d5a 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -26,9 +26,18 @@
 
 using namespace std::literals;
 
+std::string_view to_string(WaitState w) {
+	switch (w) {
+	case WaitState::Running:
+		return "Running";
+	case WaitState::DiskIO:
+		return "DiskIO";
+	}
+}
+
 class Packer : public msgpack::packer<msgpack::sbuffer> {
 	struct visitor_t {
-		using VisitorMap = std::unordered_map<std::type_info, std::function<void(std::any const&, Packer& packer)>>;
+		using VisitorMap = std::unordered_map<std::type_index, std::function<void(std::any const&, Packer& packer)>>;
 		VisitorMap visitorMap;
 
 		template <class T>
@@ -42,7 +51,7 @@ class Packer : public msgpack::packer<msgpack::sbuffer> {
 		template <class Head, class... Tail>
 		struct populate_visitor_map<Head, Tail...> {
 			static void populate(VisitorMap& map) {
-				map.emplace(any_visitor<Head>);
+				map.emplace(std::type_index(typeid(Head)), any_visitor<Head>);
 				populate_visitor_map<Tail...>::populate(map);
 			}
 		};
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index cbd2e7d1f3..af32d6de13 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -45,14 +45,7 @@ struct IALPCollector : IALPCollectorBase {
 
 enum class WaitState { Running, DiskIO };
 
-std::string_view to_string(WaitState w) {
-	switch (w) {
-	case WaitState::Running:
-		return "Running";
-	case WaitState::DiskIO:
-		return "DiskIO";
-	}
-}
+std::string_view to_string(WaitState w);
 
 struct Sample : std::enable_shared_from_this<Sample> {
 	double time = 0.0;
@@ -63,7 +56,7 @@ struct Sample : std::enable_shared_from_this<Sample> {
 
 class SampleCollectorT {
 public: // Types
-	friend class crossbow::singleton<SampleCollectorT>;
+	friend struct crossbow::create_static<SampleCollectorT>;
 	using Getter = std::function<std::vector<Reference<ActorLineage>>()>;
 
 private:
diff --git a/fdbserver/SigStack.cpp b/fdbserver/SigStack.cpp
index efec5aff7d..0c35326766 100644
--- a/fdbserver/SigStack.cpp
+++ b/fdbserver/SigStack.cpp
@@ -7,17 +7,17 @@
 // However, this should be good enough for an initial
 // proof of concept.
 extern "C" void stackSignalHandler(int sig) {
-    auto stack = getActorStackTrace();
-    int i = 0;
-    while (!stack.empty()) {
-        auto s = stack.top();
-        stack.pop();
-        std::string_view n(reinterpret_cast<const char*>(s.begin()), s.size());
-        std::cout << i << ": " << n << std::endl;
-        ++i;
-    }
+	auto stack = getActorStackTrace();
+	int i = 0;
+	while (!stack.empty()) {
+		auto s = stack.back();
+		stack.pop_back();
+		std::string_view n(reinterpret_cast<const char*>(s.begin()), s.size());
+		std::cout << i << ": " << n << std::endl;
+		++i;
+	}
 }
 
 void setupStackSignal() {
-    std::signal(SIGUSR1, &stackSignalHandler);
+	std::signal(SIGUSR1, &stackSignalHandler);
 }
diff --git a/flow/flow.cpp b/flow/flow.cpp
index 351c8d0aa2..1332207e38 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -41,7 +41,7 @@ ActorLineage::~ActorLineage() {
 
 using namespace std::literals;
 
-std::string_view StackLineage::name = "StackLineage"sv;
+const std::string_view StackLineage::name = "StackLineage"sv;
 
 std::vector<StringRef> getActorStackTrace() {
 	return currentLineage->stack(&StackLineage::actorName);

From 8a6473c08a83bfe1bb888dbfd2dd7d3813c70295 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 9 Apr 2021 15:23:42 -0600
Subject: [PATCH 109/317] Apply suggestions from code review

Co-authored-by: Lukas Joswiak <lukas.joswiak@snowflake.com>
---
 fdbclient/ActorLineageProfiler.cpp | 16 ++++++++--------
 fdbclient/ActorLineageProfiler.h   |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index a28f011d5a..13bc224001 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -1,9 +1,9 @@
 /*
- * ActorLineageProfiler.h
+ * ActorLineageProfiler.cpp
  *
  * This source file is part of the FoundationDB open source project
  *
- * Copyright 2013-20201 Apple Inc. and the FoundationDB project authors
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -107,12 +107,12 @@ public:
 			this->pack(uint64_t(val));
 		} else if (val >= std::numeric_limits<uint8_t>::min()) {
 			pack_int8(int8_t(val));
-		} else if (val >= std::numeric_limits<uint16_t>::min()) {
-			pack_int8(int16_t(val));
-		} else if (val >= std::numeric_limits<uint32_t>::min()) {
-			pack_int8(int32_t(val));
-		} else if (val >= std::numeric_limits<uint64_t>::min()) {
-			pack_int8(int64_t(val));
+		} else if (val >= std::numeric_limits<int16_t>::min()) {
+			pack_int16(int16_t(val));
+		} else if (val >= std::numeric_limits<int32_t>::min()) {
+			pack_int32(int32_t(val));
+		} else if (val >= std::numeric_limits<int64_t>::min()) {
+			pack_int64(int64_t(val));
 		}
 	}
 
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index af32d6de13..2b4e780f39 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -3,7 +3,7 @@
  *
  * This source file is part of the FoundationDB open source project
  *
- * Copyright 2013-20201 Apple Inc. and the FoundationDB project authors
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 6656557b6a276d191e743dfa9414ffbad0afadae Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 9 Apr 2021 15:25:11 -0600
Subject: [PATCH 110/317] made internal collect method private

---
 fdbclient/ActorLineageProfiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 2b4e780f39..1f2bdad659 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -63,10 +63,10 @@ private:
 	std::vector<IALPCollectorBase*> collectors;
 	std::map<WaitState, Getter> getSamples;
 	SampleCollectorT() {}
+	std::map<std::string_view, std::any> collect(ActorLineage* lineage);
 
 public:
 	void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); }
-	std::map<std::string_view, std::any> collect(ActorLineage* lineage);
 	std::shared_ptr<Sample> collect();
 };
 

From dc00d996265c39bfa128e19cd5e13221ae0caeed Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Fri, 9 Apr 2021 15:02:45 -0700
Subject: [PATCH 111/317] Log FileBackupLogRangeStart before calling
 getLogRanges() in .
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It will tell us if or why this function is legitimately trying to use too much ram. getLogRange() should normally return about 20 items in the result. If the inputs are trash, it could return far more.

If it isn’t the case, then there’s something else wrong that has corrupted something such that when we try to allocate memory.
---
 fdbclient/FileBackupAgent.actor.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index d7bbd1be61..2479b71a04 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -1988,6 +1988,12 @@ struct BackupLogRangeTaskFunc : BackupTaskFuncBase {
 
 		Key destUidValue = wait(config.destUidValue().getOrThrow(tr));
 
+		TraceEvent("FileBackupLogRangeStart")
+		    .detail("BeginVersion", beginVersion)
+		    .detail("EndVersion", endVersion)
+		    .detail("DestUIDValue", destUidValue)
+		    .detail("VersionBlockSize", CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE);
+
 		// Get the set of key ranges that hold mutations for (beginVersion, endVersion).  They will be queried in
 		// parallel below and there is a limit on how many we want to process in a single BackupLogRangeTask so if that
 		// limit is exceeded then set the addBackupLogRangeTasks boolean in Params and stop, signalling the finish()
@@ -2059,7 +2065,7 @@ struct BackupLogRangeTaskFunc : BackupTaskFuncBase {
 		    .detail("Size", outFile->size())
 		    .detail("BeginVersion", beginVersion)
 		    .detail("EndVersion", endVersion)
-		    .detail("LastReadVersion", latestVersion);
+		    .detail("LastReadVersion", lastVersion);
 
 		Params.fileSize().set(task, outFile->size());
 
@@ -5183,7 +5189,8 @@ public:
 	}
 
 	ACTOR static Future<Optional<Version>> getLastRestorable(FileBackupAgent* backupAgent,
-	                                                         Reference<ReadYourWritesTransaction> tr, Key tagName,
+	                                                         Reference<ReadYourWritesTransaction> tr,
+	                                                         Key tagName,
 	                                                         bool snapshot) {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
@@ -5577,7 +5584,8 @@ Future<std::string> FileBackupAgent::getStatusJSON(Database cx, std::string tagN
 	return FileBackupAgentImpl::getStatusJSON(this, cx, tagName);
 }
 
-Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName,
+Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr,
+                                                             Key tagName,
                                                              bool snapshot) {
 	return FileBackupAgentImpl::getLastRestorable(this, tr, tagName, snapshot);
 }

From 13e00e8408bc0914751574dc688c4ebeea4b2b11 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 12 Apr 2021 09:43:45 -0600
Subject: [PATCH 112/317] made ActorLineage thread safe

---
 flow/flow.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/flow/flow.h b/flow/flow.h
index 09211959a7..2fab7b11a4 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -42,6 +42,7 @@
 #include <utility>
 #include <algorithm>
 #include <memory>
+#include <mutex>
 
 #include "flow/Platform.h"
 #include "flow/FastAlloc.h"
@@ -453,14 +454,23 @@ struct ActorLineage : ReferenceCounted<ActorLineage> {
 private:
 	std::unordered_map<std::string_view, LineagePropertiesBase*> properties;
 	Reference<ActorLineage> parent;
+	mutable std::mutex mutex;
+	using Lock = std::unique_lock<std::mutex>;
 
 public:
 	ActorLineage();
 	~ActorLineage();
-	bool isRoot() const { return parent.getPtr() == nullptr; }
-	void makeRoot() { parent.clear(); }
+	bool isRoot() const {
+		Lock _{ mutex };
+		return parent.getPtr() == nullptr;
+	}
+	void makeRoot() {
+		Lock _{ mutex };
+		parent.clear();
+	}
 	template <class T, class V>
 	V& modify(V T::*member) {
+		Lock _{ mutex };
 		auto& res = properties[T::name];
 		if (!res) {
 			res = new T{};
@@ -470,6 +480,7 @@ public:
 	}
 	template <class T, class V>
 	std::optional<V> get(V T::*member) const {
+		Lock _{ mutex };
 		auto current = this;
 		while (current != nullptr) {
 			auto iter = current->properties.find(T::name);
@@ -485,6 +496,7 @@ public:
 	}
 	template <class T, class V>
 	std::vector<V> stack(V T::*member) const {
+		Lock _{ mutex };
 		auto current = this;
 		std::vector<V> res;
 		while (current != nullptr) {

From eb2fe0dbcf19e656fe1f7500fd85164778d1c868 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 12 Apr 2021 09:48:53 -0600
Subject: [PATCH 113/317] added serializable containers

---
 fdbclient/ActorLineageProfiler.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 13bc224001..a084beb4b3 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -60,7 +60,19 @@ class Packer : public msgpack::packer<msgpack::sbuffer> {
 			static void populate(VisitorMap&) {}
 		};
 
-		visitor_t() { populate_visitor_map<int64_t, uint64_t, bool, float, double>::populate(visitorMap); }
+		visitor_t() {
+			populate_visitor_map<int64_t,
+			                     uint64_t,
+			                     bool,
+			                     float,
+			                     double,
+			                     std::string,
+			                     std::string_view,
+			                     std::vector<std::any>,
+			                     std::map<std::any, std::any>,
+			                     std::map<std::string_view, std::any>,
+			                     std::unordered_map<std::any, std::any>>::populate(visitorMap);
+		}
 
 		void visit(const std::any& val, Packer& packer) {
 			auto iter = visitorMap.find(val.type());

From ec95b649b04179254ba5c8b6ef8de676972090dc Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 12 Apr 2021 09:51:59 -0600
Subject: [PATCH 114/317] Any can't be used as an index type

---
 fdbclient/ActorLineageProfiler.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index a084beb4b3..5c0aaf86d1 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -69,9 +69,8 @@ class Packer : public msgpack::packer<msgpack::sbuffer> {
 			                     std::string,
 			                     std::string_view,
 			                     std::vector<std::any>,
-			                     std::map<std::any, std::any>,
-			                     std::map<std::string_view, std::any>,
-			                     std::unordered_map<std::any, std::any>>::populate(visitorMap);
+			                     std::map<std::string, std::any>,
+			                     std::map<std::string_view, std::any>>::populate(visitorMap);
 		}
 
 		void visit(const std::any& val, Packer& packer) {

From 82db2ddce5285c6b350a9259b0468136499adddb Mon Sep 17 00:00:00 2001
From: Meng Xu <meng_xu@apple.com>
Date: Mon, 12 Apr 2021 09:49:44 -0700
Subject: [PATCH 115/317] Add tool to detect client version

---
 contrib/monitoring/fdb_c_version.py | 87 +++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100755 contrib/monitoring/fdb_c_version.py

diff --git a/contrib/monitoring/fdb_c_version.py b/contrib/monitoring/fdb_c_version.py
new file mode 100755
index 0000000000..91f9643d76
--- /dev/null
+++ b/contrib/monitoring/fdb_c_version.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+#
+# fdb_c_version.py
+#
+#  This source file is part of the FoundationDB open source project
+#
+#  Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import argparse
+import ctypes
+import sys
+import platform
+import os
+
+def error(message):
+    print(message)
+    sys.exit(1)
+
+def get_version_string(library_path):
+    try:
+        lib = ctypes.cdll.LoadLibrary(os.path.abspath(library_path))
+    except Exception as e:
+        error('Could not load library %r: %s' % (library_path, e.message))
+
+    lib.fdb_get_error.restype = ctypes.c_char_p
+
+    try:
+        r = lib.fdb_select_api_version_impl(410, 410)
+        if r != 0:
+            error('Error setting API version: %s (%d)' % (lib.fdb_get_error(r), r))
+    except Exception as e:
+        error('Error calling fdb_select_api_version_impl: %s' % e.message)
+
+    try:
+        lib.fdb_get_client_version.restype = ctypes.c_char_p
+        version_str = lib.fdb_get_client_version()
+    except Exception as e:
+        error('Error getting version information from client library: %s' % e.message)
+
+    version_components = version_str.split(',')
+    package_version = '.'.join(version_components[0].split('.')[0:2])
+
+    version_str = 'FoundationDB Client %s (v%s)\n' % (package_version, version_components[0])
+    version_str += 'source version %s\n' % version_components[1]
+    version_str += 'protocol %s' % version_components[2]
+
+    return version_str
+
+if __name__ == '__main__':
+    if platform.system() == 'Linux':
+        default_lib = 'libfdb_c.so'
+        platform_name = 'Linux'
+        dlopen = 'dlopen'
+    elif platform.system() == 'Windows':
+        default_lib = 'fdb_c.dll'
+        platform_name = 'Windows'
+        dlopen = 'LoadLibrary'
+    elif platform.system() == 'Darwin':
+        default_lib = 'libfdb_c.dylib'
+        platform_name = 'macOS'
+        dlopen = 'dlopen'
+    else:
+        error('Unsupported platform: %s' % platform.system())
+
+    parser = argparse.ArgumentParser(description='Prints version information for an FDB client library (e.g. %s). Must be run on a library built for the current platform (%s).' % (default_lib, platform_name))
+    parser.add_argument('library_path', type=str, help='Path to the client library. If not specified, the library will be searched for according to the procedures for %s on the current platform (%s).' % (dlopen, platform_name), default=None, nargs='?')
+
+    args = parser.parse_args()
+
+    if args.library_path is None:
+        args.library_path = default_lib
+    elif not os.path.isfile(args.library_path):
+        error('Library does not exist: %r' % args.library_path)
+
+    print(get_version_string(args.library_path))

From b93b41cbf62da1af2477a5c348d799fc0df90398 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Mon, 12 Apr 2021 13:20:04 -0700
Subject: [PATCH 116/317] Some cleanup of fdb_c_version:

* Support Python 3 and make it the default
* Use str(e) instead of e.message to print errors
* Don't convert library_path to an absolute path so that we can load the system fdb_c.
---
 contrib/monitoring/fdb_c_version.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/contrib/monitoring/fdb_c_version.py b/contrib/monitoring/fdb_c_version.py
index 91f9643d76..059d1c7d9b 100755
--- a/contrib/monitoring/fdb_c_version.py
+++ b/contrib/monitoring/fdb_c_version.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 #
 # fdb_c_version.py
 #
@@ -30,9 +30,9 @@ def error(message):
 
 def get_version_string(library_path):
     try:
-        lib = ctypes.cdll.LoadLibrary(os.path.abspath(library_path))
+        lib = ctypes.cdll.LoadLibrary(library_path)
     except Exception as e:
-        error('Could not load library %r: %s' % (library_path, e.message))
+        error('Could not load library %r: %s' % (library_path, str(e)))
 
     lib.fdb_get_error.restype = ctypes.c_char_p
 
@@ -41,13 +41,13 @@ def get_version_string(library_path):
         if r != 0:
             error('Error setting API version: %s (%d)' % (lib.fdb_get_error(r), r))
     except Exception as e:
-        error('Error calling fdb_select_api_version_impl: %s' % e.message)
+        error('Error calling fdb_select_api_version_impl: %s' % str(e))
 
     try:
         lib.fdb_get_client_version.restype = ctypes.c_char_p
-        version_str = lib.fdb_get_client_version()
+        version_str = lib.fdb_get_client_version().decode('utf-8')
     except Exception as e:
-        error('Error getting version information from client library: %s' % e.message)
+        error('Error getting version information from client library: %s' % str(e))
 
     version_components = version_str.split(',')
     package_version = '.'.join(version_components[0].split('.')[0:2])

From c8b27e71c52312a3690cbe9d06f334f6b4a7f1f1 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 12 Apr 2021 15:03:47 -0700
Subject: [PATCH 117/317] Revert TraceEvent

We've found the problem (issue #4640), so we no longer need the TraceEvent.
---
 fdbclient/FileBackupAgent.actor.cpp | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 2479b71a04..e7da8fbf58 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -1988,12 +1988,6 @@ struct BackupLogRangeTaskFunc : BackupTaskFuncBase {
 
 		Key destUidValue = wait(config.destUidValue().getOrThrow(tr));
 
-		TraceEvent("FileBackupLogRangeStart")
-		    .detail("BeginVersion", beginVersion)
-		    .detail("EndVersion", endVersion)
-		    .detail("DestUIDValue", destUidValue)
-		    .detail("VersionBlockSize", CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE);
-
 		// Get the set of key ranges that hold mutations for (beginVersion, endVersion).  They will be queried in
 		// parallel below and there is a limit on how many we want to process in a single BackupLogRangeTask so if that
 		// limit is exceeded then set the addBackupLogRangeTasks boolean in Params and stop, signalling the finish()
@@ -5189,8 +5183,7 @@ public:
 	}
 
 	ACTOR static Future<Optional<Version>> getLastRestorable(FileBackupAgent* backupAgent,
-	                                                         Reference<ReadYourWritesTransaction> tr,
-	                                                         Key tagName,
+	                                                         Reference<ReadYourWritesTransaction> tr, Key tagName,
 	                                                         bool snapshot) {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
@@ -5584,8 +5577,7 @@ Future<std::string> FileBackupAgent::getStatusJSON(Database cx, std::string tagN
 	return FileBackupAgentImpl::getStatusJSON(this, cx, tagName);
 }
 
-Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr,
-                                                             Key tagName,
+Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName,
                                                              bool snapshot) {
 	return FileBackupAgentImpl::getLastRestorable(this, tr, tagName, snapshot);
 }

From eb4c80db39deb94cb4eb1e2d72364eb097db030a Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Mon, 12 Apr 2021 23:15:17 +0000
Subject: [PATCH 118/317] Respect the version constraints for restart tests in
 ctest

---
 tests/TestRunner/TestRunner.py | 49 ++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/tests/TestRunner/TestRunner.py b/tests/TestRunner/TestRunner.py
index 4e9bb1d5c0..207bec08c0 100755
--- a/tests/TestRunner/TestRunner.py
+++ b/tests/TestRunner/TestRunner.py
@@ -264,6 +264,40 @@ def process_traces(basedir, testname, path, out, aggregationPolicy, symbolicateB
     parser.writeObject({'CMakeSEED': str(cmake_seed)})
     return res
 
+class RestartTestPolicy:
+    def __init__(self, name, old_binary, new_binary):
+        # Default is to use the same binary for the restart test, unless constraints are satisfied.
+        self._first_binary = new_binary
+        self._second_binary = new_binary
+        if old_binary is None:
+            _logger.info("No old binary provided")
+        old_binary_version_raw = subprocess.check_output([old_binary, '--version']).decode('utf-8')
+        match = re.match('FoundationDB.*\(v([0-9]+\.[0-9]+\.[0-9]+)\)', old_binary_version_raw)
+        assert match, old_binary_version_raw
+        old_binary_version = tuple(map(int, match.group(1).split('.')))
+        match = re.match('.*/restarting/from_([0-9]+\.[0-9]+\.[0-9]+)/', name)
+        if match: # upgrading _from_
+            lower_bound = tuple(map(int, match.group(1).split('.')))
+            if old_binary_version >= lower_bound:
+                self._first_binary = old_binary
+                _logger.info("Using old binary as first binary: {} >= {}".format(old_binary_version, lower_bound))
+            else:
+                _logger.info("Using new binary as first binary: {} < {}".format(old_binary_version, lower_bound))
+        match = re.match('.*/restarting/to_([0-9]+\.[0-9]+\.[0-9]+)/', name)
+        if match: # downgrading _to_
+            lower_bound = tuple(map(int, match.group(1).split('.')))
+            if old_binary_version >= lower_bound:
+                self._second_binary = old_binary
+                _logger.info("Using old binary as second binary: {} >= {}".format(old_binary_version, lower_bound))
+            else:
+                _logger.info("Using new binary as second binary: {} < {}".format(old_binary_version, lower_bound))
+
+    def first_binary(self):
+        return self._first_binary
+
+    def second_binary(self):
+        return self._second_binary
+
 def run_simulation_test(basedir, options):
     fdbserver = os.path.join(basedir, 'bin', 'fdbserver')
     pargs = [fdbserver,
@@ -298,14 +332,19 @@ def run_simulation_test(basedir, options):
     os.mkdir(wd)
     return_codes = {} # {command: return_code}
     first = True
+    restart_test_policy = None
+    if len(options.testfile) > 1:
+        restart_test_policy = RestartTestPolicy(options.testfile[0], options.old_binary, fdbserver)
     for testfile in options.testfile:
         tmp = list(pargs)
-        # old_binary is not under test, so don't run under valgrind
         valgrind_args = []
-        if first and options.old_binary is not None and len(options.testfile) > 1:
-            _logger.info("Run old binary at {}".format(options.old_binary))
-            tmp[0] = options.old_binary
-        elif options.use_valgrind:
+        if restart_test_policy is not None:
+            if first:
+                tmp[0] = restart_test_policy.first_binary()
+            else:
+                tmp[0] = restart_test_policy.second_binary()
+        # old_binary is not under test, so don't run under valgrind
+        if options.use_valgrind and tmp[0] == fdbserver:
             valgrind_args = ['valgrind', '--error-exitcode=99', '--']
         if not first:
             tmp.append('-R')

From feede1d2f64e5af57e4d1487f542fd552cf48c7d Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Tue, 13 Apr 2021 10:48:52 -0700
Subject: [PATCH 119/317] Fix line length of test macro + comments to be within
 the 120 character limit

---
 fdbclient/NativeAPI.actor.cpp         |  1 +
 fdbserver/CommitProxyServer.actor.cpp |  3 ++-
 fdbserver/DiskQueue.actor.cpp         |  7 +++++--
 fdbserver/Ratekeeper.actor.cpp        |  3 ++-
 fdbserver/masterserver.actor.cpp      | 16 ++++++++--------
 5 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 8b55757621..df9e08169f 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -2478,6 +2478,7 @@ ACTOR Future<Version> watchValue(Future<Version> version,
 				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID));
 			} else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) {
 				TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead
+				TEST(e.code() == error_code_watch_cancelled); // Too many watches on storage server, poll for changes
 				TEST(e.code() == error_code_process_behind); // The storage servers are all behind
 				wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, info.taskID));
 			} else if (e.code() == error_code_timed_out) { // The storage server occasionally times out watches in case
diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp
index 23ca41c0fe..a4d21031b2 100644
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@@ -1062,7 +1062,8 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
 	state const Optional<UID>& debugID = self->debugID;
 	state Span span("MP:postResolution"_loc, self->span.context);
 
-	TEST(pProxyCommitData->latestLocalCommitBatchLogging.get() < localBatchNumber - 1); // Queuing post-resolution commit processing
+	bool queuedCommits = pProxyCommitData->latestLocalCommitBatchLogging.get() < localBatchNumber - 1;
+	TEST(queuedCommits); // Queuing post-resolution commit processing
 	wait(pProxyCommitData->latestLocalCommitBatchLogging.whenAtLeast(localBatchNumber - 1));
 	wait(yield(TaskPriority::ProxyCommitYield1));
 
diff --git a/fdbserver/DiskQueue.actor.cpp b/fdbserver/DiskQueue.actor.cpp
index 4678f3de13..a1be3733e6 100644
--- a/fdbserver/DiskQueue.actor.cpp
+++ b/fdbserver/DiskQueue.actor.cpp
@@ -884,7 +884,9 @@ public:
 		uint8_t const* begin = contents.begin();
 		uint8_t const* end = contents.end();
 		TEST(contents.size() && pushedPageCount()); // More than one push between commits
-		TEST(contents.size() >= 4 && pushedPageCount() && backPage().remainingCapacity() < 4); // Push right at the end of a page, possibly splitting size
+
+		bool pushAtEndOfPage = contents.size() >= 4 && pushedPageCount() && backPage().remainingCapacity() < 4;
+		TEST(pushAtEndOfPage); // Push right at the end of a page, possibly splitting size
 		while (begin != end) {
 			if (!pushedPageCount() || !backPage().remainingCapacity())
 				addEmptyPage();
@@ -1363,7 +1365,8 @@ private:
 		// The fully durable popped point is self->lastPoppedSeq; tell the raw queue that.
 		int f;
 		int64_t p;
-		TEST(self->lastPoppedSeq / sizeof(Page) != self->poppedSeq / sizeof(Page)); // DiskQueue: Recovery popped position not fully durable
+		bool poppedNotDurable = self->lastPoppedSeq / sizeof(Page) != self->poppedSeq / sizeof(Page);
+		TEST(poppedNotDurable); // DiskQueue: Recovery popped position not fully durable
 		self->findPhysicalLocation(self->lastPoppedSeq, &f, &p, "lastPoppedSeq");
 		wait(self->rawQueue->setPoppedPage(f, p, pageFloor(self->lastPoppedSeq)));
 
diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index d07caf06b7..2effbb8155 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -1496,7 +1496,8 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
 					p.lastTagPushTime = now();
 
 					reply.throttledTags = self.throttledTags.getClientRates(self.autoThrottlingEnabled);
-					TEST(reply.throttledTags.present() && reply.throttledTags.get().size() > 0); // Returning tag throttles to a proxy
+					bool returningTagsToProxy = reply.throttledTags.present() && reply.throttledTags.get().size() > 0;
+					TEST(returningTagsToProxy); // Returning tag throttles to a proxy
 				}
 
 				reply.healthMetrics.update(self.healthMetrics, true, req.detailed);
diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp
index 5fbf5bc2de..a5f6ed7b75 100644
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@@ -1110,7 +1110,6 @@ ACTOR Future<Void> getVersion(Reference<MasterData> self, GetCommitVersionReques
 		req.reply.send(itr->second);
 	} else if (req.requestNum <= proxyItr->second.latestRequestNum.get()) {
 		TEST(true); // Old request for previously acknowledged sequence - may be impossible with current FlowTransport
-		            // implementation
 		ASSERT(req.requestNum <
 		       proxyItr->second.latestRequestNum.get()); // The latest request can never be acknowledged
 		req.reply.send(Never());
@@ -1133,8 +1132,9 @@ ACTOR Future<Void> getVersion(Reference<MasterData> self, GetCommitVersionReques
 			                                        SERVER_KNOBS->VERSIONS_PER_SECOND * (t1 - self->lastVersionTime)));
 
 			TEST(self->version - rep.prevVersion == 1); // Minimum possible version gap
-			TEST(self->version - rep.prevVersion ==
-			     SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS); // Maximum possible version gap
+
+			bool maxVersionGap = self->version - rep.prevVersion == SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS;
+			TEST(maxVersionGap); // Maximum possible version gap
 			self->lastVersionTime = t1;
 
 			if (self->resolverNeedingChanges.count(req.requestingProxy)) {
@@ -1998,11 +1998,11 @@ ACTOR Future<Void> masterServer(MasterInterface mi,
 			self->addActor.getFuture().pop();
 		}
 
-		TEST(err.code() == error_code_master_tlog_failed); // Master: terminated because of a tLog failure
-		TEST(err.code() == error_code_commit_proxy_failed); // Master: terminated because of a commit proxy failure
-		TEST(err.code() == error_code_grv_proxy_failed); // Master: terminated because of a GRV proxy failure
-		TEST(err.code() == error_code_master_resolver_failed); // Master: terminated because of a resolver failure
-		TEST(err.code() == error_code_master_backup_worker_failed); // Master: terminated because of a backup worker failure
+		TEST(err.code() == error_code_master_tlog_failed); // Master: terminated due to tLog failure
+		TEST(err.code() == error_code_commit_proxy_failed); // Master: terminated due to commit proxy failure
+		TEST(err.code() == error_code_grv_proxy_failed); // Master: terminated due to GRV proxy failure
+		TEST(err.code() == error_code_master_resolver_failed); // Master: terminated due to resolver failure
+		TEST(err.code() == error_code_master_backup_worker_failed); // Master: terminated due to backup worker failure
 
 		if (normalMasterErrors().count(err.code())) {
 			TraceEvent("MasterTerminated", mi.id()).error(err);

From bd6db9ca7cc4a789c5738d16f128e895c8ad15b6 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Tue, 13 Apr 2021 15:13:45 -0700
Subject: [PATCH 120/317] Update fdbserver/ClusterController.actor.cpp

Co-authored-by: Markus Pilman <markus.pilman@snowflake.com>
---
 fdbserver/ClusterController.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 97249dccf8..543abc8dad 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -855,8 +855,8 @@ public:
 				auto thisFit = it.processClass.machineClassFitness(role);
 				worstFit = std::max(worstFit, thisFit);
 				bestFit = std::min(bestFit, thisFit);
-				degraded |= it.degraded;
-				inClusterControllerDC |= (it.interf.locality.dcId() == ccDcId);
+				degraded = it.degraded || degraded;
+				inClusterControllerDC = (it.interf.locality.dcId() == ccDcId) || inClusterControllerDC;
 
 				auto thisUsed = id_used.find(it.interf.locality.processId());
 				if (thisUsed == id_used.end()) {

From ebf37594f788b5bc28dcf2214f02806f642f4f88 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Tue, 13 Apr 2021 19:22:13 -0700
Subject: [PATCH 121/317] Change initialSnapshotIntervalSeconds from knob to a
 backup argument.

---
 documentation/sphinx/source/backups.rst            |  3 +++
 fdbbackup/backup.actor.cpp                         |  9 +++++++++
 fdbclient/BackupAgent.actor.h                      | 14 ++++++++++++--
 fdbclient/FileBackupAgent.actor.cpp                |  6 +++++-
 fdbclient/Knobs.cpp                                |  2 +-
 fdbserver/workloads/AtomicRestore.actor.cpp        |  1 +
 .../BackupAndParallelRestoreCorrectness.actor.cpp  |  2 ++
 fdbserver/workloads/BackupCorrectness.actor.cpp    |  2 ++
 fdbserver/workloads/BackupToBlob.actor.cpp         |  3 ++-
 fdbserver/workloads/IncrementalBackup.actor.cpp    |  2 +-
 fdbserver/workloads/SubmitBackup.actor.cpp         |  3 +++
 11 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/documentation/sphinx/source/backups.rst b/documentation/sphinx/source/backups.rst
index 24ae05a124..404fe70f50 100644
--- a/documentation/sphinx/source/backups.rst
+++ b/documentation/sphinx/source/backups.rst
@@ -244,6 +244,9 @@ The ``start`` subcommand is used to start a backup.  If there is already a backu
 ``-s <DURATION>`` or ``--snapshot_interval <DURATION>``  
   Specifies the duration, in seconds, of the inconsistent snapshots written to the backup in continuous mode.  The default is 864000 which is 10 days.
 
+``--init_snapshot_interval <DURATION>``  
+  Specifies the duration, in seconds, of the first inconsistent snapshot written to the backup.  The default is 0, which means as fast as possible.
+
 ``--partitioned_log_experimental``
   Specifies the backup uses the partitioned mutation logs generated by backup workers. Since FDB version 6.3, this option is experimental and requires using fast restore for restoring the database from the generated files. The default is to use non-partitioned mutation logs generated by backup agents.
 
diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index 7614324afc..8170aa5eca 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -105,6 +105,7 @@ enum {
 	// Backup constants
 	OPT_DESTCONTAINER,
 	OPT_SNAPSHOTINTERVAL,
+	OPT_INIT_SNAPSHOT_INTERVAL,
 	OPT_ERRORLIMIT,
 	OPT_NOSTOPWHENDONE,
 	OPT_EXPIRE_BEFORE_VERSION,
@@ -232,6 +233,7 @@ CSimpleOpt::SOption g_rgBackupStartOptions[] = {
 	{ OPT_USE_PARTITIONED_LOG, "--partitioned_log_experimental", SO_NONE },
 	{ OPT_SNAPSHOTINTERVAL, "-s", SO_REQ_SEP },
 	{ OPT_SNAPSHOTINTERVAL, "--snapshot_interval", SO_REQ_SEP },
+	{ OPT_INIT_SNAPSHOT_INTERVAL, "--init_snapshot_interval", SO_REQ_SEP },
 	{ OPT_TAGNAME, "-t", SO_REQ_SEP },
 	{ OPT_TAGNAME, "--tagname", SO_REQ_SEP },
 	{ OPT_BACKUPKEYS, "-k", SO_REQ_SEP },
@@ -1879,6 +1881,7 @@ ACTOR Future<Void> submitDBBackup(Database src,
 
 ACTOR Future<Void> submitBackup(Database db,
                                 std::string url,
+                                int initialSnapshotIntervalSeconds,
                                 int snapshotIntervalSeconds,
                                 Standalone<VectorRef<KeyRangeRef>> backupRanges,
                                 std::string tagName,
@@ -1935,6 +1938,7 @@ ACTOR Future<Void> submitBackup(Database db,
 		else {
 			wait(backupAgent.submitBackup(db,
 			                              KeyRef(url),
+			                              initialSnapshotIntervalSeconds,
 			                              snapshotIntervalSeconds,
 			                              tagName,
 			                              backupRanges,
@@ -3212,6 +3216,7 @@ int main(int argc, char* argv[]) {
 		std::string destinationContainer;
 		bool describeDeep = false;
 		bool describeTimestamps = false;
+		int initialSnapshotIntervalSeconds = CLIENT_KNOBS->BACKUP_INIT_SNAPSHOT_INTERVAL_SEC;
 		int snapshotIntervalSeconds = CLIENT_KNOBS->BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC;
 		std::string clusterFile;
 		std::string sourceClusterFile;
@@ -3467,6 +3472,7 @@ int main(int argc, char* argv[]) {
 				modifyOptions.destURL = destinationContainer;
 				break;
 			case OPT_SNAPSHOTINTERVAL:
+			case OPT_INIT_SNAPSHOT_INTERVAL:
 			case OPT_MOD_ACTIVE_INTERVAL: {
 				const char* a = args->OptionArg();
 				int seconds;
@@ -3478,6 +3484,8 @@ int main(int argc, char* argv[]) {
 				if (optId == OPT_SNAPSHOTINTERVAL) {
 					snapshotIntervalSeconds = seconds;
 					modifyOptions.snapshotIntervalSeconds = seconds;
+				} else if (optId == OPT_INIT_SNAPSHOT_INTERVAL) {
+					initialSnapshotIntervalSeconds = seconds;
 				} else if (optId == OPT_MOD_ACTIVE_INTERVAL) {
 					modifyOptions.activeSnapshotIntervalSeconds = seconds;
 				}
@@ -3888,6 +3896,7 @@ int main(int argc, char* argv[]) {
 				openBackupContainer(argv[0], destinationContainer);
 				f = stopAfter(submitBackup(db,
 				                           destinationContainer,
+				                           initialSnapshotIntervalSeconds,
 				                           snapshotIntervalSeconds,
 				                           backupKeys,
 				                           tagName,
diff --git a/fdbclient/BackupAgent.actor.h b/fdbclient/BackupAgent.actor.h
index fb8f6b1564..b29cee2a12 100644
--- a/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/BackupAgent.actor.h
@@ -357,6 +357,7 @@ public:
 
 	Future<Void> submitBackup(Reference<ReadYourWritesTransaction> tr,
 	                          Key outContainer,
+	                          int initialSnapshotIntervalSeconds,
 	                          int snapshotIntervalSeconds,
 	                          std::string tagName,
 	                          Standalone<VectorRef<KeyRangeRef>> backupRanges,
@@ -365,6 +366,7 @@ public:
 	                          bool incrementalBackupOnly = false);
 	Future<Void> submitBackup(Database cx,
 	                          Key outContainer,
+	                          int initialSnapshotIntervalSeconds,
 	                          int snapshotIntervalSeconds,
 	                          std::string tagName,
 	                          Standalone<VectorRef<KeyRangeRef>> backupRanges,
@@ -374,6 +376,7 @@ public:
 		return runRYWTransactionFailIfLocked(cx, [=](Reference<ReadYourWritesTransaction> tr) {
 			return submitBackup(tr,
 			                    outContainer,
+			                    initialSnapshotIntervalSeconds,
 			                    snapshotIntervalSeconds,
 			                    tagName,
 			                    backupRanges,
@@ -404,7 +407,8 @@ public:
 	Future<std::string> getStatus(Database cx, bool showErrors, std::string tagName);
 	Future<std::string> getStatusJSON(Database cx, std::string tagName);
 
-	Future<Optional<Version>> getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName,
+	Future<Optional<Version>> getLastRestorable(Reference<ReadYourWritesTransaction> tr,
+	                                            Key tagName,
 	                                            bool snapshot = false);
 	void setLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName, Version version);
 
@@ -835,6 +839,11 @@ public:
 	typedef KeyBackedMap<Key, bool> RangeDispatchMapT;
 	RangeDispatchMapT snapshotRangeDispatchMap() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 
+	// Interval to use for the first (initial) snapshot.
+	KeyBackedProperty<int64_t> initialSnapshotIntervalSeconds() {
+		return configSpace.pack(LiteralStringRef(__FUNCTION__));
+	}
+
 	// Interval to use for determining the target end version for new snapshots
 	KeyBackedProperty<int64_t> snapshotIntervalSeconds() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 
@@ -864,8 +873,9 @@ public:
 
 		Future<Version> beginVersion = tr->getReadVersion();
 		Future<int64_t> defaultInterval = 0;
-		if (intervalSeconds < 0)
+		if (intervalSeconds < 0) {
 			defaultInterval = copy.snapshotIntervalSeconds().getOrThrow(tr);
+		}
 
 		// Make sure read version and possibly the snapshot interval value are ready, then clear/init the snapshot
 		// config members
diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index fd7f817711..680628bfc9 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2778,7 +2778,7 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 		state Reference<TaskFuture> backupFinished = futureBucket->future(tr);
 
 		// Initialize the initial snapshot and create tasks to continually write logs and snapshots.
-		wait(config.initNewSnapshot(tr, CLIENT_KNOBS->BACKUP_INIT_SNAPSHOT_INTERVAL_SEC));
+		wait(config.initNewSnapshot(tr, config.initialSnapshotIntervalSeconds().get(tr).get().orDefault(0)));
 
 		// Using priority 1 for both of these to at least start both tasks soon
 		// Do not add snapshot task if we only want the incremental backup
@@ -4439,6 +4439,7 @@ public:
 	ACTOR static Future<Void> submitBackup(FileBackupAgent* backupAgent,
 	                                       Reference<ReadYourWritesTransaction> tr,
 	                                       Key outContainer,
+	                                       int initialSnapshotIntervalSeconds,
 	                                       int snapshotIntervalSeconds,
 	                                       std::string tagName,
 	                                       Standalone<VectorRef<KeyRangeRef>> backupRanges,
@@ -4554,6 +4555,7 @@ public:
 		config.backupContainer().set(tr, bc);
 		config.stopWhenDone().set(tr, stopWhenDone);
 		config.backupRanges().set(tr, normalizedRanges);
+		config.initialSnapshotIntervalSeconds().set(tr, initialSnapshotIntervalSeconds);
 		config.snapshotIntervalSeconds().set(tr, snapshotIntervalSeconds);
 		config.partitionedLogEnabled().set(tr, partitionedLog);
 		config.incrementalBackupOnly().set(tr, incrementalBackupOnly);
@@ -5541,6 +5543,7 @@ Future<ERestoreState> FileBackupAgent::waitRestore(Database cx, Key tagName, boo
 
 Future<Void> FileBackupAgent::submitBackup(Reference<ReadYourWritesTransaction> tr,
                                            Key outContainer,
+                                           int initialSnapshotIntervalSeconds,
                                            int snapshotIntervalSeconds,
                                            std::string tagName,
                                            Standalone<VectorRef<KeyRangeRef>> backupRanges,
@@ -5550,6 +5553,7 @@ Future<Void> FileBackupAgent::submitBackup(Reference<ReadYourWritesTransaction>
 	return FileBackupAgentImpl::submitBackup(this,
 	                                         tr,
 	                                         outContainer,
+	                                         initialSnapshotIntervalSeconds,
 	                                         snapshotIntervalSeconds,
 	                                         tagName,
 	                                         backupRanges,
diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index 761a652a1a..bf9b87d2c3 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -133,7 +133,7 @@ void ClientKnobs::initialize(bool randomize) {
 	init( BACKUP_RANGE_TIMEOUT,   TASKBUCKET_TIMEOUT_VERSIONS/CORE_VERSIONSPERSECOND/2.0 );
 	init( BACKUP_RANGE_MINWAIT,   std::max(1.0, BACKUP_RANGE_TIMEOUT/2.0));
 	init( BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC,  10 * 60 );  // 10 minutes
-	init( BACKUP_INIT_SNAPSHOT_INTERVAL_SEC,          0); if( randomize && BUGGIFY ) BACKUP_INIT_SNAPSHOT_INTERVAL_SEC = deterministicRandom()->randomInt(0, 60); // The initial snapshot has a desired duration of 0, meaning go as fast as possible. In simulation, choose a random value between 0 - 60 seconds.
+	init( BACKUP_INIT_SNAPSHOT_INTERVAL_SEC,          0); // The initial snapshot has a desired duration of 0, meaning go as fast as possible.
 	init( BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC,   3600 * 24 * 10); // 10 days
 	init( BACKUP_SHARD_TASK_LIMIT,                1000 ); if( randomize && BUGGIFY ) BACKUP_SHARD_TASK_LIMIT = 4;
 	init( BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL, 60);
diff --git a/fdbserver/workloads/AtomicRestore.actor.cpp b/fdbserver/workloads/AtomicRestore.actor.cpp
index fb121bcc28..33412123f2 100644
--- a/fdbserver/workloads/AtomicRestore.actor.cpp
+++ b/fdbserver/workloads/AtomicRestore.actor.cpp
@@ -93,6 +93,7 @@ struct AtomicRestoreWorkload : TestWorkload {
 		try {
 			wait(backupAgent.submitBackup(cx,
 			                              StringRef(backupContainer),
+			                              deterministicRandom()->randomInt(0, 60),
 			                              deterministicRandom()->randomInt(0, 100),
 			                              BackupAgentBase::getDefaultTagName(),
 			                              self->backupRanges,
diff --git a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
index 9e8efe2937..fc7f014df0 100644
--- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
@@ -222,6 +222,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 		try {
 			wait(backupAgent->submitBackup(cx,
 			                               StringRef(backupContainer),
+										   deterministicRandom()->randomInt(0, 60),
 			                               deterministicRandom()->randomInt(0, 100),
 			                               tag.toString(),
 			                               backupRanges,
@@ -477,6 +478,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 					// the configuration to disable backup workers before restore.
 					extraBackup = backupAgent.submitBackup(cx,
 					                                       LiteralStringRef("file://simfdb/backups/"),
+														   deterministicRandom()->randomInt(0, 60),
 					                                       deterministicRandom()->randomInt(0, 100),
 					                                       self->backupTag.toString(),
 					                                       self->backupRanges,
diff --git a/fdbserver/workloads/BackupCorrectness.actor.cpp b/fdbserver/workloads/BackupCorrectness.actor.cpp
index 4a57d399fe..a6ad2c783b 100644
--- a/fdbserver/workloads/BackupCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupCorrectness.actor.cpp
@@ -248,6 +248,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 		try {
 			wait(backupAgent->submitBackup(cx,
 			                               StringRef(backupContainer),
+			                               deterministicRandom()->randomInt(0, 60),
 			                               deterministicRandom()->randomInt(0, 100),
 			                               tag.toString(),
 			                               backupRanges,
@@ -497,6 +498,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 				try {
 					extraBackup = backupAgent.submitBackup(cx,
 					                                       LiteralStringRef("file://simfdb/backups/"),
+					                                       deterministicRandom()->randomInt(0, 60),
 					                                       deterministicRandom()->randomInt(0, 100),
 					                                       self->backupTag.toString(),
 					                                       self->backupRanges,
diff --git a/fdbserver/workloads/BackupToBlob.actor.cpp b/fdbserver/workloads/BackupToBlob.actor.cpp
index 374b0d7f5d..b106176f16 100644
--- a/fdbserver/workloads/BackupToBlob.actor.cpp
+++ b/fdbserver/workloads/BackupToBlob.actor.cpp
@@ -29,6 +29,7 @@ struct BackupToBlobWorkload : TestWorkload {
 	double backupAfter;
 	Key backupTag;
 	Standalone<StringRef> backupURL;
+	int initSnapshotInterval = 0;
 	int snapshotInterval = 100000;
 
 	static constexpr const char* DESCRIPTION = "BackupToBlob";
@@ -60,7 +61,7 @@ struct BackupToBlobWorkload : TestWorkload {
 
 		wait(delay(self->backupAfter));
 		wait(backupAgent.submitBackup(
-		    cx, self->backupURL, self->snapshotInterval, self->backupTag.toString(), backupRanges));
+		    cx, self->backupURL, self->initSnapshotInterval, self->snapshotInterval, self->backupTag.toString(), backupRanges));
 		EBackupState backupStatus = wait(backupAgent.waitBackup(cx, self->backupTag.toString(), true));
 		TraceEvent("BackupToBlob_BackupStatus").detail("Status", BackupAgentBase::getStateText(backupStatus));
 		return Void();
diff --git a/fdbserver/workloads/IncrementalBackup.actor.cpp b/fdbserver/workloads/IncrementalBackup.actor.cpp
index 52aeae2859..8c4b20a07a 100644
--- a/fdbserver/workloads/IncrementalBackup.actor.cpp
+++ b/fdbserver/workloads/IncrementalBackup.actor.cpp
@@ -151,7 +151,7 @@ struct IncrementalBackupWorkload : TestWorkload {
 			TraceEvent("IBackupSubmitAttempt");
 			try {
 				wait(self->backupAgent.submitBackup(
-				    cx, self->backupDir, 1e8, self->tag.toString(), backupRanges, false, false, true));
+				    cx, self->backupDir, 0, 1e8, self->tag.toString(), backupRanges, false, false, true));
 			} catch (Error& e) {
 				TraceEvent("IBackupSubmitError").error(e);
 				if (e.code() != error_code_backup_duplicate) {
diff --git a/fdbserver/workloads/SubmitBackup.actor.cpp b/fdbserver/workloads/SubmitBackup.actor.cpp
index e468664514..6dbc58abf8 100644
--- a/fdbserver/workloads/SubmitBackup.actor.cpp
+++ b/fdbserver/workloads/SubmitBackup.actor.cpp
@@ -33,6 +33,7 @@ struct SubmitBackupWorkload final : TestWorkload {
 	Standalone<StringRef> backupDir;
 	Standalone<StringRef> tag;
 	double delayFor;
+	int initSnapshotInterval;
 	int snapshotInterval;
 	bool stopWhenDone;
 	bool incremental;
@@ -41,6 +42,7 @@ struct SubmitBackupWorkload final : TestWorkload {
 		backupDir = getOption(options, LiteralStringRef("backupDir"), LiteralStringRef("file://simfdb/backups/"));
 		tag = getOption(options, LiteralStringRef("tag"), LiteralStringRef("default"));
 		delayFor = getOption(options, LiteralStringRef("delayFor"), 10.0);
+		initSnapshotInterval = getOption(options, LiteralStringRef("initSnapshotInterval"), 0);
 		snapshotInterval = getOption(options, LiteralStringRef("snapshotInterval"), 1e8);
 		stopWhenDone = getOption(options, LiteralStringRef("stopWhenDone"), true);
 		incremental = getOption(options, LiteralStringRef("incremental"), false);
@@ -55,6 +57,7 @@ struct SubmitBackupWorkload final : TestWorkload {
 		try {
 			wait(self->backupAgent.submitBackup(cx,
 			                                    self->backupDir,
+			                                    self->initSnapshotInterval,
 			                                    self->snapshotInterval,
 			                                    self->tag.toString(),
 			                                    backupRanges,

From 28e7fc7ccafb6ebc72853bb66938cf776ecb2132 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Tue, 13 Apr 2021 19:25:43 -0700
Subject: [PATCH 122/317] Clang format fix.

---
 .../workloads/BackupAndParallelRestoreCorrectness.actor.cpp   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
index fc7f014df0..cbbe52f64a 100644
--- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp
@@ -222,7 +222,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 		try {
 			wait(backupAgent->submitBackup(cx,
 			                               StringRef(backupContainer),
-										   deterministicRandom()->randomInt(0, 60),
+			                               deterministicRandom()->randomInt(0, 60),
 			                               deterministicRandom()->randomInt(0, 100),
 			                               tag.toString(),
 			                               backupRanges,
@@ -478,7 +478,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload {
 					// the configuration to disable backup workers before restore.
 					extraBackup = backupAgent.submitBackup(cx,
 					                                       LiteralStringRef("file://simfdb/backups/"),
-														   deterministicRandom()->randomInt(0, 60),
+					                                       deterministicRandom()->randomInt(0, 60),
 					                                       deterministicRandom()->randomInt(0, 100),
 					                                       self->backupTag.toString(),
 					                                       self->backupRanges,

From 9475b6a5dd6ca87a2b42f5a950ffe7039e8e046e Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 13 Apr 2021 20:15:19 -0700
Subject: [PATCH 123/317] Correctness fix, prevent AsyncFileNonDurable from
 always making file writes take up to 5 seconds.

---
 fdbrpc/AsyncFileNonDurable.actor.h | 6 +++---
 flow/Knobs.cpp                     | 1 +
 flow/Knobs.h                       | 1 +
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index b682a7741b..c55cd8494e 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -197,7 +197,7 @@ private:
 		this->file = file;
 		this->filename = filename;
 		this->diskParameters = diskParameters;
-		maxWriteDelay = 5.0;
+		maxWriteDelay = deterministicRandom()->random01() * FLOW_KNOBS->NON_DURABLE_MAX_WRITE_DELAY;
 		hasBeenSynced = false;
 
 		killMode = (KillMode)deterministicRandom()->randomInt(1, 3);
@@ -434,7 +434,7 @@ private:
 		state TaskPriority currentTaskID = g_network->getCurrentTask();
 		wait(g_simulator.onMachine(currentProcess));
 
-		state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay;
+		state double delayDuration = g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
 		state Standalone<StringRef> dataCopy(StringRef((uint8_t*)data, length));
 
 		state Future<bool> startSyncFuture = self->startSyncPromise.getFuture();
@@ -606,7 +606,7 @@ private:
 		state TaskPriority currentTaskID = g_network->getCurrentTask();
 		wait(g_simulator.onMachine(currentProcess));
 
-		state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay;
+		state double delayDuration = g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
 		state Future<bool> startSyncFuture = self->startSyncPromise.getFuture();
 
 		try {
diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp
index a173ba43bf..6dc77e2fb2 100644
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@@ -135,6 +135,7 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) {
 	init( DISABLE_POSIX_KERNEL_AIO,                              0 );
 
 	//AsyncFileNonDurable
+	init( NON_DURABLE_MAX_WRITE_DELAY,                      0.0001 ); if( randomize && BUGGIFY ) NON_DURABLE_MAX_WRITE_DELAY = 5.0;
 	init( MAX_PRIOR_MODIFICATION_DELAY,                        1.0 ); if( randomize && BUGGIFY ) MAX_PRIOR_MODIFICATION_DELAY = 10.0;
 
 	//GenericActors
diff --git a/flow/Knobs.h b/flow/Knobs.h
index ab088382f8..67ec3b82b7 100644
--- a/flow/Knobs.h
+++ b/flow/Knobs.h
@@ -149,6 +149,7 @@ public:
 	int DISABLE_POSIX_KERNEL_AIO;
 
 	// AsyncFileNonDurable
+	double NON_DURABLE_MAX_WRITE_DELAY;
 	double MAX_PRIOR_MODIFICATION_DELAY;
 
 	// GenericActors

From 1c5013f6ecc04eb89541abcd14dfe7ddc7b27445 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Wed, 7 Apr 2021 18:39:06 -0700
Subject: [PATCH 124/317] Removed btree cleanup parameter override.

---
 tests/rare/RedwoodCorrectnessBTree.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/rare/RedwoodCorrectnessBTree.toml b/tests/rare/RedwoodCorrectnessBTree.toml
index db21848a4b..c39098e4cc 100644
--- a/tests/rare/RedwoodCorrectnessBTree.toml
+++ b/tests/rare/RedwoodCorrectnessBTree.toml
@@ -7,4 +7,3 @@ startDelay = 0
     testName = 'UnitTests'
     maxTestCases = 0
     testsMatching = '/redwood/correctness/btree'
-    remapCleanupWindow = 1000000000

From f74748ebac9fda2626cc71f8370c6b1d7ed2bd9a Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 13 Apr 2021 20:43:12 -0700
Subject: [PATCH 125/317] Applied clang-format.

---
 fdbrpc/AsyncFileNonDurable.actor.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index c55cd8494e..9997a8f4b2 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -434,7 +434,8 @@ private:
 		state TaskPriority currentTaskID = g_network->getCurrentTask();
 		wait(g_simulator.onMachine(currentProcess));
 
-		state double delayDuration = g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
+		state double delayDuration =
+		    g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
 		state Standalone<StringRef> dataCopy(StringRef((uint8_t*)data, length));
 
 		state Future<bool> startSyncFuture = self->startSyncPromise.getFuture();
@@ -606,7 +607,8 @@ private:
 		state TaskPriority currentTaskID = g_network->getCurrentTask();
 		wait(g_simulator.onMachine(currentProcess));
 
-		state double delayDuration = g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
+		state double delayDuration =
+		    g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
 		state Future<bool> startSyncFuture = self->startSyncPromise.getFuture();
 
 		try {

From 1958fde5c6751f13e633f9197610cbc22d1cf0a8 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 13 Apr 2021 20:49:04 -0700
Subject: [PATCH 126/317] Added parentheses for clarity.

---
 fdbrpc/AsyncFileNonDurable.actor.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 9997a8f4b2..fe3d3a4137 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -435,7 +435,7 @@ private:
 		wait(g_simulator.onMachine(currentProcess));
 
 		state double delayDuration =
-		    g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
+		    g_simulator.speedUpSimulation ? 0.0001 : (deterministicRandom()->random01() * self->maxWriteDelay);
 		state Standalone<StringRef> dataCopy(StringRef((uint8_t*)data, length));
 
 		state Future<bool> startSyncFuture = self->startSyncPromise.getFuture();
@@ -608,7 +608,7 @@ private:
 		wait(g_simulator.onMachine(currentProcess));
 
 		state double delayDuration =
-		    g_simulator.speedUpSimulation ? 0.0001 : deterministicRandom()->random01() * self->maxWriteDelay;
+		    g_simulator.speedUpSimulation ? 0.0001 : (deterministicRandom()->random01() * self->maxWriteDelay);
 		state Future<bool> startSyncFuture = self->startSyncPromise.getFuture();
 
 		try {

From 7567fca3cf30d353ec0d1ae5bbcc36d4e14e9d2d Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 13 Apr 2021 21:25:27 -0700
Subject: [PATCH 127/317] Changed knob to int64_t from int as its default value
 overflows.

---
 fdbserver/DiskQueue.actor.cpp | 1 +
 fdbserver/Knobs.cpp           | 2 +-
 fdbserver/Knobs.h             | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fdbserver/DiskQueue.actor.cpp b/fdbserver/DiskQueue.actor.cpp
index a1be3733e6..1efc6ecee6 100644
--- a/fdbserver/DiskQueue.actor.cpp
+++ b/fdbserver/DiskQueue.actor.cpp
@@ -385,6 +385,7 @@ public:
 						waitfor.push_back(self->files[1].f->truncate(self->fileExtensionBytes));
 						self->files[1].size = self->fileExtensionBytes;
 					} else {
+						TEST(true); // Truncating DiskQueue file
 						const int64_t startingSize = self->files[1].size;
 						self->files[1].size -= std::min(maxShrink, self->files[1].size);
 						self->files[1].size = std::max(self->files[1].size, self->fileExtensionBytes);
diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index ad4c797b8d..993ad64e4f 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -83,7 +83,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	init( TLOG_SPILL_REFERENCE_MAX_BYTES_PER_BATCH,           16<<10 ); if ( randomize && BUGGIFY ) TLOG_SPILL_REFERENCE_MAX_BYTES_PER_BATCH = 500;
 	init( DISK_QUEUE_FILE_EXTENSION_BYTES,                    10<<20 ); // BUGGIFYd per file within the DiskQueue
 	init( DISK_QUEUE_FILE_SHRINK_BYTES,                      100<<20 ); // BUGGIFYd per file within the DiskQueue
-	init( DISK_QUEUE_MAX_TRUNCATE_BYTES,                       2<<30 ); if ( randomize && BUGGIFY ) DISK_QUEUE_MAX_TRUNCATE_BYTES = 0;
+	init( DISK_QUEUE_MAX_TRUNCATE_BYTES,                     2LL<<30 ); if ( randomize && BUGGIFY ) DISK_QUEUE_MAX_TRUNCATE_BYTES = 0;
 	init( TLOG_DEGRADED_DURATION,                                5.0 );
 	init( MAX_CACHE_VERSIONS,                                   10e6 );
 	init( TLOG_IGNORE_POP_AUTO_ENABLE_DELAY,                   300.0 );
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 9a5f2a528c..5f91f0eff8 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -84,7 +84,7 @@ public:
 	int64_t TLOG_SPILL_REFERENCE_MAX_BYTES_PER_BATCH;
 	int64_t DISK_QUEUE_FILE_EXTENSION_BYTES; // When we grow the disk queue, by how many bytes should it grow?
 	int64_t DISK_QUEUE_FILE_SHRINK_BYTES; // When we shrink the disk queue, by how many bytes should it shrink?
-	int DISK_QUEUE_MAX_TRUNCATE_BYTES; // A truncate larger than this will cause the file to be replaced instead.
+	int64_t DISK_QUEUE_MAX_TRUNCATE_BYTES; // A truncate larger than this will cause the file to be replaced instead.
 	double TLOG_DEGRADED_DURATION;
 	int64_t MAX_CACHE_VERSIONS;
 	double TXS_POPPED_MAX_DELAY;

From eab468fecca3c166296db892f5d297add4da8df8 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 14 Apr 2021 09:32:48 -0700
Subject: [PATCH 128/317] Remove extra line caused by commit issue

---
 fdbclient/NativeAPI.actor.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index df9e08169f..9f6784e279 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -2477,7 +2477,6 @@ ACTOR Future<Version> watchValue(Future<Version> version,
 				cx->invalidateCache(key);
 				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID));
 			} else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) {
-				TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead
 				TEST(e.code() == error_code_watch_cancelled); // Too many watches on storage server, poll for changes
 				TEST(e.code() == error_code_process_behind); // The storage servers are all behind
 				wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, info.taskID));

From a0430536f14a49beca9be444d671cfcbb377cf13 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 10:41:41 -0700
Subject: [PATCH 129/317] Remove knob BACKUP_INIT_SNAPSHOT_INTERVAL_SEC.

---
 fdbbackup/backup.actor.cpp                 |  3 ++-
 fdbclient/FileBackupAgent.actor.cpp        | 11 ++++++++---
 fdbclient/Knobs.cpp                        |  1 -
 fdbclient/Knobs.h                          |  1 -
 fdbserver/workloads/BackupToBlob.actor.cpp |  8 ++++++--
 5 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index 8170aa5eca..c171c2fcb5 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -3216,7 +3216,8 @@ int main(int argc, char* argv[]) {
 		std::string destinationContainer;
 		bool describeDeep = false;
 		bool describeTimestamps = false;
-		int initialSnapshotIntervalSeconds = CLIENT_KNOBS->BACKUP_INIT_SNAPSHOT_INTERVAL_SEC;
+		int initialSnapshotIntervalSeconds =
+		    0; // The initial snapshot has a desired duration of 0, meaning go as fast as possible.
 		int snapshotIntervalSeconds = CLIENT_KNOBS->BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC;
 		std::string clusterFile;
 		std::string sourceClusterFile;
diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 680628bfc9..5101d4d90e 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2778,7 +2778,10 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 		state Reference<TaskFuture> backupFinished = futureBucket->future(tr);
 
 		// Initialize the initial snapshot and create tasks to continually write logs and snapshots.
-		wait(config.initNewSnapshot(tr, config.initialSnapshotIntervalSeconds().get(tr).get().orDefault(0)));
+		state Future<Optional<int64_t>> initialSnapshotIntervalSeconds =
+		    config.initialSnapshotIntervalSeconds().get(tr);
+		wait(success(initialSnapshotIntervalSeconds) &&
+		     config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.get().orDefault(-1)));
 
 		// Using priority 1 for both of these to at least start both tasks soon
 		// Do not add snapshot task if we only want the incremental backup
@@ -5182,7 +5185,8 @@ public:
 	}
 
 	ACTOR static Future<Optional<Version>> getLastRestorable(FileBackupAgent* backupAgent,
-	                                                         Reference<ReadYourWritesTransaction> tr, Key tagName,
+	                                                         Reference<ReadYourWritesTransaction> tr,
+	                                                         Key tagName,
 	                                                         bool snapshot) {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
@@ -5578,7 +5582,8 @@ Future<std::string> FileBackupAgent::getStatusJSON(Database cx, std::string tagN
 	return FileBackupAgentImpl::getStatusJSON(this, cx, tagName);
 }
 
-Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName,
+Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr,
+                                                             Key tagName,
                                                              bool snapshot) {
 	return FileBackupAgentImpl::getLastRestorable(this, tr, tagName, snapshot);
 }
diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index bf9b87d2c3..bcca5ed166 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -133,7 +133,6 @@ void ClientKnobs::initialize(bool randomize) {
 	init( BACKUP_RANGE_TIMEOUT,   TASKBUCKET_TIMEOUT_VERSIONS/CORE_VERSIONSPERSECOND/2.0 );
 	init( BACKUP_RANGE_MINWAIT,   std::max(1.0, BACKUP_RANGE_TIMEOUT/2.0));
 	init( BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC,  10 * 60 );  // 10 minutes
-	init( BACKUP_INIT_SNAPSHOT_INTERVAL_SEC,          0); // The initial snapshot has a desired duration of 0, meaning go as fast as possible.
 	init( BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC,   3600 * 24 * 10); // 10 days
 	init( BACKUP_SHARD_TASK_LIMIT,                1000 ); if( randomize && BUGGIFY ) BACKUP_SHARD_TASK_LIMIT = 4;
 	init( BACKUP_AGGREGATE_POLL_RATE_UPDATE_INTERVAL, 60);
diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h
index 4fc925766c..3d22b5a24b 100644
--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@@ -129,7 +129,6 @@ public:
 	double BACKUP_RANGE_TIMEOUT;
 	double BACKUP_RANGE_MINWAIT;
 	int BACKUP_SNAPSHOT_DISPATCH_INTERVAL_SEC;
-	int BACKUP_INIT_SNAPSHOT_INTERVAL_SEC;
 	int BACKUP_DEFAULT_SNAPSHOT_INTERVAL_SEC;
 	int BACKUP_SHARD_TASK_LIMIT;
 	double BACKUP_AGGREGATE_POLL_RATE;
diff --git a/fdbserver/workloads/BackupToBlob.actor.cpp b/fdbserver/workloads/BackupToBlob.actor.cpp
index b106176f16..5b94e4d771 100644
--- a/fdbserver/workloads/BackupToBlob.actor.cpp
+++ b/fdbserver/workloads/BackupToBlob.actor.cpp
@@ -60,8 +60,12 @@ struct BackupToBlobWorkload : TestWorkload {
 		backupRanges.push_back_deep(backupRanges.arena(), normalKeys);
 
 		wait(delay(self->backupAfter));
-		wait(backupAgent.submitBackup(
-		    cx, self->backupURL, self->initSnapshotInterval, self->snapshotInterval, self->backupTag.toString(), backupRanges));
+		wait(backupAgent.submitBackup(cx,
+		                              self->backupURL,
+		                              self->initSnapshotInterval,
+		                              self->snapshotInterval,
+		                              self->backupTag.toString(),
+		                              backupRanges));
 		EBackupState backupStatus = wait(backupAgent.waitBackup(cx, self->backupTag.toString(), true));
 		TraceEvent("BackupToBlob_BackupStatus").detail("Status", BackupAgentBase::getStateText(backupStatus));
 		return Void();

From f1415412f1bae3da4931eaa88f1413c179f02f41 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 12 Feb 2021 18:55:01 -0800
Subject: [PATCH 130/317] Add global configuration framework implementation

---
 fdbclient/CMakeLists.txt              |   2 +
 fdbclient/CommitProxyInterface.h      |  12 +--
 fdbclient/GlobalConfig.actor.cpp      |  60 ++++++++++++
 fdbclient/GlobalConfig.actor.h        | 132 ++++++++++++++++++++++++++
 fdbclient/NativeAPI.actor.cpp         |  13 ++-
 fdbclient/SpecialKeySpace.actor.cpp   | 127 ++++++++++++++++++++++++-
 fdbclient/SpecialKeySpace.actor.h     |  11 +++
 fdbclient/SystemData.cpp              |  13 ++-
 fdbclient/SystemData.h                |  24 +++++
 fdbserver/ClusterController.actor.cpp |  55 +++++++++--
 flow/network.h                        |   3 +-
 11 files changed, 426 insertions(+), 26 deletions(-)
 create mode 100644 fdbclient/GlobalConfig.actor.cpp
 create mode 100644 fdbclient/GlobalConfig.actor.h

diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index 129f9e7d3e..e733259611 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -28,6 +28,8 @@ set(FDBCLIENT_SRCS
   FDBOptions.h
   FDBTypes.h
   FileBackupAgent.actor.cpp
+  GlobalConfig.actor.h
+  GlobalConfig.actor.cpp
   GrvProxyInterface.h
   HTTP.actor.cpp
   IClientApi.h
diff --git a/fdbclient/CommitProxyInterface.h b/fdbclient/CommitProxyInterface.h
index a166a87dfa..f29d7369b3 100644
--- a/fdbclient/CommitProxyInterface.h
+++ b/fdbclient/CommitProxyInterface.h
@@ -113,6 +113,7 @@ struct ClientDBInfo {
 	vector<CommitProxyInterface> commitProxies;
 	Optional<CommitProxyInterface>
 	    firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk
+	vector<Standalone<std::pair<Version, VectorRef<MutationRef>>>> history;
 	double clientTxnInfoSampleRate;
 	int64_t clientTxnInfoSizeLimit;
 	Optional<Value> forward;
@@ -132,15 +133,8 @@ struct ClientDBInfo {
 		if constexpr (!is_fb_function<Archive>) {
 			ASSERT(ar.protocolVersion().isValid());
 		}
-		serializer(ar,
-		           grvProxies,
-		           commitProxies,
-		           id,
-		           clientTxnInfoSampleRate,
-		           clientTxnInfoSizeLimit,
-		           forward,
-		           transactionTagSampleRate,
-		           transactionTagSampleCost);
+		serializer(ar, grvProxies, commitProxies, id, history, clientTxnInfoSampleRate, clientTxnInfoSizeLimit,
+	               forward, transactionTagSampleRate, transactionTagSampleCost);
 	}
 };
 
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
new file mode 100644
index 0000000000..f4c2c81e5d
--- /dev/null
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -0,0 +1,60 @@
+/*
+ * GlobalConfig.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/GlobalConfig.actor.h"
+
+#include "flow/actorcompiler.h"  // This must be the last #include.
+
+GlobalConfig::GlobalConfig() : lastUpdate(0) {}
+
+void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+	auto config = new GlobalConfig{}; // TODO: memory leak?
+	config->cx = Database(cx);
+	g_network->setGlobal(INetwork::enGlobalConfig, config);
+	config->_updater = updater(config, dbInfo);
+}
+
+GlobalConfig& GlobalConfig::globalConfig() {
+	void* res = g_network->global(INetwork::enGlobalConfig);
+	ASSERT(res);
+	return *reinterpret_cast<GlobalConfig*>(res);
+}
+
+const std::any GlobalConfig::get(StringRef name) {
+	auto it = data.find(name);
+	if (it == data.end()) {
+		return nullptr;
+	}
+	return it->second;
+}
+
+Future<Void> GlobalConfig::onInitialized() {
+	return initialized.getFuture();
+}
+
+void GlobalConfig::insert(KeyRef key, ValueRef value) {
+	Tuple t = Tuple::unpack(value);
+	// TODO: Add more Tuple types
+	if (t.getType(0) == Tuple::ElementType::UTF8) {
+		data[key] = t.getString(0);
+	} else if (t.getType(0) == Tuple::ElementType::INT) {
+		data[key] = t.getInt(0);
+	}
+}
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
new file mode 100644
index 0000000000..323a5e953c
--- /dev/null
+++ b/fdbclient/GlobalConfig.actor.h
@@ -0,0 +1,132 @@
+/*
+ * GlobalConfig.actor.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_GLOBALCONFIG_ACTOR_G_H)
+#define FDBCLIENT_GLOBALCONFIG_ACTOR_G_H
+#include "fdbclient/GlobalConfig.actor.g.h"
+#elif !defined(FDBCLIENT_GLOBALCONFIG_ACTOR_H)
+#define FDBCLIENT_GLOBALCONFIG_ACTOR_H
+
+#include <any>
+#include <unordered_map>
+
+#include "fdbclient/CommitProxyInterface.h"
+#include "fdbclient/ReadYourWrites.h"
+#include "fdbclient/SystemData.h"
+#include "fdbclient/Tuple.h"
+#include "flow/flow.h"
+#include "flow/genericactors.actor.h"
+#include "flow/Knobs.h"
+
+#include "flow/actorcompiler.h" // has to be last include
+
+class GlobalConfig {
+public:
+	GlobalConfig();
+	GlobalConfig(const GlobalConfig&) = delete;
+	GlobalConfig& operator=(const GlobalConfig&) = delete;
+
+	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
+	static GlobalConfig& globalConfig();
+	const std::any get(StringRef name);
+	Future<Void> onInitialized();
+
+private:
+	void insert(KeyRef key, ValueRef value);
+
+	ACTOR static Future<Void> refresh(GlobalConfig* self) {
+		Transaction tr(self->cx);
+		Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
+		for (const auto& kv : result) {
+			KeyRef systemKey = kv.key.removePrefix(globalConfigDataPrefix);
+			self->insert(systemKey, kv.value);
+		}
+		return Void();
+	}
+
+	ACTOR static Future<Void> updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+		wait(refresh(self));
+		self->initialized.send(Void());
+
+		loop {
+			try {
+				wait(dbInfo->onChange());
+
+				auto& history = dbInfo->get().history;
+				if (history.size() == 0 || (self->lastUpdate < history[0].first && self->lastUpdate != 0)) {
+					// This process missed too many global configuration
+					// history updates or the protocol version changed, so it
+					// must re-read the entire configuration range.
+					wait(refresh(self));
+					self->lastUpdate = dbInfo->get().history.back().contents().first;
+				} else {
+					// Apply history in order, from lowest version to highest
+					// version. Mutation history should already be stored in
+					// ascending version order.
+					for (int i = 0; i < history.size(); ++i) {
+						std::pair<Version, VectorRef<MutationRef>> pair = history[i].contents();
+
+						Version version = pair.first;
+						if (version <= self->lastUpdate) {
+							continue;  // already applied this mutation
+						}
+
+						VectorRef<MutationRef>& mutations = pair.second;
+						for (const auto& mutation : mutations) {
+							if (mutation.type == MutationRef::SetValue) {
+								self->insert(mutation.param1, mutation.param2);
+							} else if (mutation.type == MutationRef::ClearRange) {
+								// TODO: Could be optimized if using std::map..
+								KeyRangeRef range(mutation.param1, mutation.param2);
+								auto it = self->data.begin();
+								while (it != self->data.end()) {
+									if (range.contains(it->first)) {
+										it = self->data.erase(it);
+									} else {
+										++it;
+									}
+								}
+							} else {
+								ASSERT(false);
+							}
+						}
+
+						ASSERT(version > self->lastUpdate);
+						self->lastUpdate = version;
+					}
+				}
+			} catch (Error& e) {
+				throw;
+			}
+		}
+	}
+
+	Database cx;
+	Future<Void> _updater;
+	Promise<Void> initialized;
+	// TODO: Arena to store all data in
+	// TODO: Change to std::map for faster range access
+	std::unordered_map<StringRef, std::any> data;
+	Version lastUpdate;
+};
+
+#endif
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 9f6784e279..d350a39974 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -36,6 +36,7 @@
 #include "fdbclient/ClusterInterface.h"
 #include "fdbclient/CoordinationInterface.h"
 #include "fdbclient/DatabaseContext.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/JsonBuilder.h"
 #include "fdbclient/KeyRangeMap.h"
 #include "fdbclient/Knobs.h"
@@ -962,6 +963,8 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 
 	smoothMidShardSize.reset(CLIENT_KNOBS->INIT_MID_SHARD_BYTES);
 
+	GlobalConfig::create(this, clientInfo);
+
 	if (apiVersionAtLeast(700)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ERRORMSG,
 		                              SpecialKeySpace::IMPLTYPE::READONLY,
@@ -1018,9 +1021,13 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 		        singleKeyRange(LiteralStringRef("consistency_check_suspended"))
 		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
 		registerSpecialKeySpaceModule(
-		    SpecialKeySpace::MODULE::TRACING,
-		    SpecialKeySpace::IMPLTYPE::READWRITE,
-		    std::make_unique<TracingOptionsImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::TRACING)));
+		    SpecialKeySpace::MODULE::GLOBALCONFIG, SpecialKeySpace::IMPLTYPE::READWRITE,
+		    std::make_unique<GlobalConfigImpl>(
+		        SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG)));
+		registerSpecialKeySpaceModule(
+		    SpecialKeySpace::MODULE::TRACING, SpecialKeySpace::IMPLTYPE::READWRITE,
+		    std::make_unique<TracingOptionsImpl>(
+		        SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::TRACING)));
 		registerSpecialKeySpaceModule(
 		    SpecialKeySpace::MODULE::CONFIGURATION,
 		    SpecialKeySpace::IMPLTYPE::READWRITE,
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 5fb7360b0d..8e681b9aba 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -21,7 +21,7 @@
 #include "boost/lexical_cast.hpp"
 #include "boost/algorithm/string.hpp"
 
-#include "fdbclient/Knobs.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/SpecialKeySpace.actor.h"
 #include "flow/Arena.h"
 #include "flow/UnitTest.h"
@@ -64,6 +64,8 @@ std::unordered_map<SpecialKeySpace::MODULE, KeyRange> SpecialKeySpace::moduleToB
 	{ SpecialKeySpace::MODULE::ERRORMSG, singleKeyRange(LiteralStringRef("\xff\xff/error_message")) },
 	{ SpecialKeySpace::MODULE::CONFIGURATION,
 	  KeyRangeRef(LiteralStringRef("\xff\xff/configuration/"), LiteralStringRef("\xff\xff/configuration0")) },
+	{ SpecialKeySpace::MODULE::GLOBALCONFIG,
+	  KeyRangeRef(LiteralStringRef("\xff\xff/global_config/"), LiteralStringRef("\xff\xff/global_config0")) },
 	{ SpecialKeySpace::MODULE::TRACING,
 	  KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) }
 };
@@ -1369,11 +1371,128 @@ Future<Optional<std::string>> ConsistencyCheckImpl::commit(ReadYourWritesTransac
 	return Optional<std::string>();
 }
 
-TracingOptionsImpl::TracingOptionsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {
-	TraceEvent("TracingOptionsImpl::TracingOptionsImpl").detail("Range", kr);
+GlobalConfigImpl::GlobalConfigImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
+
+Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw,
+                                                              KeyRangeRef kr) const {
+	Standalone<RangeResultRef> result;
+
+	// if (kr.begin != kr.end) {
+	// 	ryw->setSpecialKeySpaceErrorMsg("get range disabled, please fetch a single key");
+	// 	throw special_keys_api_failure();
+	// }
+
+	auto& globalConfig = GlobalConfig::globalConfig();
+	KeyRef key = kr.begin.removePrefix(getKeyRange().begin);
+	const std::any& any = globalConfig.get(key);
+	if (any.has_value()) {
+		if (any.type() == typeid(Standalone<StringRef>)) {
+			result.push_back_deep(result.arena(), KeyValueRef(kr.begin, std::any_cast<Standalone<StringRef>>(globalConfig.get(key)).contents()));
+		} else if (any.type() == typeid(int64_t)) {
+			result.push_back_deep(result.arena(), KeyValueRef(kr.begin, std::to_string(std::any_cast<int64_t>(globalConfig.get(key)))));
+		} else {
+			ASSERT(false);
+		}
+	}
+	return result;
 }
 
-Future<Standalone<RangeResultRef>> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+void GlobalConfigImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
+	ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>(value)));
+}
+
+ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* globalConfig, ReadYourWritesTransaction* ryw) {
+	state Transaction& tr = ryw->getTransaction();
+
+	// History should only contain three most recent updates. If it currently
+	// has three items, remove the oldest to make room for a new item.
+	Standalone<RangeResultRef> history = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY, false, true));
+	constexpr int kGlobalConfigMaxHistorySize = 3;
+	if (history.size() > kGlobalConfigMaxHistorySize - 1) {
+		std::vector<KeyRef> keys;
+		for (const auto& kv : history) {
+			keys.push_back(kv.key);
+		}
+		// Fix ordering of returned keys. This will ensure versions are ordered
+		// numerically; for example \xff/globalConfig/h/1000 should come after
+		// \xff/globalConfig/h/999.
+		std::sort(keys.begin(), keys.end(), [](const KeyRef& lhs, const KeyRef& rhs) {
+			if (lhs.size() != rhs.size()) {
+				return lhs.size() < rhs.size();
+			}
+			return lhs.compare(rhs) < 0;
+		});
+
+		// Cannot use a range clear because of how keys are ordered in FDB.
+		//   \xff/globalConfig/h/999 -> ...
+		//   \xff/globalConfig/h/1000 -> ...
+		//   \xff/globalConfig/h/1001 -> ...
+		//
+		//   clear_range(\xff/globalConfig/h, \xff/globalConfig/h/1000) results
+		//   in zero key-value pairs being deleted (999 is lexicographically
+		//   larger than 1000, and the range is exclusive).
+		for (int i = 0; i < keys.size() - (kGlobalConfigMaxHistorySize - 1); ++i) {
+			tr.clear(keys[i]);
+		}
+	}
+
+	// TODO: Should probably be using the commit version...
+	Version readVersion = wait(ryw->getReadVersion());
+	BinaryWriter wr = BinaryWriter(AssumeVersion(g_network->protocolVersion()));
+
+	Arena arena;
+	VectorRef<MutationRef> mutations;
+
+	// Transform writes from special-key-space (\xff\xff/global_config/) to
+	// system key space (\xff/globalConfig/).
+	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
+	    ryw->getSpecialKeySpaceWriteMap().containedRanges(specialKeys);
+	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
+	while (iter != ranges.end()) {
+		Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin);
+		Key systemKey = bareKey.withPrefix(globalConfigDataPrefix);
+		std::pair<bool, Optional<Value>> entry = iter->value();
+		if (entry.first) {
+			if (entry.second.present()) {
+				mutations.emplace_back_deep(arena, MutationRef(MutationRef::SetValue, bareKey, entry.second.get()));
+				tr.set(systemKey, entry.second.get());
+			} else {
+				mutations.emplace_back_deep(arena, MutationRef(MutationRef::ClearRange, bareKey, keyAfter(bareKey)));
+				tr.clear(systemKey);
+			}
+		}
+		++iter;
+	}
+
+	wr << std::make_pair(readVersion, mutations);
+
+	// Record the mutations in this commit into the global configuration history.
+	Key historyVersionKey = globalConfigHistoryPrefix.withSuffix(std::to_string(readVersion));
+	tr.set(historyVersionKey, wr.toValue());
+
+	ProtocolVersion protocolVersion = g_network->protocolVersion();
+	BinaryWriter versionWriter = BinaryWriter(AssumeVersion(protocolVersion));
+	versionWriter << readVersion << protocolVersion;
+	tr.set(globalConfigVersionKey, versionWriter.toValue());
+
+	return Optional<std::string>();
+
+}
+
+Future<Optional<std::string>> GlobalConfigImpl::commit(ReadYourWritesTransaction* ryw) {
+	return globalConfigCommitActor(this, ryw);
+}
+
+void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
+	// TODO
+}
+
+void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
+	ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>()));
+}
+
+Future<Standalone<RangeResultRef>> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw,
+                                                                KeyRangeRef kr) const {
 	Standalone<RangeResultRef> result;
 	for (const auto& option : SpecialKeySpace::getTracingOptions()) {
 		auto key = getKeyRange().begin.withSuffix(option);
diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h
index c760a10724..2f605385c1 100644
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@@ -146,6 +146,7 @@ public:
 		CONFIGURATION, // Configuration of the cluster
 		CONNECTIONSTRING,
 		ERRORMSG, // A single key space contains a json string which describes the last error in special-key-space
+		GLOBALCONFIG, // Global configuration options synchronized to all nodes
 		MANAGEMENT, // Management-API
 		METRICS, // data-distribution metrics
 		TESTONLY, // only used by correctness tests
@@ -336,6 +337,16 @@ public:
 	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
 };
 
+class GlobalConfigImpl : public SpecialKeyRangeRWImpl {
+public:
+	explicit GlobalConfigImpl(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) override;
+	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
+	void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
+	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
+};
+
 class TracingOptionsImpl : public SpecialKeyRangeRWImpl {
 public:
 	explicit TracingOptionsImpl(KeyRangeRef kr);
diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index 0b15f8f91d..aaf115e3b8 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -632,7 +632,18 @@ std::string encodeFailedServersKey(AddressExclusion const& addr) {
 	return failedServersPrefix.toString() + addr.toString();
 }
 
-const KeyRangeRef workerListKeys(LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0"));
+const KeyRangeRef globalConfigKeys( LiteralStringRef("\xff/globalConfig/"), LiteralStringRef("\xff/globalConfig0") );
+const KeyRef globalConfigPrefix = globalConfigKeys.begin;
+
+const KeyRangeRef globalConfigDataKeys( LiteralStringRef("\xff/globalConfig/k/"), LiteralStringRef("\xff/globalConfig/k0") );
+const KeyRef globalConfigDataPrefix = globalConfigDataKeys.begin;
+
+const KeyRangeRef globalConfigHistoryKeys( LiteralStringRef("\xff/globalConfig/h/"), LiteralStringRef("\xff/globalConfig/h0") );
+const KeyRef globalConfigHistoryPrefix = globalConfigHistoryKeys.begin;
+
+const KeyRef globalConfigVersionKey = LiteralStringRef("\xff/globalConfig/v");
+
+const KeyRangeRef workerListKeys( LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0") );
 const KeyRef workerListPrefix = workerListKeys.begin;
 
 const Key workerListKeyFor(StringRef processID) {
diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h
index bbeb7489f9..15117a867e 100644
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@@ -230,6 +230,30 @@ extern const KeyRef failedServersVersionKey; // The value of this key shall be c
 const AddressExclusion decodeFailedServersKey(KeyRef const& key); // where key.startsWith(failedServersPrefix)
 std::string encodeFailedServersKey(AddressExclusion const&);
 
+//   "\xff/globalConfig/[[option]]" := "value"
+//	 An umbrella prefix for global configuration data synchronized to all nodes.
+extern const KeyRangeRef globalConfigData;
+extern const KeyRef globalConfigDataPrefix;
+
+//   "\xff/globalConfig/k/[[key]]" := "value"
+//	 Key-value pairs that have been set. The range this keyspace represents
+//	 contains all globally configured options.
+extern const KeyRangeRef globalConfigDataKeys;
+extern const KeyRef globalConfigDataPrefix;
+
+//   "\xff/globalConfig/h/[[version]]" := "value"
+//   Maps a commit version to a list of mutations made to the global
+//   configuration at that commit. Shipped to nodes periodically. In general,
+//   clients should not write to keys in this keyspace; it will be written
+//   automatically when updating global configuration keys.
+extern const KeyRangeRef globalConfigHistoryKeys;
+extern const KeyRef globalConfigHistoryPrefix;
+
+//   "\xff/globalConfig/v" := "version,protocol"
+//   Read-only key which returns the version and protocol of the most recent
+//   data written to the global configuration keyspace.
+extern const KeyRef globalConfigVersionKey;
+
 //	"\xff/workers/[[processID]]" := ""
 //	Asynchronously updated by the cluster controller, this is a list of fdbserver processes that have joined the cluster
 //	and are currently (recently) available
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 543abc8dad..e599e3feae 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3198,26 +3198,65 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 			try {
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+				state Optional<Value> globalConfigVersion = wait(tr.get(globalConfigVersionKey));
 				state Optional<Value> rateVal = wait(tr.get(fdbClientInfoTxnSampleRate));
 				state Optional<Value> limitVal = wait(tr.get(fdbClientInfoTxnSizeLimit));
-				ClientDBInfo clientInfo = db->clientInfo->get();
-				double sampleRate = rateVal.present()
-				                        ? BinaryReader::fromStringRef<double>(rateVal.get(), Unversioned())
-				                        : std::numeric_limits<double>::infinity();
-				int64_t sizeLimit =
-				    limitVal.present() ? BinaryReader::fromStringRef<int64_t>(limitVal.get(), Unversioned()) : -1;
-				if (sampleRate != clientInfo.clientTxnInfoSampleRate ||
-				    sizeLimit != clientInfo.clientTxnInfoSampleRate) {
+				state ClientDBInfo clientInfo = db->clientInfo->get();
+
+				if (globalConfigVersion.present()) {
+					BinaryReader versionReader = BinaryReader(globalConfigVersion.get(), AssumeVersion(g_network->protocolVersion()));
+					Version version;
+					ProtocolVersion protocolVersion;
+					versionReader >> version >> protocolVersion;
+
+					state Arena arena;
+					if (protocolVersion == g_network->protocolVersion()) {
+						Standalone<RangeResultRef> globalConfigHistory = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
+						// If the global configuration version key has been
+						// set, the history should contain at least one item.
+						ASSERT(globalConfigHistory.size() > 0);
+						clientInfo.history.clear();
+
+						for (const auto& kv : globalConfigHistory) {
+							BinaryReader rd = BinaryReader(kv.value, AssumeVersion(g_network->protocolVersion()));
+							Standalone<std::pair<Version, VectorRef<MutationRef>>> data;
+							rd >> data >> arena;
+							clientInfo.history.push_back(data);
+						}
+
+						// History should be ordered by version, ascending.
+						std::sort(clientInfo.history.begin(), clientInfo.history.end(), [](const auto& lhs, const auto& rhs) {
+							return lhs.first < rhs.first;
+						});
+					} else {
+						// If the protocol version has changed, the
+						// GlobalConfig actor should refresh its view by
+						// reading the entire global configuration key range.
+						// An empty mutation list will signal the actor to
+						// refresh.
+						clientInfo.history.clear();
+					}
+
+					clientInfo.id = deterministicRandom()->randomUniqueID();
+					db->clientInfo->set(clientInfo);
+				}
+
+				// TODO: Remove this and move to global config space
+				double sampleRate = rateVal.present() ? BinaryReader::fromStringRef<double>(rateVal.get(), Unversioned()) : std::numeric_limits<double>::infinity();
+				int64_t sizeLimit = limitVal.present() ? BinaryReader::fromStringRef<int64_t>(limitVal.get(), Unversioned()) : -1;
+				if (sampleRate != clientInfo.clientTxnInfoSampleRate || sizeLimit != clientInfo.clientTxnInfoSampleRate) {
 					clientInfo.id = deterministicRandom()->randomUniqueID();
 					clientInfo.clientTxnInfoSampleRate = sampleRate;
 					clientInfo.clientTxnInfoSizeLimit = sizeLimit;
 					db->clientInfo->set(clientInfo);
 				}
 
+				state Future<Void> globalConfigFuture = tr.watch(globalConfigVersionKey);
 				state Future<Void> watchRateFuture = tr.watch(fdbClientInfoTxnSampleRate);
 				state Future<Void> watchLimitFuture = tr.watch(fdbClientInfoTxnSizeLimit);
 				wait(tr.commit());
 				choose {
+					when (wait(globalConfigFuture)) { break; }
 					when(wait(watchRateFuture)) { break; }
 					when(wait(watchLimitFuture)) { break; }
 				}
diff --git a/flow/network.h b/flow/network.h
index d0f117dede..1eeb5bdc2d 100644
--- a/flow/network.h
+++ b/flow/network.h
@@ -481,7 +481,8 @@ public:
 		enBlobCredentialFiles = 10,
 		enNetworkAddressesFunc = 11,
 		enClientFailureMonitor = 12,
-		enSQLiteInjectedError = 13
+		enSQLiteInjectedError = 13,
+		enGlobalConfig = 14
 	};
 
 	virtual void longTaskCheck(const char* name) {}

From 9e20b08976b49bbbe7216c55d757b7b4fb43577d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 17 Feb 2021 16:04:23 -0800
Subject: [PATCH 131/317] Add float and double parsing

---
 fdbclient/GlobalConfig.actor.cpp |  9 ++++-
 fdbclient/Tuple.cpp              | 64 ++++++++++++++++++++++++++++++++
 fdbclient/Tuple.h                |  4 +-
 3 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index f4c2c81e5d..8997d11973 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -40,7 +40,7 @@ GlobalConfig& GlobalConfig::globalConfig() {
 const std::any GlobalConfig::get(StringRef name) {
 	auto it = data.find(name);
 	if (it == data.end()) {
-		return nullptr;
+		return std::any{};
 	}
 	return it->second;
 }
@@ -51,10 +51,15 @@ Future<Void> GlobalConfig::onInitialized() {
 
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
 	Tuple t = Tuple::unpack(value);
-	// TODO: Add more Tuple types
 	if (t.getType(0) == Tuple::ElementType::UTF8) {
 		data[key] = t.getString(0);
 	} else if (t.getType(0) == Tuple::ElementType::INT) {
 		data[key] = t.getInt(0);
+	} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
+		data[key] = t.getFloat(0);
+	} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
+		data[key] = t.getDouble(0);
+	} else {
+		ASSERT(false);
 	}
 }
diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp
index 3d4427079f..535be3d7fc 100644
--- a/fdbclient/Tuple.cpp
+++ b/fdbclient/Tuple.cpp
@@ -20,6 +20,18 @@
 
 #include "fdbclient/Tuple.h"
 
+static float bigEndianFloat(float orig) {
+	int32_t big = *(int32_t*)&orig;
+	big = bigEndian32(big);
+	return *(float*)&big;
+}
+
+static double bigEndianDouble(double orig) {
+	int64_t big = *(int64_t*)&orig;
+	big = bigEndian64(big);
+	return *(double*)&big;
+}
+
 static size_t find_string_terminator(const StringRef data, size_t offset) {
 	size_t i = offset;
 	while (i < data.size() - 1 && !(data[i] == '\x00' && data[i + 1] != (uint8_t)'\xff')) {
@@ -29,6 +41,19 @@ static size_t find_string_terminator(const StringRef data, size_t offset) {
 	return i;
 }
 
+// If encoding and the sign bit is 1 (the number is negative), flip all the bits.
+// If decoding and the sign bit is 0 (the number is negative), flip all the bits.
+// Otherwise, the number is positive, so flip the sign bit.
+static void adjust_floating_point(uint8_t *bytes, size_t size, bool encode) {
+	if((encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x00)) || (!encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x80))) {
+		for(size_t i = 0; i < size; i++) {
+			bytes[i] ^= (uint8_t)0xff;
+		}
+	} else {
+		bytes[0] ^= (uint8_t)0x80;
+	}
+}
+
 Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
 	data.append(data.arena(), str.begin(), str.size());
 
@@ -228,6 +253,45 @@ int64_t Tuple::getInt(size_t index, bool allow_incomplete) const {
 	return swap;
 }
 
+// TODO: Combine with bindings/flow/Tuple.*. This code is copied from there.
+float Tuple::getFloat(size_t index) const {
+	if(index >= offsets.size()) {
+		throw invalid_tuple_index();
+	}
+	ASSERT_LT(offsets[index], data.size());
+	uint8_t code = data[offsets[index]];
+	if(code != 0x20) {
+		throw invalid_tuple_data_type();
+	}
+
+	float swap;
+	uint8_t* bytes = (uint8_t*)&swap;
+	ASSERT_LE(offsets[index] + 1 + sizeof(float), data.size());
+	swap = *(float*)(data.begin() + offsets[index] + 1);
+	adjust_floating_point( bytes, sizeof(float), false );
+
+	return bigEndianFloat(swap);
+}
+
+double Tuple::getDouble(size_t index) const {
+	if(index >= offsets.size()) {
+		throw invalid_tuple_index();
+	}
+	ASSERT_LT(offsets[index], data.size());
+	uint8_t code = data[offsets[index]];
+	if(code != 0x21) {
+		throw invalid_tuple_data_type();
+	}
+
+	double swap;
+	uint8_t* bytes = (uint8_t*)&swap;
+	ASSERT_LE(offsets[index] + 1 + sizeof(double), data.size());
+	swap = *(double*)(data.begin() + offsets[index] + 1);
+	adjust_floating_point( bytes, sizeof(double), false );
+
+	return bigEndianDouble(swap);
+}
+
 KeyRange Tuple::range(Tuple const& tuple) const {
 	VectorRef<uint8_t> begin;
 	VectorRef<uint8_t> end;
diff --git a/fdbclient/Tuple.h b/fdbclient/Tuple.h
index b44edd73cc..4497f19441 100644
--- a/fdbclient/Tuple.h
+++ b/fdbclient/Tuple.h
@@ -47,7 +47,7 @@ struct Tuple {
 		return append(t);
 	}
 
-	enum ElementType { NULL_TYPE, INT, BYTES, UTF8 };
+	enum ElementType { NULL_TYPE, INT, BYTES, UTF8, FLOAT, DOUBLE };
 
 	// this is number of elements, not length of data
 	size_t size() const { return offsets.size(); }
@@ -55,6 +55,8 @@ struct Tuple {
 	ElementType getType(size_t index) const;
 	Standalone<StringRef> getString(size_t index) const;
 	int64_t getInt(size_t index, bool allow_incomplete = false) const;
+	float getFloat(size_t index) const;
+	double getDouble(size_t index) const;
 
 	KeyRange range(Tuple const& tuple = Tuple()) const;
 

From 7bb0b3d8995a6d40232f8bbd107f010f2e94193d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 18 Feb 2021 17:44:42 -0800
Subject: [PATCH 132/317] Use commit version for global configuration updates

FIXME: There is a memory issue where the underlying data for values set
in the `data` field of GlobalConfig will be freed shortly after being
set.
---
 fdbclient/SpecialKeySpace.actor.cpp   | 26 +++++++++++++++-----------
 fdbserver/ClusterController.actor.cpp | 23 ++++++++++++++++++-----
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 8e681b9aba..d2d553f0ca 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1436,10 +1436,6 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 		}
 	}
 
-	// TODO: Should probably be using the commit version...
-	Version readVersion = wait(ryw->getReadVersion());
-	BinaryWriter wr = BinaryWriter(AssumeVersion(g_network->protocolVersion()));
-
 	Arena arena;
 	VectorRef<MutationRef> mutations;
 
@@ -1464,16 +1460,24 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 		++iter;
 	}
 
-	wr << std::make_pair(readVersion, mutations);
+	ProtocolVersion protocolVersion = g_network->protocolVersion();
 
 	// Record the mutations in this commit into the global configuration history.
-	Key historyVersionKey = globalConfigHistoryPrefix.withSuffix(std::to_string(readVersion));
-	tr.set(historyVersionKey, wr.toValue());
+	BinaryWriter historyKeyWriter(AssumeVersion(protocolVersion));
+	historyKeyWriter.serializeBytes(globalConfigHistoryPrefix);
+	Key historyKey = addVersionStampAtEnd(historyKeyWriter.toValue());
 
-	ProtocolVersion protocolVersion = g_network->protocolVersion();
-	BinaryWriter versionWriter = BinaryWriter(AssumeVersion(protocolVersion));
-	versionWriter << readVersion << protocolVersion;
-	tr.set(globalConfigVersionKey, versionWriter.toValue());
+	BinaryWriter historyMutationsWriter(AssumeVersion(protocolVersion));
+	historyMutationsWriter << mutations;
+
+	tr.atomicOp(historyKey, historyMutationsWriter.toValue(), MutationRef::SetVersionstampedKey);
+
+	// Write version key to trigger update in cluster controller.
+	tr.atomicOp(globalConfigVersionKey,
+	            BinaryWriter::toValue(protocolVersion, AssumeVersion(protocolVersion))
+	                .withPrefix(LiteralStringRef("0123456789")) // placeholder for versionstamp
+	                .withSuffix(LiteralStringRef("\x00\x00\x00\x00")),
+	            MutationRef::SetVersionstampedValue);
 
 	return Optional<std::string>();
 
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index e599e3feae..43b53c908c 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3205,9 +3205,10 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 
 				if (globalConfigVersion.present()) {
 					BinaryReader versionReader = BinaryReader(globalConfigVersion.get(), AssumeVersion(g_network->protocolVersion()));
-					Version version;
-					ProtocolVersion protocolVersion;
-					versionReader >> version >> protocolVersion;
+					int64_t commitVersion;
+					int16_t serializationOrder;
+					state ProtocolVersion protocolVersion;
+					versionReader >> commitVersion >> serializationOrder >> protocolVersion;
 
 					state Arena arena;
 					if (protocolVersion == g_network->protocolVersion()) {
@@ -3218,9 +3219,21 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 						clientInfo.history.clear();
 
 						for (const auto& kv : globalConfigHistory) {
-							BinaryReader rd = BinaryReader(kv.value, AssumeVersion(g_network->protocolVersion()));
 							Standalone<std::pair<Version, VectorRef<MutationRef>>> data;
-							rd >> data >> arena;
+
+							// Read commit version out of versionstamp at end of key.
+							BinaryReader versionReader = BinaryReader(kv.key.removePrefix(globalConfigHistoryPrefix), AssumeVersion(protocolVersion));
+							Version historyCommitVersion;
+							versionReader >> historyCommitVersion;
+							historyCommitVersion = bigEndian64(historyCommitVersion);
+							data.first = historyCommitVersion;
+
+							// Read the list of mutations that occurred at this version.
+							BinaryReader mutationReader = BinaryReader(kv.value, AssumeVersion(protocolVersion));
+							VectorRef<MutationRef> mutations;
+							mutationReader >> mutations;
+							data.second = VectorRef(arena, mutations);
+
 							clientInfo.history.push_back(data);
 						}
 

From c9b0d3dd4e203853c8a78904671ae38d98c5b394 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 19 Feb 2021 14:00:07 -0800
Subject: [PATCH 133/317] Fix memory leak

The map containing global configuration data had keys of type StringRef,
referencing data allocated in history arenas. When the old history
was deleted, this memory was no longer valid and some keys would point
to garbage memory.
---
 fdbclient/GlobalConfig.actor.cpp    | 37 +++++++++++++++++++++++++----
 fdbclient/GlobalConfig.actor.h      | 24 +++++++------------
 fdbclient/SpecialKeySpace.actor.cpp | 31 +++++++++++++-----------
 3 files changed, 58 insertions(+), 34 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 8997d11973..04a72bb443 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -37,7 +37,7 @@ GlobalConfig& GlobalConfig::globalConfig() {
 	return *reinterpret_cast<GlobalConfig*>(res);
 }
 
-const std::any GlobalConfig::get(StringRef name) {
+const std::any GlobalConfig::get(KeyRef name) {
 	auto it = data.find(name);
 	if (it == data.end()) {
 		return std::any{};
@@ -45,21 +45,48 @@ const std::any GlobalConfig::get(StringRef name) {
 	return it->second;
 }
 
+const std::map<KeyRef, std::any> GlobalConfig::get(KeyRangeRef range) {
+	std::map<KeyRef, std::any> results;
+	for (const auto& [key, value] : data) {
+		if (range.contains(key)) {
+			results[key] = value;
+		}
+	}
+	return results;
+}
+
 Future<Void> GlobalConfig::onInitialized() {
 	return initialized.getFuture();
 }
 
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
+	KeyRef stableKey = KeyRef(arena, key);
 	Tuple t = Tuple::unpack(value);
 	if (t.getType(0) == Tuple::ElementType::UTF8) {
-		data[key] = t.getString(0);
+		data[stableKey] = t.getString(0);
 	} else if (t.getType(0) == Tuple::ElementType::INT) {
-		data[key] = t.getInt(0);
+		data[stableKey] = t.getInt(0);
 	} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
-		data[key] = t.getFloat(0);
+		data[stableKey] = t.getFloat(0);
 	} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
-		data[key] = t.getDouble(0);
+		data[stableKey] = t.getDouble(0);
 	} else {
 		ASSERT(false);
 	}
 }
+
+void GlobalConfig::erase(KeyRef key) {
+	erase(KeyRangeRef(key, keyAfter(key)));
+}
+
+void GlobalConfig::erase(KeyRangeRef range) {
+	// TODO: Memory leak -- memory for key remains allocated in arena
+	auto it = data.begin();
+	while (it != data.end()) {
+		if (range.contains(it->first)) {
+			it = data.erase(it);
+		} else {
+			++it;
+		}
+	}
+}
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 323a5e953c..8d0bd21a17 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -27,6 +27,7 @@
 #define FDBCLIENT_GLOBALCONFIG_ACTOR_H
 
 #include <any>
+#include <map>
 #include <unordered_map>
 
 #include "fdbclient/CommitProxyInterface.h"
@@ -47,11 +48,14 @@ public:
 
 	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 	static GlobalConfig& globalConfig();
-	const std::any get(StringRef name);
+	const std::any get(KeyRef name);
+	const std::map<KeyRef, std::any> get(KeyRangeRef range);
 	Future<Void> onInitialized();
 
 private:
 	void insert(KeyRef key, ValueRef value);
+	void erase(KeyRef key);
+	void erase(KeyRangeRef range);
 
 	ACTOR static Future<Void> refresh(GlobalConfig* self) {
 		Transaction tr(self->cx);
@@ -83,28 +87,19 @@ private:
 					// version. Mutation history should already be stored in
 					// ascending version order.
 					for (int i = 0; i < history.size(); ++i) {
-						std::pair<Version, VectorRef<MutationRef>> pair = history[i].contents();
+						const std::pair<Version, VectorRef<MutationRef>>& pair = history[i].contents();
 
 						Version version = pair.first;
 						if (version <= self->lastUpdate) {
 							continue;  // already applied this mutation
 						}
 
-						VectorRef<MutationRef>& mutations = pair.second;
+						const VectorRef<MutationRef>& mutations = pair.second;
 						for (const auto& mutation : mutations) {
 							if (mutation.type == MutationRef::SetValue) {
 								self->insert(mutation.param1, mutation.param2);
 							} else if (mutation.type == MutationRef::ClearRange) {
-								// TODO: Could be optimized if using std::map..
-								KeyRangeRef range(mutation.param1, mutation.param2);
-								auto it = self->data.begin();
-								while (it != self->data.end()) {
-									if (range.contains(it->first)) {
-										it = self->data.erase(it);
-									} else {
-										++it;
-									}
-								}
+								self->erase(KeyRangeRef(mutation.param1, mutation.param2));
 							} else {
 								ASSERT(false);
 							}
@@ -123,8 +118,7 @@ private:
 	Database cx;
 	Future<Void> _updater;
 	Promise<Void> initialized;
-	// TODO: Arena to store all data in
-	// TODO: Change to std::map for faster range access
+	Arena arena;
 	std::unordered_map<StringRef, std::any> data;
 	Version lastUpdate;
 };
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index d2d553f0ca..ea577777e5 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1377,23 +1377,26 @@ Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTran
                                                               KeyRangeRef kr) const {
 	Standalone<RangeResultRef> result;
 
-	// if (kr.begin != kr.end) {
-	// 	ryw->setSpecialKeySpaceErrorMsg("get range disabled, please fetch a single key");
-	// 	throw special_keys_api_failure();
-	// }
-
 	auto& globalConfig = GlobalConfig::globalConfig();
-	KeyRef key = kr.begin.removePrefix(getKeyRange().begin);
-	const std::any& any = globalConfig.get(key);
-	if (any.has_value()) {
-		if (any.type() == typeid(Standalone<StringRef>)) {
-			result.push_back_deep(result.arena(), KeyValueRef(kr.begin, std::any_cast<Standalone<StringRef>>(globalConfig.get(key)).contents()));
-		} else if (any.type() == typeid(int64_t)) {
-			result.push_back_deep(result.arena(), KeyValueRef(kr.begin, std::to_string(std::any_cast<int64_t>(globalConfig.get(key)))));
-		} else {
-			ASSERT(false);
+	KeyRangeRef modified = KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin));
+	std::map<KeyRef, std::any> values = globalConfig.get(modified);
+	for (const auto& [key, any] : values) {
+		Key prefixedKey = key.withPrefix(getKeyRange().begin);
+		if (any.has_value()) {
+			if (any.type() == typeid(Standalone<StringRef>)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::any_cast<Standalone<StringRef>>(any).contents()));
+			} else if (any.type() == typeid(int64_t)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(any))));
+			} else if (any.type() == typeid(float)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(any))));
+			} else if (any.type() == typeid(double)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(any))));
+			} else {
+				ASSERT(false);
+			}
 		}
 	}
+
 	return result;
 }
 

From 96732810ffd4ba4dfdbf1c2830b76c02fbd86938 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 19 Feb 2021 14:22:58 -0800
Subject: [PATCH 134/317] Move actor implementation

---
 fdbclient/GlobalConfig.actor.cpp | 60 ++++++++++++++++++++++++++++++-
 fdbclient/GlobalConfig.actor.h   | 61 +++-----------------------------
 2 files changed, 63 insertions(+), 58 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 04a72bb443..476c291167 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -25,7 +25,7 @@
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
 void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
-	auto config = new GlobalConfig{}; // TODO: memory leak?
+	auto config = new GlobalConfig{};
 	config->cx = Database(cx);
 	g_network->setGlobal(INetwork::enGlobalConfig, config);
 	config->_updater = updater(config, dbInfo);
@@ -90,3 +90,61 @@ void GlobalConfig::erase(KeyRangeRef range) {
 		}
 	}
 }
+
+ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
+	Transaction tr(self->cx);
+	Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
+	for (const auto& kv : result) {
+		KeyRef systemKey = kv.key.removePrefix(globalConfigDataPrefix);
+		self->insert(systemKey, kv.value);
+	}
+	return Void();
+}
+
+ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+	wait(self->refresh(self));
+	self->initialized.send(Void());
+
+	loop {
+		try {
+			wait(dbInfo->onChange());
+
+			auto& history = dbInfo->get().history;
+			if (history.size() == 0 || (self->lastUpdate < history[0].first && self->lastUpdate != 0)) {
+				// This process missed too many global configuration
+				// history updates or the protocol version changed, so it
+				// must re-read the entire configuration range.
+				wait(self->refresh(self));
+				self->lastUpdate = dbInfo->get().history.back().contents().first;
+			} else {
+				// Apply history in order, from lowest version to highest
+				// version. Mutation history should already be stored in
+				// ascending version order.
+				for (int i = 0; i < history.size(); ++i) {
+					const std::pair<Version, VectorRef<MutationRef>>& pair = history[i].contents();
+
+					Version version = pair.first;
+					if (version <= self->lastUpdate) {
+						continue;  // already applied this mutation
+					}
+
+					const VectorRef<MutationRef>& mutations = pair.second;
+					for (const auto& mutation : mutations) {
+						if (mutation.type == MutationRef::SetValue) {
+							self->insert(mutation.param1, mutation.param2);
+						} else if (mutation.type == MutationRef::ClearRange) {
+							self->erase(KeyRangeRef(mutation.param1, mutation.param2));
+						} else {
+							ASSERT(false);
+						}
+					}
+
+					ASSERT(version > self->lastUpdate);
+					self->lastUpdate = version;
+				}
+			}
+		} catch (Error& e) {
+			throw;
+		}
+	}
+}
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 8d0bd21a17..ec43ff5a97 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -48,8 +48,10 @@ public:
 
 	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 	static GlobalConfig& globalConfig();
+
 	const std::any get(KeyRef name);
 	const std::map<KeyRef, std::any> get(KeyRangeRef range);
+
 	Future<Void> onInitialized();
 
 private:
@@ -57,63 +59,8 @@ private:
 	void erase(KeyRef key);
 	void erase(KeyRangeRef range);
 
-	ACTOR static Future<Void> refresh(GlobalConfig* self) {
-		Transaction tr(self->cx);
-		Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
-		for (const auto& kv : result) {
-			KeyRef systemKey = kv.key.removePrefix(globalConfigDataPrefix);
-			self->insert(systemKey, kv.value);
-		}
-		return Void();
-	}
-
-	ACTOR static Future<Void> updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
-		wait(refresh(self));
-		self->initialized.send(Void());
-
-		loop {
-			try {
-				wait(dbInfo->onChange());
-
-				auto& history = dbInfo->get().history;
-				if (history.size() == 0 || (self->lastUpdate < history[0].first && self->lastUpdate != 0)) {
-					// This process missed too many global configuration
-					// history updates or the protocol version changed, so it
-					// must re-read the entire configuration range.
-					wait(refresh(self));
-					self->lastUpdate = dbInfo->get().history.back().contents().first;
-				} else {
-					// Apply history in order, from lowest version to highest
-					// version. Mutation history should already be stored in
-					// ascending version order.
-					for (int i = 0; i < history.size(); ++i) {
-						const std::pair<Version, VectorRef<MutationRef>>& pair = history[i].contents();
-
-						Version version = pair.first;
-						if (version <= self->lastUpdate) {
-							continue;  // already applied this mutation
-						}
-
-						const VectorRef<MutationRef>& mutations = pair.second;
-						for (const auto& mutation : mutations) {
-							if (mutation.type == MutationRef::SetValue) {
-								self->insert(mutation.param1, mutation.param2);
-							} else if (mutation.type == MutationRef::ClearRange) {
-								self->erase(KeyRangeRef(mutation.param1, mutation.param2));
-							} else {
-								ASSERT(false);
-							}
-						}
-
-						ASSERT(version > self->lastUpdate);
-						self->lastUpdate = version;
-					}
-				}
-			} catch (Error& e) {
-				throw;
-			}
-		}
-	}
+	ACTOR static Future<Void> refresh(GlobalConfig* self);
+	ACTOR static Future<Void> updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 
 	Database cx;
 	Future<Void> _updater;

From 2acefa2c821071bf0337be99ab195902c24da9fa Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 19 Feb 2021 16:46:08 -0800
Subject: [PATCH 135/317] Add double and float support to tuples

Note that this functionality is copied from bindings/flow/Tuple.cpp.
These classes should eventually be combined (see #4351).
---
 fdbclient/GlobalConfig.actor.cpp | 30 ++++++++++++++---------
 fdbclient/GlobalConfig.actor.h   | 13 ++++++----
 fdbclient/Tuple.cpp              | 41 ++++++++++++++++++++++++++++++--
 fdbclient/Tuple.h                |  4 ++++
 4 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 476c291167..319c143b5a 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -19,6 +19,10 @@
  */
 
 #include "fdbclient/GlobalConfig.actor.h"
+#include "fdbclient/SystemData.h"
+#include "fdbclient/Tuple.h"
+#include "flow/flow.h"
+#include "flow/genericactors.actor.h"
 
 #include "flow/actorcompiler.h"  // This must be the last #include.
 
@@ -61,17 +65,21 @@ Future<Void> GlobalConfig::onInitialized() {
 
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
 	KeyRef stableKey = KeyRef(arena, key);
-	Tuple t = Tuple::unpack(value);
-	if (t.getType(0) == Tuple::ElementType::UTF8) {
-		data[stableKey] = t.getString(0);
-	} else if (t.getType(0) == Tuple::ElementType::INT) {
-		data[stableKey] = t.getInt(0);
-	} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
-		data[stableKey] = t.getFloat(0);
-	} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
-		data[stableKey] = t.getDouble(0);
-	} else {
-		ASSERT(false);
+	try {
+		Tuple t = Tuple::unpack(value);
+		if (t.getType(0) == Tuple::ElementType::UTF8) {
+			data[stableKey] = t.getString(0);
+		} else if (t.getType(0) == Tuple::ElementType::INT) {
+			data[stableKey] = t.getInt(0);
+		} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
+			data[stableKey] = t.getFloat(0);
+		} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
+			data[stableKey] = t.getDouble(0);
+		} else {
+			ASSERT(false);
+		}
+	} catch (Error& e) {
+		TraceEvent("GlobalConfigTupleError").detail("What", e.what());
 	}
 }
 
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index ec43ff5a97..b472ea4718 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -32,14 +32,13 @@
 
 #include "fdbclient/CommitProxyInterface.h"
 #include "fdbclient/ReadYourWrites.h"
-#include "fdbclient/SystemData.h"
-#include "fdbclient/Tuple.h"
-#include "flow/flow.h"
-#include "flow/genericactors.actor.h"
-#include "flow/Knobs.h"
 
 #include "flow/actorcompiler.h" // has to be last include
 
+// The global configuration is a series of typed key-value pairs synced to all
+// nodes (server and client) in an FDB cluster in an eventually consistent
+// manner.
+
 class GlobalConfig {
 public:
 	GlobalConfig();
@@ -52,6 +51,10 @@ public:
 	const std::any get(KeyRef name);
 	const std::map<KeyRef, std::any> get(KeyRangeRef range);
 
+	// To write into the global configuration, submit a transaction to
+	// \xff\xff/global_config/<your-key> with <your-value> encoded using the
+	// FDB tuple typecodes.
+
 	Future<Void> onInitialized();
 
 private:
diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp
index 535be3d7fc..96f806c791 100644
--- a/fdbclient/Tuple.cpp
+++ b/fdbclient/Tuple.cpp
@@ -65,7 +65,14 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
 			i = find_string_terminator(str, i + 1) + 1;
 		} else if (data[i] >= '\x0c' && data[i] <= '\x1c') {
 			i += abs(data[i] - '\x14') + 1;
-		} else if (data[i] == '\x00') {
+		}
+		else if(data[i] == 0x20) {
+			i += sizeof(float) + 1;
+		}
+		else if(data[i] == 0x21) {
+			i += sizeof(double) + 1;
+		}
+		else if(data[i] == '\x00') {
 			i += 1;
 		} else {
 			throw invalid_tuple_data_type();
@@ -138,6 +145,29 @@ Tuple& Tuple::append(int64_t value) {
 	return *this;
 }
 
+Tuple& Tuple::appendFloat( float value ) {
+	offsets.push_back( data.size() );
+	float swap = bigEndianFloat(value);
+	uint8_t *bytes = (uint8_t*)&swap;
+	adjust_floating_point(bytes, sizeof(float), true);
+
+	data.push_back( data.arena(), 0x20 );
+	data.append( data.arena(), bytes, sizeof(float) );
+	return *this;
+}
+
+Tuple& Tuple::appendDouble( double value ) {
+	offsets.push_back( data.size() );
+	double swap = value;
+	swap = bigEndianDouble(swap);
+	uint8_t *bytes = (uint8_t*)&swap;
+	adjust_floating_point(bytes, sizeof(double), true);
+
+	data.push_back( data.arena(), 0x21 );
+	data.append( data.arena(), bytes, sizeof(double) );
+	return *this;
+}
+
 Tuple& Tuple::appendNull() {
 	offsets.push_back(data.size());
 	data.push_back(data.arena(), (uint8_t)'\x00');
@@ -159,7 +189,14 @@ Tuple::ElementType Tuple::getType(size_t index) const {
 		return ElementType::UTF8;
 	} else if (code >= '\x0c' && code <= '\x1c') {
 		return ElementType::INT;
-	} else {
+	}
+	else if(code == 0x20) {
+		return ElementType::FLOAT;
+	}
+	else if(code == 0x21) {
+		return ElementType::DOUBLE;
+	}
+	else {
 		throw invalid_tuple_data_type();
 	}
 }
diff --git a/fdbclient/Tuple.h b/fdbclient/Tuple.h
index 4497f19441..3dc597f262 100644
--- a/fdbclient/Tuple.h
+++ b/fdbclient/Tuple.h
@@ -38,6 +38,10 @@ struct Tuple {
 	Tuple& append(Tuple const& tuple);
 	Tuple& append(StringRef const& str, bool utf8 = false);
 	Tuple& append(int64_t);
+	// There are some ambiguous append calls in fdbclient, so to make it easier
+	// to add append for floats and doubles, name them differently for now.
+	Tuple& appendFloat(float);
+	Tuple& appendDouble(double);
 	Tuple& appendNull();
 
 	StringRef pack() const { return StringRef(data.begin(), data.size()); }

From 9587318696c54a932a53d31a7ccdb91cf35b2401 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 19 Feb 2021 18:00:04 -0800
Subject: [PATCH 136/317] Fix crash when history size is 0

This shouldn't happen in normal operation (if ClientDBInfo has been
updated, that means at least one item should have been added to the
history). But there is old functionality that uses other ClientDBInfo
fields to send updates to all nodes, and until this functionality is
removed this check needs to be here.
---
 fdbclient/GlobalConfig.actor.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 319c143b5a..b411e06f56 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -123,7 +123,13 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
 				wait(self->refresh(self));
-				self->lastUpdate = dbInfo->get().history.back().contents().first;
+				// TODO: This check is a temporary fix while old functionality
+				// for setting ClientDBInfo fields exist, but eventually it
+				// should be replaced with an assert that the size of `history`
+				// is greater than 0.
+				if (dbInfo->get().history.size() > 0) {
+					self->lastUpdate = dbInfo->get().history.back().contents().first;
+				}
 			} else {
 				// Apply history in order, from lowest version to highest
 				// version. Mutation history should already be stored in

From 80c6048a01abb641fbfa54b9b33bf83486ffcbf6 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Sat, 20 Feb 2021 00:43:54 -0800
Subject: [PATCH 137/317] Naming fixes

---
 fdbclient/GlobalConfig.actor.cpp    | 2 +-
 fdbclient/SpecialKeySpace.actor.cpp | 4 +++-
 fdbclient/SystemData.cpp            | 6 +++---
 fdbclient/SystemData.h              | 6 +++---
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index b411e06f56..c51429a694 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -103,7 +103,7 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 	Transaction tr(self->cx);
 	Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
 	for (const auto& kv : result) {
-		KeyRef systemKey = kv.key.removePrefix(globalConfigDataPrefix);
+		KeyRef systemKey = kv.key.removePrefix(globalConfigKeysPrefix);
 		self->insert(systemKey, kv.value);
 	}
 	return Void();
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index ea577777e5..c84b4a5d42 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1449,7 +1449,7 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
 	while (iter != ranges.end()) {
 		Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin);
-		Key systemKey = bareKey.withPrefix(globalConfigDataPrefix);
+		Key systemKey = bareKey.withPrefix(globalConfigKeysPrefix);
 		std::pair<bool, Optional<Value>> entry = iter->value();
 		if (entry.first) {
 			if (entry.second.present()) {
@@ -1498,6 +1498,8 @@ void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key)
 	ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>()));
 }
 
+TracingOptionsImpl::TracingOptionsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
+
 Future<Standalone<RangeResultRef>> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw,
                                                                 KeyRangeRef kr) const {
 	Standalone<RangeResultRef> result;
diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index aaf115e3b8..785a42828b 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -632,11 +632,11 @@ std::string encodeFailedServersKey(AddressExclusion const& addr) {
 	return failedServersPrefix.toString() + addr.toString();
 }
 
-const KeyRangeRef globalConfigKeys( LiteralStringRef("\xff/globalConfig/"), LiteralStringRef("\xff/globalConfig0") );
-const KeyRef globalConfigPrefix = globalConfigKeys.begin;
+// const KeyRangeRef globalConfigKeys( LiteralStringRef("\xff/globalConfig/"), LiteralStringRef("\xff/globalConfig0") );
+// const KeyRef globalConfigPrefix = globalConfigKeys.begin;
 
 const KeyRangeRef globalConfigDataKeys( LiteralStringRef("\xff/globalConfig/k/"), LiteralStringRef("\xff/globalConfig/k0") );
-const KeyRef globalConfigDataPrefix = globalConfigDataKeys.begin;
+const KeyRef globalConfigKeysPrefix = globalConfigDataKeys.begin;
 
 const KeyRangeRef globalConfigHistoryKeys( LiteralStringRef("\xff/globalConfig/h/"), LiteralStringRef("\xff/globalConfig/h0") );
 const KeyRef globalConfigHistoryPrefix = globalConfigHistoryKeys.begin;
diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h
index 15117a867e..489da42f83 100644
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@@ -232,14 +232,14 @@ std::string encodeFailedServersKey(AddressExclusion const&);
 
 //   "\xff/globalConfig/[[option]]" := "value"
 //	 An umbrella prefix for global configuration data synchronized to all nodes.
-extern const KeyRangeRef globalConfigData;
-extern const KeyRef globalConfigDataPrefix;
+// extern const KeyRangeRef globalConfigData;
+// extern const KeyRef globalConfigDataPrefix;
 
 //   "\xff/globalConfig/k/[[key]]" := "value"
 //	 Key-value pairs that have been set. The range this keyspace represents
 //	 contains all globally configured options.
 extern const KeyRangeRef globalConfigDataKeys;
-extern const KeyRef globalConfigDataPrefix;
+extern const KeyRef globalConfigKeysPrefix;
 
 //   "\xff/globalConfig/h/[[version]]" := "value"
 //   Maps a commit version to a list of mutations made to the global

From c3f68831af6ee2e59c0cee6ced085396775c120b Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 23 Feb 2021 16:17:05 -0800
Subject: [PATCH 138/317] Move existing ClientDBInfo variables to global
 configuration

---
 fdbcli/fdbcli.actor.cpp                       | 32 +++++++---------
 fdbclient/CommitProxyInterface.h              | 14 ++-----
 fdbclient/GlobalConfig.actor.cpp              |  6 +++
 fdbclient/GlobalConfig.actor.h                | 27 +++++++++++++
 fdbclient/NativeAPI.actor.cpp                 | 38 +++++++++----------
 fdbclient/SystemData.cpp                      |  6 +--
 fdbclient/SystemData.h                        |  2 -
 fdbserver/ClusterController.actor.cpp         | 22 +----------
 ...entTransactionProfileCorrectness.actor.cpp | 19 ++++++----
 9 files changed, 82 insertions(+), 84 deletions(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index e608e96086..35f7fdd884 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -24,6 +24,7 @@
 #include "fdbclient/Status.h"
 #include "fdbclient/StatusClient.h"
 #include "fdbclient/DatabaseContext.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbclient/ClusterInterface.h"
@@ -3841,25 +3842,14 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 								is_error = true;
 								continue;
 							}
-							state Future<Optional<Standalone<StringRef>>> sampleRateFuture =
-							    tr->get(fdbClientInfoTxnSampleRate);
-							state Future<Optional<Standalone<StringRef>>> sizeLimitFuture =
-							    tr->get(fdbClientInfoTxnSizeLimit);
-							wait(makeInterruptable(success(sampleRateFuture) && success(sizeLimitFuture)));
+							const double sampleRateDbl = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
+							const int64_t sizeLimit = GlobalConfig::globalConfig().get<int64_t>(fdbClientInfoTxnSizeLimit, -1);
 							std::string sampleRateStr = "default", sizeLimitStr = "default";
-							if (sampleRateFuture.get().present()) {
-								const double sampleRateDbl =
-								    BinaryReader::fromStringRef<double>(sampleRateFuture.get().get(), Unversioned());
-								if (!std::isinf(sampleRateDbl)) {
-									sampleRateStr = boost::lexical_cast<std::string>(sampleRateDbl);
-								}
+							if (!std::isinf(sampleRateDbl)) {
+								sampleRateStr = boost::lexical_cast<std::string>(sampleRateDbl);
 							}
-							if (sizeLimitFuture.get().present()) {
-								const int64_t sizeLimit =
-								    BinaryReader::fromStringRef<int64_t>(sizeLimitFuture.get().get(), Unversioned());
-								if (sizeLimit != -1) {
-									sizeLimitStr = boost::lexical_cast<std::string>(sizeLimit);
-								}
+							if (sizeLimit != -1) {
+								sizeLimitStr = boost::lexical_cast<std::string>(sizeLimit);
 							}
 							printf("Client profiling rate is set to %s and size limit is set to %s.\n",
 							       sampleRateStr.c_str(),
@@ -3897,8 +3887,12 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 									continue;
 								}
 							}
-							tr->set(fdbClientInfoTxnSampleRate, BinaryWriter::toValue(sampleRate, Unversioned()));
-							tr->set(fdbClientInfoTxnSizeLimit, BinaryWriter::toValue(sizeLimit, Unversioned()));
+
+							Tuple rate = Tuple().appendDouble(sampleRate);
+							Tuple size = Tuple().append(sizeLimit);
+							tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+							tr->set(fdbClientInfoTxnSampleRate.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), rate.pack());
+							tr->set(fdbClientInfoTxnSizeLimit.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), size.pack());
 							if (!intrans) {
 								wait(commitTransaction(tr));
 							}
diff --git a/fdbclient/CommitProxyInterface.h b/fdbclient/CommitProxyInterface.h
index f29d7369b3..4fffa116ac 100644
--- a/fdbclient/CommitProxyInterface.h
+++ b/fdbclient/CommitProxyInterface.h
@@ -113,17 +113,10 @@ struct ClientDBInfo {
 	vector<CommitProxyInterface> commitProxies;
 	Optional<CommitProxyInterface>
 	    firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk
-	vector<Standalone<std::pair<Version, VectorRef<MutationRef>>>> history;
-	double clientTxnInfoSampleRate;
-	int64_t clientTxnInfoSizeLimit;
 	Optional<Value> forward;
-	double transactionTagSampleRate;
-	double transactionTagSampleCost;
+	vector<Standalone<std::pair<Version, VectorRef<MutationRef>>>> history;
 
-	ClientDBInfo()
-	  : clientTxnInfoSampleRate(std::numeric_limits<double>::infinity()), clientTxnInfoSizeLimit(-1),
-	    transactionTagSampleRate(CLIENT_KNOBS->READ_TAG_SAMPLE_RATE),
-	    transactionTagSampleCost(CLIENT_KNOBS->COMMIT_SAMPLE_COST) {}
+	ClientDBInfo() {}
 
 	bool operator==(ClientDBInfo const& r) const { return id == r.id; }
 	bool operator!=(ClientDBInfo const& r) const { return id != r.id; }
@@ -133,8 +126,7 @@ struct ClientDBInfo {
 		if constexpr (!is_fb_function<Archive>) {
 			ASSERT(ar.protocolVersion().isValid());
 		}
-		serializer(ar, grvProxies, commitProxies, id, history, clientTxnInfoSampleRate, clientTxnInfoSizeLimit,
-	               forward, transactionTagSampleRate, transactionTagSampleCost);
+		serializer(ar, grvProxies, commitProxies, id, forward, history);
 	}
 };
 
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index c51429a694..261680f4d4 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -26,6 +26,12 @@
 
 #include "flow/actorcompiler.h"  // This must be the last #include.
 
+const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("fdbClientInfo/client_txn_sample_rate");
+const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("fdbClientInfo/client_txn_size_limit");
+
+const KeyRef transactionTagSampleRate = LiteralStringRef("transactionTagSampleRate");
+const KeyRef transactionTagSampleCost = LiteralStringRef("transactionTagSampleCost");
+
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
 void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index b472ea4718..601802e57c 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -39,6 +39,13 @@
 // nodes (server and client) in an FDB cluster in an eventually consistent
 // manner.
 
+// Keys
+extern const KeyRef fdbClientInfoTxnSampleRate;
+extern const KeyRef fdbClientInfoTxnSizeLimit;
+
+extern const KeyRef transactionTagSampleRate;
+extern const KeyRef transactionTagSampleCost;
+
 class GlobalConfig {
 public:
 	GlobalConfig();
@@ -51,6 +58,26 @@ public:
 	const std::any get(KeyRef name);
 	const std::map<KeyRef, std::any> get(KeyRangeRef range);
 
+	template <typename T>
+	const T get(KeyRef name) {
+		try {
+			auto any = get(name);
+			return std::any_cast<T>(any);
+		} catch (Error& e) {
+			throw;
+		}
+	}
+
+	template <typename T>
+	const T get(KeyRef name, T defaultVal) {
+		auto any = get(name);
+		if (any.has_value()) {
+			return std::any_cast<T>(any);
+		}
+
+		return defaultVal;
+	}
+
 	// To write into the global configuration, submit a transaction to
 	// \xff\xff/global_config/<your-key> with <your-value> encoded using the
 	// FDB tuple typecodes.
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index d350a39974..f45040982f 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -506,12 +506,11 @@ ACTOR static Future<Void> clientStatusUpdateActor(DatabaseContext* cx) {
 				}
 			}
 			cx->clientStatusUpdater.outStatusQ.clear();
-			double clientSamplingProbability = std::isinf(cx->clientInfo->get().clientTxnInfoSampleRate)
-			                                       ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY
-			                                       : cx->clientInfo->get().clientTxnInfoSampleRate;
-			int64_t clientTxnInfoSizeLimit = cx->clientInfo->get().clientTxnInfoSizeLimit == -1
-			                                     ? CLIENT_KNOBS->CSI_SIZE_LIMIT
-			                                     : cx->clientInfo->get().clientTxnInfoSizeLimit;
+			wait(GlobalConfig::globalConfig().onInitialized());
+			double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
+			double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
+			double sizeLimit = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSizeLimit, -1);
+			int64_t clientTxnInfoSizeLimit = sizeLimit == -1 ? CLIENT_KNOBS->CSI_SIZE_LIMIT : sizeLimit;
 			if (!trChunksQ.empty() && deterministicRandom()->random01() < clientSamplingProbability)
 				wait(delExcessClntTxnEntriesActor(&tr, clientTxnInfoSizeLimit));
 
@@ -957,14 +956,14 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 	getValueSubmitted.init(LiteralStringRef("NativeAPI.GetValueSubmitted"));
 	getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted"));
 
+	GlobalConfig::create(this, clientInfo);
+
 	monitorProxiesInfoChange = monitorProxiesChange(clientInfo, &proxiesChangeTrigger);
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);
 	cacheListMonitor = monitorCacheList(this);
 
 	smoothMidShardSize.reset(CLIENT_KNOBS->INIT_MID_SHARD_BYTES);
 
-	GlobalConfig::create(this, clientInfo);
-
 	if (apiVersionAtLeast(700)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ERRORMSG,
 		                              SpecialKeySpace::IMPLTYPE::READONLY,
@@ -1273,14 +1272,14 @@ Future<Void> DatabaseContext::onProxiesChanged() {
 }
 
 bool DatabaseContext::sampleReadTags() const {
-	return clientInfo->get().transactionTagSampleRate > 0 &&
-	       deterministicRandom()->random01() <= clientInfo->get().transactionTagSampleRate;
+	double sampleRate = GlobalConfig::globalConfig().get(transactionTagSampleRate, CLIENT_KNOBS->READ_TAG_SAMPLE_RATE);
+	return sampleRate > 0 && deterministicRandom()->random01() <= sampleRate;
 }
 
 bool DatabaseContext::sampleOnCost(uint64_t cost) const {
-	if (clientInfo->get().transactionTagSampleCost <= 0)
-		return false;
-	return deterministicRandom()->random01() <= (double)cost / clientInfo->get().transactionTagSampleCost;
+	double sampleCost = GlobalConfig::globalConfig().get<double>(transactionTagSampleCost, CLIENT_KNOBS->COMMIT_SAMPLE_COST);
+	if (sampleCost <= 0) return false;
+	return deterministicRandom()->random01() <= (double)cost / sampleCost;
 }
 
 int64_t extractIntOption(Optional<StringRef> value, int64_t minValue, int64_t maxValue) {
@@ -5375,14 +5374,11 @@ void Transaction::checkDeferredError() {
 	cx->checkDeferredError();
 }
 
-Reference<TransactionLogInfo> Transaction::createTrLogInfoProbabilistically(const Database& cx) {
-	if (!cx->isError()) {
-		double clientSamplingProbability = std::isinf(cx->clientInfo->get().clientTxnInfoSampleRate)
-		                                       ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY
-		                                       : cx->clientInfo->get().clientTxnInfoSampleRate;
-		if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) &&
-		    deterministicRandom()->random01() < clientSamplingProbability &&
-		    (!g_network->isSimulated() || !g_simulator.speedUpSimulation)) {
+Reference<TransactionLogInfo> Transaction::createTrLogInfoProbabilistically(const Database &cx) {
+	if(!cx->isError()) {
+		double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
+		double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
+		if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) && deterministicRandom()->random01() < clientSamplingProbability && (!g_network->isSimulated() || !g_simulator.speedUpSimulation)) {
 			return makeReference<TransactionLogInfo>(TransactionLogInfo::DATABASE);
 		}
 	}
diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index 785a42828b..7c12a69059 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -757,10 +757,8 @@ const KeyRef tagThrottleLimitKey = LiteralStringRef("\xff\x02/throttledTags/manu
 const KeyRef tagThrottleCountKey = LiteralStringRef("\xff\x02/throttledTags/manualThrottleCount");
 
 // Client status info prefix
-const KeyRangeRef fdbClientInfoPrefixRange(LiteralStringRef("\xff\x02/fdbClientInfo/"),
-                                           LiteralStringRef("\xff\x02/fdbClientInfo0"));
-const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("\xff\x02/fdbClientInfo/client_txn_sample_rate/");
-const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("\xff\x02/fdbClientInfo/client_txn_size_limit/");
+const KeyRangeRef fdbClientInfoPrefixRange(LiteralStringRef("\xff\x02/fdbClientInfo/"), LiteralStringRef("\xff\x02/fdbClientInfo0"));
+// See remaining fields in GlobalConfig.actor.h
 
 // ConsistencyCheck settings
 const KeyRef fdbShouldConsistencyCheckBeSuspended = LiteralStringRef("\xff\x02/ConsistencyCheck/Suspend");
diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h
index 489da42f83..5cf56ef7ec 100644
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@@ -379,8 +379,6 @@ extern const KeyRangeRef applyMutationsKeyVersionCountRange;
 
 // FdbClient Info prefix
 extern const KeyRangeRef fdbClientInfoPrefixRange;
-extern const KeyRef fdbClientInfoTxnSampleRate;
-extern const KeyRef fdbClientInfoTxnSizeLimit;
 
 // Consistency Check settings
 extern const KeyRef fdbShouldConsistencyCheckBeSuspended;
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 43b53c908c..4aca282d62 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -2715,9 +2715,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
 		clientInfo.id = deterministicRandom()->randomUniqueID();
 		clientInfo.commitProxies = req.commitProxies;
 		clientInfo.grvProxies = req.grvProxies;
-		clientInfo.clientTxnInfoSampleRate = db->clientInfo->get().clientTxnInfoSampleRate;
-		clientInfo.clientTxnInfoSizeLimit = db->clientInfo->get().clientTxnInfoSizeLimit;
-		db->clientInfo->set(clientInfo);
+		db->clientInfo->set( clientInfo );
 		dbInfo.client = db->clientInfo->get();
 	}
 
@@ -3199,13 +3197,11 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 				tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 				tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 				state Optional<Value> globalConfigVersion = wait(tr.get(globalConfigVersionKey));
-				state Optional<Value> rateVal = wait(tr.get(fdbClientInfoTxnSampleRate));
-				state Optional<Value> limitVal = wait(tr.get(fdbClientInfoTxnSizeLimit));
 				state ClientDBInfo clientInfo = db->clientInfo->get();
 
 				if (globalConfigVersion.present()) {
 					BinaryReader versionReader = BinaryReader(globalConfigVersion.get(), AssumeVersion(g_network->protocolVersion()));
-					int64_t commitVersion;
+					int64_t commitVersion;  // Currently unused. Convert to little endian if you want to use it
 					int16_t serializationOrder;
 					state ProtocolVersion protocolVersion;
 					versionReader >> commitVersion >> serializationOrder >> protocolVersion;
@@ -3254,24 +3250,10 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 					db->clientInfo->set(clientInfo);
 				}
 
-				// TODO: Remove this and move to global config space
-				double sampleRate = rateVal.present() ? BinaryReader::fromStringRef<double>(rateVal.get(), Unversioned()) : std::numeric_limits<double>::infinity();
-				int64_t sizeLimit = limitVal.present() ? BinaryReader::fromStringRef<int64_t>(limitVal.get(), Unversioned()) : -1;
-				if (sampleRate != clientInfo.clientTxnInfoSampleRate || sizeLimit != clientInfo.clientTxnInfoSampleRate) {
-					clientInfo.id = deterministicRandom()->randomUniqueID();
-					clientInfo.clientTxnInfoSampleRate = sampleRate;
-					clientInfo.clientTxnInfoSizeLimit = sizeLimit;
-					db->clientInfo->set(clientInfo);
-				}
-
 				state Future<Void> globalConfigFuture = tr.watch(globalConfigVersionKey);
-				state Future<Void> watchRateFuture = tr.watch(fdbClientInfoTxnSampleRate);
-				state Future<Void> watchLimitFuture = tr.watch(fdbClientInfoTxnSizeLimit);
 				wait(tr.commit());
 				choose {
 					when (wait(globalConfigFuture)) { break; }
-					when(wait(watchRateFuture)) { break; }
-					when(wait(watchLimitFuture)) { break; }
 				}
 			} catch (Error& e) {
 				wait(tr.onError(e));
diff --git a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
index a5d6ca18be..9343d24694 100644
--- a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
+++ b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
@@ -1,5 +1,6 @@
 #include "fdbserver/workloads/workloads.actor.h"
 #include "fdbserver/ServerDBInfo.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/ManagementAPI.actor.h"
 #include "fdbclient/RunTransaction.actor.h"
 #include "flow/actorcompiler.h" // has to be last include
@@ -268,13 +269,17 @@ struct ClientTransactionProfileCorrectnessWorkload : TestWorkload {
 
 	ACTOR Future<Void> changeProfilingParameters(Database cx, int64_t sizeLimit, double sampleProbability) {
 
-		wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
-			tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-			tr->set(fdbClientInfoTxnSampleRate, BinaryWriter::toValue(sampleProbability, Unversioned()));
-			tr->set(fdbClientInfoTxnSizeLimit, BinaryWriter::toValue(sizeLimit, Unversioned()));
-			return Void();
-		}));
+		wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void>
+						{
+							tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+							tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+							Tuple rate = Tuple().appendDouble(sampleProbability);
+							Tuple size = Tuple().append(sizeLimit);
+							tr->set(fdbClientInfoTxnSampleRate.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), rate.pack());
+							tr->set(fdbClientInfoTxnSizeLimit.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), size.pack());
+							return Void();
+						}
+					 ));
 		return Void();
 	}
 

From 4a799baa1d9c720abfd18d1058fff54aa02ae14f Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 24 Feb 2021 11:23:29 -0800
Subject: [PATCH 139/317] Add clear range for global configuration

---
 fdbclient/GlobalConfig.actor.h      | 3 ++-
 fdbclient/NativeAPI.actor.cpp       | 2 +-
 fdbclient/SpecialKeySpace.actor.cpp | 6 ++++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 601802e57c..3902654112 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -48,7 +48,6 @@ extern const KeyRef transactionTagSampleCost;
 
 class GlobalConfig {
 public:
-	GlobalConfig();
 	GlobalConfig(const GlobalConfig&) = delete;
 	GlobalConfig& operator=(const GlobalConfig&) = delete;
 
@@ -85,6 +84,8 @@ public:
 	Future<Void> onInitialized();
 
 private:
+	GlobalConfig();
+
 	void insert(KeyRef key, ValueRef value);
 	void erase(KeyRef key);
 	void erase(KeyRangeRef range);
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index f45040982f..92d6d6711e 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -509,7 +509,7 @@ ACTOR static Future<Void> clientStatusUpdateActor(DatabaseContext* cx) {
 			wait(GlobalConfig::globalConfig().onInitialized());
 			double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
 			double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
-			double sizeLimit = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSizeLimit, -1);
+			int64_t sizeLimit = GlobalConfig::globalConfig().get<int64_t>(fdbClientInfoTxnSizeLimit, -1);
 			int64_t clientTxnInfoSizeLimit = sizeLimit == -1 ? CLIENT_KNOBS->CSI_SIZE_LIMIT : sizeLimit;
 			if (!trChunksQ.empty() && deterministicRandom()->random01() < clientSamplingProbability)
 				wait(delExcessClntTxnEntriesActor(&tr, clientTxnInfoSizeLimit));
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index c84b4a5d42..7fbce93b6e 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1456,7 +1456,9 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 				mutations.emplace_back_deep(arena, MutationRef(MutationRef::SetValue, bareKey, entry.second.get()));
 				tr.set(systemKey, entry.second.get());
 			} else {
-				mutations.emplace_back_deep(arena, MutationRef(MutationRef::ClearRange, bareKey, keyAfter(bareKey)));
+				KeyRef clearRangeBegin = iter->range().begin.removePrefix(globalConfig->getKeyRange().begin);
+				KeyRef clearRangeEnd = iter->range().end.removePrefix(globalConfig->getKeyRange().begin);
+				mutations.emplace_back_deep(arena, MutationRef(MutationRef::ClearRange, clearRangeBegin, clearRangeEnd));
 				tr.clear(systemKey);
 			}
 		}
@@ -1491,7 +1493,7 @@ Future<Optional<std::string>> GlobalConfigImpl::commit(ReadYourWritesTransaction
 }
 
 void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
-	// TODO
+	ryw->getSpecialKeySpaceWriteMap().insert(range, std::make_pair(true, Optional<Value>()));
 }
 
 void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {

From 70c4bbe119f40bcd7d44f45ee5e05819374c3657 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 24 Feb 2021 11:49:25 -0800
Subject: [PATCH 140/317] Fix clear range persistence issue

---
 fdbclient/SpecialKeySpace.actor.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 7fbce93b6e..395e791839 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1448,18 +1448,22 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 	    ryw->getSpecialKeySpaceWriteMap().containedRanges(specialKeys);
 	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
 	while (iter != ranges.end()) {
-		Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin);
-		Key systemKey = bareKey.withPrefix(globalConfigKeysPrefix);
 		std::pair<bool, Optional<Value>> entry = iter->value();
 		if (entry.first) {
 			if (entry.second.present()) {
+				Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin);
 				mutations.emplace_back_deep(arena, MutationRef(MutationRef::SetValue, bareKey, entry.second.get()));
+
+				Key systemKey = bareKey.withPrefix(globalConfigKeysPrefix);
 				tr.set(systemKey, entry.second.get());
 			} else {
-				KeyRef clearRangeBegin = iter->range().begin.removePrefix(globalConfig->getKeyRange().begin);
-				KeyRef clearRangeEnd = iter->range().end.removePrefix(globalConfig->getKeyRange().begin);
-				mutations.emplace_back_deep(arena, MutationRef(MutationRef::ClearRange, clearRangeBegin, clearRangeEnd));
-				tr.clear(systemKey);
+				KeyRef bareRangeBegin = iter->range().begin.removePrefix(globalConfig->getKeyRange().begin);
+				KeyRef bareRangeEnd = iter->range().end.removePrefix(globalConfig->getKeyRange().begin);
+				mutations.emplace_back_deep(arena, MutationRef(MutationRef::ClearRange, bareRangeBegin, bareRangeEnd));
+
+				Key systemRangeBegin = bareRangeBegin.withPrefix(globalConfigKeysPrefix);
+				Key systemRangeEnd = bareRangeEnd.withPrefix(globalConfigKeysPrefix);
+				tr.clear(KeyRangeRef(systemRangeBegin, systemRangeEnd));
 			}
 		}
 		++iter;

From e9e2ca54d68ef7f06ab3c2d97065b2dba8a29b4c Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 24 Feb 2021 11:50:13 -0800
Subject: [PATCH 141/317] Assert history contains data

---
 fdbclient/GlobalConfig.actor.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 261680f4d4..4ae6c20f83 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -129,13 +129,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
 				wait(self->refresh(self));
-				// TODO: This check is a temporary fix while old functionality
-				// for setting ClientDBInfo fields exist, but eventually it
-				// should be replaced with an assert that the size of `history`
-				// is greater than 0.
-				if (dbInfo->get().history.size() > 0) {
-					self->lastUpdate = dbInfo->get().history.back().contents().first;
-				}
+				ASSERT(dbInfo->get().history.size() > 0);
+				self->lastUpdate = dbInfo->get().history.back().contents().first;
 			} else {
 				// Apply history in order, from lowest version to highest
 				// version. Mutation history should already be stored in

From e5e48da5ceadae455fbd52af383cbd6387d8ee2f Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 24 Feb 2021 12:59:40 -0800
Subject: [PATCH 142/317] Revert removal of history size check

---
 fdbclient/GlobalConfig.actor.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 4ae6c20f83..52df7c30e5 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -129,8 +129,9 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
 				wait(self->refresh(self));
-				ASSERT(dbInfo->get().history.size() > 0);
-				self->lastUpdate = dbInfo->get().history.back().contents().first;
+				if (dbInfo->get().history.size() > 0) {
+					self->lastUpdate = dbInfo->get().history.back().contents().first;
+				}
 			} else {
 				// Apply history in order, from lowest version to highest
 				// version. Mutation history should already be stored in

From b7cd8175be269e4c12b8e08ebbc7ab549d8d4d02 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 24 Feb 2021 18:29:53 -0800
Subject: [PATCH 143/317] Add arena per object in global config

---
 fdbclient/GlobalConfig.actor.cpp    | 36 +++++++++++++++--------------
 fdbclient/GlobalConfig.actor.h      | 33 ++++++++++++++++----------
 fdbclient/SpecialKeySpace.actor.cpp | 22 +++++++++---------
 3 files changed, 51 insertions(+), 40 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 52df7c30e5..6569943ff3 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -26,19 +26,21 @@
 
 #include "flow/actorcompiler.h"  // This must be the last #include.
 
-const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("fdbClientInfo/client_txn_sample_rate");
-const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("fdbClientInfo/client_txn_size_limit");
+const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("config/fdbClientInfo/client_txn_sample_rate");
+const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdbClientInfo/client_txn_size_limit");
 
-const KeyRef transactionTagSampleRate = LiteralStringRef("transactionTagSampleRate");
-const KeyRef transactionTagSampleCost = LiteralStringRef("transactionTagSampleCost");
+const KeyRef transactionTagSampleRate = LiteralStringRef("config/transactionTagSampleRate");
+const KeyRef transactionTagSampleCost = LiteralStringRef("config/transactionTagSampleCost");
 
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
 void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
-	auto config = new GlobalConfig{};
-	config->cx = Database(cx);
-	g_network->setGlobal(INetwork::enGlobalConfig, config);
-	config->_updater = updater(config, dbInfo);
+	if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
+		auto config = new GlobalConfig{};
+		config->cx = Database(cx);
+		g_network->setGlobal(INetwork::enGlobalConfig, config);
+		config->_updater = updater(config, dbInfo);
+	}
 }
 
 GlobalConfig& GlobalConfig::globalConfig() {
@@ -47,16 +49,16 @@ GlobalConfig& GlobalConfig::globalConfig() {
 	return *reinterpret_cast<GlobalConfig*>(res);
 }
 
-const std::any GlobalConfig::get(KeyRef name) {
+const ConfigValue GlobalConfig::get(KeyRef name) {
 	auto it = data.find(name);
 	if (it == data.end()) {
-		return std::any{};
+		return ConfigValue{ Arena(), std::any{} };
 	}
 	return it->second;
 }
 
-const std::map<KeyRef, std::any> GlobalConfig::get(KeyRangeRef range) {
-	std::map<KeyRef, std::any> results;
+const std::map<KeyRef, ConfigValue> GlobalConfig::get(KeyRangeRef range) {
+	std::map<KeyRef, ConfigValue> results;
 	for (const auto& [key, value] : data) {
 		if (range.contains(key)) {
 			results[key] = value;
@@ -70,17 +72,18 @@ Future<Void> GlobalConfig::onInitialized() {
 }
 
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
+	Arena arena(1);
 	KeyRef stableKey = KeyRef(arena, key);
 	try {
 		Tuple t = Tuple::unpack(value);
 		if (t.getType(0) == Tuple::ElementType::UTF8) {
-			data[stableKey] = t.getString(0);
+			data[stableKey] = ConfigValue{ arena, StringRef(arena, t.getString(0).contents()) };
 		} else if (t.getType(0) == Tuple::ElementType::INT) {
-			data[stableKey] = t.getInt(0);
+			data[stableKey] = ConfigValue{ arena, t.getInt(0) };
 		} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
-			data[stableKey] = t.getFloat(0);
+			data[stableKey] = ConfigValue{ arena, t.getFloat(0) };
 		} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
-			data[stableKey] = t.getDouble(0);
+			data[stableKey] = ConfigValue{ arena, t.getDouble(0) };
 		} else {
 			ASSERT(false);
 		}
@@ -94,7 +97,6 @@ void GlobalConfig::erase(KeyRef key) {
 }
 
 void GlobalConfig::erase(KeyRangeRef range) {
-	// TODO: Memory leak -- memory for key remains allocated in arena
 	auto it = data.begin();
 	while (it != data.end()) {
 		if (range.contains(it->first)) {
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 3902654112..a82d86cc8a 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -28,6 +28,7 @@
 
 #include <any>
 #include <map>
+#include <type_traits>
 #include <unordered_map>
 
 #include "fdbclient/CommitProxyInterface.h"
@@ -46,6 +47,11 @@ extern const KeyRef fdbClientInfoTxnSizeLimit;
 extern const KeyRef transactionTagSampleRate;
 extern const KeyRef transactionTagSampleCost;
 
+struct ConfigValue {
+	Arena arena;
+	std::any value;
+};
+
 class GlobalConfig {
 public:
 	GlobalConfig(const GlobalConfig&) = delete;
@@ -54,27 +60,31 @@ public:
 	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 	static GlobalConfig& globalConfig();
 
-	const std::any get(KeyRef name);
-	const std::map<KeyRef, std::any> get(KeyRangeRef range);
+	const ConfigValue get(KeyRef name);
+	const std::map<KeyRef, ConfigValue> get(KeyRangeRef range);
 
-	template <typename T>
+	template <typename T, typename std::enable_if<std::is_arithmetic<T>{}, bool>::type = true>
 	const T get(KeyRef name) {
 		try {
-			auto any = get(name);
+			auto any = get(name).value;
 			return std::any_cast<T>(any);
 		} catch (Error& e) {
 			throw;
 		}
 	}
 
-	template <typename T>
+	template <typename T, typename std::enable_if<std::is_arithmetic<T>{}, bool>::type = true>
 	const T get(KeyRef name, T defaultVal) {
-		auto any = get(name);
-		if (any.has_value()) {
-			return std::any_cast<T>(any);
-		}
+		try {
+			auto any = get(name).value;
+			if (any.has_value()) {
+				return std::any_cast<T>(any);
+			}
 
-		return defaultVal;
+			return defaultVal;
+		} catch (Error& e) {
+			throw;
+		}
 	}
 
 	// To write into the global configuration, submit a transaction to
@@ -96,8 +106,7 @@ private:
 	Database cx;
 	Future<Void> _updater;
 	Promise<Void> initialized;
-	Arena arena;
-	std::unordered_map<StringRef, std::any> data;
+	std::unordered_map<StringRef, ConfigValue> data;
 	Version lastUpdate;
 };
 
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 395e791839..4c4a09456a 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1379,18 +1379,18 @@ Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTran
 
 	auto& globalConfig = GlobalConfig::globalConfig();
 	KeyRangeRef modified = KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin));
-	std::map<KeyRef, std::any> values = globalConfig.get(modified);
-	for (const auto& [key, any] : values) {
+	std::map<KeyRef, ConfigValue> values = globalConfig.get(modified);
+	for (const auto& [key, config] : values) {
 		Key prefixedKey = key.withPrefix(getKeyRange().begin);
-		if (any.has_value()) {
-			if (any.type() == typeid(Standalone<StringRef>)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::any_cast<Standalone<StringRef>>(any).contents()));
-			} else if (any.type() == typeid(int64_t)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(any))));
-			} else if (any.type() == typeid(float)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(any))));
-			} else if (any.type() == typeid(double)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(any))));
+		if (config.value.has_value()) {
+			if (config.value.type() == typeid(StringRef)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::any_cast<StringRef>(config.value).toString()));
+			} else if (config.value.type() == typeid(int64_t)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config.value))));
+			} else if (config.value.type() == typeid(float)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config.value))));
+			} else if (config.value.type() == typeid(double)) {
+				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(config.value))));
 			} else {
 				ASSERT(false);
 			}

From 388344c31e9c1f7bbb56470e78ad5b9f369f9a54 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 26 Feb 2021 09:27:55 -0800
Subject: [PATCH 144/317] Better estimation for arena size

---
 fdbclient/GlobalConfig.actor.cpp |  2 +-
 fdbclient/GlobalConfig.actor.h   | 18 ++++++++----------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 6569943ff3..cbb92d053e 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -72,7 +72,7 @@ Future<Void> GlobalConfig::onInitialized() {
 }
 
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
-	Arena arena(1);
+	Arena arena(key.expectedSize() + value.expectedSize());
 	KeyRef stableKey = KeyRef(arena, key);
 	try {
 		Tuple t = Tuple::unpack(value);
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index a82d86cc8a..479e65d427 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -60,19 +60,17 @@ public:
 	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 	static GlobalConfig& globalConfig();
 
+	// Get a value from the framework. Values are returned in a ConfigValue
+	// struct which also contains a reference to the arena containing the
+	// memory for the object. As long as the caller keeps a reference to the
+	// returned ConfigValue, the value is guaranteed to be readable (if it
+	// exists).
 	const ConfigValue get(KeyRef name);
 	const std::map<KeyRef, ConfigValue> get(KeyRangeRef range);
 
-	template <typename T, typename std::enable_if<std::is_arithmetic<T>{}, bool>::type = true>
-	const T get(KeyRef name) {
-		try {
-			auto any = get(name).value;
-			return std::any_cast<T>(any);
-		} catch (Error& e) {
-			throw;
-		}
-	}
-
+	// For arithmetic value types, returns a copy of the value for the given
+	// key, or the supplied default value if the framework does not know about
+	// the key.
 	template <typename T, typename std::enable_if<std::is_arithmetic<T>{}, bool>::type = true>
 	const T get(KeyRef name, T defaultVal) {
 		try {

From fb9a929780a97d57ef8768c8afe3a03855894d35 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 2 Mar 2021 15:03:05 -0800
Subject: [PATCH 145/317] Fix issue with freed memory being accessed

---
 fdbserver/ClusterController.actor.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 4aca282d62..28bb8c4d4d 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3206,7 +3206,6 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 					state ProtocolVersion protocolVersion;
 					versionReader >> commitVersion >> serializationOrder >> protocolVersion;
 
-					state Arena arena;
 					if (protocolVersion == g_network->protocolVersion()) {
 						Standalone<RangeResultRef> globalConfigHistory = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
 						// If the global configuration version key has been
@@ -3228,7 +3227,7 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 							BinaryReader mutationReader = BinaryReader(kv.value, AssumeVersion(protocolVersion));
 							VectorRef<MutationRef> mutations;
 							mutationReader >> mutations;
-							data.second = VectorRef(arena, mutations);
+							data.second = VectorRef(data.arena(), mutations);
 
 							clientInfo.history.push_back(data);
 						}

From 1c84c04ffc25106b71288f9b15421c628b8af837 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 11 Mar 2021 10:57:46 -0800
Subject: [PATCH 146/317] Add global configuration prefix function

---
 fdbcli/fdbcli.actor.cpp                                      | 4 ++--
 fdbclient/GlobalConfig.actor.cpp                             | 5 +++++
 fdbclient/GlobalConfig.actor.h                               | 4 ++++
 fdbclient/SpecialKeySpace.actor.cpp                          | 1 +
 .../workloads/ClientTransactionProfileCorrectness.actor.cpp  | 4 ++--
 5 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index 35f7fdd884..e8167c4855 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -3891,8 +3891,8 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 							Tuple rate = Tuple().appendDouble(sampleRate);
 							Tuple size = Tuple().append(sizeLimit);
 							tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
-							tr->set(fdbClientInfoTxnSampleRate.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), rate.pack());
-							tr->set(fdbClientInfoTxnSizeLimit.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), size.pack());
+							tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
+							tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
 							if (!intrans) {
 								wait(commitTransaction(tr));
 							}
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index cbb92d053e..dd9b2d8aff 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -19,6 +19,7 @@
  */
 
 #include "fdbclient/GlobalConfig.actor.h"
+#include "fdbclient/SpecialKeySpace.actor.h"
 #include "fdbclient/SystemData.h"
 #include "fdbclient/Tuple.h"
 #include "flow/flow.h"
@@ -49,6 +50,10 @@ GlobalConfig& GlobalConfig::globalConfig() {
 	return *reinterpret_cast<GlobalConfig*>(res);
 }
 
+Key GlobalConfig::prefixedKey(KeyRef key) {
+	return key.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin);
+}
+
 const ConfigValue GlobalConfig::get(KeyRef name) {
 	auto it = data.find(name);
 	if (it == data.end()) {
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 479e65d427..d699c804f0 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -60,6 +60,10 @@ public:
 	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 	static GlobalConfig& globalConfig();
 
+	// Use this function to turn a global configuration key defined above into
+	// the full path needed to set the value in the database.
+	static Key prefixedKey(KeyRef key);
+
 	// Get a value from the framework. Values are returned in a ConfigValue
 	// struct which also contains a reference to the arena containing the
 	// memory for the object. As long as the caller keeps a reference to the
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 4c4a09456a..356cdfc2c1 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1434,6 +1434,7 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 		//   clear_range(\xff/globalConfig/h, \xff/globalConfig/h/1000) results
 		//   in zero key-value pairs being deleted (999 is lexicographically
 		//   larger than 1000, and the range is exclusive).
+		// Delete the oldest key(s) in the history to make room for new data.
 		for (int i = 0; i < keys.size() - (kGlobalConfigMaxHistorySize - 1); ++i) {
 			tr.clear(keys[i]);
 		}
diff --git a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
index 9343d24694..d56061ca3a 100644
--- a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
+++ b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
@@ -275,8 +275,8 @@ struct ClientTransactionProfileCorrectnessWorkload : TestWorkload {
 							tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 							Tuple rate = Tuple().appendDouble(sampleProbability);
 							Tuple size = Tuple().append(sizeLimit);
-							tr->set(fdbClientInfoTxnSampleRate.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), rate.pack());
-							tr->set(fdbClientInfoTxnSizeLimit.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin), size.pack());
+							tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
+							tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
 							return Void();
 						}
 					 ));

From 12603859659108bfd092e50ae0edaaa65e8934a5 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 15 Mar 2021 18:03:54 -0700
Subject: [PATCH 147/317] Use object to wrap global configuration history

---
 fdbclient/CommitProxyInterface.h      |   3 +-
 fdbclient/GlobalConfig.actor.cpp      |  22 ++---
 fdbclient/GlobalConfig.actor.h        |   8 +-
 fdbclient/GlobalConfig.h              |  45 ++++++++++
 fdbclient/SpecialKeySpace.actor.cpp   |  62 ++++----------
 fdbserver/ClusterController.actor.cpp | 113 ++++++++++++--------------
 6 files changed, 131 insertions(+), 122 deletions(-)
 create mode 100644 fdbclient/GlobalConfig.h

diff --git a/fdbclient/CommitProxyInterface.h b/fdbclient/CommitProxyInterface.h
index 4fffa116ac..794b88ceaa 100644
--- a/fdbclient/CommitProxyInterface.h
+++ b/fdbclient/CommitProxyInterface.h
@@ -31,6 +31,7 @@
 #include "fdbclient/CommitTransaction.h"
 #include "fdbserver/RatekeeperInterface.h"
 #include "fdbclient/TagThrottle.h"
+#include "fdbclient/GlobalConfig.h"
 
 #include "fdbrpc/Stats.h"
 #include "fdbrpc/TimedRequest.h"
@@ -114,7 +115,7 @@ struct ClientDBInfo {
 	Optional<CommitProxyInterface>
 	    firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk
 	Optional<Value> forward;
-	vector<Standalone<std::pair<Version, VectorRef<MutationRef>>>> history;
+	vector<VersionHistory> history;
 
 	ClientDBInfo() {}
 
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index dd9b2d8aff..5315a8f68a 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -112,6 +112,8 @@ void GlobalConfig::erase(KeyRangeRef range) {
 	}
 }
 
+// Updates local copy of global configuration by reading the entire key-range
+// from storage.
 ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 	Transaction tr(self->cx);
 	Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
@@ -122,6 +124,8 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 	return Void();
 }
 
+// Applies updates to the local copy of the global configuration when this
+// process receives an updated history.
 ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
 	wait(self->refresh(self));
 	self->initialized.send(Void());
@@ -131,28 +135,24 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 			wait(dbInfo->onChange());
 
 			auto& history = dbInfo->get().history;
-			if (history.size() == 0 || (self->lastUpdate < history[0].first && self->lastUpdate != 0)) {
+			if (history.size() == 0 || (self->lastUpdate < history[0].version && self->lastUpdate != 0)) {
 				// This process missed too many global configuration
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
 				wait(self->refresh(self));
 				if (dbInfo->get().history.size() > 0) {
-					self->lastUpdate = dbInfo->get().history.back().contents().first;
+					self->lastUpdate = dbInfo->get().history.back().version;
 				}
 			} else {
 				// Apply history in order, from lowest version to highest
 				// version. Mutation history should already be stored in
 				// ascending version order.
-				for (int i = 0; i < history.size(); ++i) {
-					const std::pair<Version, VectorRef<MutationRef>>& pair = history[i].contents();
-
-					Version version = pair.first;
-					if (version <= self->lastUpdate) {
+				for (const auto& vh : history) {
+					if (vh.version <= self->lastUpdate) {
 						continue;  // already applied this mutation
 					}
 
-					const VectorRef<MutationRef>& mutations = pair.second;
-					for (const auto& mutation : mutations) {
+					for (const auto& mutation : vh.mutations.contents()) {
 						if (mutation.type == MutationRef::SetValue) {
 							self->insert(mutation.param1, mutation.param2);
 						} else if (mutation.type == MutationRef::ClearRange) {
@@ -162,8 +162,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 						}
 					}
 
-					ASSERT(version > self->lastUpdate);
-					self->lastUpdate = version;
+					ASSERT(vh.version > self->lastUpdate);
+					self->lastUpdate = vh.version;
 				}
 			}
 		} catch (Error& e) {
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index d699c804f0..11699633da 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -32,6 +32,7 @@
 #include <unordered_map>
 
 #include "fdbclient/CommitProxyInterface.h"
+#include "fdbclient/GlobalConfig.h"
 #include "fdbclient/ReadYourWrites.h"
 
 #include "flow/actorcompiler.h" // has to be last include
@@ -62,6 +63,8 @@ public:
 
 	// Use this function to turn a global configuration key defined above into
 	// the full path needed to set the value in the database.
+	//
+	// For example, given "config/a", returns "\xff\xff/global_config/config/a".
 	static Key prefixedKey(KeyRef key);
 
 	// Get a value from the framework. Values are returned in a ConfigValue
@@ -91,8 +94,11 @@ public:
 
 	// To write into the global configuration, submit a transaction to
 	// \xff\xff/global_config/<your-key> with <your-value> encoded using the
-	// FDB tuple typecodes.
+	// FDB tuple typecodes. Use the helper function `prefixedKey` to correctly
+	// prefix your global configuration key.
 
+	// Triggers the returned future when the global configuration singleton has
+	// been created and is ready.
 	Future<Void> onInitialized();
 
 private:
diff --git a/fdbclient/GlobalConfig.h b/fdbclient/GlobalConfig.h
new file mode 100644
index 0000000000..14e10f8635
--- /dev/null
+++ b/fdbclient/GlobalConfig.h
@@ -0,0 +1,45 @@
+/*
+ * GlobalConfig.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "fdbclient/CommitTransaction.h"
+#include "fdbclient/FDBTypes.h"
+
+// Used to store a list of mutations made to the global configuration at a
+// specific version.
+struct VersionHistory {
+	constexpr static FileIdentifier file_identifier = 5863456;
+
+	Version version;
+	Standalone<VectorRef<MutationRef>> mutations;
+
+	bool operator<(const VersionHistory& other) const { return version < other.version; }
+
+	int expectedSize() const { return sizeof(version) + mutations.expectedSize(); }
+
+	template <typename Ar>
+	void serialize(Ar& ar) {
+		// The version is not serialized because this object is only sent over
+		// the network during a write. In this case, the version is included in
+		// the key, while this object will be written to the value.
+		serializer(ar, mutations);
+	}
+};
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 356cdfc2c1..6bec7665d1 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1409,42 +1409,19 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 
 	// History should only contain three most recent updates. If it currently
 	// has three items, remove the oldest to make room for a new item.
-	Standalone<RangeResultRef> history = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY, false, true));
+	Standalone<RangeResultRef> history = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
 	constexpr int kGlobalConfigMaxHistorySize = 3;
 	if (history.size() > kGlobalConfigMaxHistorySize - 1) {
-		std::vector<KeyRef> keys;
-		for (const auto& kv : history) {
-			keys.push_back(kv.key);
-		}
-		// Fix ordering of returned keys. This will ensure versions are ordered
-		// numerically; for example \xff/globalConfig/h/1000 should come after
-		// \xff/globalConfig/h/999.
-		std::sort(keys.begin(), keys.end(), [](const KeyRef& lhs, const KeyRef& rhs) {
-			if (lhs.size() != rhs.size()) {
-				return lhs.size() < rhs.size();
-			}
-			return lhs.compare(rhs) < 0;
-		});
-
-		// Cannot use a range clear because of how keys are ordered in FDB.
-		//   \xff/globalConfig/h/999 -> ...
-		//   \xff/globalConfig/h/1000 -> ...
-		//   \xff/globalConfig/h/1001 -> ...
-		//
-		//   clear_range(\xff/globalConfig/h, \xff/globalConfig/h/1000) results
-		//   in zero key-value pairs being deleted (999 is lexicographically
-		//   larger than 1000, and the range is exclusive).
-		// Delete the oldest key(s) in the history to make room for new data.
-		for (int i = 0; i < keys.size() - (kGlobalConfigMaxHistorySize - 1); ++i) {
-			tr.clear(keys[i]);
+		for (int i = 0; i < history.size() - (kGlobalConfigMaxHistorySize - 1); ++i) {
+			tr.clear(history[i].key);
 		}
 	}
 
-	Arena arena;
-	VectorRef<MutationRef> mutations;
+	VersionHistory vh;
 
-	// Transform writes from special-key-space (\xff\xff/global_config/) to
-	// system key space (\xff/globalConfig/).
+	// Transform writes from the special-key-space (\xff\xff/global_config/) to
+	// the system key space (\xff/globalConfig/), and writes mutations to
+	// latest version history.
 	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
 	    ryw->getSpecialKeySpaceWriteMap().containedRanges(specialKeys);
 	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
@@ -1453,14 +1430,16 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 		if (entry.first) {
 			if (entry.second.present()) {
 				Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin);
-				mutations.emplace_back_deep(arena, MutationRef(MutationRef::SetValue, bareKey, entry.second.get()));
+				vh.mutations.emplace_back_deep(vh.mutations.arena(),
+				                               MutationRef(MutationRef::SetValue, bareKey, entry.second.get()));
 
 				Key systemKey = bareKey.withPrefix(globalConfigKeysPrefix);
 				tr.set(systemKey, entry.second.get());
 			} else {
 				KeyRef bareRangeBegin = iter->range().begin.removePrefix(globalConfig->getKeyRange().begin);
 				KeyRef bareRangeEnd = iter->range().end.removePrefix(globalConfig->getKeyRange().begin);
-				mutations.emplace_back_deep(arena, MutationRef(MutationRef::ClearRange, bareRangeBegin, bareRangeEnd));
+				vh.mutations.emplace_back_deep(vh.mutations.arena(),
+				                               MutationRef(MutationRef::ClearRange, bareRangeBegin, bareRangeEnd));
 
 				Key systemRangeBegin = bareRangeBegin.withPrefix(globalConfigKeysPrefix);
 				Key systemRangeEnd = bareRangeEnd.withPrefix(globalConfigKeysPrefix);
@@ -1470,27 +1449,18 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 		++iter;
 	}
 
-	ProtocolVersion protocolVersion = g_network->protocolVersion();
-
 	// Record the mutations in this commit into the global configuration history.
-	BinaryWriter historyKeyWriter(AssumeVersion(protocolVersion));
-	historyKeyWriter.serializeBytes(globalConfigHistoryPrefix);
-	Key historyKey = addVersionStampAtEnd(historyKeyWriter.toValue());
-
-	BinaryWriter historyMutationsWriter(AssumeVersion(protocolVersion));
-	historyMutationsWriter << mutations;
-
-	tr.atomicOp(historyKey, historyMutationsWriter.toValue(), MutationRef::SetVersionstampedKey);
+	Key historyKey = addVersionStampAtEnd(globalConfigHistoryPrefix);
+	ObjectWriter historyWriter(IncludeVersion());
+	historyWriter.serialize(vh);
+	tr.atomicOp(historyKey, historyWriter.toStringRef(), MutationRef::SetVersionstampedKey);
 
 	// Write version key to trigger update in cluster controller.
 	tr.atomicOp(globalConfigVersionKey,
-	            BinaryWriter::toValue(protocolVersion, AssumeVersion(protocolVersion))
-	                .withPrefix(LiteralStringRef("0123456789")) // placeholder for versionstamp
-	                .withSuffix(LiteralStringRef("\x00\x00\x00\x00")),
+	            LiteralStringRef("0123456789\x00\x00\x00\x00"), // versionstamp
 	            MutationRef::SetVersionstampedValue);
 
 	return Optional<std::string>();
-
 }
 
 Future<Optional<std::string>> GlobalConfigImpl::commit(ReadYourWritesTransaction* ryw) {
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 28bb8c4d4d..93632650e1 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -41,6 +41,7 @@
 #include "fdbserver/Status.h"
 #include "fdbserver/LatencyBandConfig.h"
 #include "fdbclient/DatabaseContext.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbserver/RecoveryState.h"
 #include "fdbclient/ReadYourWrites.h"
 #include "fdbrpc/Replication.h"
@@ -3189,7 +3190,7 @@ ACTOR Future<Void> monitorServerInfoConfig(ClusterControllerData::DBInfo* db) {
 	}
 }
 
-ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db) {
+ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 	loop {
 		state ReadYourWritesTransaction tr(db->db);
 		loop {
@@ -3200,49 +3201,39 @@ ACTOR Future<Void> monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db
 				state ClientDBInfo clientInfo = db->clientInfo->get();
 
 				if (globalConfigVersion.present()) {
-					BinaryReader versionReader = BinaryReader(globalConfigVersion.get(), AssumeVersion(g_network->protocolVersion()));
-					int64_t commitVersion;  // Currently unused. Convert to little endian if you want to use it
-					int16_t serializationOrder;
-					state ProtocolVersion protocolVersion;
-					versionReader >> commitVersion >> serializationOrder >> protocolVersion;
+					// Since the history keys end with versionstamps, they
+					// should be sorted correctly (versionstamps are stored in
+					// big-endian order).
+					Standalone<RangeResultRef> globalConfigHistory =
+					    wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
+					// If the global configuration version key has been set,
+					// the history should contain at least one item.
+					ASSERT(globalConfigHistory.size() > 0);
+					clientInfo.history.clear();
 
-					if (protocolVersion == g_network->protocolVersion()) {
-						Standalone<RangeResultRef> globalConfigHistory = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
-						// If the global configuration version key has been
-						// set, the history should contain at least one item.
-						ASSERT(globalConfigHistory.size() > 0);
-						clientInfo.history.clear();
-
-						for (const auto& kv : globalConfigHistory) {
-							Standalone<std::pair<Version, VectorRef<MutationRef>>> data;
-
-							// Read commit version out of versionstamp at end of key.
-							BinaryReader versionReader = BinaryReader(kv.key.removePrefix(globalConfigHistoryPrefix), AssumeVersion(protocolVersion));
-							Version historyCommitVersion;
-							versionReader >> historyCommitVersion;
-							historyCommitVersion = bigEndian64(historyCommitVersion);
-							data.first = historyCommitVersion;
-
-							// Read the list of mutations that occurred at this version.
-							BinaryReader mutationReader = BinaryReader(kv.value, AssumeVersion(protocolVersion));
-							VectorRef<MutationRef> mutations;
-							mutationReader >> mutations;
-							data.second = VectorRef(data.arena(), mutations);
-
-							clientInfo.history.push_back(data);
+					for (const auto& kv : globalConfigHistory) {
+						VersionHistory vh;
+						ObjectReader reader(kv.value.begin(), IncludeVersion());
+						if (reader.protocolVersion() != g_network->protocolVersion()) {
+							// If the protocol version has changed, the
+							// GlobalConfig actor should refresh its view by
+							// reading the entire global configuration key
+							// range.  An empty mutation list will signal the
+							// actor to refresh.
+							clientInfo.history.clear();
+							break;
 						}
+						reader.deserialize(vh);
 
-						// History should be ordered by version, ascending.
-						std::sort(clientInfo.history.begin(), clientInfo.history.end(), [](const auto& lhs, const auto& rhs) {
-							return lhs.first < rhs.first;
-						});
-					} else {
-						// If the protocol version has changed, the
-						// GlobalConfig actor should refresh its view by
-						// reading the entire global configuration key range.
-						// An empty mutation list will signal the actor to
-						// refresh.
-						clientInfo.history.clear();
+						// Read commit version out of versionstamp at end of key.
+						BinaryReader versionReader =
+						    BinaryReader(kv.key.removePrefix(globalConfigHistoryPrefix), Unversioned());
+						Version historyCommitVersion;
+						versionReader >> historyCommitVersion;
+						historyCommitVersion = bigEndian64(historyCommitVersion);
+						vh.version = historyCommitVersion;
+
+						clientInfo.history.push_back(vh);
 					}
 
 					clientInfo.id = deterministicRandom()->randomUniqueID();
@@ -3711,29 +3702,25 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
 	state ClusterControllerData self(interf, locality);
 	state Future<Void> coordinationPingDelay = delay(SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY);
 	state uint64_t step = 0;
-	state Future<ErrorOr<Void>> error = errorOr(actorCollection(self.addActor.getFuture()));
+	state Future<ErrorOr<Void>> error = errorOr( actorCollection( self.addActor.getFuture() ) );
 
-	self.addActor.send(clusterWatchDatabase(&self, &self.db)); // Start the master database
-	self.addActor.send(self.updateWorkerList.init(self.db.db));
-	self.addActor.send(statusServer(interf.clientInterface.databaseStatus.getFuture(), &self, coordinators));
-	self.addActor.send(timeKeeper(&self));
-	self.addActor.send(monitorProcessClasses(&self));
-	self.addActor.send(monitorServerInfoConfig(&self.db));
-	self.addActor.send(monitorClientTxnInfoConfigs(&self.db));
-	self.addActor.send(updatedChangingDatacenters(&self));
-	self.addActor.send(updatedChangedDatacenters(&self));
-	self.addActor.send(updateDatacenterVersionDifference(&self));
-	self.addActor.send(handleForcedRecoveries(&self, interf));
-	self.addActor.send(monitorDataDistributor(&self));
-	self.addActor.send(monitorRatekeeper(&self));
-	self.addActor.send(dbInfoUpdater(&self));
-	self.addActor.send(traceCounters("ClusterControllerMetrics",
-	                                 self.id,
-	                                 SERVER_KNOBS->STORAGE_LOGGING_DELAY,
-	                                 &self.clusterControllerMetrics,
-	                                 self.id.toString() + "/ClusterControllerMetrics"));
-	self.addActor.send(traceRole(Role::CLUSTER_CONTROLLER, interf.id()));
-	// printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());
+	self.addActor.send( clusterWatchDatabase( &self, &self.db ) );  // Start the master database
+	self.addActor.send( self.updateWorkerList.init( self.db.db ) );
+	self.addActor.send( statusServer( interf.clientInterface.databaseStatus.getFuture(), &self, coordinators));
+	self.addActor.send( timeKeeper(&self) );
+	self.addActor.send( monitorProcessClasses(&self) );
+	self.addActor.send( monitorServerInfoConfig(&self.db) );
+	self.addActor.send(monitorGlobalConfig(&self.db));
+	self.addActor.send( updatedChangingDatacenters(&self) );
+	self.addActor.send( updatedChangedDatacenters(&self) );
+	self.addActor.send( updateDatacenterVersionDifference(&self) );
+	self.addActor.send( handleForcedRecoveries(&self, interf) );
+	self.addActor.send( monitorDataDistributor(&self) );
+	self.addActor.send( monitorRatekeeper(&self) );
+	self.addActor.send( dbInfoUpdater(&self) );
+	self.addActor.send( traceCounters("ClusterControllerMetrics", self.id, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self.clusterControllerMetrics, self.id.toString() + "/ClusterControllerMetrics") );
+	self.addActor.send( traceRole(Role::CLUSTER_CONTROLLER, interf.id()) );
+	//printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());
 
 	loop choose {
 		when(ErrorOr<Void> err = wait(error)) {

From 6de28dd916df282d025044ad590ee94f69742806 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 16 Mar 2021 17:20:25 -0700
Subject: [PATCH 148/317] clang-format

---
 fdbcli/fdbcli.actor.cpp                       |  6 +-
 fdbclient/GlobalConfig.actor.cpp              |  2 +-
 fdbclient/NativeAPI.actor.cpp                 | 23 +++++---
 fdbclient/SpecialKeySpace.actor.cpp           | 15 +++--
 fdbclient/SystemData.cpp                      |  3 +-
 fdbclient/Tuple.cpp                           | 57 +++++++++----------
 fdbserver/ClusterController.actor.cpp         | 40 +++++++------
 ...entTransactionProfileCorrectness.actor.cpp | 20 +++----
 8 files changed, 89 insertions(+), 77 deletions(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index e8167c4855..d655601e22 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -3842,8 +3842,10 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 								is_error = true;
 								continue;
 							}
-							const double sampleRateDbl = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
-							const int64_t sizeLimit = GlobalConfig::globalConfig().get<int64_t>(fdbClientInfoTxnSizeLimit, -1);
+							const double sampleRateDbl = GlobalConfig::globalConfig().get<double>(
+							    fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
+							const int64_t sizeLimit =
+							    GlobalConfig::globalConfig().get<int64_t>(fdbClientInfoTxnSizeLimit, -1);
 							std::string sampleRateStr = "default", sizeLimitStr = "default";
 							if (!std::isinf(sampleRateDbl)) {
 								sampleRateStr = boost::lexical_cast<std::string>(sampleRateDbl);
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 5315a8f68a..a482a8e3ea 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -149,7 +149,7 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 				// ascending version order.
 				for (const auto& vh : history) {
 					if (vh.version <= self->lastUpdate) {
-						continue;  // already applied this mutation
+						continue; // already applied this mutation
 					}
 
 					for (const auto& mutation : vh.mutations.contents()) {
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 92d6d6711e..2e460d55d2 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -507,8 +507,10 @@ ACTOR static Future<Void> clientStatusUpdateActor(DatabaseContext* cx) {
 			}
 			cx->clientStatusUpdater.outStatusQ.clear();
 			wait(GlobalConfig::globalConfig().onInitialized());
-			double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
-			double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
+			double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate,
+			                                                             std::numeric_limits<double>::infinity());
+			double clientSamplingProbability =
+			    std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
 			int64_t sizeLimit = GlobalConfig::globalConfig().get<int64_t>(fdbClientInfoTxnSizeLimit, -1);
 			int64_t clientTxnInfoSizeLimit = sizeLimit == -1 ? CLIENT_KNOBS->CSI_SIZE_LIMIT : sizeLimit;
 			if (!trChunksQ.empty() && deterministicRandom()->random01() < clientSamplingProbability)
@@ -1277,8 +1279,10 @@ bool DatabaseContext::sampleReadTags() const {
 }
 
 bool DatabaseContext::sampleOnCost(uint64_t cost) const {
-	double sampleCost = GlobalConfig::globalConfig().get<double>(transactionTagSampleCost, CLIENT_KNOBS->COMMIT_SAMPLE_COST);
-	if (sampleCost <= 0) return false;
+	double sampleCost =
+	    GlobalConfig::globalConfig().get<double>(transactionTagSampleCost, CLIENT_KNOBS->COMMIT_SAMPLE_COST);
+	if (sampleCost <= 0)
+		return false;
 	return deterministicRandom()->random01() <= (double)cost / sampleCost;
 }
 
@@ -5374,11 +5378,14 @@ void Transaction::checkDeferredError() {
 	cx->checkDeferredError();
 }
 
-Reference<TransactionLogInfo> Transaction::createTrLogInfoProbabilistically(const Database &cx) {
-	if(!cx->isError()) {
-		double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate, std::numeric_limits<double>::infinity());
+Reference<TransactionLogInfo> Transaction::createTrLogInfoProbabilistically(const Database& cx) {
+	if (!cx->isError()) {
+		double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate,
+		                                                             std::numeric_limits<double>::infinity());
 		double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
-		if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) && deterministicRandom()->random01() < clientSamplingProbability && (!g_network->isSimulated() || !g_simulator.speedUpSimulation)) {
+		if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) &&
+		    deterministicRandom()->random01() < clientSamplingProbability &&
+		    (!g_network->isSimulated() || !g_simulator.speedUpSimulation)) {
 			return makeReference<TransactionLogInfo>(TransactionLogInfo::DATABASE);
 		}
 	}
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 6bec7665d1..bff44b03ba 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1378,19 +1378,24 @@ Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTran
 	Standalone<RangeResultRef> result;
 
 	auto& globalConfig = GlobalConfig::globalConfig();
-	KeyRangeRef modified = KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin));
+	KeyRangeRef modified =
+	    KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin));
 	std::map<KeyRef, ConfigValue> values = globalConfig.get(modified);
 	for (const auto& [key, config] : values) {
 		Key prefixedKey = key.withPrefix(getKeyRange().begin);
 		if (config.value.has_value()) {
 			if (config.value.type() == typeid(StringRef)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::any_cast<StringRef>(config.value).toString()));
+				result.push_back_deep(result.arena(),
+				                      KeyValueRef(prefixedKey, std::any_cast<StringRef>(config.value).toString()));
 			} else if (config.value.type() == typeid(int64_t)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config.value))));
+				result.push_back_deep(result.arena(),
+				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config.value))));
 			} else if (config.value.type() == typeid(float)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config.value))));
+				result.push_back_deep(result.arena(),
+				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config.value))));
 			} else if (config.value.type() == typeid(double)) {
-				result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(config.value))));
+				result.push_back_deep(result.arena(),
+				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(config.value))));
 			} else {
 				ASSERT(false);
 			}
diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp
index 7c12a69059..42fec5f9f2 100644
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@@ -757,7 +757,8 @@ const KeyRef tagThrottleLimitKey = LiteralStringRef("\xff\x02/throttledTags/manu
 const KeyRef tagThrottleCountKey = LiteralStringRef("\xff\x02/throttledTags/manualThrottleCount");
 
 // Client status info prefix
-const KeyRangeRef fdbClientInfoPrefixRange(LiteralStringRef("\xff\x02/fdbClientInfo/"), LiteralStringRef("\xff\x02/fdbClientInfo0"));
+const KeyRangeRef fdbClientInfoPrefixRange(LiteralStringRef("\xff\x02/fdbClientInfo/"),
+                                           LiteralStringRef("\xff\x02/fdbClientInfo0"));
 // See remaining fields in GlobalConfig.actor.h
 
 // ConsistencyCheck settings
diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp
index 96f806c791..9d81281b14 100644
--- a/fdbclient/Tuple.cpp
+++ b/fdbclient/Tuple.cpp
@@ -44,9 +44,10 @@ static size_t find_string_terminator(const StringRef data, size_t offset) {
 // If encoding and the sign bit is 1 (the number is negative), flip all the bits.
 // If decoding and the sign bit is 0 (the number is negative), flip all the bits.
 // Otherwise, the number is positive, so flip the sign bit.
-static void adjust_floating_point(uint8_t *bytes, size_t size, bool encode) {
-	if((encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x00)) || (!encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x80))) {
-		for(size_t i = 0; i < size; i++) {
+static void adjust_floating_point(uint8_t* bytes, size_t size, bool encode) {
+	if ((encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x00)) ||
+	    (!encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x80))) {
+		for (size_t i = 0; i < size; i++) {
 			bytes[i] ^= (uint8_t)0xff;
 		}
 	} else {
@@ -65,14 +66,11 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
 			i = find_string_terminator(str, i + 1) + 1;
 		} else if (data[i] >= '\x0c' && data[i] <= '\x1c') {
 			i += abs(data[i] - '\x14') + 1;
-		}
-		else if(data[i] == 0x20) {
+		} else if (data[i] == 0x20) {
 			i += sizeof(float) + 1;
-		}
-		else if(data[i] == 0x21) {
+		} else if (data[i] == 0x21) {
 			i += sizeof(double) + 1;
-		}
-		else if(data[i] == '\x00') {
+		} else if (data[i] == '\x00') {
 			i += 1;
 		} else {
 			throw invalid_tuple_data_type();
@@ -145,26 +143,26 @@ Tuple& Tuple::append(int64_t value) {
 	return *this;
 }
 
-Tuple& Tuple::appendFloat( float value ) {
-	offsets.push_back( data.size() );
+Tuple& Tuple::appendFloat(float value) {
+	offsets.push_back(data.size());
 	float swap = bigEndianFloat(value);
-	uint8_t *bytes = (uint8_t*)&swap;
+	uint8_t* bytes = (uint8_t*)&swap;
 	adjust_floating_point(bytes, sizeof(float), true);
 
-	data.push_back( data.arena(), 0x20 );
-	data.append( data.arena(), bytes, sizeof(float) );
+	data.push_back(data.arena(), 0x20);
+	data.append(data.arena(), bytes, sizeof(float));
 	return *this;
 }
 
-Tuple& Tuple::appendDouble( double value ) {
-	offsets.push_back( data.size() );
+Tuple& Tuple::appendDouble(double value) {
+	offsets.push_back(data.size());
 	double swap = value;
 	swap = bigEndianDouble(swap);
-	uint8_t *bytes = (uint8_t*)&swap;
+	uint8_t* bytes = (uint8_t*)&swap;
 	adjust_floating_point(bytes, sizeof(double), true);
 
-	data.push_back( data.arena(), 0x21 );
-	data.append( data.arena(), bytes, sizeof(double) );
+	data.push_back(data.arena(), 0x21);
+	data.append(data.arena(), bytes, sizeof(double));
 	return *this;
 }
 
@@ -189,14 +187,11 @@ Tuple::ElementType Tuple::getType(size_t index) const {
 		return ElementType::UTF8;
 	} else if (code >= '\x0c' && code <= '\x1c') {
 		return ElementType::INT;
-	}
-	else if(code == 0x20) {
+	} else if (code == 0x20) {
 		return ElementType::FLOAT;
-	}
-	else if(code == 0x21) {
+	} else if (code == 0x21) {
 		return ElementType::DOUBLE;
-	}
-	else {
+	} else {
 		throw invalid_tuple_data_type();
 	}
 }
@@ -292,12 +287,12 @@ int64_t Tuple::getInt(size_t index, bool allow_incomplete) const {
 
 // TODO: Combine with bindings/flow/Tuple.*. This code is copied from there.
 float Tuple::getFloat(size_t index) const {
-	if(index >= offsets.size()) {
+	if (index >= offsets.size()) {
 		throw invalid_tuple_index();
 	}
 	ASSERT_LT(offsets[index], data.size());
 	uint8_t code = data[offsets[index]];
-	if(code != 0x20) {
+	if (code != 0x20) {
 		throw invalid_tuple_data_type();
 	}
 
@@ -305,18 +300,18 @@ float Tuple::getFloat(size_t index) const {
 	uint8_t* bytes = (uint8_t*)&swap;
 	ASSERT_LE(offsets[index] + 1 + sizeof(float), data.size());
 	swap = *(float*)(data.begin() + offsets[index] + 1);
-	adjust_floating_point( bytes, sizeof(float), false );
+	adjust_floating_point(bytes, sizeof(float), false);
 
 	return bigEndianFloat(swap);
 }
 
 double Tuple::getDouble(size_t index) const {
-	if(index >= offsets.size()) {
+	if (index >= offsets.size()) {
 		throw invalid_tuple_index();
 	}
 	ASSERT_LT(offsets[index], data.size());
 	uint8_t code = data[offsets[index]];
-	if(code != 0x21) {
+	if (code != 0x21) {
 		throw invalid_tuple_data_type();
 	}
 
@@ -324,7 +319,7 @@ double Tuple::getDouble(size_t index) const {
 	uint8_t* bytes = (uint8_t*)&swap;
 	ASSERT_LE(offsets[index] + 1 + sizeof(double), data.size());
 	swap = *(double*)(data.begin() + offsets[index] + 1);
-	adjust_floating_point( bytes, sizeof(double), false );
+	adjust_floating_point(bytes, sizeof(double), false);
 
 	return bigEndianDouble(swap);
 }
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 93632650e1..0f5b7ebb94 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -2716,7 +2716,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
 		clientInfo.id = deterministicRandom()->randomUniqueID();
 		clientInfo.commitProxies = req.commitProxies;
 		clientInfo.grvProxies = req.grvProxies;
-		db->clientInfo->set( clientInfo );
+		db->clientInfo->set(clientInfo);
 		dbInfo.client = db->clientInfo->get();
 	}
 
@@ -3702,25 +3702,29 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
 	state ClusterControllerData self(interf, locality);
 	state Future<Void> coordinationPingDelay = delay(SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY);
 	state uint64_t step = 0;
-	state Future<ErrorOr<Void>> error = errorOr( actorCollection( self.addActor.getFuture() ) );
+	state Future<ErrorOr<Void>> error = errorOr(actorCollection(self.addActor.getFuture()));
 
-	self.addActor.send( clusterWatchDatabase( &self, &self.db ) );  // Start the master database
-	self.addActor.send( self.updateWorkerList.init( self.db.db ) );
-	self.addActor.send( statusServer( interf.clientInterface.databaseStatus.getFuture(), &self, coordinators));
-	self.addActor.send( timeKeeper(&self) );
-	self.addActor.send( monitorProcessClasses(&self) );
-	self.addActor.send( monitorServerInfoConfig(&self.db) );
+	self.addActor.send(clusterWatchDatabase(&self, &self.db)); // Start the master database
+	self.addActor.send(self.updateWorkerList.init(self.db.db));
+	self.addActor.send(statusServer(interf.clientInterface.databaseStatus.getFuture(), &self, coordinators));
+	self.addActor.send(timeKeeper(&self));
+	self.addActor.send(monitorProcessClasses(&self));
+	self.addActor.send(monitorServerInfoConfig(&self.db));
 	self.addActor.send(monitorGlobalConfig(&self.db));
-	self.addActor.send( updatedChangingDatacenters(&self) );
-	self.addActor.send( updatedChangedDatacenters(&self) );
-	self.addActor.send( updateDatacenterVersionDifference(&self) );
-	self.addActor.send( handleForcedRecoveries(&self, interf) );
-	self.addActor.send( monitorDataDistributor(&self) );
-	self.addActor.send( monitorRatekeeper(&self) );
-	self.addActor.send( dbInfoUpdater(&self) );
-	self.addActor.send( traceCounters("ClusterControllerMetrics", self.id, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self.clusterControllerMetrics, self.id.toString() + "/ClusterControllerMetrics") );
-	self.addActor.send( traceRole(Role::CLUSTER_CONTROLLER, interf.id()) );
-	//printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());
+	self.addActor.send(updatedChangingDatacenters(&self));
+	self.addActor.send(updatedChangedDatacenters(&self));
+	self.addActor.send(updateDatacenterVersionDifference(&self));
+	self.addActor.send(handleForcedRecoveries(&self, interf));
+	self.addActor.send(monitorDataDistributor(&self));
+	self.addActor.send(monitorRatekeeper(&self));
+	self.addActor.send(dbInfoUpdater(&self));
+	self.addActor.send(traceCounters("ClusterControllerMetrics",
+	                                 self.id,
+	                                 SERVER_KNOBS->STORAGE_LOGGING_DELAY,
+	                                 &self.clusterControllerMetrics,
+	                                 self.id.toString() + "/ClusterControllerMetrics"));
+	self.addActor.send(traceRole(Role::CLUSTER_CONTROLLER, interf.id()));
+	// printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str());
 
 	loop choose {
 		when(ErrorOr<Void> err = wait(error)) {
diff --git a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
index d56061ca3a..5c99263f58 100644
--- a/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
+++ b/fdbserver/workloads/ClientTransactionProfileCorrectness.actor.cpp
@@ -269,17 +269,15 @@ struct ClientTransactionProfileCorrectnessWorkload : TestWorkload {
 
 	ACTOR Future<Void> changeProfilingParameters(Database cx, int64_t sizeLimit, double sampleProbability) {
 
-		wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void>
-						{
-							tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
-							tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-							Tuple rate = Tuple().appendDouble(sampleProbability);
-							Tuple size = Tuple().append(sizeLimit);
-							tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
-							tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
-							return Void();
-						}
-					 ));
+		wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
+			tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+			Tuple rate = Tuple().appendDouble(sampleProbability);
+			Tuple size = Tuple().append(sizeLimit);
+			tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
+			tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
+			return Void();
+		}));
 		return Void();
 	}
 

From aa0014ab6e672d59df2ba2372b86d72c841229f1 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 17 Mar 2021 20:22:38 -0700
Subject: [PATCH 149/317] Fix version serialization

---
 fdbclient/GlobalConfig.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fdbclient/GlobalConfig.h b/fdbclient/GlobalConfig.h
index 14e10f8635..4973a86499 100644
--- a/fdbclient/GlobalConfig.h
+++ b/fdbclient/GlobalConfig.h
@@ -37,9 +37,6 @@ struct VersionHistory {
 
 	template <typename Ar>
 	void serialize(Ar& ar) {
-		// The version is not serialized because this object is only sent over
-		// the network during a write. In this case, the version is included in
-		// the key, while this object will be written to the value.
-		serializer(ar, mutations);
+		serializer(ar, mutations, version);
 	}
 };

From 1c60653c2acfc3b938c07dbbe17e60c2983445a2 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 17 Mar 2021 20:41:46 -0700
Subject: [PATCH 150/317] Add fix to conditionally set global config history

---
 fdbclient/CMakeLists.txt                                 | 1 +
 fdbclient/GlobalConfig.actor.cpp                         | 5 +++++
 fdbserver/ClusterController.actor.cpp                    | 4 ++--
 fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index e733259611..bd14ef7b52 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -28,6 +28,7 @@ set(FDBCLIENT_SRCS
   FDBOptions.h
   FDBTypes.h
   FileBackupAgent.actor.cpp
+  GlobalConfig.h
   GlobalConfig.actor.h
   GlobalConfig.actor.cpp
   GrvProxyInterface.h
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index a482a8e3ea..f504ec2db2 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -18,6 +18,7 @@
  * limitations under the License.
  */
 
+#include "fdbclient/DatabaseContext.h"
 #include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/SpecialKeySpace.actor.h"
 #include "fdbclient/SystemData.h"
@@ -134,6 +135,10 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 		try {
 			wait(dbInfo->onChange());
 
+			if (dbInfo->get().id.second() != 123456789) {
+				continue;
+			}
+
 			auto& history = dbInfo->get().history;
 			if (history.size() == 0 || (self->lastUpdate < history[0].version && self->lastUpdate != 0)) {
 				// This process missed too many global configuration
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 0f5b7ebb94..47b19a932a 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3233,10 +3233,10 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 						historyCommitVersion = bigEndian64(historyCommitVersion);
 						vh.version = historyCommitVersion;
 
-						clientInfo.history.push_back(vh);
+						clientInfo.history.push_back(std::move(vh));
 					}
 
-					clientInfo.id = deterministicRandom()->randomUniqueID();
+					clientInfo.id = UID(deterministicRandom()->randomUniqueID().first(), 123456789);
 					db->clientInfo->set(clientInfo);
 				}
 
diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
index 34c1f32cf4..a6c98910d2 100644
--- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
+++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp
@@ -21,6 +21,7 @@
 #include "boost/lexical_cast.hpp"
 #include "boost/algorithm/string.hpp"
 
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/ManagementAPI.actor.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/ReadYourWrites.h"

From 7ba7257cd2812e76fb992e06c544a82320140f73 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 19 Mar 2021 13:28:03 -0700
Subject: [PATCH 151/317] Store global config data on heap

---
 fdbclient/GlobalConfig.actor.cpp      | 26 ++++++++++++++-----------
 fdbclient/GlobalConfig.actor.h        | 28 +++++++++++++++------------
 fdbclient/GlobalConfig.h              |  3 +++
 fdbclient/SpecialKeySpace.actor.cpp   | 20 +++++++++----------
 fdbserver/ClusterController.actor.cpp | 12 ++++++++----
 5 files changed, 52 insertions(+), 37 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index f504ec2db2..f8f97916f8 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -55,16 +55,16 @@ Key GlobalConfig::prefixedKey(KeyRef key) {
 	return key.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin);
 }
 
-const ConfigValue GlobalConfig::get(KeyRef name) {
+const Reference<ConfigValue> GlobalConfig::get(KeyRef name) {
 	auto it = data.find(name);
 	if (it == data.end()) {
-		return ConfigValue{ Arena(), std::any{} };
+		return Reference<ConfigValue>();
 	}
 	return it->second;
 }
 
-const std::map<KeyRef, ConfigValue> GlobalConfig::get(KeyRangeRef range) {
-	std::map<KeyRef, ConfigValue> results;
+const std::map<KeyRef, Reference<ConfigValue>> GlobalConfig::get(KeyRangeRef range) {
+	std::map<KeyRef, Reference<ConfigValue>> results;
 	for (const auto& [key, value] : data) {
 		if (range.contains(key)) {
 			results[key] = value;
@@ -78,21 +78,25 @@ Future<Void> GlobalConfig::onInitialized() {
 }
 
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
+	data.erase(key);
+
 	Arena arena(key.expectedSize() + value.expectedSize());
 	KeyRef stableKey = KeyRef(arena, key);
 	try {
+		std::any any;
 		Tuple t = Tuple::unpack(value);
 		if (t.getType(0) == Tuple::ElementType::UTF8) {
-			data[stableKey] = ConfigValue{ arena, StringRef(arena, t.getString(0).contents()) };
+			any = StringRef(arena, t.getString(0).contents());
 		} else if (t.getType(0) == Tuple::ElementType::INT) {
-			data[stableKey] = ConfigValue{ arena, t.getInt(0) };
+			any = t.getInt(0);
 		} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
-			data[stableKey] = ConfigValue{ arena, t.getFloat(0) };
+			any = t.getFloat(0);
 		} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
-			data[stableKey] = ConfigValue{ arena, t.getDouble(0) };
+			any = t.getDouble(0);
 		} else {
 			ASSERT(false);
 		}
+		data[stableKey] = makeReference<ConfigValue>(std::move(arena), std::move(any));
 	} catch (Error& e) {
 		TraceEvent("GlobalConfigTupleError").detail("What", e.what());
 	}
@@ -135,12 +139,12 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 		try {
 			wait(dbInfo->onChange());
 
-			if (dbInfo->get().id.second() != 123456789) {
+			auto& history = dbInfo->get().history;
+			if (history.size() == 0) {
 				continue;
 			}
 
-			auto& history = dbInfo->get().history;
-			if (history.size() == 0 || (self->lastUpdate < history[0].version && self->lastUpdate != 0)) {
+			if (self->lastUpdate < history[0].version) {
 				// This process missed too many global configuration
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 11699633da..45a3e469e9 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -48,9 +48,12 @@ extern const KeyRef fdbClientInfoTxnSizeLimit;
 extern const KeyRef transactionTagSampleRate;
 extern const KeyRef transactionTagSampleCost;
 
-struct ConfigValue {
+struct ConfigValue : ReferenceCounted<ConfigValue> {
 	Arena arena;
 	std::any value;
+
+	ConfigValue() {}
+	ConfigValue(Arena&& a, std::any&& v) : arena(a), value(v) {}
 };
 
 class GlobalConfig {
@@ -67,13 +70,12 @@ public:
 	// For example, given "config/a", returns "\xff\xff/global_config/config/a".
 	static Key prefixedKey(KeyRef key);
 
-	// Get a value from the framework. Values are returned in a ConfigValue
-	// struct which also contains a reference to the arena containing the
-	// memory for the object. As long as the caller keeps a reference to the
-	// returned ConfigValue, the value is guaranteed to be readable (if it
-	// exists).
-	const ConfigValue get(KeyRef name);
-	const std::map<KeyRef, ConfigValue> get(KeyRangeRef range);
+	// Get a value from the framework. Values are returned as a ConfigValue
+	// reference which also contains the arena holding the object. As long as
+	// the caller keeps the ConfigValue reference, the value is guaranteed to
+	// be readable. An empty reference is returned if the value does not exist.
+	const Reference<ConfigValue> get(KeyRef name);
+	const std::map<KeyRef, Reference<ConfigValue>> get(KeyRangeRef range);
 
 	// For arithmetic value types, returns a copy of the value for the given
 	// key, or the supplied default value if the framework does not know about
@@ -81,9 +83,11 @@ public:
 	template <typename T, typename std::enable_if<std::is_arithmetic<T>{}, bool>::type = true>
 	const T get(KeyRef name, T defaultVal) {
 		try {
-			auto any = get(name).value;
-			if (any.has_value()) {
-				return std::any_cast<T>(any);
+			auto configValue = get(name);
+			if (configValue.isValid()) {
+				if (configValue->value.has_value()) {
+					return std::any_cast<T>(configValue->value);
+				}
 			}
 
 			return defaultVal;
@@ -114,7 +118,7 @@ private:
 	Database cx;
 	Future<Void> _updater;
 	Promise<Void> initialized;
-	std::unordered_map<StringRef, ConfigValue> data;
+	std::unordered_map<StringRef, Reference<ConfigValue>> data;
 	Version lastUpdate;
 };
 
diff --git a/fdbclient/GlobalConfig.h b/fdbclient/GlobalConfig.h
index 4973a86499..f68ea2361e 100644
--- a/fdbclient/GlobalConfig.h
+++ b/fdbclient/GlobalConfig.h
@@ -28,6 +28,9 @@
 struct VersionHistory {
 	constexpr static FileIdentifier file_identifier = 5863456;
 
+	VersionHistory() {}
+	VersionHistory(Version v) : version(v) {}
+
 	Version version;
 	Standalone<VectorRef<MutationRef>> mutations;
 
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index bff44b03ba..4b76a14255 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1380,22 +1380,22 @@ Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTran
 	auto& globalConfig = GlobalConfig::globalConfig();
 	KeyRangeRef modified =
 	    KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin));
-	std::map<KeyRef, ConfigValue> values = globalConfig.get(modified);
+	std::map<KeyRef, Reference<ConfigValue>> values = globalConfig.get(modified);
 	for (const auto& [key, config] : values) {
 		Key prefixedKey = key.withPrefix(getKeyRange().begin);
-		if (config.value.has_value()) {
-			if (config.value.type() == typeid(StringRef)) {
+		if (config.isValid() && config->value.has_value()) {
+			if (config->value.type() == typeid(StringRef)) {
 				result.push_back_deep(result.arena(),
-				                      KeyValueRef(prefixedKey, std::any_cast<StringRef>(config.value).toString()));
-			} else if (config.value.type() == typeid(int64_t)) {
+				                      KeyValueRef(prefixedKey, std::any_cast<StringRef>(config->value).toString()));
+			} else if (config->value.type() == typeid(int64_t)) {
 				result.push_back_deep(result.arena(),
-				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config.value))));
-			} else if (config.value.type() == typeid(float)) {
+				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config->value))));
+			} else if (config->value.type() == typeid(float)) {
 				result.push_back_deep(result.arena(),
-				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config.value))));
-			} else if (config.value.type() == typeid(double)) {
+				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config->value))));
+			} else if (config->value.type() == typeid(double)) {
 				result.push_back_deep(result.arena(),
-				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(config.value))));
+				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(config->value))));
 			} else {
 				ASSERT(false);
 			}
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 47b19a932a..4b7b26fb95 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3212,17 +3212,21 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 					clientInfo.history.clear();
 
 					for (const auto& kv : globalConfigHistory) {
-						VersionHistory vh;
 						ObjectReader reader(kv.value.begin(), IncludeVersion());
 						if (reader.protocolVersion() != g_network->protocolVersion()) {
 							// If the protocol version has changed, the
 							// GlobalConfig actor should refresh its view by
 							// reading the entire global configuration key
-							// range.  An empty mutation list will signal the
-							// actor to refresh.
+							// range.  Setting the version to the max int64_t
+							// will always cause the global configuration
+							// updater to refresh its view of the configuration
+							// keyspace.
 							clientInfo.history.clear();
+							clientInfo.history.emplace_back(std::numeric_limits<Version>::max());
 							break;
 						}
+
+						VersionHistory vh;
 						reader.deserialize(vh);
 
 						// Read commit version out of versionstamp at end of key.
@@ -3236,7 +3240,7 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 						clientInfo.history.push_back(std::move(vh));
 					}
 
-					clientInfo.id = UID(deterministicRandom()->randomUniqueID().first(), 123456789);
+					clientInfo.id = deterministicRandom()->randomUniqueID();
 					db->clientInfo->set(clientInfo);
 				}
 

From c38ddf5eb72a82abc9f6e4245ce275e9caead350 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 19 Mar 2021 17:37:01 -0700
Subject: [PATCH 152/317] Add comments

---
 fdbclient/GlobalConfig.actor.h        | 21 +++++++++++++++++++++
 fdbclient/SpecialKeySpace.actor.cpp   | 11 +++++++++++
 fdbserver/ClusterController.actor.cpp |  5 +++++
 3 files changed, 37 insertions(+)

diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 45a3e469e9..dc62203746 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -48,6 +48,9 @@ extern const KeyRef fdbClientInfoTxnSizeLimit;
 extern const KeyRef transactionTagSampleRate;
 extern const KeyRef transactionTagSampleCost;
 
+// Structure used to hold the values stored by global configuration. The arena
+// is used as memory to store both the key and the value (the value is only
+// stored in the arena if it is an object; primitives are just copied).
 struct ConfigValue : ReferenceCounted<ConfigValue> {
 	Arena arena;
 	std::any value;
@@ -61,7 +64,14 @@ public:
 	GlobalConfig(const GlobalConfig&) = delete;
 	GlobalConfig& operator=(const GlobalConfig&) = delete;
 
+	// Creates a GlobalConfig singleton, accessed by calling GlobalConfig().
+	// This function should only be called once by each process (however, it is
+	// idempotent and calling it multiple times will have no effect).
 	static void create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo);
+
+	// Returns a reference to the global GlobalConfig object. Clients should
+	// call this function whenever they need to read a value out of the global
+	// configuration.
 	static GlobalConfig& globalConfig();
 
 	// Use this function to turn a global configuration key defined above into
@@ -108,8 +118,19 @@ public:
 private:
 	GlobalConfig();
 
+	// The functions below only affect the local copy of the global
+	// configuration keyspace! To insert or remove values across all nodes you
+	// must use a transaction (see the note above).
+
+	// Inserts the given key-value pair into the local copy of the global
+	// configuration keyspace, overwriting the old key-value pair if it exists.
+	// `value` must be encoded using the FDB tuple typecodes.
 	void insert(KeyRef key, ValueRef value);
+	// Removes the given key (and associated value) from the local copy of the
+	// global configuration keyspace.
 	void erase(KeyRef key);
+	// Removes the given key range (and associated values) from the local copy
+	// of the global configuration keyspace.
 	void erase(KeyRangeRef range);
 
 	ACTOR static Future<Void> refresh(GlobalConfig* self);
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 4b76a14255..da53a91f93 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1373,6 +1373,10 @@ Future<Optional<std::string>> ConsistencyCheckImpl::commit(ReadYourWritesTransac
 
 GlobalConfigImpl::GlobalConfigImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
+// Returns key-value pairs for each value stored in the global configuration
+// framework within the range specified. The special-key-space getrange
+// function should only be used for informational purposes. All values are
+// returned as strings regardless of their true type.
 Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw,
                                                               KeyRangeRef kr) const {
 	Standalone<RangeResultRef> result;
@@ -1405,10 +1409,14 @@ Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTran
 	return result;
 }
 
+// Marks the key for insertion into global configuration.
 void GlobalConfigImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
 	ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>(value)));
 }
 
+// Writes global configuration changes to durable memory. Also writes the
+// changes made in the transaction to a recent history set, and updates the
+// latest version which the global configuration was updated at.
 ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* globalConfig, ReadYourWritesTransaction* ryw) {
 	state Transaction& tr = ryw->getTransaction();
 
@@ -1468,14 +1476,17 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 	return Optional<std::string>();
 }
 
+// Called when a transaction includes keys in the global configuration special-key-space range.
 Future<Optional<std::string>> GlobalConfigImpl::commit(ReadYourWritesTransaction* ryw) {
 	return globalConfigCommitActor(this, ryw);
 }
 
+// Marks the range for deletion from global configuration.
 void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
 	ryw->getSpecialKeySpaceWriteMap().insert(range, std::make_pair(true, Optional<Value>()));
 }
 
+// Marks the key for deletion from global configuration.
 void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
 	ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>()));
 }
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 4b7b26fb95..1d1d05a930 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3190,6 +3190,11 @@ ACTOR Future<Void> monitorServerInfoConfig(ClusterControllerData::DBInfo* db) {
 	}
 }
 
+// Monitors the global configuration version key for changes. When changes are
+// made, the global configuration history is read and any updates are sent to
+// all processes in the system by updating the ClientDBInfo object. The
+// GlobalConfig actor class contains the functionality to read the latest
+// history and update the processes local view.
 ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 	loop {
 		state ReadYourWritesTransaction tr(db->db);

From 7de23918c0d14bfcd29e83b893704eb12301854d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 23 Mar 2021 16:22:39 -0700
Subject: [PATCH 153/317] Add comments, fix erase bug, make optimizations

---
 fdbclient/GlobalConfig.actor.cpp      |  6 ++++--
 fdbclient/GlobalConfig.actor.h        | 17 ++++++++---------
 fdbclient/NativeAPI.actor.cpp         |  5 ++---
 fdbclient/SpecialKeySpace.actor.cpp   |  5 +++--
 fdbclient/SystemData.h                |  6 +++---
 fdbclient/Tuple.cpp                   | 15 ++++++++-------
 fdbserver/ClusterController.actor.cpp | 20 +++++++++++++++++---
 7 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index f8f97916f8..1071a08a28 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -98,12 +98,12 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
 		}
 		data[stableKey] = makeReference<ConfigValue>(std::move(arena), std::move(any));
 	} catch (Error& e) {
-		TraceEvent("GlobalConfigTupleError").detail("What", e.what());
+		TraceEvent("GlobalConfigTupleParseError").detail("What", e.what());
 	}
 }
 
 void GlobalConfig::erase(KeyRef key) {
-	erase(KeyRangeRef(key, keyAfter(key)));
+	data.erase(key);
 }
 
 void GlobalConfig::erase(KeyRangeRef range) {
@@ -120,6 +120,8 @@ void GlobalConfig::erase(KeyRangeRef range) {
 // Updates local copy of global configuration by reading the entire key-range
 // from storage.
 ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
+	self->data.clear();
+
 	Transaction tr(self->cx);
 	Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
 	for (const auto& kv : result) {
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index dc62203746..ded1bc32c6 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -39,7 +39,8 @@
 
 // The global configuration is a series of typed key-value pairs synced to all
 // nodes (server and client) in an FDB cluster in an eventually consistent
-// manner.
+// manner. Only small key-value pairs should be stored in global configuration;
+// an excessive amount of data can cause synchronization slowness.
 
 // Keys
 extern const KeyRef fdbClientInfoTxnSampleRate;
@@ -59,11 +60,8 @@ struct ConfigValue : ReferenceCounted<ConfigValue> {
 	ConfigValue(Arena&& a, std::any&& v) : arena(a), value(v) {}
 };
 
-class GlobalConfig {
+class GlobalConfig : NonCopyable {
 public:
-	GlobalConfig(const GlobalConfig&) = delete;
-	GlobalConfig& operator=(const GlobalConfig&) = delete;
-
 	// Creates a GlobalConfig singleton, accessed by calling GlobalConfig().
 	// This function should only be called once by each process (however, it is
 	// idempotent and calling it multiple times will have no effect).
@@ -106,10 +104,11 @@ public:
 		}
 	}
 
-	// To write into the global configuration, submit a transaction to
-	// \xff\xff/global_config/<your-key> with <your-value> encoded using the
-	// FDB tuple typecodes. Use the helper function `prefixedKey` to correctly
-	// prefix your global configuration key.
+	// Trying to write into the global configuration keyspace? To write data,
+	// submit a transaction to \xff\xff/global_config/<your-key> with
+	// <your-value> encoded using the FDB tuple typecodes. Use the helper
+	// function `prefixedKey` to correctly prefix your global configuration
+	// key.
 
 	// Triggers the returned future when the global configuration singleton has
 	// been created and is ready.
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 2e460d55d2..be581a08ea 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -5380,9 +5380,8 @@ void Transaction::checkDeferredError() {
 
 Reference<TransactionLogInfo> Transaction::createTrLogInfoProbabilistically(const Database& cx) {
 	if (!cx->isError()) {
-		double sampleRate = GlobalConfig::globalConfig().get<double>(fdbClientInfoTxnSampleRate,
-		                                                             std::numeric_limits<double>::infinity());
-		double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate;
+		double clientSamplingProbability = GlobalConfig::globalConfig().get<double>(
+		    fdbClientInfoTxnSampleRate, CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY);
 		if (((networkOptions.logClientInfo.present() && networkOptions.logClientInfo.get()) || BUGGIFY) &&
 		    deterministicRandom()->random01() < clientSamplingProbability &&
 		    (!g_network->isSimulated() || !g_simulator.speedUpSimulation)) {
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index da53a91f93..cb347f7fcd 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1441,14 +1441,15 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 	while (iter != ranges.end()) {
 		std::pair<bool, Optional<Value>> entry = iter->value();
 		if (entry.first) {
-			if (entry.second.present()) {
+			if (entry.second.present() && iter->begin().startsWith(globalConfig->getKeyRange().begin)) {
 				Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin);
 				vh.mutations.emplace_back_deep(vh.mutations.arena(),
 				                               MutationRef(MutationRef::SetValue, bareKey, entry.second.get()));
 
 				Key systemKey = bareKey.withPrefix(globalConfigKeysPrefix);
 				tr.set(systemKey, entry.second.get());
-			} else {
+			} else if (!entry.second.present() && iter->range().begin.startsWith(globalConfig->getKeyRange().begin) &&
+			           iter->range().end.startsWith(globalConfig->getKeyRange().begin)) {
 				KeyRef bareRangeBegin = iter->range().begin.removePrefix(globalConfig->getKeyRange().begin);
 				KeyRef bareRangeEnd = iter->range().end.removePrefix(globalConfig->getKeyRange().begin);
 				vh.mutations.emplace_back_deep(vh.mutations.arena(),
diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h
index 5cf56ef7ec..952e8fcf00 100644
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@@ -249,9 +249,9 @@ extern const KeyRef globalConfigKeysPrefix;
 extern const KeyRangeRef globalConfigHistoryKeys;
 extern const KeyRef globalConfigHistoryPrefix;
 
-//   "\xff/globalConfig/v" := "version,protocol"
-//   Read-only key which returns the version and protocol of the most recent
-//   data written to the global configuration keyspace.
+//   "\xff/globalConfig/v" := "version"
+//   Read-only key which returns the commit version of the most recent mutation
+//   made to the global configuration keyspace.
 extern const KeyRef globalConfigVersionKey;
 
 //	"\xff/workers/[[processID]]" := ""
diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp
index 9d81281b14..367a7b80fb 100644
--- a/fdbclient/Tuple.cpp
+++ b/fdbclient/Tuple.cpp
@@ -20,6 +20,7 @@
 
 #include "fdbclient/Tuple.h"
 
+// TODO: Many functions copied from bindings/flow/Tuple.cpp. Merge at some point.
 static float bigEndianFloat(float orig) {
 	int32_t big = *(int32_t*)&orig;
 	big = bigEndian32(big);
@@ -32,7 +33,7 @@ static double bigEndianDouble(double orig) {
 	return *(double*)&big;
 }
 
-static size_t find_string_terminator(const StringRef data, size_t offset) {
+static size_t findStringTerminator(const StringRef data, size_t offset) {
 	size_t i = offset;
 	while (i < data.size() - 1 && !(data[i] == '\x00' && data[i + 1] != (uint8_t)'\xff')) {
 		i += (data[i] == '\x00' ? 2 : 1);
@@ -44,7 +45,7 @@ static size_t find_string_terminator(const StringRef data, size_t offset) {
 // If encoding and the sign bit is 1 (the number is negative), flip all the bits.
 // If decoding and the sign bit is 0 (the number is negative), flip all the bits.
 // Otherwise, the number is positive, so flip the sign bit.
-static void adjust_floating_point(uint8_t* bytes, size_t size, bool encode) {
+static void adjustFloatingPoint(uint8_t* bytes, size_t size, bool encode) {
 	if ((encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x00)) ||
 	    (!encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x80))) {
 		for (size_t i = 0; i < size; i++) {
@@ -63,7 +64,7 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
 		offsets.push_back(i);
 
 		if (data[i] == '\x01' || data[i] == '\x02') {
-			i = find_string_terminator(str, i + 1) + 1;
+			i = findStringTerminator(str, i + 1) + 1;
 		} else if (data[i] >= '\x0c' && data[i] <= '\x1c') {
 			i += abs(data[i] - '\x14') + 1;
 		} else if (data[i] == 0x20) {
@@ -147,7 +148,7 @@ Tuple& Tuple::appendFloat(float value) {
 	offsets.push_back(data.size());
 	float swap = bigEndianFloat(value);
 	uint8_t* bytes = (uint8_t*)&swap;
-	adjust_floating_point(bytes, sizeof(float), true);
+	adjustFloatingPoint(bytes, sizeof(float), true);
 
 	data.push_back(data.arena(), 0x20);
 	data.append(data.arena(), bytes, sizeof(float));
@@ -159,7 +160,7 @@ Tuple& Tuple::appendDouble(double value) {
 	double swap = value;
 	swap = bigEndianDouble(swap);
 	uint8_t* bytes = (uint8_t*)&swap;
-	adjust_floating_point(bytes, sizeof(double), true);
+	adjustFloatingPoint(bytes, sizeof(double), true);
 
 	data.push_back(data.arena(), 0x21);
 	data.append(data.arena(), bytes, sizeof(double));
@@ -300,7 +301,7 @@ float Tuple::getFloat(size_t index) const {
 	uint8_t* bytes = (uint8_t*)&swap;
 	ASSERT_LE(offsets[index] + 1 + sizeof(float), data.size());
 	swap = *(float*)(data.begin() + offsets[index] + 1);
-	adjust_floating_point(bytes, sizeof(float), false);
+	adjustFloatingPoint(bytes, sizeof(float), false);
 
 	return bigEndianFloat(swap);
 }
@@ -319,7 +320,7 @@ double Tuple::getDouble(size_t index) const {
 	uint8_t* bytes = (uint8_t*)&swap;
 	ASSERT_LE(offsets[index] + 1 + sizeof(double), data.size());
 	swap = *(double*)(data.begin() + offsets[index] + 1);
-	adjust_floating_point(bytes, sizeof(double), false);
+	adjustFloatingPoint(bytes, sizeof(double), false);
 
 	return bigEndianDouble(swap);
 }
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 1d1d05a930..750d6643ed 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -3245,15 +3245,29 @@ ACTOR Future<Void> monitorGlobalConfig(ClusterControllerData::DBInfo* db) {
 						clientInfo.history.push_back(std::move(vh));
 					}
 
+					if (clientInfo.history.size() > 0) {
+						// The first item in the historical list of mutations
+						// is only used to:
+						//   a) Recognize that some historical changes may have
+						//      been missed, and the entire global
+						//      configuration keyspace needs to be read, or..
+						//   b) Check which historical updates have already
+						//      been applied. If this is the case, the first
+						//      history item must have a version greater than
+						//      or equal to whatever version the global
+						//      configuration was last updated at, and
+						//      therefore won't need to be applied again.
+						clientInfo.history[0].mutations = Standalone<VectorRef<MutationRef>>();
+					}
+
 					clientInfo.id = deterministicRandom()->randomUniqueID();
 					db->clientInfo->set(clientInfo);
 				}
 
 				state Future<Void> globalConfigFuture = tr.watch(globalConfigVersionKey);
 				wait(tr.commit());
-				choose {
-					when (wait(globalConfigFuture)) { break; }
-				}
+				wait(globalConfigFuture);
+				break;
 			} catch (Error& e) {
 				wait(tr.onError(e));
 			}

From 2594d91f113711b1593a515abbef8a335e57525e Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 12 Apr 2021 10:27:41 -0700
Subject: [PATCH 154/317] Update casing

---
 fdbclient/GlobalConfig.actor.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 1071a08a28..1aa8c84992 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -28,11 +28,11 @@
 
 #include "flow/actorcompiler.h"  // This must be the last #include.
 
-const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("config/fdbClientInfo/client_txn_sample_rate");
-const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdbClientInfo/client_txn_size_limit");
+const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("config/fdb_client_info/client_txn_sample_rate");
+const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_info/client_txn_size_limit");
 
-const KeyRef transactionTagSampleRate = LiteralStringRef("config/transactionTagSampleRate");
-const KeyRef transactionTagSampleCost = LiteralStringRef("config/transactionTagSampleCost");
+const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag_sample_rate");
+const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost");
 
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 

From 51e4c19675d99c6504df03c27ac396649cbfbd1d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 13 Apr 2021 12:50:18 -0700
Subject: [PATCH 155/317] Add migration for client profiling keys

---
 fdbclient/GlobalConfig.actor.cpp    | 47 ++++++++++++++++++++++++++++-
 fdbclient/GlobalConfig.actor.h      |  1 +
 fdbclient/SpecialKeySpace.actor.cpp |  9 +++---
 3 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 1aa8c84992..58e032d363 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -26,7 +26,7 @@
 #include "flow/flow.h"
 #include "flow/genericactors.actor.h"
 
-#include "flow/actorcompiler.h"  // This must be the last #include.
+#include "flow/actorcompiler.h" // This must be the last #include.
 
 const KeyRef fdbClientInfoTxnSampleRate = LiteralStringRef("config/fdb_client_info/client_txn_sample_rate");
 const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_info/client_txn_size_limit");
@@ -117,6 +117,49 @@ void GlobalConfig::erase(KeyRangeRef range) {
 	}
 }
 
+// Older FDB versions used different keys for client profiling data. This
+// function performs a one-time migration of data in these keys to the new
+// global configuration key space.
+ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
+	state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
+	tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+
+	state Key migratedKey("\xff\x02/fdbClientInfo/migrated/"_sr);
+	state Optional<Value> migrated = wait(tr->get(migratedKey));
+	if (migrated.present()) {
+		// Already performed migration.
+		return Void();
+	}
+
+	state Optional<Value> sampleRate = wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_sample_rate/"_sr)));
+	state Optional<Value> sizeLimit = wait(tr->get(Key("\xff\x02/fdbClientInfo/client_txn_size_limit/"_sr)));
+
+	loop {
+		try {
+			tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
+			// The value doesn't matter too much, as long as the key is set.
+			tr->set(migratedKey.contents(), "1"_sr);
+			if (sampleRate.present()) {
+				const double sampleRateDbl =
+				    BinaryReader::fromStringRef<double>(sampleRate.get().contents(), Unversioned());
+				Tuple rate = Tuple().appendDouble(sampleRateDbl);
+				tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSampleRate), rate.pack());
+			}
+			if (sizeLimit.present()) {
+				const int64_t sizeLimitInt =
+				    BinaryReader::fromStringRef<int64_t>(sizeLimit.get().contents(), Unversioned());
+				Tuple size = Tuple().append(sizeLimitInt);
+				tr->set(GlobalConfig::prefixedKey(fdbClientInfoTxnSizeLimit), size.pack());
+			}
+
+			wait(tr->commit());
+			return Void();
+		} catch (Error& e) {
+			throw;
+		}
+	}
+}
+
 // Updates local copy of global configuration by reading the entire key-range
 // from storage.
 ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
@@ -134,6 +177,8 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 // Applies updates to the local copy of the global configuration when this
 // process receives an updated history.
 ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+	wait(self->migrate(self));
+
 	wait(self->refresh(self));
 	self->initialized.send(Void());
 
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index ded1bc32c6..5c3693f450 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -132,6 +132,7 @@ private:
 	// of the global configuration keyspace.
 	void erase(KeyRangeRef range);
 
+	ACTOR static Future<Void> migrate(GlobalConfig* self);
 	ACTOR static Future<Void> refresh(GlobalConfig* self);
 	ACTOR static Future<Void> updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo);
 
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index cb347f7fcd..2bbafbd451 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1377,8 +1377,7 @@ GlobalConfigImpl::GlobalConfigImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {
 // framework within the range specified. The special-key-space getrange
 // function should only be used for informational purposes. All values are
 // returned as strings regardless of their true type.
-Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                              KeyRangeRef kr) const {
+Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	Standalone<RangeResultRef> result;
 
 	auto& globalConfig = GlobalConfig::globalConfig();
@@ -1417,7 +1416,8 @@ void GlobalConfigImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, co
 // Writes global configuration changes to durable memory. Also writes the
 // changes made in the transaction to a recent history set, and updates the
 // latest version which the global configuration was updated at.
-ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* globalConfig, ReadYourWritesTransaction* ryw) {
+ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* globalConfig,
+                                                            ReadYourWritesTransaction* ryw) {
 	state Transaction& tr = ryw->getTransaction();
 
 	// History should only contain three most recent updates. If it currently
@@ -1494,8 +1494,7 @@ void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key)
 
 TracingOptionsImpl::TracingOptionsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
-Future<Standalone<RangeResultRef>> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw,
-                                                                KeyRangeRef kr) const {
+Future<Standalone<RangeResultRef>> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	Standalone<RangeResultRef> result;
 	for (const auto& option : SpecialKeySpace::getTracingOptions()) {
 		auto key = getKeyRange().begin.withSuffix(option);

From 9737212e5186be9baf86e167edcb245aeb49cf16 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 10:56:42 -0700
Subject: [PATCH 156/317] The default value of the first snapshot interval
 should be 0 rather than -1.

---
 fdbclient/FileBackupAgent.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 5101d4d90e..b7e2a847b0 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2781,7 +2781,7 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 		state Future<Optional<int64_t>> initialSnapshotIntervalSeconds =
 		    config.initialSnapshotIntervalSeconds().get(tr);
 		wait(success(initialSnapshotIntervalSeconds) &&
-		     config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.get().orDefault(-1)));
+		     config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.get().orDefault(0)));
 
 		// Using priority 1 for both of these to at least start both tasks soon
 		// Do not add snapshot task if we only want the incremental backup

From 08c82050ac60d7f13216d940e204b1320296ee7c Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 11:05:21 -0700
Subject: [PATCH 157/317] Add TraceEvent to see whether RateKeeper is too busy
 to get SS list, or fails to get SS list.

---
 fdbserver/Ratekeeper.actor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index d07caf06b7..c667cdc1b1 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -764,6 +764,7 @@ ACTOR Future<Void> monitorServerListChange(
 			tr = Transaction(self->db);
 			wait(delay(SERVER_KNOBS->SERVER_LIST_DELAY));
 		} catch (Error& e) {
+			TraceEvent("RatekeeperGetSSListError", self->id).error(e);
 			wait(tr.onError(e));
 		}
 	}

From c8b8e8cf7d94e0d421d4a8163ba851fd0560a57e Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 14 Apr 2021 11:27:01 -0700
Subject: [PATCH 158/317] Fix msgpack install

---
 cmake/GetMsgpack.cmake | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cmake/GetMsgpack.cmake b/cmake/GetMsgpack.cmake
index 0b951d5a1b..dc9a578175 100644
--- a/cmake/GetMsgpack.cmake
+++ b/cmake/GetMsgpack.cmake
@@ -9,8 +9,11 @@ else()
   ExternalProject_add(msgpackProject
     URL "https://github.com/msgpack/msgpack-c/releases/download/cpp-3.3.0/msgpack-3.3.0.tar.gz"
     URL_HASH SHA256=6e114d12a5ddb8cb11f669f83f32246e484a8addd0ce93f274996f1941c1f07b
-    CONFIGURE_COMMAND BUILD_COMMAND INSTALL_COMMAND)
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ""
+  )
 
   ExternalProject_Get_property(msgpackProject SOURCE_DIR)
   target_include_directories(msgpack SYSTEM INTERFACE "${SOURCE_DIR}/include")
-endif()
\ No newline at end of file
+endif()

From 28c92aa9452ae78de5ca09b1b655cc3049750846 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 12:01:41 -0700
Subject: [PATCH 159/317] Supress TraceEvent.

Co-authored-by: A.J. Beamon <aj.beamon@snowflake.com>
---
 fdbserver/Ratekeeper.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index c667cdc1b1..0a66ae15f9 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -764,7 +764,7 @@ ACTOR Future<Void> monitorServerListChange(
 			tr = Transaction(self->db);
 			wait(delay(SERVER_KNOBS->SERVER_LIST_DELAY));
 		} catch (Error& e) {
-			TraceEvent("RatekeeperGetSSListError", self->id).error(e);
+			TraceEvent("RatekeeperGetSSListError", self->id).suppressFor(1.0).error(e);
 			wait(tr.onError(e));
 		}
 	}

From 3ed0d614d29e5734ba03c5492386fddb5ee629cf Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 14 Apr 2021 12:50:30 -0700
Subject: [PATCH 160/317] Move fdb_get_server_protocol to be a function on the
 database object. Add an argument for expected_version that can be used to
 signal that the function shouldn't return unless the protocol version is
 different.

---
 bindings/c/fdb_c.cpp                        | 15 +++++--
 bindings/c/foundationdb/fdb_c.h             |  4 +-
 bindings/c/test/unit/unit_tests.cpp         |  4 +-
 fdbclient/IClientApi.h                      | 12 ++++-
 fdbclient/MultiVersionTransaction.actor.cpp | 49 ++++++++++++++++-----
 fdbclient/MultiVersionTransaction.h         | 33 ++++++++++++--
 fdbclient/NativeAPI.actor.cpp               | 15 +++++--
 fdbclient/NativeAPI.actor.h                 |  5 ++-
 fdbclient/ThreadSafeTransaction.cpp         | 19 ++++----
 fdbclient/ThreadSafeTransaction.h           | 18 ++++++--
 10 files changed, 133 insertions(+), 41 deletions(-)

diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp
index bf6af3aab7..907f8058b6 100644
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@@ -364,6 +364,17 @@ extern "C" DLLEXPORT double fdb_database_get_main_thread_busyness(FDBDatabase* d
 	return DB(d)->getMainThreadBusyness();
 }
 
+// Returns the protocol version reported by a quorum of coordinators
+// If an expected version is non-zero, the future won't return until the protocol version is different than expected
+extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version) {
+	Optional<ProtocolVersion> expected;
+	if (expected_version > 0) {
+		expected = ProtocolVersion(expected_version);
+	}
+
+	return (FDBFuture*)(DB(db)->getServerProtocol(expected).extractPtr());
+}
+
 extern "C" DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr) {
 	try {
 		TXN(tr)->delref();
@@ -583,10 +594,6 @@ extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_approximate_size(FDBTransact
 	return (FDBFuture*)TXN(tr)->getApproximateSize().extractPtr();
 }
 
-extern "C" DLLEXPORT FDBFuture* fdb_get_server_protocol(const char* clusterFilePath) {
-	return (FDBFuture*)(API->getServerProtocol(clusterFilePath ? clusterFilePath : "").extractPtr());
-}
-
 extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_versionstamp(FDBTransaction* tr) {
 	return (FDBFuture*)(TXN(tr)->getVersionstamp().extractPtr());
 }
diff --git a/bindings/c/foundationdb/fdb_c.h b/bindings/c/foundationdb/fdb_c.h
index 2086cbd775..4ea59ac11e 100644
--- a/bindings/c/foundationdb/fdb_c.h
+++ b/bindings/c/foundationdb/fdb_c.h
@@ -189,6 +189,8 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_create_snapshot(FDBDatabase
 
 DLLEXPORT WARN_UNUSED_RESULT double fdb_database_get_main_thread_busyness(FDBDatabase* db);
 
+DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version);
+
 DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr);
 
 DLLEXPORT void fdb_transaction_cancel(FDBTransaction* tr);
@@ -281,8 +283,6 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_transaction_get_committed_version(F
  */
 DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_approximate_size(FDBTransaction* tr);
 
-DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_get_server_protocol(const char* clusterFilePath);
-
 DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_versionstamp(FDBTransaction* tr);
 
 DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_on_error(FDBTransaction* tr, fdb_error_t error);
diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp
index f3f97476c2..a87e483ef3 100644
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@@ -1515,7 +1515,7 @@ TEST_CASE("fdb_transaction_get_approximate_size") {
 
 TEST_CASE("fdb_get_server_protocol") {
 	// We don't really have any expectations other than "don't crash" here
-	FDBFuture* protocolFuture = fdb_get_server_protocol(clusterFilePath.c_str());
+	FDBFuture* protocolFuture = fdb_database_get_server_protocol(db, 0);
 	uint64_t out;
 
 	fdb_check(fdb_future_block_until_ready(protocolFuture));
@@ -1523,7 +1523,7 @@ TEST_CASE("fdb_get_server_protocol") {
 	fdb_future_destroy(protocolFuture);
 
 	// "Default" cluster file version
-	protocolFuture = fdb_get_server_protocol(nullptr);
+	protocolFuture = fdb_database_get_server_protocol(nullptr, 0x0FDB00A200090000LL);
 	fdb_check(fdb_future_block_until_ready(protocolFuture));
 	fdb_check(fdb_future_get_uint64(protocolFuture, &out));
 	fdb_future_destroy(protocolFuture);
diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h
index 6f3ad07cd1..4496eff732 100644
--- a/fdbclient/IClientApi.h
+++ b/fdbclient/IClientApi.h
@@ -28,6 +28,7 @@
 
 #include "flow/ThreadHelper.actor.h"
 
+// An interface that represents a transaction created by a client
 class ITransaction {
 public:
 	virtual ~ITransaction() {}
@@ -90,6 +91,7 @@ public:
 	virtual void delref() = 0;
 };
 
+// An interface that represents a connection to a cluster made by a client
 class IDatabase {
 public:
 	virtual ~IDatabase() {}
@@ -98,6 +100,11 @@ public:
 	virtual void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;
 	virtual double getMainThreadBusyness() = 0;
 
+	// Returns the protocol version reported by a quorum of coordinators
+	// If an expected version is given, the future won't return until the protocol version is different than expected
+	virtual ThreadFuture<ProtocolVersion> getServerProtocol(
+	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) = 0;
+
 	virtual void addref() = 0;
 	virtual void delref() = 0;
 
@@ -110,13 +117,16 @@ public:
 	virtual ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) = 0;
 };
 
+// An interface that presents the top-level FDB client API as exposed through the C bindings
+//
+// This interface and its associated objects are intended to live outside the network thread, so its asynchronous
+// operations use ThreadFutures and implementations should be thread safe.
 class IClientApi {
 public:
 	virtual ~IClientApi() {}
 
 	virtual void selectApiVersion(int apiVersion) = 0;
 	virtual const char* getClientVersion() = 0;
-	virtual ThreadFuture<uint64_t> getServerProtocol(const char* clusterFilePath) = 0;
 
 	virtual void setNetworkOption(FDBNetworkOptions::Option option,
 	                              Optional<StringRef> value = Optional<StringRef>()) = 0;
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index ac1855c811..4b6ba0c27c 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -356,7 +356,32 @@ double DLDatabase::getMainThreadBusyness() {
 	return 0;
 }
 
+// Returns the protocol version reported by a quorum of coordinators
+// If an expected version is given, the future won't return until the protocol version is different than expected
+ThreadFuture<ProtocolVersion> DLDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
+	ASSERT(api->databaseGetServerProtocol != nullptr);
+
+	uint64_t expected =
+	    expectedVersion.map<uint64_t>([](const ProtocolVersion& v) { return v.version(); }).orDefault(0);
+	FdbCApi::FDBFuture* f = api->databaseGetServerProtocol(db, expected);
+	return toThreadFuture<ProtocolVersion>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
+		uint64_t pv;
+		FdbCApi::fdb_error_t error = api->futureGetUInt64(f, &pv);
+		ASSERT(!error);
+		return ProtocolVersion(pv);
+	});
+}
+
 // DLApi
+
+// Loads the specified function from a dynamic library
+//
+// fp - The function pointer where the loaded function will be stored
+// lib - The dynamic library where the function is loaded from
+// libPath - The path of the dynamic library (used for logging)
+// functionName - The function to load
+// requireFunction - Determines the behavior if the function is not present. If true, an error is thrown. If false,
+//                   the function pointer will be set to nullptr.
 template <class T>
 void loadClientFunction(T* fp, void* lib, std::string libPath, const char* functionName, bool requireFunction = true) {
 	*(void**)(fp) = loadFunction(lib, functionName);
@@ -403,6 +428,8 @@ void DLApi::init() {
 	                   fdbCPath,
 	                   "fdb_database_get_main_thread_busyness",
 	                   headerVersion >= 700);
+	loadClientFunction(
+	    &api->databaseGetServerProtocol, lib, fdbCPath, "fdb_database_get_server_protocol", headerVersion >= 700);
 	loadClientFunction(&api->databaseDestroy, lib, fdbCPath, "fdb_database_destroy");
 	loadClientFunction(&api->databaseRebootWorker, lib, fdbCPath, "fdb_database_reboot_worker", headerVersion >= 700);
 	loadClientFunction(&api->databaseForceRecoveryWithDataLoss,
@@ -452,7 +479,7 @@ void DLApi::init() {
 
 	loadClientFunction(
 	    &api->futureGetInt64, lib, fdbCPath, headerVersion >= 620 ? "fdb_future_get_int64" : "fdb_future_get_version");
-	loadClientFunction(&api->futureGetUInt64, lib, fdbCPath, "fdb_future_get_uint64");
+	loadClientFunction(&api->futureGetUInt64, lib, fdbCPath, "fdb_future_get_uint64", headerVersion >= 700);
 	loadClientFunction(&api->futureGetError, lib, fdbCPath, "fdb_future_get_error");
 	loadClientFunction(&api->futureGetKey, lib, fdbCPath, "fdb_future_get_key");
 	loadClientFunction(&api->futureGetValue, lib, fdbCPath, "fdb_future_get_value");
@@ -488,11 +515,6 @@ const char* DLApi::getClientVersion() {
 	return api->getClientVersion();
 }
 
-ThreadFuture<uint64_t> DLApi::getServerProtocol(const char* clusterFilePath) {
-	ASSERT(false);
-	return ThreadFuture<uint64_t>();
-}
-
 void DLApi::setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value) {
 	throwIfError(api->setNetworkOption(
 	    option, value.present() ? value.get().begin() : nullptr, value.present() ? value.get().size() : 0));
@@ -856,7 +878,7 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
                                            std::string clusterFilePath,
                                            Reference<IDatabase> db,
                                            bool openConnectors)
-  : dbState(new DatabaseState()) {
+  : dbState(new DatabaseState()), clusterFilePath(clusterFilePath) {
 	dbState->db = db;
 	dbState->dbVar->set(db);
 
@@ -941,6 +963,15 @@ double MultiVersionDatabase::getMainThreadBusyness() {
 	return 0;
 }
 
+// Returns the protocol version reported by a quorum of coordinators
+// If an expected version is given, the future won't return until the protocol version is different than expected
+ThreadFuture<ProtocolVersion> MultiVersionDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
+	// TODO: send this out through the active database
+	return MultiVersionApi::api->getLocalClient()
+	    ->api->createDatabase(clusterFilePath.c_str())
+	    ->getServerProtocol(expectedVersion);
+}
+
 void MultiVersionDatabase::Connector::connect() {
 	addref();
 	onMainThreadVoid(
@@ -1181,10 +1212,6 @@ const char* MultiVersionApi::getClientVersion() {
 	return localClient->api->getClientVersion();
 }
 
-ThreadFuture<uint64_t> MultiVersionApi::getServerProtocol(const char* clusterFilePath) {
-	return api->localClient->api->getServerProtocol(clusterFilePath);
-}
-
 void validateOption(Optional<StringRef> value, bool canBePresent, bool canBeAbsent, bool canBeEmpty = true) {
 	ASSERT(canBePresent || canBeAbsent);
 
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index ea16f4f35e..badb848334 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -28,6 +28,8 @@
 
 #include "flow/ThreadHelper.actor.h"
 
+// FdbCApi is used as a wrapper around the FoundationDB C API that gets loaded from an external client library.
+// All of the required functions loaded from that external library are stored in function pointers in this struct.
 struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	typedef struct future FDBFuture;
 	typedef struct cluster FDBCluster;
@@ -55,7 +57,6 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	// Network
 	fdb_error_t (*selectApiVersion)(int runtimeVersion, int headerVersion);
 	const char* (*getClientVersion)();
-	FDBFuture* (*getServerProtocol)(const char* clusterFilePath);
 	fdb_error_t (*setNetworkOption)(FDBNetworkOptions::Option option, uint8_t const* value, int valueLength);
 	fdb_error_t (*setupNetwork)();
 	fdb_error_t (*runNetwork)();
@@ -81,6 +82,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	                                     uint8_t const* snapshotCommmand,
 	                                     int snapshotCommandLength);
 	double (*databaseGetMainThreadBusyness)(FDBDatabase* database);
+	FDBFuture* (*databaseGetServerProtocol)(FDBDatabase* database, uint64_t expectedVersion);
 
 	// Transaction
 	fdb_error_t (*transactionSetOption)(FDBTransaction* tr,
@@ -185,6 +187,8 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
 	fdb_error_t (*futureGetCluster)(FDBFuture* f, FDBCluster** outCluster);
 };
 
+// An implementation of ITransaction that wraps a transaction object created on an externally loaded client library.
+// All API calls to that transaction are routed through the external library.
 class DLTransaction : public ITransaction, ThreadSafeReferenceCounted<DLTransaction> {
 public:
 	DLTransaction(Reference<FdbCApi> api, FdbCApi::FDBTransaction* tr) : api(api), tr(tr) {}
@@ -249,6 +253,8 @@ private:
 	FdbCApi::FDBTransaction* const tr;
 };
 
+// An implementation of IDatabase that wraps a database object created on an externally loaded client library.
+// All API calls to that database are routed through the external library.
 class DLDatabase : public IDatabase, ThreadSafeReferenceCounted<DLDatabase> {
 public:
 	DLDatabase(Reference<FdbCApi> api, FdbCApi::FDBDatabase* db) : api(api), db(db), ready(Void()) {}
@@ -265,6 +271,11 @@ public:
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
 
+	// Returns the protocol version reported by a quorum of coordinators
+	// If an expected version is given, the future won't return until the protocol version is different than expected
+	ThreadFuture<ProtocolVersion> getServerProtocol(
+	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
+
 	void addref() override { ThreadSafeReferenceCounted<DLDatabase>::addref(); }
 	void delref() override { ThreadSafeReferenceCounted<DLDatabase>::delref(); }
 
@@ -279,13 +290,14 @@ private:
 	ThreadFuture<Void> ready;
 };
 
+// An implementation of IClientApi that re-issues API calls to the C API of an externally loaded client library.
+// The DL prefix stands for "dynamic library".
 class DLApi : public IClientApi {
 public:
 	DLApi(std::string fdbCPath, bool unlinkOnLoad = false);
 
 	void selectApiVersion(int apiVersion) override;
 	const char* getClientVersion() override;
-	ThreadFuture<uint64_t> getServerProtocol(const char* clusterFilePath) override;
 
 	void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	void setupNetwork() override;
@@ -312,6 +324,9 @@ private:
 
 class MultiVersionDatabase;
 
+// An implementation of ITransaction that wraps a transaction created either locally or through a dynamically loaded
+// external client. When needed (e.g on cluster version change), the MultiVersionTransaction can automatically replace
+// its wrapped transaction with one from another client.
 class MultiVersionTransaction : public ITransaction, ThreadSafeReferenceCounted<MultiVersionTransaction> {
 public:
 	MultiVersionTransaction(Reference<MultiVersionDatabase> db,
@@ -413,6 +428,9 @@ struct ClientInfo : ClientDesc, ThreadSafeReferenceCounted<ClientInfo> {
 
 class MultiVersionApi;
 
+// An implementation of IDatabase that wraps a database created either locally or through a dynamically loaded
+// external client. The MultiVersionDatabase monitors the protocol version of the cluster and automatically
+// replaces the wrapped database when the protocol version changes.
 class MultiVersionDatabase final : public IDatabase, ThreadSafeReferenceCounted<MultiVersionDatabase> {
 public:
 	MultiVersionDatabase(MultiVersionApi* api,
@@ -426,6 +444,11 @@ public:
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
 
+	// Returns the protocol version reported by a quorum of coordinators
+	// If an expected version is given, the future won't return until the protocol version is different than expected
+	ThreadFuture<ProtocolVersion> getServerProtocol(
+	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
+
 	void addref() override { ThreadSafeReferenceCounted<MultiVersionDatabase>::addref(); }
 	void delref() override { ThreadSafeReferenceCounted<MultiVersionDatabase>::delref(); }
 
@@ -487,15 +510,19 @@ private:
 		Mutex optionLock;
 	};
 
+	std::string clusterFilePath;
 	const Reference<DatabaseState> dbState;
 	friend class MultiVersionTransaction;
 };
 
+// An implementation of IClientApi that can choose between multiple different client implementations either provided
+// locally within the primary loaded fdb_c client or through any number of dynamically loaded clients.
+//
+// This functionality is used to provide support for multiple protocol versions simultaneously.
 class MultiVersionApi : public IClientApi {
 public:
 	void selectApiVersion(int apiVersion) override;
 	const char* getClientVersion() override;
-	ThreadFuture<uint64_t> getServerProtocol(const char* clusterFilePath) override;
 
 	void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	void setupNetwork() override;
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 9f6784e279..6615e973dd 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -4900,9 +4900,18 @@ ACTOR Future<ProtocolVersion> coordinatorProtocolsFetcher(Reference<ClusterConne
 	return ProtocolVersion(majorityProtocol);
 }
 
-ACTOR Future<uint64_t> getCoordinatorProtocols(Reference<ClusterConnectionFile> f) {
-	ProtocolVersion protocolVersion = wait(coordinatorProtocolsFetcher(f));
-	return protocolVersion.version();
+// Returns the protocol version reported by a quorum of coordinators
+// If an expected version is given, the future won't return until the protocol version is different than expected
+ACTOR Future<ProtocolVersion> getClusterProtocol(Reference<ClusterConnectionFile> f,
+                                                 Optional<ProtocolVersion> expectedVersion) {
+	loop {
+		ProtocolVersion protocolVersion = wait(coordinatorProtocolsFetcher(f));
+		if (!expectedVersion.present() || protocolVersion != expectedVersion.get()) {
+			return protocolVersion;
+		} else {
+			wait(delay(2.0)); // TODO: this is temporary, so not making into a knob yet
+		}
+	}
 }
 
 uint32_t Transaction::getSize() {
diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h
index ac31967d83..51411ae0a2 100644
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@@ -400,7 +400,10 @@ ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID sn
 // Checks with Data Distributor that it is safe to mark all servers in exclusions as failed
 ACTOR Future<bool> checkSafeExclusions(Database cx, vector<AddressExclusion> exclusions);
 
-ACTOR Future<uint64_t> getCoordinatorProtocols(Reference<ClusterConnectionFile> f);
+// Returns the protocol version reported by a quorum of coordinators
+// If an expected version is given, the future won't return until the protocol version is different than expected
+ACTOR Future<ProtocolVersion> getClusterProtocol(Reference<ClusterConnectionFile> f,
+                                                 Optional<ProtocolVersion> expectedVersion);
 
 inline uint64_t getWriteOperationCost(uint64_t bytes) {
 	return bytes / std::max(1, CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR) + 1;
diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp
index 0e0877f9af..c5bf2dce87 100644
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@@ -97,6 +97,15 @@ double ThreadSafeDatabase::getMainThreadBusyness() {
 	return g_network->networkInfo.metrics.networkBusyness;
 }
 
+// Returns the protocol version reported by a quorum of coordinators
+// If an expected version is given, the future won't return until the protocol version is different than expected
+ThreadFuture<ProtocolVersion> ThreadSafeDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
+	DatabaseContext* db = this->db;
+	return onMainThread([db, expectedVersion]() -> Future<ProtocolVersion> {
+		return getClusterProtocol(db->getConnectionFile(), expectedVersion);
+	});
+}
+
 ThreadSafeDatabase::ThreadSafeDatabase(std::string connFilename, int apiVersion) {
 	ClusterConnectionFile* connFile =
 	    new ClusterConnectionFile(ClusterConnectionFile::lookupClusterFileName(connFilename).first);
@@ -407,16 +416,6 @@ const char* ThreadSafeApi::getClientVersion() {
 	return clientVersion.c_str();
 }
 
-// Wait until a quorum of coordinators with the same protocol version are available, and then return that protocol
-// version.
-ThreadFuture<uint64_t> ThreadSafeApi::getServerProtocol(const char* clusterFilePath) {
-	return onMainThread([clusterFilePath = std::string(clusterFilePath)]() -> Future<uint64_t> {
-		auto [clusterFile, isDefault] = ClusterConnectionFile::lookupClusterFileName(clusterFilePath);
-		Reference<ClusterConnectionFile> f = Reference<ClusterConnectionFile>(new ClusterConnectionFile(clusterFile));
-		return getCoordinatorProtocols(f);
-	});
-}
-
 void ThreadSafeApi::setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value) {
 	if (option == FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID) {
 		if (value.present()) {
diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h
index a62e503c11..e6360c2a6d 100644
--- a/fdbclient/ThreadSafeTransaction.h
+++ b/fdbclient/ThreadSafeTransaction.h
@@ -27,6 +27,8 @@
 #include "fdbclient/ClusterInterface.h"
 #include "fdbclient/IClientApi.h"
 
+// An implementation of IDatabase that serializes operations onto the network thread and interacts with the lower-level
+// client APIs exposed by NativeAPI and ReadYourWrites.
 class ThreadSafeDatabase : public IDatabase, public ThreadSafeReferenceCounted<ThreadSafeDatabase> {
 public:
 	~ThreadSafeDatabase() override;
@@ -37,9 +39,14 @@ public:
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
 
-	ThreadFuture<Void>
-	onConnected(); // Returns after a majority of coordination servers are available and have reported a leader. The
-	               // cluster file therefore is valid, but the database might be unavailable.
+	// Returns the protocol version reported by a quorum of coordinators
+	// If an expected version is given, the future won't return until the protocol version is different than expected
+	ThreadFuture<ProtocolVersion> getServerProtocol(
+	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
+
+	// Returns after a majority of coordination servers are available and have reported a leader. The
+	// cluster file therefore is valid, but the database might be unavailable.
+	ThreadFuture<Void> onConnected();
 
 	void addref() override { ThreadSafeReferenceCounted<ThreadSafeDatabase>::addref(); }
 	void delref() override { ThreadSafeReferenceCounted<ThreadSafeDatabase>::delref(); }
@@ -58,6 +65,8 @@ public: // Internal use only
 	DatabaseContext* unsafeGetPtr() const { return db; }
 };
 
+// An implementation of ITransaction that serializes operations onto the network thread and interacts with the
+// lower-level client APIs exposed by NativeAPI and ReadYourWrites.
 class ThreadSafeTransaction : public ITransaction, ThreadSafeReferenceCounted<ThreadSafeTransaction>, NonCopyable {
 public:
 	explicit ThreadSafeTransaction(DatabaseContext* cx);
@@ -135,11 +144,12 @@ private:
 	ReadYourWritesTransaction* tr;
 };
 
+// An implementation of IClientApi that serializes operations onto the network thread and interacts with the lower-level
+// client APIs exposed by NativeAPI and ReadYourWrites.
 class ThreadSafeApi : public IClientApi, ThreadSafeReferenceCounted<ThreadSafeApi> {
 public:
 	void selectApiVersion(int apiVersion) override;
 	const char* getClientVersion() override;
-	ThreadFuture<uint64_t> getServerProtocol(const char* clusterFilePath) override;
 
 	void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	void setupNetwork() override;

From bc8568d4bbdccf6a90200965c04dfe2ecd6641ae Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 14 Apr 2021 12:58:59 -0700
Subject: [PATCH 161/317] Use the correct pointer in the unit test for
 fdb_database_get_server_protocol

---
 bindings/c/test/unit/unit_tests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp
index a87e483ef3..c5c40b88c1 100644
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@@ -1523,7 +1523,7 @@ TEST_CASE("fdb_get_server_protocol") {
 	fdb_future_destroy(protocolFuture);
 
 	// "Default" cluster file version
-	protocolFuture = fdb_database_get_server_protocol(nullptr, 0x0FDB00A200090000LL);
+	protocolFuture = fdb_database_get_server_protocol(db, 0x0FDB00A200090000LL);
 	fdb_check(fdb_future_block_until_ready(protocolFuture));
 	fdb_check(fdb_future_get_uint64(protocolFuture, &out));
 	fdb_future_destroy(protocolFuture);

From a285d6019e40d0ff8f557d4a415b7144daac8aef Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 13:10:56 -0700
Subject: [PATCH 162/317] We cannot put 2 Future functions in the same wait if
 the second one uses the first's result.

Before this change:
20210414-180825-renxuan-7451fad7aed4f0c7           compressed=True data_size=22960315 duration=732 ended=146 fail=10 fail_fast=10 max_runs=100000 pass=46 priority=100 remaining=0 runtime=0:01:12 sanity=False started=147 stopped=20210414-180937 submitted=20210414-180825 timeout=5400 username=renxuan

After this change:
20210414-192849-renxuan-cbe0f71ad5c48286           compressed=True data_size=22959419 duration=4261266 ended=106778 fail=1 fail_fast=10 max_runs=100000 pass=99999 priority=100 remaining=0 runtime=0:24:49 sanity=False started=106963 stopped=20210414-195338 submitted=20210414-192849 timeout=5400 username=renxuan
---
 fdbclient/FileBackupAgent.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index b7e2a847b0..6f3fd1e13b 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2780,8 +2780,8 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 		// Initialize the initial snapshot and create tasks to continually write logs and snapshots.
 		state Future<Optional<int64_t>> initialSnapshotIntervalSeconds =
 		    config.initialSnapshotIntervalSeconds().get(tr);
-		wait(success(initialSnapshotIntervalSeconds) &&
-		     config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.get().orDefault(0)));
+		wait(success(initialSnapshotIntervalSeconds));
+		wait(config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.get().orDefault(0)));
 
 		// Using priority 1 for both of these to at least start both tasks soon
 		// Do not add snapshot task if we only want the incremental backup

From d3b6a543ab29bbdd9becaf0bf205b1b6c6ac1cb9 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 14 Apr 2021 13:23:06 -0700
Subject: [PATCH 163/317] Update comment in unit test

---
 bindings/c/test/unit/unit_tests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp
index c5c40b88c1..64898f6ede 100644
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@@ -1522,7 +1522,7 @@ TEST_CASE("fdb_get_server_protocol") {
 	fdb_check(fdb_future_get_uint64(protocolFuture, &out));
 	fdb_future_destroy(protocolFuture);
 
-	// "Default" cluster file version
+	// Passing in an expected version that's different than the cluster version
 	protocolFuture = fdb_database_get_server_protocol(db, 0x0FDB00A200090000LL);
 	fdb_check(fdb_future_block_until_ready(protocolFuture));
 	fdb_check(fdb_future_get_uint64(protocolFuture, &out));

From 97b995fb4f4a49c88daafce4284f8659b49d4bee Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 13:49:59 -0700
Subject: [PATCH 164/317] Update fdbclient/FileBackupAgent.actor.cpp

Co-authored-by: Trevor Clinkenbeard <trevor.clinkenbeard@snowflake.com>
---
 fdbclient/FileBackupAgent.actor.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 6f3fd1e13b..7f59cf553e 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -2778,10 +2778,8 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase {
 		state Reference<TaskFuture> backupFinished = futureBucket->future(tr);
 
 		// Initialize the initial snapshot and create tasks to continually write logs and snapshots.
-		state Future<Optional<int64_t>> initialSnapshotIntervalSeconds =
-		    config.initialSnapshotIntervalSeconds().get(tr);
-		wait(success(initialSnapshotIntervalSeconds));
-		wait(config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.get().orDefault(0)));
+		state Optional<int64_t> initialSnapshotIntervalSeconds = wait(config.initialSnapshotIntervalSeconds().get(tr));
+		wait(config.initNewSnapshot(tr, initialSnapshotIntervalSeconds.orDefault(0)));
 
 		// Using priority 1 for both of these to at least start both tasks soon
 		// Do not add snapshot task if we only want the incremental backup

From 650b052284dab70a1b4f8fabc63b7fa1bfa14fd9 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 13:55:46 -0700
Subject: [PATCH 165/317] Log each time RateKeeper enters
 monitorServerListChange().

---
 fdbserver/Ratekeeper.actor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 0a66ae15f9..0224954b9e 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -738,6 +738,7 @@ ACTOR Future<Void> monitorServerListChange(
 	loop {
 		try {
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+			TraceEvent("RatekeeperMonitorSSList", self->id).detail("CurrentTime", now());
 			vector<std::pair<StorageServerInterface, ProcessClass>> results = wait(getServerListAndProcessClasses(&tr));
 			self->lastSSListFetchedTimestamp = now();
 

From 0378dc0a502b3f8e26d512c738d2f918f4fd9542 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 14 Apr 2021 22:19:39 -0700
Subject: [PATCH 166/317] Report the current version in the restore status.

---
 fdbclient/FileBackupAgent.actor.cpp | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index e7da8fbf58..4044df66cc 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -244,6 +244,20 @@ public:
 
 	Key applyMutationsMapPrefix() { return uidPrefixKey(applyMutationsKeyVersionMapRange.begin, uid); }
 
+	ACTOR static Future<Version> getCurrentVersion_impl(Reference<ReadYourWritesTransaction> tr, UID uid) {
+		state Future<Optional<Value>> beginVal = tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid), true);
+		wait(success(beginVal));
+		if (!beginVal.get().present()) {
+			return -1;
+		}
+		Version currentVersion = BinaryReader::fromStringRef<Version>(beginVal.get().get(), Unversioned());
+		return currentVersion;
+	}
+
+	Future<Version> getCurrentVersion(Reference<ReadYourWritesTransaction> tr) {
+		return getCurrentVersion_impl(tr, uid);
+	}
+
 	ACTOR static Future<int64_t> getApplyVersionLag_impl(Reference<ReadYourWritesTransaction> tr, UID uid) {
 		// Both of these are snapshot reads
 		state Future<Optional<Value>> beginVal = tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid), true);
@@ -334,6 +348,7 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	state Future<int64_t> fileBlocksFinished = restore.fileBlocksFinished().getD(tr);
 	state Future<int64_t> bytesWritten = restore.bytesWritten().getD(tr);
 	state Future<StringRef> status = restore.stateText(tr);
+	state Future<Version> currentVersion = restore.getCurrentVersion(tr);
 	state Future<Version> lag = restore.getApplyVersionLag(tr);
 	state Future<std::string> tag = restore.tag().getD(tr);
 	state Future<std::pair<std::string, Version>> lastError = restore.lastError().getD(tr);
@@ -341,8 +356,8 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	// restore might no longer be valid after the first wait so make sure it is not needed anymore.
 	state UID uid = restore.getUid();
 	wait(success(fileCount) && success(fileBlockCount) && success(fileBlocksDispatched) &&
-	     success(fileBlocksFinished) && success(bytesWritten) && success(status) && success(lag) && success(tag) &&
-	     success(lastError));
+	     success(fileBlocksFinished) && success(bytesWritten) && success(status) && success(currentVersion) &&
+	     success(lag) && success(tag) && success(lastError));
 
 	std::string errstr = "None";
 	if (lastError.get().second != 0)
@@ -359,11 +374,12 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	    .detail("FileBlocksTotal", fileBlockCount.get())
 	    .detail("FileBlocksInProgress", fileBlocksDispatched.get() - fileBlocksFinished.get())
 	    .detail("BytesWritten", bytesWritten.get())
+	    .detail("CurrentVersion", currentVersion.get())
 	    .detail("ApplyLag", lag.get())
 	    .detail("TaskInstance", THIS_ADDR);
 
 	return format("Tag: %s  UID: %s  State: %s  Blocks: %lld/%lld  BlocksInProgress: %lld  Files: %lld  BytesWritten: "
-	              "%lld  ApplyVersionLag: %lld  LastError: %s",
+	              "%lld  CurrentVersion: %lld  ApplyVersionLag: %lld  LastError: %s",
 	              tag.get().c_str(),
 	              uid.toString().c_str(),
 	              status.get().toString().c_str(),
@@ -372,6 +388,7 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	              fileBlocksDispatched.get() - fileBlocksFinished.get(),
 	              fileCount.get(),
 	              bytesWritten.get(),
+	              currentVersion.get(),
 	              lag.get(),
 	              errstr.c_str());
 }

From b2d6930103becc1323397bab0fec40fa0ce64e0a Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Thu, 15 Apr 2021 11:45:14 -0700
Subject: [PATCH 167/317] The multi-version client monitors the cluster's
 protocol version and only activates the client library that can connect.

---
 bindings/c/fdb_c.cpp                        |   2 +-
 fdbclient/CoordinationInterface.h           |   5 +
 fdbclient/DatabaseContext.h                 |  11 +
 fdbclient/IClientApi.h                      |   2 +-
 fdbclient/MonitorLeader.actor.cpp           |   7 +-
 fdbclient/MonitorLeader.h                   |   1 +
 fdbclient/MultiVersionTransaction.actor.cpp | 313 ++++++++------------
 fdbclient/MultiVersionTransaction.h         |  71 ++---
 fdbclient/NativeAPI.actor.cpp               | 139 ++++++---
 fdbclient/NativeAPI.actor.h                 |   9 +-
 fdbclient/ThreadSafeTransaction.cpp         |   7 +-
 fdbclient/ThreadSafeTransaction.h           |   2 +-
 fdbrpc/FlowTransport.actor.cpp              |  26 +-
 fdbrpc/FlowTransport.h                      |  49 +--
 flow/ProtocolVersion.h                      |  12 +
 15 files changed, 350 insertions(+), 306 deletions(-)

diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp
index 907f8058b6..2c133dae36 100644
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@@ -364,7 +364,7 @@ extern "C" DLLEXPORT double fdb_database_get_main_thread_busyness(FDBDatabase* d
 	return DB(d)->getMainThreadBusyness();
 }
 
-// Returns the protocol version reported by a quorum of coordinators
+// Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is non-zero, the future won't return until the protocol version is different than expected
 extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version) {
 	Optional<ProtocolVersion> expected;
diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h
index 0d22b035fb..d826da4fd6 100644
--- a/fdbclient/CoordinationInterface.h
+++ b/fdbclient/CoordinationInterface.h
@@ -35,6 +35,7 @@ constexpr UID WLTOKEN_CLIENTLEADERREG_OPENDATABASE(-1, 3);
 
 constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 10);
 
+// The coordinator interface as exposed to clients
 struct ClientLeaderRegInterface {
 	RequestStream<struct GetLeaderRequest> getLeader;
 	RequestStream<struct OpenDatabaseCoordRequest> openDatabase;
@@ -42,6 +43,10 @@ struct ClientLeaderRegInterface {
 	ClientLeaderRegInterface() {}
 	ClientLeaderRegInterface(NetworkAddress remote);
 	ClientLeaderRegInterface(INetwork* local);
+
+	bool operator==(const ClientLeaderRegInterface& rhs) const {
+		return getLeader == rhs.getLeader && openDatabase == rhs.openDatabase;
+	}
 };
 
 class ClusterConnectionString {
diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h
index 2e1100fef7..487ce50bf2 100644
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@@ -152,6 +152,7 @@ public:
 		return (DatabaseContext*)DatabaseContext::operator new(sizeof(DatabaseContext));
 	}
 
+	// Static constructor used by server processes to create a DatabaseContext
 	// For internal (fdbserver) use only
 	static Database create(Reference<AsyncVar<ClientDBInfo>> clientInfo,
 	                       Future<Void> clientInfoMonitor,
@@ -164,9 +165,11 @@ public:
 
 	~DatabaseContext();
 
+	// Constructs a new copy of this DatabaseContext from the parameters of this DatabaseContext
 	Database clone() const {
 		return Database(new DatabaseContext(connectionFile,
 		                                    clientInfo,
+		                                    coordinator,
 		                                    clientInfoMonitor,
 		                                    taskID,
 		                                    clientLocality,
@@ -196,6 +199,10 @@ public:
 	Future<Void> onProxiesChanged();
 	Future<HealthMetrics> getHealthMetrics(bool detailed);
 
+	// Returns the protocol version reported by the coordinator this client is connected to
+	// If an expected version is given, the future won't return until the protocol version is different than expected
+	Future<ProtocolVersion> getClusterProtocol(Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>());
+
 	// Update the watch counter for the database
 	void addWatch();
 	void removeWatch();
@@ -247,6 +254,7 @@ public:
 	// private:
 	explicit DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionFile>>> connectionFile,
 	                         Reference<AsyncVar<ClientDBInfo>> clientDBInfo,
+	                         Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
 	                         Future<Void> clientInfoMonitor,
 	                         TaskPriority taskID,
 	                         LocalityData const& clientLocality,
@@ -380,6 +388,9 @@ public:
 	Future<Void> clientInfoMonitor;
 	Future<Void> connected;
 
+	// An AsyncVar that reports the coordinator this DatabaseContext is interacting with
+	Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator;
+
 	Reference<AsyncVar<Optional<ClusterInterface>>> statusClusterInterface;
 	Future<Void> statusLeaderMon;
 	double lastStatusFetch;
diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h
index 4496eff732..a3de56bf10 100644
--- a/fdbclient/IClientApi.h
+++ b/fdbclient/IClientApi.h
@@ -100,7 +100,7 @@ public:
 	virtual void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;
 	virtual double getMainThreadBusyness() = 0;
 
-	// Returns the protocol version reported by a quorum of coordinators
+	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
 	virtual ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) = 0;
diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp
index af563c68b0..df14e6a40a 100644
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@@ -757,6 +757,7 @@ void shrinkProxyList(ClientDBInfo& ni,
 ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
     Reference<ClusterConnectionFile> connFile,
     Reference<AsyncVar<ClientDBInfo>> clientInfo,
+    Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
     MonitorLeaderInfo info,
     Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions,
     Key traceLogGroup) {
@@ -774,6 +775,9 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
 	loop {
 		state ClientLeaderRegInterface clientLeaderServer(addrs[idx]);
 		state OpenDatabaseCoordRequest req;
+
+		coordinator->set(clientLeaderServer);
+
 		req.clusterKey = cs.clusterKey();
 		req.coordinators = cs.coordinators();
 		req.knownClientInfoID = clientInfo->get().id;
@@ -840,13 +844,14 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
 ACTOR Future<Void> monitorProxies(
     Reference<AsyncVar<Reference<ClusterConnectionFile>>> connFile,
     Reference<AsyncVar<ClientDBInfo>> clientInfo,
+    Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
     Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions,
     Key traceLogGroup) {
 	state MonitorLeaderInfo info(connFile->get());
 	loop {
 		choose {
 			when(MonitorLeaderInfo _info = wait(monitorProxiesOneGeneration(
-			         connFile->get(), clientInfo, info, supportedVersions, traceLogGroup))) {
+			         connFile->get(), clientInfo, coordinator, info, supportedVersions, traceLogGroup))) {
 				info = _info;
 			}
 			when(wait(connFile->onChange())) {
diff --git a/fdbclient/MonitorLeader.h b/fdbclient/MonitorLeader.h
index 204b6994f4..b9b195a9da 100644
--- a/fdbclient/MonitorLeader.h
+++ b/fdbclient/MonitorLeader.h
@@ -76,6 +76,7 @@ Future<Void> monitorLeaderForProxies(Value const& key,
 Future<Void> monitorProxies(
     Reference<AsyncVar<Reference<ClusterConnectionFile>>> const& connFile,
     Reference<AsyncVar<ClientDBInfo>> const& clientInfo,
+    Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> const& coordinator,
     Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> const& supportedVersions,
     Key const& traceLogGroup);
 
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 4b6ba0c27c..57f23e3d88 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -356,7 +356,7 @@ double DLDatabase::getMainThreadBusyness() {
 	return 0;
 }
 
-// Returns the protocol version reported by a quorum of coordinators
+// Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
 ThreadFuture<ProtocolVersion> DLDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
 	ASSERT(api->databaseGetServerProtocol != nullptr);
@@ -877,35 +877,35 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
                                            int threadIdx,
                                            std::string clusterFilePath,
                                            Reference<IDatabase> db,
+                                           Reference<IDatabase> versionMonitorDb,
                                            bool openConnectors)
-  : dbState(new DatabaseState()), clusterFilePath(clusterFilePath) {
+  : dbState(new DatabaseState(clusterFilePath, versionMonitorDb)) {
 	dbState->db = db;
 	dbState->dbVar->set(db);
 
-	if (!openConnectors) {
-		dbState->currentClientIndex = 0;
-	} else {
+	if (openConnectors) {
 		if (!api->localClientDisabled) {
-			dbState->currentClientIndex = 0;
-			dbState->addConnection(api->getLocalClient(), clusterFilePath);
-		} else {
-			dbState->currentClientIndex = -1;
+			dbState->addClient(api->getLocalClient());
 		}
 
-		api->runOnExternalClients(threadIdx, [this, clusterFilePath](Reference<ClientInfo> client) {
-			dbState->addConnection(client, clusterFilePath);
-		});
+		if (!externalClientsInitialized.test_and_set()) {
+			api->runOnExternalClientsAllThreads([&clusterFilePath](Reference<ClientInfo> client) {
+				// This creates a database to initialize some client state on the external library,
+				// but it gets deleted immediately so that we don't keep open connections
+				Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
+			});
+		}
 
-		dbState->startConnections();
+		api->runOnExternalClients(threadIdx, [this](Reference<ClientInfo> client) { dbState->addClient(client); });
+
+		dbState->protocolVersionMonitor = dbState->monitorProtocolVersion();
 	}
 }
 
-MultiVersionDatabase::~MultiVersionDatabase() {
-	dbState->cancelConnections();
-}
-
+// Create a MultiVersionDatabase that wraps an already created IDatabase object
+// For internal use in testing
 Reference<IDatabase> MultiVersionDatabase::debugCreateFromExistingDatabase(Reference<IDatabase> db) {
-	return Reference<IDatabase>(new MultiVersionDatabase(MultiVersionApi::api, 0, "", db, false));
+	return Reference<IDatabase>(new MultiVersionDatabase(MultiVersionApi::api, 0, "", db, db, false));
 }
 
 Reference<ITransaction> MultiVersionDatabase::createTransaction() {
@@ -963,189 +963,122 @@ double MultiVersionDatabase::getMainThreadBusyness() {
 	return 0;
 }
 
-// Returns the protocol version reported by a quorum of coordinators
+// Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
 ThreadFuture<ProtocolVersion> MultiVersionDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
-	// TODO: send this out through the active database
-	return MultiVersionApi::api->getLocalClient()
-	    ->api->createDatabase(clusterFilePath.c_str())
-	    ->getServerProtocol(expectedVersion);
+	return dbState->versionMonitorDb->getServerProtocol(expectedVersion);
 }
 
-void MultiVersionDatabase::Connector::connect() {
-	addref();
-	onMainThreadVoid(
-	    [this]() {
-		    if (!cancelled) {
-			    connected = false;
-			    if (connectionFuture.isValid()) {
-				    connectionFuture.cancel();
-			    }
+MultiVersionDatabase::DatabaseState::DatabaseState(std::string clusterFilePath, Reference<IDatabase> versionMonitorDb)
+  : clusterFilePath(clusterFilePath), versionMonitorDb(versionMonitorDb),
+    dbVar(new ThreadSafeAsyncVar<Reference<IDatabase>>(Reference<IDatabase>(NULL))) {}
 
-			    candidateDatabase = client->api->createDatabase(clusterFilePath.c_str());
-			    if (client->external) {
-				    connectionFuture = candidateDatabase.castTo<DLDatabase>()->onReady();
-			    } else {
-				    connectionFuture = ThreadFuture<Void>(Void());
-			    }
+// Adds a client (local or externally loaded) that can be used to connect to the cluster
+void MultiVersionDatabase::DatabaseState::addClient(Reference<ClientInfo> client) {
+	ProtocolVersion baseVersion = client->protocolVersion.normalizedVersion();
+	auto [itr, inserted] = clients.insert({ baseVersion, client });
+	if (!inserted) {
+		// SOMEDAY: prefer client with higher release version if protocol versions are compatible
+		Reference<ClientInfo> keptClient = itr->second;
+		Reference<ClientInfo> discardedClient = client;
+		if (client->canReplace(itr->second)) {
+			std::swap(keptClient, discardedClient);
+			clients[baseVersion] = client;
+		}
 
-			    connectionFuture = flatMapThreadFuture<Void, Void>(connectionFuture, [this](ErrorOr<Void> ready) {
-				    if (ready.isError()) {
-					    return ErrorOr<ThreadFuture<Void>>(ready.getError());
-				    }
+		discardedClient->failed = true;
+		TraceEvent(SevWarn, "DuplicateClientVersion")
+		    .detail("Keeping", keptClient->libPath)
+		    .detail("KeptProtocolVersion", keptClient->protocolVersion)
+		    .detail("Disabling", discardedClient->libPath)
+		    .detail("DisabledProtocolVersion", discardedClient->protocolVersion);
 
-				    tr = candidateDatabase->createTransaction();
-				    return ErrorOr<ThreadFuture<Void>>(
-				        mapThreadFuture<Version, Void>(tr->getReadVersion(), [](ErrorOr<Version> v) {
-					        // If the version attempt returns an error, we regard that as a connection (except
-					        // operation_cancelled)
-					        if (v.isError() && v.getError().code() == error_code_operation_cancelled) {
-						        return ErrorOr<Void>(v.getError());
-					        } else {
-						        return ErrorOr<Void>(Void());
-					        }
-				        }));
-			    });
-
-			    int userParam;
-			    connectionFuture.callOrSetAsCallback(this, userParam, 0);
-		    } else {
-			    delref();
-		    }
-	    },
-	    nullptr);
-}
-
-// Only called from main thread
-void MultiVersionDatabase::Connector::cancel() {
-	connected = false;
-	cancelled = true;
-	if (connectionFuture.isValid()) {
-		connectionFuture.cancel();
-	}
-}
-
-void MultiVersionDatabase::Connector::fire(const Void& unused, int& userParam) {
-	onMainThreadVoid(
-	    [this]() {
-		    if (!cancelled) {
-			    connected = true;
-			    dbState->stateChanged();
-		    }
-		    delref();
-	    },
-	    nullptr);
-}
-
-void MultiVersionDatabase::Connector::error(const Error& e, int& userParam) {
-	if (e.code() != error_code_operation_cancelled) {
-		// TODO: is it right to abandon this connection attempt?
-		client->failed = true;
 		MultiVersionApi::api->updateSupportedVersions();
-		TraceEvent(SevError, "DatabaseConnectionError").error(e).detail("ClientLibrary", this->client->libPath);
 	}
-
-	delref();
 }
 
-MultiVersionDatabase::DatabaseState::DatabaseState()
-  : dbVar(new ThreadSafeAsyncVar<Reference<IDatabase>>(Reference<IDatabase>(nullptr))), currentClientIndex(-1) {}
+// Watch the cluster protocol version for changes and update the database state when it does
+ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion() {
+	ThreadFuture<ProtocolVersion> f = versionMonitorDb->getServerProtocol(dbProtocolVersion);
+	return mapThreadFuture<ProtocolVersion, Void>(f, [this](ErrorOr<ProtocolVersion> cv) {
+		if (cv.isError()) {
+			TraceEvent("ErrorGettingClusterProtocolVersion")
+			    .detail("ExpectedProtocolVersion", dbProtocolVersion)
+			    .error(cv.getError());
+		}
 
-// Only called from main thread
-void MultiVersionDatabase::DatabaseState::stateChanged() {
-	int newIndex = -1;
-	for (int i = 0; i < clients.size(); ++i) {
-		if (i != currentClientIndex && connectionAttempts[i]->connected) {
-			if (currentClientIndex >= 0 && !clients[i]->canReplace(clients[currentClientIndex])) {
-				TraceEvent(SevWarn, "DuplicateClientVersion")
-				    .detail("Keeping", clients[currentClientIndex]->libPath)
-				    .detail("KeptClientProtocolVersion", clients[currentClientIndex]->protocolVersion.version())
-				    .detail("Disabling", clients[i]->libPath)
-				    .detail("DisabledClientProtocolVersion", clients[i]->protocolVersion.version());
-				connectionAttempts[i]->connected = false; // Permanently disable this client in favor of the current one
-				clients[i]->failed = true;
-				MultiVersionApi::api->updateSupportedVersions();
-				return;
+		ProtocolVersion clusterVersion = !cv.isError() ? cv.get() : dbProtocolVersion.orDefault(currentProtocolVersion);
+		onMainThreadVoid([this, clusterVersion]() { protocolVersionChanged(clusterVersion); }, nullptr);
+		return Void();
+	});
+}
+
+// Called when a change to the protocol version of the cluster has been detected. Must be called from the main
+// thread.
+void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion protocolVersion) {
+	if (dbProtocolVersion.present() &&
+	    protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) {
+		dbProtocolVersion = protocolVersion;
+	} else {
+		TraceEvent("ProtocolVersionChanged")
+		    .detail("NewProtocolVersion", protocolVersion)
+		    .detail("OldProtocolVersion", dbProtocolVersion);
+
+		dbProtocolVersion = protocolVersion;
+		auto itr = clients.find(protocolVersion.normalizedVersion());
+
+		if (itr != clients.end()) {
+			auto& client = itr->second;
+			TraceEvent("CreatingDatabaseOnExternalClient")
+			    .detail("LibraryPath", client->libPath)
+			    .detail("Failed", client->failed);
+
+			Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
+
+			optionLock.enter();
+			for (auto option : options) {
+				try {
+					newDb->setOption(
+					    option.first,
+					    option.second.castTo<StringRef>()); // In practice, this will set a deferred error instead
+					                                        // of throwing. If that happens, the database will be
+					                                        // unusable (attempts to use it will throw errors).
+				} catch (Error& e) {
+					optionLock.leave();
+					TraceEvent(SevError, "ClusterVersionChangeOptionError")
+					    .error(e)
+					    .detail("Option", option.first)
+					    .detail("OptionValue", option.second)
+					    .detail("LibPath", client->libPath);
+					client->failed = true;
+					MultiVersionApi::api->updateSupportedVersions();
+					db = Reference<IDatabase>(); // If we can't set all of the options on a cluster, we abandon the
+					                             // client
+					break;
+				}
 			}
 
-			newIndex = i;
-			break;
-		}
-	}
-
-	if (newIndex == -1) {
-		ASSERT_EQ(currentClientIndex, 0); // This can only happen for the local client, which we set as the current
-		                                  // connection before we know it's connected
-		return;
-	}
-
-	// Restart connection for replaced client
-	auto newDb = connectionAttempts[newIndex]->candidateDatabase;
-
-	optionLock.enter();
-	for (auto option : options) {
-		try {
-			newDb->setOption(option.first,
-			                 option.second.castTo<StringRef>()); // In practice, this will set a deferred error instead
-			                                                     // of throwing. If that happens, the database will be
-			                                                     // unusable (attempts to use it will throw errors).
-		} catch (Error& e) {
+			db = newDb;
+			if (dbProtocolVersion.get().hasStableInterfaces()) {
+				versionMonitorDb = db;
+			} else {
+				versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
+			}
 			optionLock.leave();
-			TraceEvent(SevError, "ClusterVersionChangeOptionError")
-			    .error(e)
-			    .detail("Option", option.first)
-			    .detail("OptionValue", option.second)
-			    .detail("LibPath", clients[newIndex]->libPath);
-			connectionAttempts[newIndex]->connected = false;
-			clients[newIndex]->failed = true;
-			MultiVersionApi::api->updateSupportedVersions();
-			return; // If we can't set all of the options on a cluster, we abandon the client
+		} else {
+			db = Reference<IDatabase>();
+			versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
 		}
+
+		dbVar->set(db);
 	}
 
-	db = newDb;
-	optionLock.leave();
-
-	dbVar->set(db);
-
-	if (currentClientIndex >= 0 && connectionAttempts[currentClientIndex]->connected) {
-		connectionAttempts[currentClientIndex]->connected = false;
-		connectionAttempts[currentClientIndex]->connect();
-	}
-
-	ASSERT(newIndex >= 0 && newIndex < clients.size());
-	currentClientIndex = newIndex;
+	protocolVersionMonitor = monitorProtocolVersion();
 }
 
-void MultiVersionDatabase::DatabaseState::addConnection(Reference<ClientInfo> client, std::string clusterFilePath) {
-	clients.push_back(client);
-	connectionAttempts.push_back(
-	    makeReference<Connector>(Reference<DatabaseState>::addRef(this), client, clusterFilePath));
-}
-
-void MultiVersionDatabase::DatabaseState::startConnections() {
-	for (auto c : connectionAttempts) {
-		c->connect();
-	}
-}
-
-void MultiVersionDatabase::DatabaseState::cancelConnections() {
-	addref();
-	onMainThreadVoid(
-	    [this]() {
-		    for (auto c : connectionAttempts) {
-			    c->cancel();
-		    }
-
-		    connectionAttempts.clear();
-		    clients.clear();
-		    delref();
-	    },
-	    nullptr);
-}
+std::atomic_flag MultiVersionDatabase::externalClientsInitialized = ATOMIC_FLAG_INIT;
 
 // MultiVersionApi
-
 bool MultiVersionApi::apiVersionAtLeast(int minVersion) {
 	ASSERT_NE(MultiVersionApi::api->apiVersion, 0);
 	return MultiVersionApi::api->apiVersion >= minVersion || MultiVersionApi::api->apiVersion < 0;
@@ -1608,6 +1541,7 @@ void MultiVersionApi::addNetworkThreadCompletionHook(void (*hook)(void*), void*
 	}
 }
 
+// Creates an IDatabase object that represents a connections to the cluster
 Reference<IDatabase> MultiVersionApi::createDatabase(const char* clusterFilePath) {
 	lock.enter();
 	if (!networkSetup) {
@@ -1622,28 +1556,21 @@ Reference<IDatabase> MultiVersionApi::createDatabase(const char* clusterFilePath
 		int threadIdx = nextThread;
 		nextThread = (nextThread + 1) % threadCount;
 		lock.leave();
-		for (auto it : externalClients) {
-			TraceEvent("CreatingDatabaseOnExternalClient")
-			    .detail("LibraryPath", it.first)
-			    .detail("Failed", it.second[threadIdx]->failed);
-		}
-		return Reference<IDatabase>(new MultiVersionDatabase(this, threadIdx, clusterFile, Reference<IDatabase>()));
+
+		Reference<IDatabase> localDb = localClient->api->createDatabase(clusterFilePath);
+		return Reference<IDatabase>(
+		    new MultiVersionDatabase(this, threadIdx, clusterFile, Reference<IDatabase>(), localDb));
 	}
 
 	lock.leave();
 
 	ASSERT_LE(threadCount, 1);
 
-	auto db = localClient->api->createDatabase(clusterFilePath);
+	Reference<IDatabase> localDb = localClient->api->createDatabase(clusterFilePath);
 	if (bypassMultiClientApi) {
-		return db;
+		return localDb;
 	} else {
-		for (auto it : externalClients) {
-			TraceEvent("CreatingDatabaseOnExternalClient")
-			    .detail("LibraryPath", it.first)
-			    .detail("Failed", it.second[0]->failed);
-		}
-		return Reference<IDatabase>(new MultiVersionDatabase(this, 0, clusterFile, db));
+		return Reference<IDatabase>(new MultiVersionDatabase(this, 0, clusterFile, Reference<IDatabase>(), localDb));
 	}
 }
 
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index badb848334..c8aaeb840e 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -271,7 +271,7 @@ public:
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
 
-	// Returns the protocol version reported by a quorum of coordinators
+	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
 	ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
@@ -437,14 +437,14 @@ public:
 	                     int threadIdx,
 	                     std::string clusterFilePath,
 	                     Reference<IDatabase> db,
+	                     Reference<IDatabase> versionMonitorDb,
 	                     bool openConnectors = true);
-	~MultiVersionDatabase() override;
 
 	Reference<ITransaction> createTransaction() override;
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
 
-	// Returns the protocol version reported by a quorum of coordinators
+	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
 	ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
@@ -452,67 +452,59 @@ public:
 	void addref() override { ThreadSafeReferenceCounted<MultiVersionDatabase>::addref(); }
 	void delref() override { ThreadSafeReferenceCounted<MultiVersionDatabase>::delref(); }
 
+	// Create a MultiVersionDatabase that wraps an already created IDatabase object
+	// For internal use in testing
 	static Reference<IDatabase> debugCreateFromExistingDatabase(Reference<IDatabase> db);
 
 	ThreadFuture<int64_t> rebootWorker(const StringRef& address, bool check, int duration) override;
 	ThreadFuture<Void> forceRecoveryWithDataLoss(const StringRef& dcid) override;
 	ThreadFuture<Void> createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override;
 
-private:
-	struct DatabaseState;
-
-	struct Connector : ThreadCallback, ThreadSafeReferenceCounted<Connector> {
-		Connector(Reference<DatabaseState> dbState, Reference<ClientInfo> client, std::string clusterFilePath)
-		  : dbState(dbState), client(client), clusterFilePath(clusterFilePath), connected(false), cancelled(false) {}
-
-		void connect();
-		void cancel();
-
-		bool canFire(int notMadeActive) const override { return true; }
-		void fire(const Void& unused, int& userParam) override;
-		void error(const Error& e, int& userParam) override;
-
-		const Reference<ClientInfo> client;
-		const std::string clusterFilePath;
-
-		const Reference<DatabaseState> dbState;
-
-		ThreadFuture<Void> connectionFuture;
-
-		Reference<IDatabase> candidateDatabase;
-		Reference<ITransaction> tr;
-
-		bool connected;
-		bool cancelled;
-	};
+	// private:
 
+	// A struct that manages the current connection state of the MultiVersionDatabase. This wraps the underlying
+	// IDatabase object that is currently interacting with the cluster.
 	struct DatabaseState : ThreadSafeReferenceCounted<DatabaseState> {
-		DatabaseState();
+		DatabaseState(std::string clusterFilePath, Reference<IDatabase> versionMonitorDb);
 
-		void stateChanged();
-		void addConnection(Reference<ClientInfo> client, std::string clusterFilePath);
-		void startConnections();
-		void cancelConnections();
+		// Called when a change to the protocol version of the cluster has been detected. Must be called from the main
+		// thread.
+		void protocolVersionChanged(ProtocolVersion protocolVersion);
+
+		// Adds a client (local or externally loaded) that can be used to connect to the cluster
+		void addClient(Reference<ClientInfo> client);
+
+		// Watch the cluster protocol version for changes and update the database state when it does
+		ThreadFuture<Void> monitorProtocolVersion();
 
 		Reference<IDatabase> db;
 		const Reference<ThreadSafeAsyncVar<Reference<IDatabase>>> dbVar;
+		std::string clusterFilePath;
+
+		// Used to monitor the cluster protocol version. Will be the same as db unless we have either not connected
+		// yet or if the client version associated with db does not support protocol monitoring. In those cases, this
+		// will be a specially created local db.
+		Reference<IDatabase> versionMonitorDb;
 
 		ThreadFuture<Void> changed;
 
 		bool cancelled;
 
-		int currentClientIndex;
-		std::vector<Reference<ClientInfo>> clients;
-		std::vector<Reference<Connector>> connectionAttempts;
+		ThreadFuture<Void> protocolVersionMonitor;
+		Optional<ProtocolVersion> dbProtocolVersion;
+		std::map<ProtocolVersion, Reference<ClientInfo>> clients;
 
 		std::vector<std::pair<FDBDatabaseOptions::Option, Optional<Standalone<StringRef>>>> options;
 		UniqueOrderedOptionList<FDBTransactionOptions> transactionDefaultOptions;
 		Mutex optionLock;
 	};
 
-	std::string clusterFilePath;
 	const Reference<DatabaseState> dbState;
 	friend class MultiVersionTransaction;
+
+	// Clients must create a database object in order to initialize some of their state.
+	// This needs to be done only once, and this flag tracks whether that has happened.
+	static std::atomic_flag externalClientsInitialized;
 };
 
 // An implementation of IClientApi that can choose between multiple different client implementations either provided
@@ -530,6 +522,7 @@ public:
 	void stopNetwork() override;
 	void addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParameter) override;
 
+	// Creates an IDatabase object that represents a connections to the cluster
 	Reference<IDatabase> createDatabase(const char* clusterFilePath) override;
 	static MultiVersionApi* api;
 
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 6615e973dd..4a6239346e 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -898,6 +898,7 @@ Future<Standalone<RangeResultRef>> HealthMetricsRangeImpl::getRange(ReadYourWrit
 
 DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionFile>>> connectionFile,
                                  Reference<AsyncVar<ClientDBInfo>> clientInfo,
+                                 Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
                                  Future<Void> clientInfoMonitor,
                                  TaskPriority taskID,
                                  LocalityData const& clientLocality,
@@ -906,9 +907,10 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
                                  bool internal,
                                  int apiVersion,
                                  bool switchable)
-  : connectionFile(connectionFile), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), taskID(taskID),
-    clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance), lockAware(lockAware),
-    apiVersion(apiVersion), switchable(switchable), proxyProvisional(false), cc("TransactionMetrics"),
+  : connectionFile(connectionFile), clientInfo(clientInfo), coordinator(coordinator),
+    clientInfoMonitor(clientInfoMonitor), taskID(taskID), clientLocality(clientLocality),
+    enableLocalityLoadBalance(enableLocalityLoadBalance), lockAware(lockAware), apiVersion(apiVersion),
+    switchable(switchable), proxyProvisional(false), cc("TransactionMetrics"),
     transactionReadVersions("ReadVersions", cc), transactionReadVersionsThrottled("ReadVersionsThrottled", cc),
     transactionReadVersionsCompleted("ReadVersionsCompleted", cc),
     transactionReadVersionBatches("ReadVersionBatches", cc),
@@ -1156,6 +1158,8 @@ DatabaseContext::DatabaseContext(const Error& err)
     transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), internal(false),
     transactionTracingEnabled(true) {}
 
+// Static constructor used by server processes to create a DatabaseContext
+// For internal (fdbserver) use only
 Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo,
                                  Future<Void> clientInfoMonitor,
                                  LocalityData clientLocality,
@@ -1166,6 +1170,7 @@ Database DatabaseContext::create(Reference<AsyncVar<ClientDBInfo>> clientInfo,
                                  bool switchable) {
 	return Database(new DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionFile>>>(),
 	                                    clientInfo,
+	                                    makeReference<AsyncVar<Optional<ClientLeaderRegInterface>>>(),
 	                                    clientInfoMonitor,
 	                                    taskID,
 	                                    clientLocality,
@@ -1446,6 +1451,9 @@ void DatabaseContext::expireThrottles() {
 
 extern IPAddress determinePublicIPAutomatically(ClusterConnectionString const& ccs);
 
+// Creates a database object that represents a connection to a cluster
+// This constructor uses a preallocated DatabaseContext that may have been created
+// on another thread
 Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
                                   int apiVersion,
                                   bool internal,
@@ -1492,15 +1500,20 @@ Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
 	g_network->initTLS();
 
 	auto clientInfo = makeReference<AsyncVar<ClientDBInfo>>();
+	auto coordinator = makeReference<AsyncVar<Optional<ClientLeaderRegInterface>>>();
 	auto connectionFile = makeReference<AsyncVar<Reference<ClusterConnectionFile>>>();
 	connectionFile->set(connFile);
-	Future<Void> clientInfoMonitor = monitorProxies(
-	    connectionFile, clientInfo, networkOptions.supportedVersions, StringRef(networkOptions.traceLogGroup));
+	Future<Void> clientInfoMonitor = monitorProxies(connectionFile,
+	                                                clientInfo,
+	                                                coordinator,
+	                                                networkOptions.supportedVersions,
+	                                                StringRef(networkOptions.traceLogGroup));
 
 	DatabaseContext* db;
 	if (preallocatedDb) {
 		db = new (preallocatedDb) DatabaseContext(connectionFile,
 		                                          clientInfo,
+		                                          coordinator,
 		                                          clientInfoMonitor,
 		                                          TaskPriority::DefaultEndpoint,
 		                                          clientLocality,
@@ -1512,6 +1525,7 @@ Database Database::createDatabase(Reference<ClusterConnectionFile> connFile,
 	} else {
 		db = new DatabaseContext(connectionFile,
 		                         clientInfo,
+		                         coordinator,
 		                         clientInfoMonitor,
 		                         TaskPriority::DefaultEndpoint,
 		                         clientLocality,
@@ -4872,48 +4886,95 @@ Future<Standalone<StringRef>> Transaction::getVersionstamp() {
 	return versionstampPromise.getFuture();
 }
 
-ACTOR Future<ProtocolVersion> coordinatorProtocolsFetcher(Reference<ClusterConnectionFile> f) {
-	state ClientCoordinators coord(f);
+// Gets the protocol version reported by a coordinator via the protocol info interface
+ACTOR Future<ProtocolVersion> getCoordinatorProtocol(NetworkAddressList coordinatorAddresses) {
+	RequestStream<ProtocolInfoRequest> requestStream{ Endpoint{ { coordinatorAddresses }, WLTOKEN_PROTOCOL_INFO } };
+	ProtocolInfoReply reply = wait(retryBrokenPromise(requestStream, ProtocolInfoRequest{}));
 
-	state vector<Future<ProtocolInfoReply>> coordProtocols;
-	coordProtocols.reserve(coord.clientLeaderServers.size());
-	for (int i = 0; i < coord.clientLeaderServers.size(); i++) {
-		RequestStream<ProtocolInfoRequest> requestStream{ Endpoint{
-			{ coord.clientLeaderServers[i].getLeader.getEndpoint().addresses }, WLTOKEN_PROTOCOL_INFO } };
-		coordProtocols.push_back(retryBrokenPromise(requestStream, ProtocolInfoRequest{}));
-	}
-
-	wait(smartQuorum(coordProtocols, coordProtocols.size() / 2 + 1, 1.5));
-
-	std::unordered_map<uint64_t, int> protocolCount;
-	for (int i = 0; i < coordProtocols.size(); i++) {
-		if (coordProtocols[i].isReady()) {
-			protocolCount[coordProtocols[i].get().version.version()]++;
-		}
-	}
-
-	uint64_t majorityProtocol = std::max_element(protocolCount.begin(),
-	                                             protocolCount.end(),
-	                                             [](const std::pair<uint64_t, int>& l,
-	                                                const std::pair<uint64_t, int>& r) { return l.second < r.second; })
-	                                ->first;
-	return ProtocolVersion(majorityProtocol);
+	return reply.version;
 }
 
-// Returns the protocol version reported by a quorum of coordinators
-// If an expected version is given, the future won't return until the protocol version is different than expected
-ACTOR Future<ProtocolVersion> getClusterProtocol(Reference<ClusterConnectionFile> f,
-                                                 Optional<ProtocolVersion> expectedVersion) {
+// Gets the protocol version reported by a coordinator in its connect packet
+// If we are unable to get a version from the connect packet (e.g. because we lost connection with the peer), then this
+// function will return with an unset result.
+// If an expected version is given, this future won't return if the actual protocol version matches the expected version
+ACTOR Future<Optional<ProtocolVersion>> getCoordinatorProtocolFromConnectPacket(
+    NetworkAddress coordinatorAddress,
+    Optional<ProtocolVersion> expectedVersion) {
+
+	state Reference<AsyncVar<Optional<ProtocolVersion>>> protocolVersion =
+	    FlowTransport::transport().getPeerProtocolAsyncVar(coordinatorAddress);
+
 	loop {
-		ProtocolVersion protocolVersion = wait(coordinatorProtocolsFetcher(f));
-		if (!expectedVersion.present() || protocolVersion != expectedVersion.get()) {
-			return protocolVersion;
-		} else {
-			wait(delay(2.0)); // TODO: this is temporary, so not making into a knob yet
+		if (protocolVersion->get().present() &&
+		    (!expectedVersion.present() || expectedVersion.get() != protocolVersion->get().get())) {
+			return protocolVersion->get();
+		}
+
+		Future<Void> change = protocolVersion->onChange();
+		if (!protocolVersion->get().present()) {
+			// If we still don't have any connection info after a timeout, retry sending the protocol version request
+			change = timeout(change, FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT, Void());
+		}
+
+		wait(change);
+
+		if (!protocolVersion->get().present()) {
+			return protocolVersion->get();
 		}
 	}
 }
 
+// Returns the protocol version reported by the given coordinator
+// If an expected version is given, the future won't return until the protocol version is different than expected
+ACTOR Future<ProtocolVersion> getClusterProtocolImpl(
+    Reference<AsyncVar<Optional<ClientLeaderRegInterface>>> coordinator,
+    Optional<ProtocolVersion> expectedVersion) {
+
+	state bool needToConnect = true;
+	state Future<ProtocolVersion> protocolVersion = Never();
+
+	loop {
+		if (!coordinator->get().present()) {
+			wait(coordinator->onChange());
+		} else {
+			Endpoint coordinatorEndpoint = coordinator->get().get().getLeader.getEndpoint();
+			if (needToConnect) {
+				// Even though we typically rely on the connect packet to get the protocol version, we need to send some
+				// request in order to start a connection. This protocol version request serves that purpose.
+				protocolVersion = getCoordinatorProtocol(coordinatorEndpoint.addresses);
+				needToConnect = false;
+			}
+			choose {
+				when(wait(coordinator->onChange())) { needToConnect = true; }
+
+				when(ProtocolVersion pv = wait(protocolVersion)) {
+					if (!expectedVersion.present() || expectedVersion.get() != pv) {
+						return pv;
+					}
+				}
+
+				// Older versions of FDB don't have an endpoint to return the protocol version, so we get this info from
+				// the connect packet
+				when(Optional<ProtocolVersion> pv = wait(getCoordinatorProtocolFromConnectPacket(
+				         coordinatorEndpoint.getPrimaryAddress(), expectedVersion))) {
+					if (pv.present()) {
+						return pv.get();
+					} else {
+						needToConnect = true;
+					}
+				}
+			}
+		}
+	}
+}
+
+// Returns the protocol version reported by the coordinator this client is currently connected to
+// If an expected version is given, the future won't return until the protocol version is different than expected
+Future<ProtocolVersion> DatabaseContext::getClusterProtocol(Optional<ProtocolVersion> expectedVersion) {
+	return getClusterProtocolImpl(coordinator, expectedVersion);
+}
+
 uint32_t Transaction::getSize() {
 	auto s = tr.transaction.mutations.expectedSize() + tr.transaction.read_conflict_ranges.expectedSize() +
 	         tr.transaction.write_conflict_ranges.expectedSize();
diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h
index 51411ae0a2..9f9b0057ca 100644
--- a/fdbclient/NativeAPI.actor.h
+++ b/fdbclient/NativeAPI.actor.h
@@ -76,11 +76,15 @@ class Database {
 public:
 	enum { API_VERSION_LATEST = -1 };
 
+	// Creates a database object that represents a connection to a cluster
+	// This constructor uses a preallocated DatabaseContext that may have been created
+	// on another thread
 	static Database createDatabase(Reference<ClusterConnectionFile> connFile,
 	                               int apiVersion,
 	                               bool internal = true,
 	                               LocalityData const& clientLocality = LocalityData(),
 	                               DatabaseContext* preallocatedDb = nullptr);
+
 	static Database createDatabase(std::string connFileName,
 	                               int apiVersion,
 	                               bool internal = true,
@@ -400,11 +404,6 @@ ACTOR Future<Void> snapCreate(Database cx, Standalone<StringRef> snapCmd, UID sn
 // Checks with Data Distributor that it is safe to mark all servers in exclusions as failed
 ACTOR Future<bool> checkSafeExclusions(Database cx, vector<AddressExclusion> exclusions);
 
-// Returns the protocol version reported by a quorum of coordinators
-// If an expected version is given, the future won't return until the protocol version is different than expected
-ACTOR Future<ProtocolVersion> getClusterProtocol(Reference<ClusterConnectionFile> f,
-                                                 Optional<ProtocolVersion> expectedVersion);
-
 inline uint64_t getWriteOperationCost(uint64_t bytes) {
 	return bytes / std::max(1, CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR) + 1;
 }
diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp
index c5bf2dce87..ce17338af7 100644
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@@ -97,13 +97,12 @@ double ThreadSafeDatabase::getMainThreadBusyness() {
 	return g_network->networkInfo.metrics.networkBusyness;
 }
 
-// Returns the protocol version reported by a quorum of coordinators
+// Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
 ThreadFuture<ProtocolVersion> ThreadSafeDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
 	DatabaseContext* db = this->db;
-	return onMainThread([db, expectedVersion]() -> Future<ProtocolVersion> {
-		return getClusterProtocol(db->getConnectionFile(), expectedVersion);
-	});
+	return onMainThread(
+	    [db, expectedVersion]() -> Future<ProtocolVersion> { return db->getClusterProtocol(expectedVersion); });
 }
 
 ThreadSafeDatabase::ThreadSafeDatabase(std::string connFilename, int apiVersion) {
diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h
index e6360c2a6d..407f9aefae 100644
--- a/fdbclient/ThreadSafeTransaction.h
+++ b/fdbclient/ThreadSafeTransaction.h
@@ -39,7 +39,7 @@ public:
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
 
-	// Returns the protocol version reported by a quorum of coordinators
+	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
 	ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 56fca670b2..b7221c8876 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -760,6 +760,13 @@ ACTOR Future<Void> connectionKeeper(Reference<Peer> self,
 
 				conn->close();
 				conn = Reference<IConnection>();
+
+				// Old versions will throw this error, and we don't want to forget their protocol versions.
+				// This means we can't tell the difference between an old protocol version and one we
+				// can no longer connect to.
+				if (e.code() != error_code_incompatible_protocol_version) {
+					self->protocolVersion->set(Optional<ProtocolVersion>());
+				}
 			}
 
 			// Clients might send more packets in response, which needs to go out on the next connection
@@ -787,7 +794,8 @@ Peer::Peer(TransportData* transport, NetworkAddress const& destination)
     incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()),
     pingLatencies(destination.isPublic() ? FLOW_KNOBS->PING_SAMPLE_AMOUNT : 1), lastLoggedBytesReceived(0),
     bytesSent(0), lastLoggedBytesSent(0), lastLoggedTime(0.0), connectOutgoingCount(0), connectIncomingCount(0),
-    connectFailedCount(0), connectLatencies(destination.isPublic() ? FLOW_KNOBS->NETWORK_CONNECT_SAMPLE_AMOUNT : 1) {
+    connectFailedCount(0), connectLatencies(destination.isPublic() ? FLOW_KNOBS->NETWORK_CONNECT_SAMPLE_AMOUNT : 1),
+    protocolVersion(Reference<AsyncVar<Optional<ProtocolVersion>>>(new AsyncVar<Optional<ProtocolVersion>>())) {
 	IFailureMonitor::failureMonitor().setStatus(destination, FailureStatus(false));
 }
 
@@ -1103,12 +1111,12 @@ static int getNewBufferSize(const uint8_t* begin,
 	                          packetLen + sizeof(uint32_t) * (peerAddress.isTLS() ? 2 : 3));
 }
 
+// This actor exists whenever there is an open or opening connection, whether incoming or outgoing
+// For incoming connections conn is set and peer is initially nullptr; for outgoing connections it is the reverse
 ACTOR static Future<Void> connectionReader(TransportData* transport,
                                            Reference<IConnection> conn,
                                            Reference<Peer> peer,
                                            Promise<Reference<Peer>> onConnected) {
-	// This actor exists whenever there is an open or opening connection, whether incoming or outgoing
-	// For incoming connections conn is set and peer is initially nullptr; for outgoing connections it is the reverse
 
 	state Arena arena;
 	state uint8_t* unprocessed_begin = nullptr;
@@ -1209,6 +1217,7 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
 							if (!protocolVersion.hasMultiVersionClient()) {
 								// Older versions expected us to hang up. It may work even if we don't hang up here, but
 								// it's safer to keep the old behavior.
+								peer->protocolVersion->set(peerProtocolVersion);
 								throw incompatible_protocol_version();
 							}
 						} else {
@@ -1256,6 +1265,7 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
 							onConnected.send(peer);
 							wait(delay(0)); // Check for cancellation
 						}
+						peer->protocolVersion->set(peerProtocolVersion);
 					}
 				}
 
@@ -1669,6 +1679,16 @@ Reference<AsyncVar<bool>> FlowTransport::getDegraded() {
 	return self->degraded;
 }
 
+// Returns the protocol version of the peer at the specified address. The result is returned as an AsyncVar that
+// can be used to monitor for changes of a peer's protocol. The protocol version will be unset in the event that
+// there is no connection established to the peer.
+//
+// Note that this function does not establish a connection to the peer. In order to obtain a peer's protocol
+// version, some other mechanism should be used to connect to that peer.
+Reference<AsyncVar<Optional<ProtocolVersion>>> FlowTransport::getPeerProtocolAsyncVar(NetworkAddress addr) {
+	return self->peers.at(addr)->protocolVersion;
+}
+
 void FlowTransport::resetConnection(NetworkAddress address) {
 	auto peer = self->getPeer(address);
 	if (peer) {
diff --git a/fdbrpc/FlowTransport.h b/fdbrpc/FlowTransport.h
index bdec8237bd..e2bbfddeee 100644
--- a/fdbrpc/FlowTransport.h
+++ b/fdbrpc/FlowTransport.h
@@ -152,6 +152,9 @@ struct Peer : public ReferenceCounted<Peer> {
 	double lastLoggedTime;
 	int64_t lastLoggedBytesReceived;
 	int64_t lastLoggedBytesSent;
+
+	Reference<AsyncVar<Optional<ProtocolVersion>>> protocolVersion;
+
 	// Cleared every time stats are logged for this peer.
 	int connectOutgoingCount;
 	int connectIncomingCount;
@@ -174,64 +177,64 @@ public:
 	FlowTransport(uint64_t transportId);
 	~FlowTransport();
 
-	static void createInstance(bool isClient, uint64_t transportId);
 	// Creates a new FlowTransport and makes FlowTransport::transport() return it.  This uses g_network->global()
 	// variables, so it will be private to a simulation.
+	static void createInstance(bool isClient, uint64_t transportId);
 
 	static bool isClient() { return g_network->global(INetwork::enClientFailureMonitor) != nullptr; }
 
-	void initMetrics();
 	// Metrics must be initialized after FlowTransport::createInstance has been called
+	void initMetrics();
 
-	Future<Void> bind(NetworkAddress publicAddress, NetworkAddress listenAddress);
 	// Starts a server listening on the given listenAddress, and sets publicAddress to be the public
 	// address of this server.  Returns only errors.
+	Future<Void> bind(NetworkAddress publicAddress, NetworkAddress listenAddress);
 
-	NetworkAddress getLocalAddress() const;
 	// Returns first local NetworkAddress.
+	NetworkAddress getLocalAddress() const;
 
-	NetworkAddressList getLocalAddresses() const;
 	// Returns all local NetworkAddress.
+	NetworkAddressList getLocalAddresses() const;
 
-	std::map<NetworkAddress, std::pair<uint64_t, double>>* getIncompatiblePeers();
 	// Returns the same of all peers that have attempted to connect, but have incompatible protocol versions
+	std::map<NetworkAddress, std::pair<uint64_t, double>>* getIncompatiblePeers();
 
-	Future<Void> onIncompatibleChanged();
 	// Returns when getIncompatiblePeers has at least one peer which is incompatible.
+	Future<Void> onIncompatibleChanged();
 
-	void addPeerReference(const Endpoint&, bool isStream);
 	// Signal that a peer connection is being used, even if no messages are currently being sent to the peer
+	void addPeerReference(const Endpoint&, bool isStream);
 
-	void removePeerReference(const Endpoint&, bool isStream);
 	// Signal that a peer connection is no longer being used
+	void removePeerReference(const Endpoint&, bool isStream);
 
-	void addEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);
 	// Sets endpoint to be a new local endpoint which delivers messages to the given receiver
+	void addEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);
 
 	void addEndpoints(std::vector<std::pair<struct FlowReceiver*, TaskPriority>> const& streams);
 
-	void removeEndpoint(const Endpoint&, NetworkMessageReceiver*);
 	// The given local endpoint no longer delivers messages to the given receiver or uses resources
+	void removeEndpoint(const Endpoint&, NetworkMessageReceiver*);
 
-	void addWellKnownEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);
 	// Sets endpoint to a new local endpoint (without changing its token) which delivers messages to the given receiver
 	// Implementations may have limitations on when this function is called and what endpoint.token may be!
+	void addWellKnownEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID);
 
+	// sendReliable will keep trying to deliver the data to the destination until cancelReliable is called. It will
+	// retry sending if the connection is closed or the failure manager reports the destination become available (edge
+	// triggered).
 	ReliablePacket* sendReliable(ISerializeSource const& what, const Endpoint& destination);
-	// sendReliable will keep trying to deliver the data to the destination until cancelReliable is
-	//   called.  It will retry sending if the connection is closed or the failure manager reports
-	//   the destination become available (edge triggered).
 
+	// Makes Packet "unreliable" (either the data or a connection close event will be delivered eventually). It can
+	// still be used safely to send a reply to a "reliable" request.
 	void cancelReliable(ReliablePacket*);
-	// Makes Packet "unreliable" (either the data or a connection close event will be delivered
-	//   eventually).  It can still be used safely to send a reply to a "reliable" request.
 
-	Reference<AsyncVar<bool>> getDegraded();
 	// This async var will be set to true when the process cannot connect to a public network address that the failure
 	// monitor thinks is healthy.
+	Reference<AsyncVar<bool>> getDegraded();
 
-	void resetConnection(NetworkAddress address);
 	// Forces the connection with this address to be reset
+	void resetConnection(NetworkAddress address);
 
 	Reference<Peer> sendUnreliable(ISerializeSource const& what,
 	                               const Endpoint& destination,
@@ -239,6 +242,14 @@ public:
 
 	bool incompatibleOutgoingConnectionsPresent();
 
+	// Returns the protocol version of the peer at the specified address. The result is returned as an AsyncVar that
+	// can be used to monitor for changes of a peer's protocol. The protocol version will be unset in the event that
+	// there is no connection established to the peer.
+	//
+	// Note that this function does not establish a connection to the peer. In order to obtain a peer's protocol
+	// version, some other mechanism should be used to connect to that peer.
+	Reference<AsyncVar<Optional<ProtocolVersion>>> getPeerProtocolAsyncVar(NetworkAddress addr);
+
 	static FlowTransport& transport() {
 		return *static_cast<FlowTransport*>((void*)g_network->global(INetwork::enFlowTransport));
 	}
diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h
index 1a5bd816b8..74da1dfd70 100644
--- a/flow/ProtocolVersion.h
+++ b/flow/ProtocolVersion.h
@@ -20,6 +20,7 @@
 
 #pragma once
 #include <cstdint>
+#include "flow/Trace.h"
 
 #define PROTOCOL_VERSION_FEATURE(v, x)                                                                                 \
 	struct x {                                                                                                         \
@@ -50,6 +51,10 @@ public:
 		return (other.version() & compatibleProtocolVersionMask) == (version() & compatibleProtocolVersionMask);
 	}
 
+	// Returns a normalized protocol version that will be the same for all compatible versions
+	constexpr ProtocolVersion normalizedVersion() const {
+		return ProtocolVersion(_version & compatibleProtocolVersionMask);
+	}
 	constexpr bool isValid() const { return version() >= minValidProtocolVersion; }
 
 	constexpr uint64_t version() const { return _version & versionFlagMask; }
@@ -134,6 +139,13 @@ public: // introduced features
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, SpanContext);
 };
 
+template <>
+struct Traceable<ProtocolVersion> : std::true_type {
+	static std::string toString(const ProtocolVersion& protocolVersion) {
+		return format("0x%016lX", protocolVersion.version());
+	}
+};
+
 // These impact both communications and the deserialization of certain database and IKeyValueStore keys.
 //
 // The convention is that 'x' and 'y' should match the major and minor version of the software, and 'z' should be 0.

From 711fb5829369458c1bb2305af0369af72c6b3043 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 15 Apr 2021 12:40:39 -0700
Subject: [PATCH 168/317] Improve logging on worker joining cluster

1. Logging on worker nodes when it joins a cluster and which cluster;
2. Log the connection string that is being used by worker;
3. Log a warning when a worker fails to join a cluster for longer than 5min, either because it doesn't know which cluster to join, or fails to get a RegisterWorkerReply within 5min.
---
 fdbserver/worker.actor.cpp | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 7e8ddbaf79..76554189be 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -526,9 +526,9 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			request.issues.push_back_deep(request.issues.arena(), i);
 		}
 		ClusterConnectionString fileConnectionString;
+		std::string connectionString = connFile->getConnectionString().toString();
 		if (connFile && !connFile->fileContentsUpToDate(fileConnectionString)) {
 			request.issues.push_back_deep(request.issues.arena(), LiteralStringRef("incorrect_cluster_file_contents"));
-			std::string connectionString = connFile->getConnectionString().toString();
 			if (!incorrectTime.present()) {
 				incorrectTime = now();
 			}
@@ -542,6 +542,12 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			}
 		} else {
 			incorrectTime = Optional<double>();
+			if (connFile->canGetFilename()) {
+				TraceEvent("ClusterFileContents")
+				    .detail("Filename", connFile->getFilename())
+				    .detail("ConnectionStringFromFile", fileConnectionString.toString())
+				    .detail("CurrentConnectionString", connectionString);
+			}
 		}
 
 		auto peers = FlowTransport::transport().getIncompatiblePeers();
@@ -554,21 +560,27 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			}
 		}
 
-		Future<RegisterWorkerReply> registrationReply =
+		state Future<RegisterWorkerReply> registrationReply =
 		    ccInterface->get().present()
 		        ? brokenPromiseToNever(ccInterface->get().get().registerWorker.getReply(request))
 		        : Never();
-		choose {
+		state double startTime = now();
+		loop choose {
 			when(RegisterWorkerReply reply = wait(registrationReply)) {
 				processClass = reply.processClass;
 				asyncPriorityInfo->set(reply.priorityInfo);
+				TraceEvent("WorkerJoiningCluster").detail("CCID", ccInterface->get().get().id());
+				break;
 			}
-			when(wait(ccInterface->onChange())) {}
-			when(wait(ddInterf->onChange())) {}
-			when(wait(rkInterf->onChange())) {}
-			when(wait(degraded->onChange())) {}
-			when(wait(FlowTransport::transport().onIncompatibleChanged())) {}
-			when(wait(issues->onChange())) {}
+			when(wait(delay(300))) { // 5 min
+				TraceEvent(SevWarn, "WorkerNotJoinedClusterForLongTime").detail("WaitTime", now() - startTime);
+			}
+			when(wait(ccInterface->onChange())) { break; }
+			when(wait(ddInterf->onChange())) { break; }
+			when(wait(rkInterf->onChange())) { break; }
+			when(wait(degraded->onChange())) { break; }
+			when(wait(FlowTransport::transport().onIncompatibleChanged())) { break; }
+			when(wait(issues->onChange())) { break; }
 		}
 	}
 }

From 486260e944c3362a283eb1494a79e12ba873f3af Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Thu, 15 Apr 2021 13:36:31 -0700
Subject: [PATCH 169/317] Fix infinite loop with stable interface protocol
 monitoring. Fix case where getting an error with a network option didn't
 properly terminate the database connection. Reduce option lock critical
 section.

---
 fdbclient/MultiVersionTransaction.actor.cpp | 24 +++++++++++----------
 fdbclient/NativeAPI.actor.cpp               |  2 ++
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 57f23e3d88..b39fde5cfd 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -1029,22 +1029,23 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
 
 		if (itr != clients.end()) {
 			auto& client = itr->second;
-			TraceEvent("CreatingDatabaseOnExternalClient")
+			TraceEvent("CreatingDatabaseOnClient")
 			    .detail("LibraryPath", client->libPath)
-			    .detail("Failed", client->failed);
+			    .detail("Failed", client->failed)
+			    .detail("External", client->external);
 
 			Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
 
 			optionLock.enter();
 			for (auto option : options) {
 				try {
-					newDb->setOption(
-					    option.first,
-					    option.second.castTo<StringRef>()); // In practice, this will set a deferred error instead
-					                                        // of throwing. If that happens, the database will be
-					                                        // unusable (attempts to use it will throw errors).
+					// In practice, this will set a deferred error instead of throwing. If that happens, the database
+					// will be unusable (attempts to use it will throw errors).
+					newDb->setOption(option.first, option.second.castTo<StringRef>());
 				} catch (Error& e) {
 					optionLock.leave();
+
+					// If we can't set all of the options on a cluster, we abandon the client
 					TraceEvent(SevError, "ClusterVersionChangeOptionError")
 					    .error(e)
 					    .detail("Option", option.first)
@@ -1052,19 +1053,20 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
 					    .detail("LibPath", client->libPath);
 					client->failed = true;
 					MultiVersionApi::api->updateSupportedVersions();
-					db = Reference<IDatabase>(); // If we can't set all of the options on a cluster, we abandon the
-					                             // client
+					newDb = Reference<IDatabase>();
 					break;
 				}
 			}
 
 			db = newDb;
-			if (dbProtocolVersion.get().hasStableInterfaces()) {
+
+			optionLock.leave();
+
+			if (dbProtocolVersion.get().hasStableInterfaces() && db) {
 				versionMonitorDb = db;
 			} else {
 				versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
 			}
-			optionLock.leave();
 		} else {
 			db = Reference<IDatabase>();
 			versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 4a6239346e..f673d025c6 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -4952,6 +4952,8 @@ ACTOR Future<ProtocolVersion> getClusterProtocolImpl(
 					if (!expectedVersion.present() || expectedVersion.get() != pv) {
 						return pv;
 					}
+
+					protocolVersion = Never();
 				}
 
 				// Older versions of FDB don't have an endpoint to return the protocol version, so we get this info from

From 551268b0f25be9f1eb35089e702a6a78cd2d1913 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 15 Apr 2021 13:50:50 -0700
Subject: [PATCH 170/317] Add well known endpoint for worker communication

---
 cmake/GetMsgpack.cmake              |  7 +++-
 fdbclient/NativeAPI.actor.cpp       |  4 ++
 fdbclient/ProcessInterface.h        | 57 +++++++++++++++++++++++++++++
 fdbclient/SpecialKeySpace.actor.cpp | 37 ++++++++++++++++++-
 fdbclient/SpecialKeySpace.actor.h   |  7 ++++
 fdbrpc/FlowTransport.actor.cpp      |  2 +-
 fdbserver/worker.actor.cpp          | 15 ++++++++
 flow/Platform.actor.cpp             | 18 +--------
 8 files changed, 126 insertions(+), 21 deletions(-)
 create mode 100644 fdbclient/ProcessInterface.h

diff --git a/cmake/GetMsgpack.cmake b/cmake/GetMsgpack.cmake
index 0b951d5a1b..dc9a578175 100644
--- a/cmake/GetMsgpack.cmake
+++ b/cmake/GetMsgpack.cmake
@@ -9,8 +9,11 @@ else()
   ExternalProject_add(msgpackProject
     URL "https://github.com/msgpack/msgpack-c/releases/download/cpp-3.3.0/msgpack-3.3.0.tar.gz"
     URL_HASH SHA256=6e114d12a5ddb8cb11f669f83f32246e484a8addd0ce93f274996f1941c1f07b
-    CONFIGURE_COMMAND BUILD_COMMAND INSTALL_COMMAND)
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ""
+  )
 
   ExternalProject_Get_property(msgpackProject SOURCE_DIR)
   target_include_directories(msgpack SYSTEM INTERFACE "${SOURCE_DIR}/include")
-endif()
\ No newline at end of file
+endif()
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index b208107fde..f5c135dd23 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -1046,6 +1046,10 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 		    std::make_unique<ClientProfilingImpl>(
 		        KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
 		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
+		registerSpecialKeySpaceModule(
+		    SpecialKeySpace::MODULE::ACTORLINEAGE,
+		    SpecialKeySpace::IMPLTYPE::READONLY,
+		    std::make_unique<ActorLineageImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE)));
 	}
 	if (apiVersionAtLeast(630)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION,
diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
new file mode 100644
index 0000000000..c76cf9ef48
--- /dev/null
+++ b/fdbclient/ProcessInterface.h
@@ -0,0 +1,57 @@
+/*
+ * ProcessInterface.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/FDBTypes.h"
+#include "fdbrpc/fdbrpc.h"
+
+constexpr UID WLTOKEN_PROCESS(-1, 11);
+
+struct ProcessInterface {
+	constexpr static FileIdentifier file_identifier = 985636;
+	RequestStream<struct GetProcessInterfaceRequest> getInterface;
+	RequestStream<struct EchoRequest> echo;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, echo);
+	}
+};
+
+struct GetProcessInterfaceRequest {
+	constexpr static FileIdentifier file_identifier = 7632546;
+	ReplyPromise<ProcessInterface> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, reply);
+	}
+};
+
+// TODO: Used for demonstration purposes, remove in later PR
+struct EchoRequest {
+	constexpr static FileIdentifier file_identifier = 10624019;
+	std::string message;
+	ReplyPromise<std::string> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, message, reply);
+	}
+};
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 5fb7360b0d..eaa35e353d 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -22,6 +22,7 @@
 #include "boost/algorithm/string.hpp"
 
 #include "fdbclient/Knobs.h"
+#include "fdbclient/ProcessInterface.h"
 #include "fdbclient/SpecialKeySpace.actor.h"
 #include "flow/Arena.h"
 #include "flow/UnitTest.h"
@@ -65,9 +66,12 @@ std::unordered_map<SpecialKeySpace::MODULE, KeyRange> SpecialKeySpace::moduleToB
 	{ SpecialKeySpace::MODULE::CONFIGURATION,
 	  KeyRangeRef(LiteralStringRef("\xff\xff/configuration/"), LiteralStringRef("\xff\xff/configuration0")) },
 	{ SpecialKeySpace::MODULE::TRACING,
-	  KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) }
+	  KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) },
+	{ SpecialKeySpace::MODULE::ACTORLINEAGE,
+	  KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) }
 };
 
+// TODO: Similar for actor lineage?
 std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandToRange = {
 	{ "exclude",
 	  KeyRangeRef(LiteralStringRef("excluded/"), LiteralStringRef("excluded0"))
@@ -1794,3 +1798,34 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke
 	    "profile",
 	    "Clear operation is forbidden for profile client. You can set it to default to disable profiling.");
 }
+
+ActorLineageImpl::ActorLineageImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
+
+ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYourWritesTransaction* ryw,
+                                                                          KeyRef prefix,
+                                                                          KeyRangeRef kr) {
+	state Standalone<RangeResultRef> result;
+	Standalone<StringRef> addressString = kr.begin.removePrefix(prefix);
+
+	try {
+		auto address = NetworkAddress::parse(addressString.contents().toString());
+
+		state ProcessInterface process;
+		process.getInterface = RequestStream<GetProcessInterfaceRequest>(Endpoint({ address }, WLTOKEN_PROCESS));
+		ProcessInterface p = wait(retryBrokenPromise(process.getInterface, GetProcessInterfaceRequest{}));
+		process = p;
+
+		EchoRequest echoRequest;
+		echoRequest.message = "Hello";
+		std::string response = wait(process.echo.getReply(echoRequest));
+		result.push_back_deep(result.arena(), KeyValueRef(kr.begin, response));
+	} catch (Error& e) {
+		TraceEvent(SevDebug, "SpecialKeysNetworkParseError").error(e);
+	}
+
+	return result;
+}
+
+Future<Standalone<RangeResultRef>> ActorLineageImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+	return actorLineageGetRangeActor(ryw, getKeyRange().begin, kr);
+}
diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h
index c760a10724..051b17470a 100644
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@@ -142,6 +142,7 @@ public:
 class SpecialKeySpace {
 public:
 	enum class MODULE {
+		ACTORLINEAGE, // Sampling data
 		CLUSTERFILEPATH,
 		CONFIGURATION, // Configuration of the cluster
 		CONNECTIONSTRING,
@@ -377,5 +378,11 @@ public:
 	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
 };
 
+class ActorLineageImpl : public SpecialKeyRangeReadImpl {
+public:
+	explicit ActorLineageImpl(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+};
+
 #include "flow/unactorcompiler.h"
 #endif
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 56fca670b2..15dac5dea0 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -334,7 +334,7 @@ ACTOR Future<Void> pingLatencyLogger(TransportData* self) {
 }
 
 TransportData::TransportData(uint64_t transportId)
-  : endpoints(/*wellKnownTokenCount*/ 11), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
+  : endpoints(/*wellKnownTokenCount*/ 12), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints),
     warnAlwaysForLargePacket(true), lastIncompatibleMessage(0), transportId(transportId),
     numIncompatibleConnections(0) {
 	degraded = makeReference<AsyncVar<bool>>(false);
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 2740d9e720..4d05d3f5fe 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -22,6 +22,7 @@
 #include <boost/lexical_cast.hpp>
 
 #include "fdbrpc/Locality.h"
+#include "fdbclient/ProcessInterface.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbserver/Knobs.h"
 #include "flow/ActorCollection.h"
@@ -2032,6 +2033,19 @@ ACTOR Future<Void> serveProtocolInfo() {
 	}
 }
 
+ACTOR Future<Void> serveProcess() {
+	state ProcessInterface process;
+	process.getInterface.makeWellKnownEndpoint(WLTOKEN_PROCESS, TaskPriority::DefaultEndpoint);
+	loop {
+		choose {
+			when(GetProcessInterfaceRequest req = waitNext(process.getInterface.getFuture())) {
+				req.reply.send(process);
+			}
+			when(EchoRequest req = waitNext(process.echo.getFuture())) { req.reply.send(req.message); }
+		}
+	}
+}
+
 ACTOR Future<Void> fdbd(Reference<ClusterConnectionFile> connFile,
                         LocalityData localities,
                         ProcessClass processClass,
@@ -2048,6 +2062,7 @@ ACTOR Future<Void> fdbd(Reference<ClusterConnectionFile> connFile,
 	currentLineage->modify(&RoleLineage::role) = ProcessClass::Worker;
 
 	actors.push_back(serveProtocolInfo());
+	actors.push_back(serveProcess());
 
 	try {
 		ServerCoordinators coordinators(connFile);
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 78fe11b0a5..4d435afe00 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -3683,28 +3683,12 @@ void* sampleThread(void* arg) {
 
 		// Get actor lineage of currently running actor.
 		auto actorLineage = currentLineageThreadSafe.get();
-		printf("Currently running actor lineage (%p):\n", actorLineage.getPtr());
-		auto stack = actorLineage->stack(&StackLineage::actorName);
-		while (!stack.empty()) {
-			printf("%s ", stack.back());
-			stack.pop_back();
-		}
-		printf("\n");
+		// TODO: Use actorLineage
 
 		for (const auto& [waitState, lineageFn] : samples) {
 			auto alps = lineageFn();
 
 			// TODO: Serialize collected actor linage properties
-
-			printf("Wait State #%d ALPs (%d):\n", waitState, alps.size());
-			for (auto actorLineage : alps) {
-				auto stack = actorLineage->stack(&StackLineage::actorName);
-				while (!stack.empty()) {
-					printf("%s ", stack.back());
-					stack.pop_back();
-				}
-				printf("\n");
-			}
 		}
 	}
 

From 5c33c7c4f59841585e92687e2a69433d5787b57a Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 15 Apr 2021 13:54:49 -0700
Subject: [PATCH 171/317] Remove TODO

---
 fdbclient/SpecialKeySpace.actor.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index eaa35e353d..b245b049ba 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -71,7 +71,6 @@ std::unordered_map<SpecialKeySpace::MODULE, KeyRange> SpecialKeySpace::moduleToB
 	  KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) }
 };
 
-// TODO: Similar for actor lineage?
 std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandToRange = {
 	{ "exclude",
 	  KeyRangeRef(LiteralStringRef("excluded/"), LiteralStringRef("excluded0"))

From 225375b043c67175907cc9559a0ab04371b49caf Mon Sep 17 00:00:00 2001
From: falsandtru <falsandtru@users.noreply.github.com>
Date: Fri, 16 Apr 2021 06:28:33 +0900
Subject: [PATCH 172/317] Update Golang sample

---
 packaging/docker/samples/golang/.env       | 6 +++---
 packaging/docker/samples/golang/app/go.mod | 4 ++--
 packaging/docker/samples/golang/app/go.sum | 6 ++++--
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/packaging/docker/samples/golang/.env b/packaging/docker/samples/golang/.env
index 811156201e..55ddee20ab 100644
--- a/packaging/docker/samples/golang/.env
+++ b/packaging/docker/samples/golang/.env
@@ -19,8 +19,8 @@
 
 COMPOSE_PROJECT_NAME=fdbgolangsample
 
-FDB_API_VERSION=620
-FDB_VERSION=6.2.28
+FDB_API_VERSION=630
+FDB_VERSION=6.3.12
 FDB_COORDINATOR=fdb-coordinator
 FDB_NETWORKING_MODE=container
-FDB_COORDINATOR_PORT=4500
\ No newline at end of file
+FDB_COORDINATOR_PORT=4500
diff --git a/packaging/docker/samples/golang/app/go.mod b/packaging/docker/samples/golang/app/go.mod
index 609602be19..8f285c9966 100644
--- a/packaging/docker/samples/golang/app/go.mod
+++ b/packaging/docker/samples/golang/app/go.mod
@@ -19,6 +19,6 @@
 
 module foundationdb.org/docker/samples/golang/v0/fdb-demo-golang
 
-go 1.13
+go 1.16
 
-require github.com/apple/foundationdb/bindings/go v0.0.0-20191129023120-e16ae7cadf80
+require github.com/apple/foundationdb/bindings/go v0.0.0-20210414233633-40942b2d9d13
diff --git a/packaging/docker/samples/golang/app/go.sum b/packaging/docker/samples/golang/app/go.sum
index 7ebbfbaab3..33f5d5038a 100644
--- a/packaging/docker/samples/golang/app/go.sum
+++ b/packaging/docker/samples/golang/app/go.sum
@@ -1,2 +1,4 @@
-github.com/apple/foundationdb/bindings/go v0.0.0-20191129023120-e16ae7cadf80 h1:VKL6OsaB8X91ijz5DEDOw2lBIxmqTUVm5A//EExEyvo=
-github.com/apple/foundationdb/bindings/go v0.0.0-20191129023120-e16ae7cadf80/go.mod h1:OMVSB21p9+xQUIqlGizHPZfjK+SHws1ht+ZytVDoz9U=
+github.com/apple/foundationdb/bindings/go v0.0.0-20210414233633-40942b2d9d13 h1:RxQG4vcIkRCjxCtN/QFm9SkMGvikjStR4TLHR0Z78+8=
+github.com/apple/foundationdb/bindings/go v0.0.0-20210414233633-40942b2d9d13/go.mod h1:w63jdZTFCtvdjsUj5yrdKgjxaAD5uXQX6hJ7EaiLFRs=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

From 41104040adee788be8a3e19525cd54d59c6ccdb3 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 15 Apr 2021 15:24:24 -0700
Subject: [PATCH 173/317] Rename init_snapshot_interval to
 initial_snapshot_interval.

---
 documentation/sphinx/source/backups.rst | 2 +-
 fdbbackup/backup.actor.cpp              | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/documentation/sphinx/source/backups.rst b/documentation/sphinx/source/backups.rst
index 404fe70f50..01a730a6bd 100644
--- a/documentation/sphinx/source/backups.rst
+++ b/documentation/sphinx/source/backups.rst
@@ -244,7 +244,7 @@ The ``start`` subcommand is used to start a backup.  If there is already a backu
 ``-s <DURATION>`` or ``--snapshot_interval <DURATION>``  
   Specifies the duration, in seconds, of the inconsistent snapshots written to the backup in continuous mode.  The default is 864000 which is 10 days.
 
-``--init_snapshot_interval <DURATION>``  
+``--initial_snapshot_interval <DURATION>``  
   Specifies the duration, in seconds, of the first inconsistent snapshot written to the backup.  The default is 0, which means as fast as possible.
 
 ``--partitioned_log_experimental``
diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index c171c2fcb5..4b4aa40765 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -105,7 +105,7 @@ enum {
 	// Backup constants
 	OPT_DESTCONTAINER,
 	OPT_SNAPSHOTINTERVAL,
-	OPT_INIT_SNAPSHOT_INTERVAL,
+	OPT_INITIAL_SNAPSHOT_INTERVAL,
 	OPT_ERRORLIMIT,
 	OPT_NOSTOPWHENDONE,
 	OPT_EXPIRE_BEFORE_VERSION,
@@ -233,7 +233,7 @@ CSimpleOpt::SOption g_rgBackupStartOptions[] = {
 	{ OPT_USE_PARTITIONED_LOG, "--partitioned_log_experimental", SO_NONE },
 	{ OPT_SNAPSHOTINTERVAL, "-s", SO_REQ_SEP },
 	{ OPT_SNAPSHOTINTERVAL, "--snapshot_interval", SO_REQ_SEP },
-	{ OPT_INIT_SNAPSHOT_INTERVAL, "--init_snapshot_interval", SO_REQ_SEP },
+	{ OPT_INITIAL_SNAPSHOT_INTERVAL, "--initial_snapshot_interval", SO_REQ_SEP },
 	{ OPT_TAGNAME, "-t", SO_REQ_SEP },
 	{ OPT_TAGNAME, "--tagname", SO_REQ_SEP },
 	{ OPT_BACKUPKEYS, "-k", SO_REQ_SEP },
@@ -3473,7 +3473,7 @@ int main(int argc, char* argv[]) {
 				modifyOptions.destURL = destinationContainer;
 				break;
 			case OPT_SNAPSHOTINTERVAL:
-			case OPT_INIT_SNAPSHOT_INTERVAL:
+			case OPT_INITIAL_SNAPSHOT_INTERVAL:
 			case OPT_MOD_ACTIVE_INTERVAL: {
 				const char* a = args->OptionArg();
 				int seconds;
@@ -3485,7 +3485,7 @@ int main(int argc, char* argv[]) {
 				if (optId == OPT_SNAPSHOTINTERVAL) {
 					snapshotIntervalSeconds = seconds;
 					modifyOptions.snapshotIntervalSeconds = seconds;
-				} else if (optId == OPT_INIT_SNAPSHOT_INTERVAL) {
+				} else if (optId == OPT_INITIAL_SNAPSHOT_INTERVAL) {
 					initialSnapshotIntervalSeconds = seconds;
 				} else if (optId == OPT_MOD_ACTIVE_INTERVAL) {
 					modifyOptions.activeSnapshotIntervalSeconds = seconds;

From 028c02c7b0fa16c8ddadb664b67b5a5df6f3eae8 Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh3y@gmail.com>
Date: Thu, 15 Apr 2021 22:26:32 +0000
Subject: [PATCH 174/317] doc: Link FDB Commit Process doc in CONTRIBUTING.md

---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e599780e37..525e80a9d9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -36,7 +36,7 @@ Members of the Apple FoundationDB team are part of the core committers helping r
 
 ## Contributing
 ### Opening a Pull Request
-We love pull requests! For minor changes, feel free to open up a PR directly. For larger feature development and any changes that may require community discussion, we ask that you discuss your ideas on the [community forums](https://forums.foundationdb.org) prior to opening a PR, and then reference that thread within your PR comment.
+We love pull requests! For minor changes, feel free to open up a PR directly. For larger feature development and any changes that may require community discussion, we ask that you discuss your ideas on the [community forums](https://forums.foundationdb.org) prior to opening a PR, and then reference that thread within your PR comment. Please refer to [FoundationDB Commit Process](https://github.com/apple/foundationdb/wiki/FoundationDB-Commit-Process) for more detailed guidelines.
 
 CI will be run automatically for core committers, and for community PRs it will be initiated by the request of a core committer.  Tests can also be run locally via `ctest`, and core committers can run additional validation on pull requests prior to merging them.
 

From 21c518467a3278dec795a2f104984611dc8674fb Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 15 Apr 2021 15:38:44 -0700
Subject: [PATCH 175/317] Move 300s to a knob.

---
 fdbserver/Knobs.cpp        | 1 +
 fdbserver/Knobs.h          | 1 +
 fdbserver/worker.actor.cpp | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index ad4c797b8d..df3434fb9b 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -616,6 +616,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	//Worker
 	init( WORKER_LOGGING_INTERVAL,                               5.0 );
 	init( HEAP_PROFILER_INTERVAL,                               30.0 );
+	init( JOIN_CLUSTER_WARNING_INTERVAL,                       300.0 );
 	init( DEGRADED_RESET_INTERVAL,                          24*60*60 ); if ( randomize && BUGGIFY ) DEGRADED_RESET_INTERVAL = 10;
 	init( DEGRADED_WARNING_LIMIT,                                  1 );
 	init( DEGRADED_WARNING_RESET_DELAY,                   7*24*60*60 );
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 9a5f2a528c..690bbe6327 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -543,6 +543,7 @@ public:
 	// Worker
 	double WORKER_LOGGING_INTERVAL;
 	double HEAP_PROFILER_INTERVAL;
+	double JOIN_CLUSTER_WARNING_INTERVAL;
 	double DEGRADED_RESET_INTERVAL;
 	double DEGRADED_WARNING_LIMIT;
 	double DEGRADED_WARNING_RESET_DELAY;
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 76554189be..e03089b618 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -572,7 +572,7 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 				TraceEvent("WorkerJoiningCluster").detail("CCID", ccInterface->get().get().id());
 				break;
 			}
-			when(wait(delay(300))) { // 5 min
+			when(wait(delay(SERVER_KNOBS->JOIN_CLUSTER_WARNING_INTERVAL))) {
 				TraceEvent(SevWarn, "WorkerNotJoinedClusterForLongTime").detail("WaitTime", now() - startTime);
 			}
 			when(wait(ccInterface->onChange())) { break; }

From b75e25e01292f28d5ea5a3621f6e1de809b4e511 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Fri, 16 Apr 2021 17:39:56 +0000
Subject: [PATCH 177/317] Fix multiple (inline) definitions of canReplyWith

canReplyWith was declared inline in two translation units with different
definitions. When building with clang, this resulted in
storageserver.actor.cpp using the definition from
StorageCache.actor.cpp. Fix that by giving both symbols internal
linkage.
---
 fdbserver/StorageCache.actor.cpp  | 2 +-
 fdbserver/storageserver.actor.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp
index 24aed1e7bf..73f6d0a245 100644
--- a/fdbserver/StorageCache.actor.cpp
+++ b/fdbserver/StorageCache.actor.cpp
@@ -43,7 +43,7 @@
 // Need to look into refactoring common code out for better code readability and to avoid duplication
 
 // TODO rename wrong_shard_server error to wrong_cache_server
-inline bool canReplyWith(Error e) {
+static inline bool canReplyWith(Error e) {
 	switch (e.code()) {
 	case error_code_transaction_too_old:
 	case error_code_future_version:
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 8c26f955bb..cadf5d36fb 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -72,7 +72,7 @@
 
 #define SHORT_CIRCUT_ACTUAL_STORAGE 0
 
-inline bool canReplyWith(Error e) {
+static inline bool canReplyWith(Error e) {
 	switch (e.code()) {
 	case error_code_transaction_too_old:
 	case error_code_future_version:

From e7d4a452e46e2c41b64f7ac978be4a5dcf6ee1fe Mon Sep 17 00:00:00 2001
From: Vishesh Yadav <vishesh_yadav@apple.com>
Date: Tue, 13 Apr 2021 18:09:48 -0700
Subject: [PATCH 178/317] docker: Load custom bashrc if available

This will check if there is a `.bashrc.local` avialable in the synced directory,
and load it.

- This is useful so that any changes usrs make to our bashrc in Okteto containers,
doesn't get lost between re-deployement of containers.
- Can also used to automate setting up environment, e.g. copy various dotfiles
etc from the synced directory to $HOME folder during first run.
---
 build/docker/centos6/devel/Dockerfile | 7 ++++++-
 build/docker/centos7/devel/Dockerfile | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/build/docker/centos6/devel/Dockerfile b/build/docker/centos6/devel/Dockerfile
index 82c99d4464..c5c9db2914 100644
--- a/build/docker/centos6/devel/Dockerfile
+++ b/build/docker/centos6/devel/Dockerfile
@@ -76,4 +76,9 @@ RUN rm -f /root/anaconda-ks.cfg && \
     '   j start --tarball $(find ${HOME}/build_output/packages -name correctness\*.tar.gz) "${@}"' \
     '}' \
     '' \
-    >> .bashrc
\ No newline at end of file
+    'USER_BASHRC="$HOME/src/.bashrc.local"' \
+    'if test -f "$USER_BASHRC"; then' \
+    '   source $USER_BASHRC' \
+    'fi' \
+    '' \
+    >> .bashrc
diff --git a/build/docker/centos7/devel/Dockerfile b/build/docker/centos7/devel/Dockerfile
index ea60da54e7..98f1923c17 100644
--- a/build/docker/centos7/devel/Dockerfile
+++ b/build/docker/centos7/devel/Dockerfile
@@ -104,5 +104,10 @@ RUN rm -f /root/anaconda-ks.cfg && \
     '   j start --tarball $(find ${HOME}/build_output/packages -name correctness\*.tar.gz) "${@}"' \
     '}' \
     '' \
+    'USER_BASHRC="$HOME/src/.bashrc.local"' \
+    'if test -f "$USER_BASHRC"; then' \
+    '   source $USER_BASHRC' \
+    'fi' \
+    '' \
     'bash ${HOME}/docker_proxy.sh' \
-    >> .bashrc
\ No newline at end of file
+    >> .bashrc

From 4a1a55f27063a16ea7f793b9da55c6fee6228117 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 16 Apr 2021 13:48:44 -0700
Subject: [PATCH 179/317] Remove fdb_get_server_protocol from the Python
 bindings. This C function this was using recently moved and changed
 signature, so it no longer works in Python.

---
 bindings/c/test/unit/unit_tests.cpp |  2 +-
 bindings/python/fdb/__init__.py     |  1 -
 bindings/python/fdb/impl.py         | 10 ----------
 3 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp
index 64898f6ede..54f763fb5c 100644
--- a/bindings/c/test/unit/unit_tests.cpp
+++ b/bindings/c/test/unit/unit_tests.cpp
@@ -1513,7 +1513,7 @@ TEST_CASE("fdb_transaction_get_approximate_size") {
 	}
 }
 
-TEST_CASE("fdb_get_server_protocol") {
+TEST_CASE("fdb_database_get_server_protocol") {
 	// We don't really have any expectations other than "don't crash" here
 	FDBFuture* protocolFuture = fdb_database_get_server_protocol(db, 0);
 	uint64_t out;
diff --git a/bindings/python/fdb/__init__.py b/bindings/python/fdb/__init__.py
index 17f697797d..c969b6c70c 100644
--- a/bindings/python/fdb/__init__.py
+++ b/bindings/python/fdb/__init__.py
@@ -95,7 +95,6 @@ def api_version(ver):
         'transactional',
         'options',
         'StreamingMode',
-        'get_server_protocol'
     )
 
     _add_symbols(fdb.impl, list)
diff --git a/bindings/python/fdb/impl.py b/bindings/python/fdb/impl.py
index 6e7803777a..e8cc2a79b8 100644
--- a/bindings/python/fdb/impl.py
+++ b/bindings/python/fdb/impl.py
@@ -1531,9 +1531,6 @@ def init_c_api():
     _capi.fdb_transaction_get_approximate_size.argtypes = [ctypes.c_void_p]
     _capi.fdb_transaction_get_approximate_size.restype = ctypes.c_void_p
 
-    _capi.fdb_get_server_protocol.argtypes = [ctypes.c_char_p]
-    _capi.fdb_get_server_protocol.restype = ctypes.c_void_p
-
     _capi.fdb_transaction_get_versionstamp.argtypes = [ctypes.c_void_p]
     _capi.fdb_transaction_get_versionstamp.restype = ctypes.c_void_p
 
@@ -1733,13 +1730,6 @@ open_databases = {}
 
 cacheLock = threading.Lock()
 
-def get_server_protocol(clusterFilePath=None):
-    with _network_thread_reentrant_lock:
-        if not _network_thread:
-            init()
-
-    return FutureUInt64(_capi.fdb_get_server_protocol(optionalParamToBytes(clusterFilePath)[0]))
-
 def open(cluster_file=None, event_model=None):
     """Opens the given database (or the default database of the cluster indicated
     by the fdb.cluster file in a platform-specific location, if no cluster_file

From bb5539bb70a27d38cdeeb8c1fab362fe1d8317fa Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 16 Apr 2021 13:47:41 -0700
Subject: [PATCH 180/317] Initialize version field

---
 fdbclient/SpecialKeySpace.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 2bbafbd451..9ac802a9ca 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1430,7 +1430,7 @@ ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* gl
 		}
 	}
 
-	VersionHistory vh;
+	VersionHistory vh{ 0 };
 
 	// Transform writes from the special-key-space (\xff\xff/global_config/) to
 	// the system key space (\xff/globalConfig/), and writes mutations to

From db610355cf4f59f8b097ff7a61964395dcdfb05d Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Fri, 16 Apr 2021 14:19:37 -0700
Subject: [PATCH 181/317] Keep simulated disk write delay high until speedUp is
 set.

---
 fdbrpc/AsyncFileNonDurable.actor.h | 2 +-
 flow/Knobs.cpp                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index fe3d3a4137..848d755fb1 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -197,7 +197,7 @@ private:
 		this->file = file;
 		this->filename = filename;
 		this->diskParameters = diskParameters;
-		maxWriteDelay = deterministicRandom()->random01() * FLOW_KNOBS->NON_DURABLE_MAX_WRITE_DELAY;
+		maxWriteDelay = FLOW_KNOBS->NON_DURABLE_MAX_WRITE_DELAY;
 		hasBeenSynced = false;
 
 		killMode = (KillMode)deterministicRandom()->randomInt(1, 3);
diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp
index 6dc77e2fb2..4a3eb4e2d7 100644
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@@ -135,7 +135,7 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) {
 	init( DISABLE_POSIX_KERNEL_AIO,                              0 );
 
 	//AsyncFileNonDurable
-	init( NON_DURABLE_MAX_WRITE_DELAY,                      0.0001 ); if( randomize && BUGGIFY ) NON_DURABLE_MAX_WRITE_DELAY = 5.0;
+	init( NON_DURABLE_MAX_WRITE_DELAY,                         5.0 );
 	init( MAX_PRIOR_MODIFICATION_DELAY,                        1.0 ); if( randomize && BUGGIFY ) MAX_PRIOR_MODIFICATION_DELAY = 10.0;
 
 	//GenericActors

From d79dc447b4fd45907e72ae8913b021e60ac76458 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 16 Apr 2021 15:24:21 -0700
Subject: [PATCH 182/317] Update release notes

---
 .../sphinx/source/release-notes/release-notes-630.rst         | 4 ++++
 .../sphinx/source/release-notes/release-notes-700.rst         | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst
index 076f85d74d..cd8c5e4150 100644
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@@ -2,6 +2,10 @@
 Release Notes
 #############
 
+6.3.13
+======
+* The multi-version client now requires at most two connections to the cluster, regardless of how many external clients are configured. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
+
 6.3.12
 ======
 * Change the default for --knob_tls_server_handshake_threads to 64. The previous was 1000. This avoids starting 1000 threads by default, but may adversely affect recovery time for large clusters using tls. Users with large tls clusters should consider explicitly setting this knob in their foundationdb.conf file. `(PR #4421) <https://github.com/apple/foundationdb/pull/4421>`_
diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst
index 431ea14fc2..5f3d3a4669 100644
--- a/documentation/sphinx/source/release-notes/release-notes-700.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-700.rst
@@ -15,7 +15,8 @@ Features
 Performance
 -----------
 
-* Increased performance of dr_agent when copying the mutation log. The ``COPY_LOG_BLOCK_SIZE``, ``COPY_LOG_BLOCKS_PER_TASK``, ``COPY_LOG_PREFETCH_BLOCKS``, ``COPY_LOG_READ_AHEAD_BYTES`` and ``COPY_LOG_TASK_DURATION_NANOS`` knobs can be set. `(PR 3436) <https://github.com/apple/foundationdb/pull/3436>`_
+* Increased performance of dr_agent when copying the mutation log. The ``COPY_LOG_BLOCK_SIZE``, ``COPY_LOG_BLOCKS_PER_TASK``, ``COPY_LOG_PREFETCH_BLOCKS``, ``COPY_LOG_READ_AHEAD_BYTES`` and ``COPY_LOG_TASK_DURATION_NANOS`` knobs can be set. `(PR #3436) <https://github.com/apple/foundationdb/pull/3436>`_
+* Reduced the number of connections required by the multi-version client when loading external clients. When connection to 7.0 clusters, only one connection will be used. With older clusters, at most two connections will be used. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
 
 Reliability
 -----------

From 336a429be106c8c88da24a8856d59736fbdb4773 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 16 Apr 2021 17:32:53 -0600
Subject: [PATCH 183/317] first version of profiler

---
 fdbclient/ActorLineageProfiler.cpp | 94 +++++++++++++++++++++++++++---
 fdbclient/ActorLineageProfiler.h   | 69 ++++++++++++++++++----
 fdbclient/AnnotateActor.h          | 20 ++++++-
 3 files changed, 161 insertions(+), 22 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 5c0aaf86d1..4993a74207 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -18,7 +18,9 @@
  * limitations under the License.
  */
 
+#include "flow/flow.h"
 #include "flow/singleton.h"
+#include "fdbrpc/IAsyncFile.h"
 #include "fdbclient/ActorLineageProfiler.h"
 #include <msgpack.hpp>
 #include <memory>
@@ -26,15 +28,6 @@
 
 using namespace std::literals;
 
-std::string_view to_string(WaitState w) {
-	switch (w) {
-	case WaitState::Running:
-		return "Running";
-	case WaitState::DiskIO:
-		return "DiskIO";
-	}
-}
-
 class Packer : public msgpack::packer<msgpack::sbuffer> {
 	struct visitor_t {
 		using VisitorMap = std::unordered_map<std::type_index, std::function<void(std::any const&, Packer& packer)>>;
@@ -201,3 +194,86 @@ std::shared_ptr<Sample> SampleCollectorT::collect() {
 	packer.pack(res);
 	return packer.done(time);
 }
+
+void SampleCollection_t::refresh() {
+	auto sample = _collector->collect();
+	auto min = sample->time - windowSize;
+	double oldest = 0.0;
+	while (oldest < min && !data.empty()) {
+		// we remove at most 10 elements at a time. This is so we don't block the main thread for too long.
+		{
+			Lock _{ mutex };
+			int i = 0;
+			do {
+				oldest = data.front()->time;
+				data.pop_front();
+				++i;
+			} while (i < 10 && oldest < min && !data.empty());
+		}
+	}
+	{
+		Lock _{ mutex };
+		data.push_back(sample);
+	}
+}
+
+std::vector<std::shared_ptr<Sample>> SampleCollection_t::get(double from /*= 0.0*/,
+                                                             double to /*= std::numeric_limits<double>::max()*/) const {
+	Lock _{ mutex };
+	std::vector<std::shared_ptr<Sample>> res;
+	for (const auto& sample : data) {
+		if (sample->time > to) {
+			break;
+		} else if (sample->time > from) {
+			res.emplace_back(sample);
+		}
+	}
+	return res;
+}
+
+ActorLineageProfilerT::ActorLineageProfilerT() {
+	collection->collector()->addGetter(WaitState::Network,
+	                                   std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet())));
+	collection->collector()->addGetter(
+	    WaitState::Disk,
+	    std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet())));
+	collection->collector()->addGetter(WaitState::Running, []() {
+		auto res = currentLineageThreadSafe.get();
+		return std::vector<Reference<ActorLineage>>({ currentLineageThreadSafe.get() });
+	});
+}
+
+ActorLineageProfilerT::~ActorLineageProfilerT() {
+	stop();
+}
+
+void ActorLineageProfilerT::stop() {
+	setFrequency(0);
+}
+
+void ActorLineageProfilerT::setFrequency(unsigned frequency) {
+	bool change = this->frequency != frequency;
+	this->frequency = frequency;
+	if (frequency != 0 && !profilerThread.joinable()) {
+		profilerThread = std::thread(std::bind(&ActorLineageProfilerT::profile, this));
+	} else if (change) {
+		cond.notify_all();
+	}
+}
+
+void ActorLineageProfilerT::profile() {
+	for (;;) {
+		collection->refresh();
+		if (frequency == 0) {
+			return;
+		}
+		{
+			std::unique_lock<std::mutex> lock{ mutex };
+			cond.wait_for(lock, std::chrono::microseconds(1000000 / frequency));
+			// cond.wait_until(lock, lastSample + std::chrono::milliseconds)
+		}
+		if (frequency == 0) {
+			return;
+		}
+	}
+}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 1f2bdad659..3f11840714 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -19,13 +19,19 @@
  */
 
 #pragma once
+#include "fdbclient/AnnotateActor.h"
+
 #include <optional>
 #include <string>
 #include <any>
 #include <vector>
+#include <mutex>
+#include <condition_variable>
 #include "flow/singleton.h"
 #include "flow/flow.h"
 
+void runSamplingProfiler();
+
 struct IALPCollectorBase {
 	virtual std::optional<std::any> collect(ActorLineage*) = 0;
 	virtual const std::string_view& name() = 0;
@@ -34,19 +40,9 @@ struct IALPCollectorBase {
 
 template <class T>
 struct IALPCollector : IALPCollectorBase {
-	const std::string_view& name() override {
-		static std::string_view res;
-		if (res == "") {
-			res = T::name;
-		}
-		return res;
-	}
+	const std::string_view& name() override { return T::name; }
 };
 
-enum class WaitState { Running, DiskIO };
-
-std::string_view to_string(WaitState w);
-
 struct Sample : std::enable_shared_from_this<Sample> {
 	double time = 0.0;
 	unsigned size = 0u;
@@ -68,6 +64,57 @@ private:
 public:
 	void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); }
 	std::shared_ptr<Sample> collect();
+	void addGetter(WaitState waitState, Getter const& getter);
 };
 
 using SampleCollector = crossbow::singleton<SampleCollectorT>;
+
+class SampleCollection_t {
+	friend struct crossbow::create_static<SampleCollection_t>;
+	using Lock = std::unique_lock<std::mutex>;
+	SampleCollection_t() {}
+
+	SampleCollector _collector;
+	mutable std::mutex mutex;
+	std::atomic<double> windowSize = 0.0;
+	std::deque<std::shared_ptr<Sample>> data;
+
+public:
+	/**
+	 * Define how many samples the collection shoul keep. The window size is defined by time dimension.
+	 *
+	 * \param duration How long a sample should be kept in the collection.
+	 */
+	void setWindowSize(double duration) { windowSize.store(duration); }
+	/**
+	 * By default returns reference counted pointers of all samples. A window can be defined in terms of absolute time.
+	 *
+	 * \param from The minimal age of all returned samples.
+	 * \param to The max age of all returned samples.
+	 */
+	std::vector<std::shared_ptr<Sample>> get(double from = 0.0, double to = std::numeric_limits<double>::max()) const;
+	/**
+	 * Collects all new samples from the sample collector and stores them in the collection.
+	 */
+	void refresh();
+	const SampleCollector& collector() const { return _collector; }
+	SampleCollector& collector() { return _collector; }
+};
+
+using SampleCollection = crossbow::singleton<SampleCollection_t>;
+
+class ActorLineageProfilerT {
+	friend struct crossbow::create_static<ActorLineageProfilerT>;
+	ActorLineageProfilerT();
+	SampleCollection collection;
+	std::thread profilerThread;
+	std::atomic<unsigned> frequency = 0;
+	std::mutex mutex;
+	std::condition_variable cond;
+	void profile();
+
+public:
+	~ActorLineageProfilerT();
+	void setFrequency(unsigned frequency);
+	void stop();
+};
diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h
index 265d1bb3ad..660b777d69 100644
--- a/fdbclient/AnnotateActor.h
+++ b/fdbclient/AnnotateActor.h
@@ -23,6 +23,8 @@
 #include "flow/flow.h"
 #include "flow/network.h"
 
+#include <string_view>
+
 // Used to manually instrument waiting actors to collect samples for the
 // sampling profiler.
 struct AnnotateActor {
@@ -51,7 +53,7 @@ struct AnnotateActor {
 
 		return *this;
 	}
-	
+
 	~AnnotateActor() {
 		if (set) {
 			g_network->getActorLineageSet().erase(index);
@@ -59,6 +61,20 @@ struct AnnotateActor {
 	}
 };
 
-enum WaitState { Disk, Network };
+enum class WaitState { Disk, Network, Running };
+// usually we shouldn't use `using namespace` in a header file, but literals should be safe as user defined literals
+// need to be prefixed with `_`
+using namespace std::literals;
+
+constexpr std::string_view to_string(WaitState st) {
+	switch (st) {
+	case WaitState::Disk:
+		return "Disk"sv;
+	case WaitState::Network:
+		return "Network"sv;
+	case WaitState::Running:
+		return "Running"sv;
+	}
+}
 
 extern std::map<WaitState, std::function<std::vector<Reference<ActorLineage>>()>> samples;

From 09ddcb3bae9a83818aa64f06ed79ebb3aca566ad Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 19 Apr 2021 11:27:19 -0600
Subject: [PATCH 184/317] remove old sample thread

---
 fdbclient/NativeAPI.actor.cpp |  3 ++-
 fdbserver/fdbserver.actor.cpp |  1 -
 flow/Platform.actor.cpp       | 28 ----------------------------
 flow/Platform.h               |  8 +++-----
 4 files changed, 5 insertions(+), 35 deletions(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index b761f6c049..7ab3f18440 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -1801,7 +1801,6 @@ void runNetwork() {
 	if (networkOptions.traceDirectory.present() && networkOptions.runLoopProfilingEnabled) {
 		setupRunLoopProfiler();
 	}
-	setupSamplingProfiler();
 
 	g_network->run();
 
@@ -2483,9 +2482,11 @@ ACTOR Future<Version> watchValue(Future<Version> version,
 				cx->invalidateCache(key);
 				wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID));
 			} else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) {
+				// clang-format off
 				TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead
 				TEST(e.code() == error_code_watch_cancelled); // Too many watches on storage server, poll for changes
 				TEST(e.code() == error_code_process_behind); // The storage servers are all behind
+				// clang-format on
 				wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, info.taskID));
 			} else if (e.code() == error_code_timed_out) { // The storage server occasionally times out watches in case
 				                                           // it was cancelled
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index ce2a903c1f..ac1bf7950f 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -1948,7 +1948,6 @@ int main(int argc, char* argv[]) {
 				ASSERT(opts.connectionFile);
 
 				setupRunLoopProfiler();
-				setupSamplingProfiler();
 
 				auto dataFolder = opts.dataFolder;
 				if (!dataFolder.size())
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 4d435afe00..8cdb34f769 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -3677,34 +3677,6 @@ void setupRunLoopProfiler() {
 #endif
 }
 
-void* sampleThread(void* arg) {
-	while (true) {
-		threadSleep(1.0); // TODO: Read sample rate from global config
-
-		// Get actor lineage of currently running actor.
-		auto actorLineage = currentLineageThreadSafe.get();
-		// TODO: Use actorLineage
-
-		for (const auto& [waitState, lineageFn] : samples) {
-			auto alps = lineageFn();
-
-			// TODO: Serialize collected actor linage properties
-		}
-	}
-
-	return nullptr;
-}
-
-void setupSamplingProfiler() {
-	samples[WaitState::Disk] = std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet()));
-	samples[WaitState::Network] =
-	    std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet()));
-
-	// TODO: Add knob
-	TraceEvent("StartingSamplingProfilerThread");
-	startThread(&sampleThread, nullptr);
-}
-
 // UnitTest for getMemoryInfo
 #ifdef __linux__
 TEST_CASE("/flow/Platform/getMemoryInfo") {
diff --git a/flow/Platform.h b/flow/Platform.h
index edf9ff3997..c50c13e11a 100644
--- a/flow/Platform.h
+++ b/flow/Platform.h
@@ -741,8 +741,6 @@ void registerCrashHandler();
 void setupRunLoopProfiler();
 EXTERNC void setProfilingEnabled(int enabled);
 
-void setupSamplingProfiler();
-
 // Use _exit() or criticalError(), not exit()
 #define exit static_assert(false, "Calls to exit() are forbidden by policy");
 
@@ -793,17 +791,17 @@ inline void fdb_probe_actor_exit(const char* name, unsigned long id, int index)
 #include <inttypes.h>
 static inline uint32_t hwCrc32cU8(unsigned int crc, unsigned char v) {
 	uint32_t ret;
-	asm volatile("crc32cb %w[r], %w[c], %w[v]" : [r] "=r"(ret) : [c] "r"(crc), [v] "r"(v));
+	asm volatile("crc32cb %w[r], %w[c], %w[v]" : [ r ] "=r"(ret) : [ c ] "r"(crc), [ v ] "r"(v));
 	return ret;
 }
 static inline uint32_t hwCrc32cU32(unsigned int crc, unsigned int v) {
 	uint32_t ret;
-	asm volatile("crc32cw %w[r], %w[c], %w[v]" : [r] "=r"(ret) : [c] "r"(crc), [v] "r"(v));
+	asm volatile("crc32cw %w[r], %w[c], %w[v]" : [ r ] "=r"(ret) : [ c ] "r"(crc), [ v ] "r"(v));
 	return ret;
 }
 static inline uint64_t hwCrc32cU64(uint64_t crc, uint64_t v) {
 	uint64_t ret;
-	asm volatile("crc32cx %w[r], %w[c], %x[v]" : [r] "=r"(ret) : [c] "r"(crc), [v] "r"(v));
+	asm volatile("crc32cx %w[r], %w[c], %x[v]" : [ r ] "=r"(ret) : [ c ] "r"(crc), [ v ] "r"(v));
 	return ret;
 }
 #else

From f8d2bca6a4b0a664373d4ce511ee062c65b9cc9e Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Mon, 19 Apr 2021 13:10:27 -0600
Subject: [PATCH 185/317] address review comments

---
 fdbclient/ActorLineageProfiler.cpp | 32 ++++++++++++++----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 4993a74207..82d04aa42c 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -197,23 +197,21 @@ std::shared_ptr<Sample> SampleCollectorT::collect() {
 
 void SampleCollection_t::refresh() {
 	auto sample = _collector->collect();
-	auto min = sample->time - windowSize;
-	double oldest = 0.0;
-	while (oldest < min && !data.empty()) {
-		// we remove at most 10 elements at a time. This is so we don't block the main thread for too long.
-		{
-			Lock _{ mutex };
-			int i = 0;
-			do {
-				oldest = data.front()->time;
-				data.pop_front();
-				++i;
-			} while (i < 10 && oldest < min && !data.empty());
-		}
-	}
+	auto min = std::max(sample->time - windowSize, sample->time);
 	{
 		Lock _{ mutex };
-		data.push_back(sample);
+		data.emplace_back(std::move(sample));
+	}
+	double oldest = data.front()->time;
+	// we don't need to check for data.empty() in this loop (or the inner loop) as we know that we will end
+	// up with at least one entry which is the most recent sample
+	while (oldest < min) {
+		Lock _{ mutex };
+		// we remove at most 10 elements at a time. This is so we don't block the main thread for too long.
+		for (int i = 0; i < 10 && oldest < min; ++i) {
+			data.pop_front();
+			oldest = data.front()->time;
+		}
 	}
 }
 
@@ -224,8 +222,8 @@ std::vector<std::shared_ptr<Sample>> SampleCollection_t::get(double from /*= 0.0
 	for (const auto& sample : data) {
 		if (sample->time > to) {
 			break;
-		} else if (sample->time > from) {
-			res.emplace_back(sample);
+		} else if (sample->time >= from) {
+			res.push_back(sample);
 		}
 	}
 	return res;

From 7182289abb0ebfb30ecb37288bd9e288f30b93ac Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Mon, 19 Apr 2021 20:32:49 +0000
Subject: [PATCH 186/317] Use anonymous namespace

---
 fdbserver/StorageCache.actor.cpp  | 5 ++++-
 fdbserver/storageserver.actor.cpp | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp
index 73f6d0a245..b084f52896 100644
--- a/fdbserver/StorageCache.actor.cpp
+++ b/fdbserver/StorageCache.actor.cpp
@@ -42,8 +42,9 @@
 // TODO storageCache server shares quite a bit of storageServer functionality, although simplified
 // Need to look into refactoring common code out for better code readability and to avoid duplication
 
+namespace {
 // TODO rename wrong_shard_server error to wrong_cache_server
-static inline bool canReplyWith(Error e) {
+bool canReplyWith(Error e) {
 	switch (e.code()) {
 	case error_code_transaction_too_old:
 	case error_code_future_version:
@@ -56,6 +57,8 @@ static inline bool canReplyWith(Error e) {
 		return false;
 	};
 }
+} // namespace
+
 class StorageCacheUpdater;
 
 struct AddingCacheRange : NonCopyable {
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index cadf5d36fb..833a90ed5e 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -72,7 +72,8 @@
 
 #define SHORT_CIRCUT_ACTUAL_STORAGE 0
 
-static inline bool canReplyWith(Error e) {
+namespace {
+bool canReplyWith(Error e) {
 	switch (e.code()) {
 	case error_code_transaction_too_old:
 	case error_code_future_version:
@@ -85,6 +86,7 @@ static inline bool canReplyWith(Error e) {
 		return false;
 	};
 }
+} // namespace
 
 struct AddingShard : NonCopyable {
 	KeyRange keys;

From 03c031a09d77dec1e8700d5bcaf9de7b92f7e40a Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 19 Apr 2021 13:43:51 -0700
Subject: [PATCH 187/317] Update getCurrentVersion_impl

- If the restore is in the running state, then the current version is the getApplyBeginVersion()
- If the restore is in the completed state, the current version is the restore target version which comes from the restoreVersion() property.
- If the restore is in any other state, the current version can be reported as -1 as you have done.
---
 fdbclient/FileBackupAgent.actor.cpp | 43 ++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 4044df66cc..d95da6be89 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -244,20 +244,6 @@ public:
 
 	Key applyMutationsMapPrefix() { return uidPrefixKey(applyMutationsKeyVersionMapRange.begin, uid); }
 
-	ACTOR static Future<Version> getCurrentVersion_impl(Reference<ReadYourWritesTransaction> tr, UID uid) {
-		state Future<Optional<Value>> beginVal = tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid), true);
-		wait(success(beginVal));
-		if (!beginVal.get().present()) {
-			return -1;
-		}
-		Version currentVersion = BinaryReader::fromStringRef<Version>(beginVal.get().get(), Unversioned());
-		return currentVersion;
-	}
-
-	Future<Version> getCurrentVersion(Reference<ReadYourWritesTransaction> tr) {
-		return getCurrentVersion_impl(tr, uid);
-	}
-
 	ACTOR static Future<int64_t> getApplyVersionLag_impl(Reference<ReadYourWritesTransaction> tr, UID uid) {
 		// Both of these are snapshot reads
 		state Future<Optional<Value>> beginVal = tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid), true);
@@ -317,6 +303,13 @@ public:
 		tr->set(uidPrefixKey(applyMutationsBeginRange.begin, uid), BinaryWriter::toValue(ver, Unversioned()));
 	}
 
+	Future<Version> getApplyBeginVersion(Reference<ReadYourWritesTransaction> tr) {
+		return map(tr->get(uidPrefixKey(applyMutationsBeginRange.begin, uid)),
+		           [=](Optional<Value> const& value) -> Version {
+			           return value.present() ? BinaryReader::fromStringRef<Version>(value.get(), Unversioned()) : 0;
+		           });
+	}
+
 	void setApplyEndVersion(Reference<ReadYourWritesTransaction> tr, Version ver) {
 		tr->set(uidPrefixKey(applyMutationsEndRange.begin, uid), BinaryWriter::toValue(ver, Unversioned()));
 	}
@@ -328,6 +321,22 @@ public:
 		           });
 	}
 
+	ACTOR static Future<Version> getCurrentVersion_impl(RestoreConfig* self, Reference<ReadYourWritesTransaction> tr) {
+		state ERestoreState status = wait(self->stateEnum().getD(tr));
+		if (status == ERestoreState::RUNNING) {
+			Version version = wait(self->getApplyBeginVersion(tr));
+			return version;
+		} else if (status == ERestoreState::COMPLETED) {
+			Version version = wait(self->restoreVersion().getD(tr));
+			return version;
+		}
+		return -1;
+	}
+
+	Future<Version> getCurrentVersion(Reference<ReadYourWritesTransaction> tr) {
+		return getCurrentVersion_impl(this, tr);
+	}
+
 	ACTOR static Future<std::string> getProgress_impl(RestoreConfig restore, Reference<ReadYourWritesTransaction> tr);
 	Future<std::string> getProgress(Reference<ReadYourWritesTransaction> tr) { return getProgress_impl(*this, tr); }
 
@@ -5200,7 +5209,8 @@ public:
 	}
 
 	ACTOR static Future<Optional<Version>> getLastRestorable(FileBackupAgent* backupAgent,
-	                                                         Reference<ReadYourWritesTransaction> tr, Key tagName,
+	                                                         Reference<ReadYourWritesTransaction> tr,
+	                                                         Key tagName,
 	                                                         bool snapshot) {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
@@ -5594,7 +5604,8 @@ Future<std::string> FileBackupAgent::getStatusJSON(Database cx, std::string tagN
 	return FileBackupAgentImpl::getStatusJSON(this, cx, tagName);
 }
 
-Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName,
+Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr,
+                                                             Key tagName,
                                                              bool snapshot) {
 	return FileBackupAgentImpl::getLastRestorable(this, tr, tagName, snapshot);
 }

From ab4c5ff90e2598a36eb7bd8d0b3ab59f69abaa4b Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 19 Apr 2021 14:06:50 -0700
Subject: [PATCH 188/317] For better readability

---
 fdbclient/FileBackupAgent.actor.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index d95da6be89..f51ad736dd 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -323,14 +323,13 @@ public:
 
 	ACTOR static Future<Version> getCurrentVersion_impl(RestoreConfig* self, Reference<ReadYourWritesTransaction> tr) {
 		state ERestoreState status = wait(self->stateEnum().getD(tr));
+		state Version version = -1;
 		if (status == ERestoreState::RUNNING) {
-			Version version = wait(self->getApplyBeginVersion(tr));
-			return version;
+			wait(store(version, self->getApplyBeginVersion(tr)));
 		} else if (status == ERestoreState::COMPLETED) {
-			Version version = wait(self->restoreVersion().getD(tr));
-			return version;
+			wait(store(version, self->restoreVersion().getD(tr)));
 		}
-		return -1;
+		return version;
 	}
 
 	Future<Version> getCurrentVersion(Reference<ReadYourWritesTransaction> tr) {

From af72f76bd65250df3498ec42f26b468de4b537ae Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Mon, 19 Apr 2021 14:51:58 -0700
Subject: [PATCH 189/317] Update the documentation for
 \xff\xff/management/maintenance

---
 documentation/sphinx/source/developer-guide.rst | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index 6f4f7bcad4..3a3c731630 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -949,7 +949,11 @@ that process, and wait for necessary data to be moved away.
 #. ``\xff\xff/management/options/failed/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed/<exclusion>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
 #. ``\xff\xff/management/min_required_commit_version`` Read/write. Changing this key will change the corresponding system key ``\xff/minRequiredCommitVersion = [[Version]]``. The value of this special key is the literal text of the underlying ``Version``, which is ``int64_t``. If you set the key with a value failed to be parsed as ``int64_t``, ``special_keys_api_failure`` will be thrown. In addition, the given ``Version`` should be larger than the current read version and smaller than the upper bound(``2**63-1-version_per_second*3600*24*365*1000``). Otherwise, ``special_keys_api_failure`` is thrown. For more details, see help text of ``fdbcli`` command ``advanceversion``.
 #. ``\xff\xff/management/profiling/<client_txn_sample_rate|client_txn_size_limit>`` Read/write. Changing these two keys will change the corresponding system keys ``\xff\x02/fdbClientInfo/<client_txn_sample_rate|client_txn_size_limit>``, respectively. The value of ``\xff\xff/management/client_txn_sample_rate`` is a literal text of ``double``, and the value of ``\xff\xff/management/client_txn_size_limit`` is a literal text of ``int64_t``. A special value ``default`` can be set to or read from these two keys, representing the client profiling is disabled. In addition, ``clear`` in this range is not allowed. For more details, see help text of ``fdbcli`` command ``profile client``.
-#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of ``int`` which represents the remaining time for the zone to be in maintenance. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. The special key ``\xff\xff/management/maintenance/IgnoreSSFailures``, if set, will disable datadistribution for storage server failures and thus maintenance mode will be unable to use until the key is cleared. For more details, see help text of ``fdbcli`` command ``maintenance``.
+#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of ``int`` which represents the remaining time for the zone to be in maintenance. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. For more details, see help text of ``fdbcli`` command ``maintenance``.
+   In addition, a special key ``\xff\xff/management/maintenance/IgnoreSSFailures`` in the range, if set, will disable datadistribution for storage server failures.
+   It is doing the same thing as the fdbcli command ``datadistribution disable ssfailure``.
+   Maintenance mode will be unable to use until the key is cleared, which is the same as the fdbcli command ``datadistribution enable ssfailure``.
+   While the key is set, any commit that tries to set a key in the range will fail with the ``special_keys_api_failure`` error.
 #. ``\xff\xff/management/data_distribution/<mode|rebalance_ignored>`` Read/write. Changing these two keys will change the two corresponding system keys ``\xff/dataDistributionMode`` and ``\xff\x02/rebalanceDDIgnored``. The value of ``\xff\xff/management/data_distribution/mode`` is a literal text of ``0`` (disable) or ``1`` (enable). Transactions committed with invalid values will throw ``special_keys_api_failure`` . The value of ``\xff\xff/management/data_distribution/rebalance_ignored`` is empty. If present, it means data distribution is disabled for rebalance. Any transaction committed with non-empty value for this key will throw ``special_keys_api_failure``. For more details, see help text of ``fdbcli`` command ``datadistribution``.
 
 An exclusion is syntactically either an ip address (e.g. ``127.0.0.1``), or

From c2c9ca43626a0cd5df87cb73e6c3378aacd0165d Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Mon, 19 Apr 2021 17:01:20 -0700
Subject: [PATCH 190/317] Assert was incorrect.  Restore ranges must begin with
 the restore prefix to remove.

---
 fdbclient/FileBackupAgent.actor.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index e7da8fbf58..2369501c82 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -4590,8 +4590,9 @@ public:
 				restoreRanges.push_back(KeyRange(KeyRangeRef(restoreRange.range().begin, restoreRange.range().end)));
 			}
 		}
-		for (auto& restoreRange : restoreRanges)
-			ASSERT(restoreRange.contains(removePrefix) || removePrefix.size() == 0);
+		for (auto& restoreRange : restoreRanges) {
+			ASSERT(restoreRange.begin.startsWith(removePrefix) && restoreRange.end.startsWith(removePrefix));
+		}
 
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
@@ -5183,7 +5184,8 @@ public:
 	}
 
 	ACTOR static Future<Optional<Version>> getLastRestorable(FileBackupAgent* backupAgent,
-	                                                         Reference<ReadYourWritesTransaction> tr, Key tagName,
+	                                                         Reference<ReadYourWritesTransaction> tr,
+	                                                         Key tagName,
 	                                                         bool snapshot) {
 		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
 		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
@@ -5577,7 +5579,8 @@ Future<std::string> FileBackupAgent::getStatusJSON(Database cx, std::string tagN
 	return FileBackupAgentImpl::getStatusJSON(this, cx, tagName);
 }
 
-Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr, Key tagName,
+Future<Optional<Version>> FileBackupAgent::getLastRestorable(Reference<ReadYourWritesTransaction> tr,
+                                                             Key tagName,
                                                              bool snapshot) {
 	return FileBackupAgentImpl::getLastRestorable(this, tr, tagName, snapshot);
 }

From 36702e57ee3dd61b942705cccd6ae5a6ae20d7fd Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 19 Apr 2021 17:06:10 -0700
Subject: [PATCH 191/317] Rename a few variables.

---
 fdbserver/Knobs.cpp        | 2 +-
 fdbserver/Knobs.h          | 2 +-
 fdbserver/worker.actor.cpp | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index df3434fb9b..ef2334d3cf 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -616,7 +616,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	//Worker
 	init( WORKER_LOGGING_INTERVAL,                               5.0 );
 	init( HEAP_PROFILER_INTERVAL,                               30.0 );
-	init( JOIN_CLUSTER_WARNING_INTERVAL,                       300.0 );
+	init( REGISTER_WORKER_REQUEST_TIMEOUT,                       300.0 );
 	init( DEGRADED_RESET_INTERVAL,                          24*60*60 ); if ( randomize && BUGGIFY ) DEGRADED_RESET_INTERVAL = 10;
 	init( DEGRADED_WARNING_LIMIT,                                  1 );
 	init( DEGRADED_WARNING_RESET_DELAY,                   7*24*60*60 );
diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h
index 690bbe6327..a9333b0cf3 100644
--- a/fdbserver/Knobs.h
+++ b/fdbserver/Knobs.h
@@ -543,7 +543,7 @@ public:
 	// Worker
 	double WORKER_LOGGING_INTERVAL;
 	double HEAP_PROFILER_INTERVAL;
-	double JOIN_CLUSTER_WARNING_INTERVAL;
+	double REGISTER_WORKER_REQUEST_TIMEOUT;
 	double DEGRADED_RESET_INTERVAL;
 	double DEGRADED_WARNING_LIMIT;
 	double DEGRADED_WARNING_RESET_DELAY;
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index e03089b618..c2f35f226c 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -569,11 +569,11 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			when(RegisterWorkerReply reply = wait(registrationReply)) {
 				processClass = reply.processClass;
 				asyncPriorityInfo->set(reply.priorityInfo);
-				TraceEvent("WorkerJoiningCluster").detail("CCID", ccInterface->get().get().id());
+				TraceEvent("WorkerRegisterReply").detail("CCID", ccInterface->get().get().id());
 				break;
 			}
 			when(wait(delay(SERVER_KNOBS->JOIN_CLUSTER_WARNING_INTERVAL))) {
-				TraceEvent(SevWarn, "WorkerNotJoinedClusterForLongTime").detail("WaitTime", now() - startTime);
+				TraceEvent(SevWarn, "WorkerRegisterTimeout").detail("WaitTime", now() - startTime);
 			}
 			when(wait(ccInterface->onChange())) { break; }
 			when(wait(ddInterf->onChange())) { break; }

From 4a35fa07e784816843d3053dadc4a53c42a03912 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 19 Apr 2021 17:14:46 -0700
Subject: [PATCH 192/317] Add a safe check

---
 fdbserver/worker.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index c2f35f226c..0eb77c2553 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -542,7 +542,7 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			}
 		} else {
 			incorrectTime = Optional<double>();
-			if (connFile->canGetFilename()) {
+			if (connFile && connFile->canGetFilename()) {
 				TraceEvent("ClusterFileContents")
 				    .detail("Filename", connFile->getFilename())
 				    .detail("ConnectionStringFromFile", fileConnectionString.toString())

From f04303185fa81f9900ea38f99a03452d2059c886 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 19 Apr 2021 17:17:22 -0700
Subject: [PATCH 193/317] Huh

---
 fdbserver/worker.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 0eb77c2553..b7cbfe16d7 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -572,7 +572,7 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 				TraceEvent("WorkerRegisterReply").detail("CCID", ccInterface->get().get().id());
 				break;
 			}
-			when(wait(delay(SERVER_KNOBS->JOIN_CLUSTER_WARNING_INTERVAL))) {
+			when(wait(delay(SERVER_KNOBS->REGISTER_WORKER_REQUEST_TIMEOUT))) {
 				TraceEvent(SevWarn, "WorkerRegisterTimeout").detail("WaitTime", now() - startTime);
 			}
 			when(wait(ccInterface->onChange())) { break; }

From f8054b82de8c61a05c7cd8b0c45d9eca0981033d Mon Sep 17 00:00:00 2001
From: Cynthia <cynthia@coan.dev>
Date: Mon, 19 Apr 2021 22:24:13 -0600
Subject: [PATCH 194/317] fdbcli prints error on TLS File not found

---
 flow/TLSConfig.actor.cpp | 50 ++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/flow/TLSConfig.actor.cpp b/flow/TLSConfig.actor.cpp
index d716ccf19a..867c2369e1 100644
--- a/flow/TLSConfig.actor.cpp
+++ b/flow/TLSConfig.actor.cpp
@@ -253,21 +253,36 @@ LoadedTLSConfig TLSConfig::loadSync() const {
 
 	const std::string certPath = getCertificatePathSync();
 	if (certPath.size()) {
-		loaded.tlsCertBytes = readFileBytes(certPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE);
+		try {
+			loaded.tlsCertBytes = readFileBytes(certPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE);
+		} catch (Error& e) {
+			fprintf(stderr, "Error reading TLS Certificate [%s]: %s\n", certPath.c_str(), e.what());
+			throw;
+		}
 	} else {
 		loaded.tlsCertBytes = tlsCertBytes;
 	}
 
 	const std::string keyPath = getKeyPathSync();
 	if (keyPath.size()) {
-		loaded.tlsKeyBytes = readFileBytes(keyPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE);
+		try {
+			loaded.tlsKeyBytes = readFileBytes(keyPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE);
+		} catch (Error& e) {
+			fprintf(stderr, "Error reading TLS Key [%s]: %s\n", keyPath.c_str(), e.what());
+			throw;
+		}
 	} else {
 		loaded.tlsKeyBytes = tlsKeyBytes;
 	}
 
 	const std::string CAPath = getCAPathSync();
 	if (CAPath.size()) {
-		loaded.tlsCABytes = readFileBytes(CAPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE);
+		try {
+			loaded.tlsCABytes = readFileBytes(CAPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE);
+		} catch (Error& e) {
+			fprintf(stderr, "Error reading TLS CA [%s]: %s\n", CAPath.c_str(), e.what());
+			throw;
+		}
 	} else {
 		loaded.tlsCABytes = tlsCABytes;
 	}
@@ -297,28 +312,49 @@ ACTOR Future<LoadedTLSConfig> TLSConfig::loadAsync(const TLSConfig* self) {
 	state LoadedTLSConfig loaded;
 	state std::vector<Future<Void>> reads;
 
-	const std::string& certPath = self->getCertificatePathSync();
+	state int32_t certIdx = -1;
+	state int32_t keyIdx = -1;
+	state int32_t caIdx = -1;
+
+	state std::string certPath = self->getCertificatePathSync();
 	if (certPath.size()) {
 		reads.push_back(readEntireFile(certPath, &loaded.tlsCertBytes));
+		certIdx = reads.size() - 1;
 	} else {
 		loaded.tlsCertBytes = self->tlsCertBytes;
 	}
 
-	const std::string& keyPath = self->getKeyPathSync();
+	state std::string keyPath = self->getKeyPathSync();
 	if (keyPath.size()) {
 		reads.push_back(readEntireFile(keyPath, &loaded.tlsKeyBytes));
+		keyIdx = reads.size() - 1;
 	} else {
 		loaded.tlsKeyBytes = self->tlsKeyBytes;
 	}
 
-	const std::string& CAPath = self->getCAPathSync();
+	state std::string CAPath = self->getCAPathSync();
 	if (CAPath.size()) {
 		reads.push_back(readEntireFile(CAPath, &loaded.tlsCABytes));
+		caIdx = reads.size() - 1;
 	} else {
 		loaded.tlsCABytes = self->tlsCABytes;
 	}
 
-	wait(waitForAll(reads));
+	try {
+		wait(waitForAll(reads));
+	} catch (Error& e) {
+		if (certIdx != -1 && reads[certIdx].isError()) {
+			fprintf(stderr, "Failure reading TLS Certificate [%s]: %s\n", certPath.c_str(), e.what());
+		} else if (keyIdx != -1 && reads[keyIdx].isError()) {
+			fprintf(stderr, "Failure reading TLS Key [%s]: %s\n", keyPath.c_str(), e.what());
+		} else if (caIdx != -1 && reads[caIdx].isError()) {
+			fprintf(stderr, "Failure reading TLS Key [%s]: %s\n", CAPath.c_str(), e.what());
+		} else {
+			fprintf(stderr, "Failure reading TLS needed file: %s\n", e.what());
+		}
+
+		throw;
+	}
 
 	loaded.tlsPassword = self->tlsPassword;
 	loaded.tlsVerifyPeers = self->tlsVerifyPeers;

From 3c7dc1a59e616a19ce52cb595c35137a1514582b Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Mon, 19 Apr 2021 21:35:54 -0700
Subject: [PATCH 195/317] Backup correctness workload bug fix.  Sometimes the
 restore target ranges are empty, which causes a test timeout.   Renamed some
 variables for clarity.

---
 .../workloads/BackupCorrectness.actor.cpp     | 45 ++++++++++++-------
 tests/fast/BackupCorrectnessClean.toml        |  2 +-
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/fdbserver/workloads/BackupCorrectness.actor.cpp b/fdbserver/workloads/BackupCorrectness.actor.cpp
index 4a57d399fe..32ff788981 100644
--- a/fdbserver/workloads/BackupCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupCorrectness.actor.cpp
@@ -33,8 +33,8 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 	int backupRangesCount, backupRangeLengthMax;
 	bool differentialBackup, performRestore, agentRequest;
 	Standalone<VectorRef<KeyRangeRef>> backupRanges;
-	std::vector<std::string> prefixesMandatory;
-	Standalone<VectorRef<KeyRangeRef>> skipRestoreRanges;
+	std::vector<std::string> prefixesAllowed;
+	std::vector<Standalone<KeyRangeRef>> skippedRestoreRanges;
 	Standalone<VectorRef<KeyRangeRef>> restoreRanges;
 	static int backupAgentRequests;
 	bool locked;
@@ -68,7 +68,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 		agentRequest = getOption(options, LiteralStringRef("simBackupAgents"), true);
 		allowPauses = getOption(options, LiteralStringRef("allowPauses"), true);
 		shareLogRange = getOption(options, LiteralStringRef("shareLogRange"), false);
-		prefixesMandatory = getOption(options, LiteralStringRef("prefixesMandatory"), std::vector<std::string>());
+		prefixesAllowed = getOption(options, LiteralStringRef("prefixesAllowed"), std::vector<std::string>());
 		shouldSkipRestoreRanges = deterministicRandom()->random01() < 0.3 ? true : false;
 
 		TraceEvent("BARW_ClientId").detail("Id", wcx.clientId);
@@ -104,32 +104,45 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 			}
 		}
 
-		if (performRestore && !prefixesMandatory.empty() && shouldSkipRestoreRanges) {
+		if (performRestore && !prefixesAllowed.empty() && shouldSkipRestoreRanges) {
 			for (auto& range : backupRanges) {
 				bool intersection = false;
-				for (auto& prefix : prefixesMandatory) {
-					KeyRange mandatoryRange(KeyRangeRef(prefix, strinc(prefix)));
-					if (range.intersects(mandatoryRange))
+				for (auto& prefix : prefixesAllowed) {
+					KeyRange prefixRange(KeyRangeRef(prefix, strinc(prefix)));
+					if (range.intersects(prefixRange)) {
 						intersection = true;
+					}
 					TraceEvent("BARW_PrefixSkipRangeDetails")
-					    .detail("PrefixMandatory", printable(mandatoryRange))
-					    .detail("BackUpRange", printable(range))
+					    .detail("PrefixMandatory", printable(prefix))
+					    .detail("BackupRange", printable(range))
 					    .detail("Intersection", intersection);
 				}
-				if (!intersection && deterministicRandom()->random01() < 0.5)
-					skipRestoreRanges.push_back(skipRestoreRanges.arena(), range);
-				else
-					restoreRanges.push_back(restoreRanges.arena(), range);
+				// If the backup range intersects with prefixesAllowed or a coin flip is true then use it as a restore
+				// range as well, otherwise skip it.
+				if (intersection || deterministicRandom()->coinflip()) {
+					restoreRanges.push_back_deep(restoreRanges.arena(), range);
+				} else {
+					skippedRestoreRanges.push_back(range);
+				}
 			}
 		} else {
 			restoreRanges = backupRanges;
 		}
+
+		// If no random backup ranges intersected with prefixesAllowed or won the coin flip then restoreRanges will be
+		// empty, so move an item from skippedRestoreRanges to restoreRanges.
+		if (restoreRanges.empty()) {
+			ASSERT(!skippedRestoreRanges.empty());
+			restoreRanges.push_back_deep(restoreRanges.arena(), skippedRestoreRanges.back());
+			skippedRestoreRanges.pop_back();
+		}
+
 		for (auto& range : restoreRanges) {
 			TraceEvent("BARW_RestoreRange", randomID)
 			    .detail("RangeBegin", printable(range.begin))
 			    .detail("RangeEnd", printable(range.end));
 		}
-		for (auto& range : skipRestoreRanges) {
+		for (auto& range : skippedRestoreRanges) {
 			TraceEvent("BARW_SkipRange", randomID)
 			    .detail("RangeBegin", printable(range.begin))
 			    .detail("RangeEnd", printable(range.end));
@@ -171,8 +184,8 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 		loop {
 			try {
 				state int restoreIndex;
-				for (restoreIndex = 0; restoreIndex < self->skipRestoreRanges.size(); restoreIndex++) {
-					state KeyRangeRef range = self->skipRestoreRanges[restoreIndex];
+				for (restoreIndex = 0; restoreIndex < self->skippedRestoreRanges.size(); restoreIndex++) {
+					state KeyRangeRef range = self->skippedRestoreRanges[restoreIndex];
 					Standalone<StringRef> restoreTag(self->backupTag.toString() + "_" + std::to_string(restoreIndex));
 					Standalone<RangeResultRef> res = wait(tr.getRange(range, GetRangeLimits::ROW_LIMIT_UNLIMITED));
 					if (!res.empty()) {
diff --git a/tests/fast/BackupCorrectnessClean.toml b/tests/fast/BackupCorrectnessClean.toml
index 8562413197..bc6b68ae34 100644
--- a/tests/fast/BackupCorrectnessClean.toml
+++ b/tests/fast/BackupCorrectnessClean.toml
@@ -50,7 +50,7 @@ simBackupAgents = 'BackupToFile'
     restoreAfter = 60.0
     performRestore = true
     allowPauses = false
-    prefixesMandatory = 'a,A,m'
+    prefixesAllowed = 'a,A,m'
 
     [[test.workload]]
     testName = 'BackupAndRestoreCorrectness'

From 3f54a4a6dc571dd2213cebac165f73b1815cfad8 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Mon, 19 Apr 2021 21:52:38 -0700
Subject: [PATCH 196/317] Throw an error if an empty range set is passed to
 restore().

---
 fdbclient/FileBackupAgent.actor.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 2369501c82..e55044e11b 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -5237,6 +5237,11 @@ public:
 	                                     bool incrementalBackupOnly,
 	                                     Version beginVersion,
 	                                     UID randomUid) {
+		// The restore command line tool won't allow ranges to be empty, but correctness workloads somehow might.
+		if (ranges.empty()) {
+			throw restore_error();
+		}
+
 		state Reference<IBackupContainer> bc = IBackupContainer::openContainer(url.toString());
 
 		state BackupDescription desc = wait(bc->describeBackup(true));

From c81e1e95193ee07437984ae1d3ab8cc7f0bd957b Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 19 Apr 2021 22:46:57 -0700
Subject: [PATCH 197/317] Add sampling profiler frequency to global config

---
 fdbclient/ActorLineageProfiler.h      |  4 +++-
 fdbclient/GlobalConfig.actor.cpp      | 12 ++++++++++++
 fdbclient/GlobalConfig.actor.h        | 15 +++++++++++++++
 fdbserver/ClusterController.actor.cpp |  4 +++-
 fdbserver/fdbserver.actor.cpp         | 24 ++++++++++++++++++++++++
 5 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 3f11840714..5dee2a4291 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -64,7 +64,7 @@ private:
 public:
 	void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); }
 	std::shared_ptr<Sample> collect();
-	void addGetter(WaitState waitState, Getter const& getter);
+	void addGetter(WaitState waitState, Getter const& getter) { getSamples[waitState] = getter; };
 };
 
 using SampleCollector = crossbow::singleton<SampleCollectorT>;
@@ -118,3 +118,5 @@ public:
 	void setFrequency(unsigned frequency);
 	void stop();
 };
+
+using ActorLineageProfiler = crossbow::singleton<ActorLineageProfilerT>;
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 58e032d363..95d7cfce13 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -34,6 +34,8 @@ const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_inf
 const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag_sample_rate");
 const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost");
 
+const KeyRef sampleFrequency = LiteralStringRef("visibility/sample_frequency");
+
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
 void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
@@ -45,6 +47,10 @@ void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>>
 	}
 }
 
+void GlobalConfig::updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+	_updater = updater(&GlobalConfig::globalConfig(), dbInfo);
+}
+
 GlobalConfig& GlobalConfig::globalConfig() {
 	void* res = g_network->global(INetwork::enGlobalConfig);
 	ASSERT(res);
@@ -77,6 +83,10 @@ Future<Void> GlobalConfig::onInitialized() {
 	return initialized.getFuture();
 }
 
+Future<Void> GlobalConfig::onChange() {
+	return configChanged.onTrigger();
+}
+
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
 	data.erase(key);
 
@@ -222,6 +232,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 					self->lastUpdate = vh.version;
 				}
 			}
+
+			self->configChanged.trigger();
 		} catch (Error& e) {
 			throw;
 		}
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 5c3693f450..bf7532a974 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -49,6 +49,8 @@ extern const KeyRef fdbClientInfoTxnSizeLimit;
 extern const KeyRef transactionTagSampleRate;
 extern const KeyRef transactionTagSampleCost;
 
+extern const KeyRef sampleFrequency;
+
 // Structure used to hold the values stored by global configuration. The arena
 // is used as memory to store both the key and the value (the value is only
 // stored in the arena if it is an object; primitives are just copied).
@@ -78,6 +80,14 @@ public:
 	// For example, given "config/a", returns "\xff\xff/global_config/config/a".
 	static Key prefixedKey(KeyRef key);
 
+	// Update the ClientDBInfo object used internally to check for updates to
+	// global configuration. The ClientDBInfo reference must be the same one
+	// used in the cluster controller, but fdbserver requires initial creation
+	// of the GlobalConfig class before the cluster controller is initialized.
+	// This function allows the ClientDBInfo object to be updated after create
+	// was called.
+	void updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo);
+
 	// Get a value from the framework. Values are returned as a ConfigValue
 	// reference which also contains the arena holding the object. As long as
 	// the caller keeps the ConfigValue reference, the value is guaranteed to
@@ -114,6 +124,10 @@ public:
 	// been created and is ready.
 	Future<Void> onInitialized();
 
+	// Triggers the returned future when any key-value pair in the global
+	// configuration changes.
+	Future<Void> onChange();
+
 private:
 	GlobalConfig();
 
@@ -139,6 +153,7 @@ private:
 	Database cx;
 	Future<Void> _updater;
 	Promise<Void> initialized;
+	AsyncTrigger configChanged;
 	std::unordered_map<StringRef, Reference<ConfigValue>> data;
 	Version lastUpdate;
 };
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 8ec3a4d30c..6b929ca29e 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -135,7 +135,9 @@ public:
 		                                                                         true,
 		                                                                         TaskPriority::DefaultEndpoint,
 		                                                                         true)) // SOMEDAY: Locality!
-		{}
+		{
+			GlobalConfig::globalConfig().updateDBInfo(clientInfo);
+		}
 
 		void setDistributor(const DataDistributorInterface& interf) {
 			auto newInfo = serverInfo->get();
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 136cd90c3d..59e2f494fc 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -35,6 +35,8 @@
 #include <boost/algorithm/string.hpp>
 #include <boost/interprocess/managed_shared_memory.hpp>
 
+#include "fdbclient/ActorLineageProfiler.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/RestoreWorkerInterface.actor.h"
 #include "fdbclient/SystemData.h"
@@ -456,6 +458,27 @@ ACTOR Future<Void> dumpDatabase(Database cx, std::string outputFilename, KeyRang
 	}
 }
 
+// Handles running the sampling profiler, including responding to frequency
+// changes and other updates the client wishes to make through global
+// configuration.
+ACTOR Future<Void> actorLineageProfiler() {
+	wait(delay(1));
+	wait(GlobalConfig::globalConfig().onInitialized());
+	// TODO: Add flag to enable/disable
+	state unsigned frequency = GlobalConfig::globalConfig().get<double>(sampleFrequency, 0);
+	ActorLineageProfiler::instance().setFrequency(frequency);
+
+	loop {
+		wait(GlobalConfig::globalConfig().onChange());
+
+		unsigned latestFrequency = GlobalConfig::globalConfig().get<double>(sampleFrequency, 0);
+		if (latestFrequency != frequency) {
+			frequency = latestFrequency;
+			ActorLineageProfiler::instance().setFrequency(latestFrequency);
+		}
+	}
+}
+
 void memoryTest();
 void skipListTest();
 
@@ -1987,6 +2010,7 @@ int main(int argc, char* argv[]) {
 				                      opts.whitelistBinPaths));
 				actors.push_back(histogramReport());
 				// actors.push_back( recurring( []{}, .001 ) );  // for ASIO latency measurement
+				actors.push_back(actorLineageProfiler());
 
 				f = stopAfter(waitForAll(actors));
 				g_network->run();

From f634165b791020c544a8bf89717f34d731395782 Mon Sep 17 00:00:00 2001
From: Oleg Samarin <osamarin@openwaygroup.com>
Date: Mon, 5 Apr 2021 18:04:55 +0300
Subject: [PATCH 198/317] Fixed a typo in the Client Testing documentation

---
 documentation/sphinx/source/client-testing.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/client-testing.rst b/documentation/sphinx/source/client-testing.rst
index caf65a265a..884eff0933 100644
--- a/documentation/sphinx/source/client-testing.rst
+++ b/documentation/sphinx/source/client-testing.rst
@@ -315,7 +315,7 @@ and pass the test with ``-f``:
 
 .. code-block:: sh
 
-   fdbserver -r simulator -f testfile.txt
+   fdbserver -r simulation -f testfile.txt
 
 
 Running a Workload on an actual Cluster

From b8865673c31810a3a5f0e84e60066c5ac4cc9c2b Mon Sep 17 00:00:00 2001
From: Aaron Molitor <amolitor@apple.com>
Date: Tue, 20 Apr 2021 09:32:38 -0500
Subject: [PATCH 199/317] add redhat-lsb-core to build images -- cpack uses
 lsb_release

---
 build/docker/centos6/build/Dockerfile | 1 +
 build/docker/centos7/build/Dockerfile | 1 +
 2 files changed, 2 insertions(+)

diff --git a/build/docker/centos6/build/Dockerfile b/build/docker/centos6/build/Dockerfile
index 1290160c4f..c007626643 100644
--- a/build/docker/centos6/build/Dockerfile
+++ b/build/docker/centos6/build/Dockerfile
@@ -37,6 +37,7 @@ RUN sed -i -e '/enabled/d' /etc/yum.repos.d/CentOS-Base.repo && \
         lz4-devel \
         lz4-static \
         mono-devel \
+        redhat-lsb-core \
         rpm-build \
         tcl-devel \
         unzip \
diff --git a/build/docker/centos7/build/Dockerfile b/build/docker/centos7/build/Dockerfile
index 3a9ee06938..18773c041a 100644
--- a/build/docker/centos7/build/Dockerfile
+++ b/build/docker/centos7/build/Dockerfile
@@ -34,6 +34,7 @@ RUN rpmkeys --import mono-project.com.rpmkey.pgp && \
         lz4-devel \
         lz4-static \
         mono-devel \
+        redhat-lsb-core \
         rpm-build \
         tcl-devel \
         unzip \

From 5d0d8372681b8c4800484cc43e9b034130eb1b8d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 31 Mar 2021 15:19:50 -0700
Subject: [PATCH 200/317] Fix version cutoff

---
 fdbclient/DatabaseBackupAgent.actor.cpp       |  2 +-
 .../workloads/BackupToDBCorrectness.actor.cpp | 30 ++++++++++++++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/fdbclient/DatabaseBackupAgent.actor.cpp b/fdbclient/DatabaseBackupAgent.actor.cpp
index bc29f9e848..a6870568bf 100644
--- a/fdbclient/DatabaseBackupAgent.actor.cpp
+++ b/fdbclient/DatabaseBackupAgent.actor.cpp
@@ -1072,7 +1072,7 @@ struct CopyLogsTaskFunc : TaskFuncBase {
 
 			wait(waitForAll(addTaskVector) && taskBucket->finish(tr, task));
 		} else {
-			if (appliedVersion <= stopVersionData) {
+			if (appliedVersion < applyVersion) {
 				wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
 				wait(success(CopyLogsTaskFunc::addTask(
 				    tr, taskBucket, task, prevBeginVersion, beginVersion, TaskCompletionKey::signal(onDone))));
diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index b8776064a5..1cd8160edc 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -145,6 +145,29 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 
 	void getMetrics(vector<PerfMetric>& m) override {}
 
+	// Reads a series of key ranges and returns the number of matching records.
+	ACTOR static Future<int> readRanges(Database cx,
+	                                    Standalone<VectorRef<KeyRangeRef>> ranges,
+	                                    StringRef removePrefix) {
+		loop {
+			state Transaction tr(cx);
+			try {
+				state std::vector<Future<Standalone<RangeResultRef>>> results;
+				for (auto& range : ranges) {
+					results.push_back(tr.getRange(range.removePrefix(removePrefix), CLIENT_KNOBS->TOO_MANY));
+				}
+				wait(waitForAll(results));
+				int numResults = 0;
+				for (auto result : results) {
+					numResults += result.get().size();
+				}
+				return numResults;
+			} catch (Error& e) {
+				wait(tr.onError(e));
+			}
+		}
+	}
+
 	ACTOR static Future<Void> diffRanges(Standalone<VectorRef<KeyRangeRef>> ranges,
 	                                     StringRef backupPrefix,
 	                                     Database src,
@@ -639,7 +662,7 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 					}
 				}
 
-				Standalone<VectorRef<KeyRangeRef>> restoreRange;
+				state Standalone<VectorRef<KeyRangeRef>> restoreRange;
 
 				for (auto r : self->backupRanges) {
 					restoreRange.push_back_deep(
@@ -660,6 +683,11 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 
 				wait(success(restoreTool.waitBackup(cx, self->restoreTag)));
 				wait(restoreTool.unlockBackup(cx, self->restoreTag));
+
+				state int res1 = wait(readRanges(cx, restoreRange, self->backupPrefix));
+				wait(delay(5));
+				state int res2 = wait(readRanges(cx, restoreRange, self->backupPrefix));
+				ASSERT(res1 == res2);
 			}
 
 			if (extraBackup.isValid()) {

From 9baa837b2d870a3fcdf6bda0e71e8e0620dff21d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 1 Apr 2021 11:00:50 -0700
Subject: [PATCH 201/317] Compare range contents instead of size

---
 .../workloads/BackupToDBCorrectness.actor.cpp | 28 +++++++++++++------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index 1cd8160edc..f39f6607a8 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -145,8 +145,8 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 
 	void getMetrics(vector<PerfMetric>& m) override {}
 
-	// Reads a series of key ranges and returns the number of matching records.
-	ACTOR static Future<int> readRanges(Database cx,
+	// Reads a series of key ranges and returns each range.
+	ACTOR static Future<std::vector<Standalone<RangeResultRef>>> readRanges(Database cx,
 	                                    Standalone<VectorRef<KeyRangeRef>> ranges,
 	                                    StringRef removePrefix) {
 		loop {
@@ -154,14 +154,15 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 			try {
 				state std::vector<Future<Standalone<RangeResultRef>>> results;
 				for (auto& range : ranges) {
-					results.push_back(tr.getRange(range.removePrefix(removePrefix), CLIENT_KNOBS->TOO_MANY));
+					results.push_back(tr.getRange(range.removePrefix(removePrefix), 1000));
 				}
 				wait(waitForAll(results));
-				int numResults = 0;
+
+				std::vector<Standalone<RangeResultRef>> ret;
 				for (auto result : results) {
-					numResults += result.get().size();
+					ret.push_back(result.get());
 				}
-				return numResults;
+				return ret;
 			} catch (Error& e) {
 				wait(tr.onError(e));
 			}
@@ -684,10 +685,19 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 				wait(success(restoreTool.waitBackup(cx, self->restoreTag)));
 				wait(restoreTool.unlockBackup(cx, self->restoreTag));
 
-				state int res1 = wait(readRanges(cx, restoreRange, self->backupPrefix));
+				state std::vector<Standalone<RangeResultRef>> res1 = wait(readRanges(cx, restoreRange, self->backupPrefix));
 				wait(delay(5));
-				state int res2 = wait(readRanges(cx, restoreRange, self->backupPrefix));
-				ASSERT(res1 == res2);
+				state std::vector<Standalone<RangeResultRef>> res2 = wait(readRanges(cx, restoreRange, self->backupPrefix));
+				ASSERT(res1.size() == res2.size());
+				for (int i = 0; i < res1.size(); ++i) {
+					auto range1 = res1.at(i);
+					auto range2 = res2.at(i);
+					ASSERT(range1.size() == range2.size());
+
+					for (int j = 0; i < range1.size(); ++j) {
+						ASSERT(range1[i].key == range2[i].key && range1[i].value == range2[i].value);
+					}
+				}
 			}
 
 			if (extraBackup.isValid()) {

From 3b95559419fe766f22d9d36ebd1add51ff4e489a Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 1 Apr 2021 11:13:05 -0700
Subject: [PATCH 202/317] Fix indexes

---
 fdbserver/workloads/BackupToDBCorrectness.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index f39f6607a8..b17fd206ee 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -694,8 +694,8 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 					auto range2 = res2.at(i);
 					ASSERT(range1.size() == range2.size());
 
-					for (int j = 0; i < range1.size(); ++j) {
-						ASSERT(range1[i].key == range2[i].key && range1[i].value == range2[i].value);
+					for (int j = 0; j < range1.size(); ++j) {
+						ASSERT(range1[j].key == range2[j].key && range1[j].value == range2[j].value);
 					}
 				}
 			}

From 38d780e847427d3d95d53c27289b079ae9b4357c Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 1 Apr 2021 17:35:43 -0700
Subject: [PATCH 203/317] Add buggify

---
 fdbclient/BackupAgentBase.actor.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp
index d6be426dab..72fce5a509 100644
--- a/fdbclient/BackupAgentBase.actor.cpp
+++ b/fdbclient/BackupAgentBase.actor.cpp
@@ -743,6 +743,9 @@ ACTOR Future<Void> applyMutations(Database cx,
 			wait(coalesceKeyVersionCache(
 			    uid, newEndVersion, keyVersion, commit, committedVersion, addActor, &commitLock));
 			beginVersion = newEndVersion;
+			if (BUGGIFY) {
+				wait(delay(2.0));
+			}
 		}
 	} catch (Error& e) {
 		TraceEvent(e.code() == error_code_restore_missing_data ? SevWarnAlways : SevError, "ApplyMutationsError")

From 92d11cb09e9cd64f52959bb64e6bdac488aa88fd Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 1 Apr 2021 18:36:27 -0700
Subject: [PATCH 204/317] Add idleness comment

---
 fdbserver/workloads/BackupToDBCorrectness.actor.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index b17fd206ee..47b23f72db 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -24,7 +24,9 @@
 #include "fdbserver/workloads/BulkSetup.actor.h"
 #include "flow/actorcompiler.h" // This must be the last #include.
 
-// A workload which test the correctness of backup and restore process
+// A workload which test the correctness of backup and restore process. The
+// database must be idle after the restore completes, and this workload checks
+// that the restore range does not change post restore.
 struct BackupToDBCorrectnessWorkload : TestWorkload {
 	double backupAfter, abortAndRestartAfter, restoreAfter;
 	double backupStartAt, restoreStartAfterBackupFinished, stopDifferentialAfter;

From 1c4f72c98a74ec2d370c69223c8e0b975cda0ab8 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 5 Apr 2021 09:35:34 -0700
Subject: [PATCH 205/317] Add explanation comment

---
 fdbserver/workloads/BackupToDBCorrectness.actor.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index 47b23f72db..b9c94199b2 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -687,6 +687,8 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 				wait(success(restoreTool.waitBackup(cx, self->restoreTag)));
 				wait(restoreTool.unlockBackup(cx, self->restoreTag));
 
+				// Make sure no more data is written to the restored range
+				// after the restore completes.
 				state std::vector<Standalone<RangeResultRef>> res1 = wait(readRanges(cx, restoreRange, self->backupPrefix));
 				wait(delay(5));
 				state std::vector<Standalone<RangeResultRef>> res2 = wait(readRanges(cx, restoreRange, self->backupPrefix));

From 9ee2cd7bcbb1888ca0134511e507a841e2079f77 Mon Sep 17 00:00:00 2001
From: Steve Atherton <steve.atherton@snowflake.com>
Date: Tue, 20 Apr 2021 11:46:16 -0700
Subject: [PATCH 206/317] Renamed prefixesAllowed again for clarity.

---
 fdbserver/workloads/BackupCorrectness.actor.cpp | 12 ++++++------
 tests/fast/BackupCorrectnessClean.toml          |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fdbserver/workloads/BackupCorrectness.actor.cpp b/fdbserver/workloads/BackupCorrectness.actor.cpp
index 32ff788981..5d88a19f14 100644
--- a/fdbserver/workloads/BackupCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupCorrectness.actor.cpp
@@ -33,7 +33,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 	int backupRangesCount, backupRangeLengthMax;
 	bool differentialBackup, performRestore, agentRequest;
 	Standalone<VectorRef<KeyRangeRef>> backupRanges;
-	std::vector<std::string> prefixesAllowed;
+	std::vector<std::string> restorePrefixesToInclude;
 	std::vector<Standalone<KeyRangeRef>> skippedRestoreRanges;
 	Standalone<VectorRef<KeyRangeRef>> restoreRanges;
 	static int backupAgentRequests;
@@ -68,7 +68,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 		agentRequest = getOption(options, LiteralStringRef("simBackupAgents"), true);
 		allowPauses = getOption(options, LiteralStringRef("allowPauses"), true);
 		shareLogRange = getOption(options, LiteralStringRef("shareLogRange"), false);
-		prefixesAllowed = getOption(options, LiteralStringRef("prefixesAllowed"), std::vector<std::string>());
+		restorePrefixesToInclude = getOption(options, LiteralStringRef("restorePrefixesToInclude"), std::vector<std::string>());
 		shouldSkipRestoreRanges = deterministicRandom()->random01() < 0.3 ? true : false;
 
 		TraceEvent("BARW_ClientId").detail("Id", wcx.clientId);
@@ -104,10 +104,10 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 			}
 		}
 
-		if (performRestore && !prefixesAllowed.empty() && shouldSkipRestoreRanges) {
+		if (performRestore && !restorePrefixesToInclude.empty() && shouldSkipRestoreRanges) {
 			for (auto& range : backupRanges) {
 				bool intersection = false;
-				for (auto& prefix : prefixesAllowed) {
+				for (auto& prefix : restorePrefixesToInclude) {
 					KeyRange prefixRange(KeyRangeRef(prefix, strinc(prefix)));
 					if (range.intersects(prefixRange)) {
 						intersection = true;
@@ -117,7 +117,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 					    .detail("BackupRange", printable(range))
 					    .detail("Intersection", intersection);
 				}
-				// If the backup range intersects with prefixesAllowed or a coin flip is true then use it as a restore
+				// If the backup range intersects with restorePrefixesToInclude or a coin flip is true then use it as a restore
 				// range as well, otherwise skip it.
 				if (intersection || deterministicRandom()->coinflip()) {
 					restoreRanges.push_back_deep(restoreRanges.arena(), range);
@@ -129,7 +129,7 @@ struct BackupAndRestoreCorrectnessWorkload : TestWorkload {
 			restoreRanges = backupRanges;
 		}
 
-		// If no random backup ranges intersected with prefixesAllowed or won the coin flip then restoreRanges will be
+		// If no random backup ranges intersected with restorePrefixesToInclude or won the coin flip then restoreRanges will be
 		// empty, so move an item from skippedRestoreRanges to restoreRanges.
 		if (restoreRanges.empty()) {
 			ASSERT(!skippedRestoreRanges.empty());
diff --git a/tests/fast/BackupCorrectnessClean.toml b/tests/fast/BackupCorrectnessClean.toml
index bc6b68ae34..d5fc3d945e 100644
--- a/tests/fast/BackupCorrectnessClean.toml
+++ b/tests/fast/BackupCorrectnessClean.toml
@@ -50,7 +50,7 @@ simBackupAgents = 'BackupToFile'
     restoreAfter = 60.0
     performRestore = true
     allowPauses = false
-    prefixesAllowed = 'a,A,m'
+    restorePrefixesToInclude = 'a,A,m'
 
     [[test.workload]]
     testName = 'BackupAndRestoreCorrectness'

From 8b2a72fea26d72c7237d6d74717eb2dc66dc9998 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 30 Mar 2021 17:17:47 -0700
Subject: [PATCH 207/317] Add option to clear destination range before backup

---
 fdbclient/BackupAgent.actor.h                 | 14 +++--
 fdbclient/DatabaseBackupAgent.actor.cpp       | 52 ++++++++++++-------
 .../workloads/BackupToDBCorrectness.actor.cpp | 16 ++++--
 3 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/fdbclient/BackupAgent.actor.h b/fdbclient/BackupAgent.actor.h
index fb8f6b1564..3047580c60 100644
--- a/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/BackupAgent.actor.h
@@ -488,6 +488,14 @@ public:
 		                         [=](Reference<ReadYourWritesTransaction> tr) { return unlockBackup(tr, tagName); });
 	}
 
+	// Specifies the action to take on the backup's destination key range
+	// before the backup begins.
+	enum PreBackupAction {
+		NONE = 0, // No action is taken
+		VERIFY = 1, // Verify the key range being restored to is empty.
+		CLEAR = 2 // Clear the key range being restored to.
+	};
+
 	Future<Void> submitBackup(Reference<ReadYourWritesTransaction> tr,
 	                          Key tagName,
 	                          Standalone<VectorRef<KeyRangeRef>> backupRanges,
@@ -495,7 +503,7 @@ public:
 	                          Key addPrefix = StringRef(),
 	                          Key removePrefix = StringRef(),
 	                          bool lockDatabase = false,
-	                          bool databasesInSync = false);
+	                          PreBackupAction backupAction = PreBackupAction::VERIFY);
 	Future<Void> submitBackup(Database cx,
 	                          Key tagName,
 	                          Standalone<VectorRef<KeyRangeRef>> backupRanges,
@@ -503,10 +511,10 @@ public:
 	                          Key addPrefix = StringRef(),
 	                          Key removePrefix = StringRef(),
 	                          bool lockDatabase = false,
-	                          bool databasesInSync = false) {
+	                          PreBackupAction backupAction = PreBackupAction::VERIFY) {
 		return runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) {
 			return submitBackup(
-			    tr, tagName, backupRanges, stopWhenDone, addPrefix, removePrefix, lockDatabase, databasesInSync);
+			    tr, tagName, backupRanges, stopWhenDone, addPrefix, removePrefix, lockDatabase, backupAction);
 		});
 	}
 
diff --git a/fdbclient/DatabaseBackupAgent.actor.cpp b/fdbclient/DatabaseBackupAgent.actor.cpp
index bc29f9e848..764ad9ccd0 100644
--- a/fdbclient/DatabaseBackupAgent.actor.cpp
+++ b/fdbclient/DatabaseBackupAgent.actor.cpp
@@ -2243,17 +2243,18 @@ struct StartFullBackupTaskFunc : TaskFuncBase {
 		return Void();
 	}
 
-	ACTOR static Future<Key> addTask(Reference<ReadYourWritesTransaction> tr,
-	                                 Reference<TaskBucket> taskBucket,
-	                                 Key logUid,
-	                                 Key backupUid,
-	                                 Key keyAddPrefix,
-	                                 Key keyRemovePrefix,
-	                                 Key keyConfigBackupRanges,
-	                                 Key tagName,
-	                                 TaskCompletionKey completionKey,
-	                                 Reference<TaskFuture> waitFor = Reference<TaskFuture>(),
-	                                 bool databasesInSync = false) {
+	ACTOR static Future<Key> addTask(
+	    Reference<ReadYourWritesTransaction> tr,
+	    Reference<TaskBucket> taskBucket,
+	    Key logUid,
+	    Key backupUid,
+	    Key keyAddPrefix,
+	    Key keyRemovePrefix,
+	    Key keyConfigBackupRanges,
+	    Key tagName,
+	    TaskCompletionKey completionKey,
+	    Reference<TaskFuture> waitFor = Reference<TaskFuture>(),
+	    DatabaseBackupAgent::PreBackupAction backupAction = DatabaseBackupAgent::PreBackupAction::VERIFY) {
 		Key doneKey = wait(completionKey.get(tr, taskBucket));
 		auto task = makeReference<Task>(StartFullBackupTaskFunc::name, StartFullBackupTaskFunc::version, doneKey);
 
@@ -2264,7 +2265,7 @@ struct StartFullBackupTaskFunc : TaskFuncBase {
 		task->params[BackupAgentBase::keyConfigBackupRanges] = keyConfigBackupRanges;
 		task->params[BackupAgentBase::keyTagName] = tagName;
 		task->params[DatabaseBackupAgent::keyDatabasesInSync] =
-		    databasesInSync ? LiteralStringRef("t") : LiteralStringRef("f");
+		    backupAction == DatabaseBackupAgent::PreBackupAction::NONE ? LiteralStringRef("t") : LiteralStringRef("f");
 
 		if (!waitFor) {
 			return taskBucket->addTask(tr,
@@ -2514,7 +2515,7 @@ public:
 	                                       Key addPrefix,
 	                                       Key removePrefix,
 	                                       bool lockDB,
-	                                       bool databasesInSync) {
+	                                       DatabaseBackupAgent::PreBackupAction backupAction) {
 		state UID logUid = deterministicRandom()->randomUniqueID();
 		state Key logUidValue = BinaryWriter::toValue(logUid, Unversioned());
 		state UID logUidCurrent = wait(backupAgent->getLogUid(tr, tagName));
@@ -2558,7 +2559,7 @@ public:
 			}
 		}
 
-		if (!databasesInSync) {
+		if (backupAction == DatabaseBackupAgent::PreBackupAction::VERIFY) {
 			// Make sure all of the ranges are empty before we backup into them.
 			state std::vector<Future<Standalone<RangeResultRef>>> backupIntoResults;
 			for (auto& backupRange : backupRanges) {
@@ -2572,6 +2573,11 @@ public:
 					throw restore_destination_not_empty();
 				}
 			}
+		} else if (backupAction == DatabaseBackupAgent::PreBackupAction::CLEAR) {
+			// Clear out all ranges before we backup into them.
+			for (auto& backupRange : backupRanges) {
+				tr->clear(backupRange.removePrefix(removePrefix).withPrefix(addPrefix));
+			}
 		}
 
 		// Clear the backup ranges for the tag
@@ -2610,7 +2616,7 @@ public:
 		tr->clear(KeyRangeRef(mapPrefix, mapEnd));
 
 		state Version readVersion = invalidVersion;
-		if (databasesInSync) {
+		if (backupAction == DatabaseBackupAgent::PreBackupAction::NONE) {
 			Transaction readTransaction(backupAgent->taskBucket->src);
 			readTransaction.setOption(FDBTransactionOptions::LOCK_AWARE);
 			Version _ = wait(readTransaction.getReadVersion());
@@ -2629,7 +2635,7 @@ public:
 		    tagName,
 		    TaskCompletionKey::noSignal(),
 		    Reference<TaskFuture>(),
-		    databasesInSync));
+		    backupAction));
 
 		if (lockDB)
 			wait(lockDatabase(tr, logUid));
@@ -2772,8 +2778,14 @@ public:
 		TraceEvent("DBA_SwitchoverVersionUpgraded");
 
 		try {
-			wait(drAgent.submitBackup(
-			    backupAgent->taskBucket->src, tagName, backupRanges, false, addPrefix, removePrefix, true, true));
+			wait(drAgent.submitBackup(backupAgent->taskBucket->src,
+			                          tagName,
+			                          backupRanges,
+			                          false,
+			                          addPrefix,
+			                          removePrefix,
+			                          true,
+			                          DatabaseBackupAgent::PreBackupAction::NONE));
 		} catch (Error& e) {
 			if (e.code() != error_code_backup_duplicate)
 				throw;
@@ -3236,9 +3248,9 @@ Future<Void> DatabaseBackupAgent::submitBackup(Reference<ReadYourWritesTransacti
                                                Key addPrefix,
                                                Key removePrefix,
                                                bool lockDatabase,
-                                               bool databasesInSync) {
+                                               PreBackupAction backupAction) {
 	return DatabaseBackupAgentImpl::submitBackup(
-	    this, tr, tagName, backupRanges, stopWhenDone, addPrefix, removePrefix, lockDatabase, databasesInSync);
+	    this, tr, tagName, backupRanges, stopWhenDone, addPrefix, removePrefix, lockDatabase, backupAction);
 }
 
 Future<Void> DatabaseBackupAgent::discontinueBackup(Reference<ReadYourWritesTransaction> tr, Key tagName) {
diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index b8776064a5..434d67069d 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -286,7 +286,8 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 					                               stopDifferentialDelay ? false : true,
 					                               self->backupPrefix,
 					                               StringRef(),
-					                               self->locked));
+					                               self->locked,
+					                               DatabaseBackupAgent::PreBackupAction::CLEAR));
 					wait(tr2->commit());
 					break;
 				} catch (Error& e) {
@@ -600,7 +601,8 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 					                                       true,
 					                                       self->extraPrefix,
 					                                       StringRef(),
-					                                       self->locked);
+					                                       self->locked,
+					                                       DatabaseBackupAgent::PreBackupAction::CLEAR);
 				} catch (Error& e) {
 					TraceEvent("BARW_SubmitBackup2Exception", randomID)
 					    .error(e)
@@ -648,8 +650,14 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 				}
 
 				try {
-					wait(restoreTool.submitBackup(
-					    cx, self->restoreTag, restoreRange, true, StringRef(), self->backupPrefix, self->locked));
+					wait(restoreTool.submitBackup(cx,
+					                              self->restoreTag,
+					                              restoreRange,
+					                              true,
+					                              StringRef(),
+					                              self->backupPrefix,
+					                              self->locked,
+					                              DatabaseBackupAgent::PreBackupAction::CLEAR));
 				} catch (Error& e) {
 					TraceEvent("BARW_DoBackupSubmitBackupException", randomID)
 					    .error(e)

From 12e9e59c1b208e3d1ba29de2fb327b6fe75f814a Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 20 Apr 2021 11:59:03 -0700
Subject: [PATCH 208/317] Remove redundant clear range before backup

---
 .../workloads/BackupToDBCorrectness.actor.cpp | 69 ++++++-------------
 1 file changed, 22 insertions(+), 47 deletions(-)

diff --git a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
index 434d67069d..8844c60df7 100644
--- a/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
+++ b/fdbserver/workloads/BackupToDBCorrectness.actor.cpp
@@ -259,39 +259,33 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 			wait(backupAgent->unlockBackup(cx, tag));
 		}
 
-		// The range clear and submitBackup is being done here in the SAME transaction (which does make SubmitBackup's
-		// range emptiness check pointless in this test) because separating them causes rare errors where the
-		// SubmitBackup commit result is indeterminite but the submission was in fact successful and the backup actually
-		// completes before the retry of SubmitBackup so this second call to submit fails because the destination range
-		// is no longer empty.
+		// In prior versions of submitBackup, we have seen a rare bug where
+		// submitBackup results in a commit_unknown_result, causing the backup
+		// to retry when in fact it had successfully completed. On the retry,
+		// the range being backed up into was checked to make sure it was
+		// empty, and this check was failing because the backup had succeeded
+		// the first time. The old solution for this was to clear the backup
+		// range in the same transaction as the backup, but now we have
+		// switched to passing a "pre-backup action" to either verify the range
+		// being backed up into is empty, or clearing it first.
 		TraceEvent("BARW_DoBackupClearAndSubmitBackup", randomID)
 		    .detail("Tag", printable(tag))
 		    .detail("StopWhenDone", stopDifferentialDelay ? "False" : "True");
 
 		try {
-			state Reference<ReadYourWritesTransaction> tr2(new ReadYourWritesTransaction(self->extraDB));
-			loop {
-				try {
-					for (auto r : self->backupRanges) {
-						if (!r.empty()) {
-							auto targetRange = r.withPrefix(self->backupPrefix);
-							printf("Clearing %s in destination\n", printable(targetRange).c_str());
-							tr2->addReadConflictRange(targetRange);
-							tr2->clear(targetRange);
-						}
-					}
-					wait(backupAgent->submitBackup(tr2,
-					                               tag,
-					                               backupRanges,
-					                               stopDifferentialDelay ? false : true,
-					                               self->backupPrefix,
-					                               StringRef(),
-					                               self->locked,
-					                               DatabaseBackupAgent::PreBackupAction::CLEAR));
-					wait(tr2->commit());
-					break;
-				} catch (Error& e) {
-					wait(tr2->onError(e));
+			try {
+				wait(backupAgent->submitBackup(cx,
+				                               tag,
+				                               backupRanges,
+				                               stopDifferentialDelay ? false : true,
+				                               self->backupPrefix,
+				                               StringRef(),
+				                               self->locked,
+				                               DatabaseBackupAgent::PreBackupAction::CLEAR));
+			} catch (Error& e) {
+				TraceEvent("BARW_SubmitBackup1Exception", randomID).error(e);
+				if (e.code() != error_code_backup_unneeded && e.code() != error_code_backup_duplicate) {
+					throw;
 				}
 			}
 		} catch (Error& e) {
@@ -622,25 +616,6 @@ struct BackupToDBCorrectnessWorkload : TestWorkload {
 				    .detail("BackupTag", printable(self->restoreTag));
 				// wait(diffRanges(self->backupRanges, self->backupPrefix, cx, self->extraDB));
 
-				state Transaction tr3(cx);
-				loop {
-					try {
-						// Run on the first proxy to ensure data is cleared
-						// when submitting the backup request below.
-						tr3.setOption(FDBTransactionOptions::COMMIT_ON_FIRST_PROXY);
-						for (auto r : self->backupRanges) {
-							if (!r.empty()) {
-								tr3.addReadConflictRange(r);
-								tr3.clear(r);
-							}
-						}
-						wait(tr3.commit());
-						break;
-					} catch (Error& e) {
-						wait(tr3.onError(e));
-					}
-				}
-
 				Standalone<VectorRef<KeyRangeRef>> restoreRange;
 
 				for (auto r : self->backupRanges) {

From 2e825908dc3a888cc3f91583cdd51f7806a6167a Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 20 Apr 2021 14:04:00 -0700
Subject: [PATCH 209/317] Add check to make sure maintenance time is positive
 and update the documentation

---
 documentation/sphinx/source/developer-guide.rst |  2 +-
 fdbclient/SpecialKeySpace.actor.cpp             | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index 3a3c731630..3c038efb29 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -949,7 +949,7 @@ that process, and wait for necessary data to be moved away.
 #. ``\xff\xff/management/options/failed/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed/<exclusion>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
 #. ``\xff\xff/management/min_required_commit_version`` Read/write. Changing this key will change the corresponding system key ``\xff/minRequiredCommitVersion = [[Version]]``. The value of this special key is the literal text of the underlying ``Version``, which is ``int64_t``. If you set the key with a value failed to be parsed as ``int64_t``, ``special_keys_api_failure`` will be thrown. In addition, the given ``Version`` should be larger than the current read version and smaller than the upper bound(``2**63-1-version_per_second*3600*24*365*1000``). Otherwise, ``special_keys_api_failure`` is thrown. For more details, see help text of ``fdbcli`` command ``advanceversion``.
 #. ``\xff\xff/management/profiling/<client_txn_sample_rate|client_txn_size_limit>`` Read/write. Changing these two keys will change the corresponding system keys ``\xff\x02/fdbClientInfo/<client_txn_sample_rate|client_txn_size_limit>``, respectively. The value of ``\xff\xff/management/client_txn_sample_rate`` is a literal text of ``double``, and the value of ``\xff\xff/management/client_txn_size_limit`` is a literal text of ``int64_t``. A special value ``default`` can be set to or read from these two keys, representing the client profiling is disabled. In addition, ``clear`` in this range is not allowed. For more details, see help text of ``fdbcli`` command ``profile client``.
-#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of ``int`` which represents the remaining time for the zone to be in maintenance. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. For more details, see help text of ``fdbcli`` command ``maintenance``.
+#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of a positive ``double`` which represents the remaining time for the zone to be in maintenance. Commiting with an invalid value will throw ``special_keys_api_failure``. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. For more details, see help text of ``fdbcli`` command ``maintenance``.
    In addition, a special key ``\xff\xff/management/maintenance/IgnoreSSFailures`` in the range, if set, will disable datadistribution for storage server failures.
    It is doing the same thing as the fdbcli command ``datadistribution disable ssfailure``.
    Maintenance mode will be unable to use until the key is cleared, which is the same as the fdbcli command ``datadistribution enable ssfailure``.
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 03cc4f1a67..f371a73ee7 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1937,10 +1937,10 @@ ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYou
 		if ((healthyZone.first == ignoreSSFailuresZoneString) ||
 		    (healthyZone.second > ryw->getTransaction().getReadVersion().get())) {
 			Key zone_key = healthyZone.first.withPrefix(prefix);
-			int64_t seconds = healthyZone.first == ignoreSSFailuresZoneString
-			                      ? 0
-			                      : (healthyZone.second - ryw->getTransaction().getReadVersion().get()) /
-			                            CLIENT_KNOBS->CORE_VERSIONSPERSECOND;
+			double seconds = healthyZone.first == ignoreSSFailuresZoneString
+			                     ? 0
+			                     : (healthyZone.second - ryw->getTransaction().getReadVersion().get()) /
+			                           CLIENT_KNOBS->CORE_VERSIONSPERSECOND;
 			if (kr.contains(zone_key)) {
 				result.push_back_deep(result.arena(),
 				                      KeyValueRef(zone_key, Value(boost::lexical_cast<std::string>(seconds))));
@@ -1965,7 +1965,7 @@ ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWrites
 	state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
 	    ryw->getSpecialKeySpaceWriteMap().containedRanges(kr);
 	Key zoneId;
-	int64_t seconds;
+	double seconds;
 	bool isSet = false;
 	// Since maintenance only allows one zone at the same time,
 	// if a transaction has more than one set operation on different zone keys,
@@ -1979,7 +1979,7 @@ ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWrites
 				    false, "maintenance", "Multiple zones given for maintenance, only one allowed at the same time"));
 			isSet = true;
 			zoneId = iter->begin().removePrefix(kr.begin);
-			seconds = boost::lexical_cast<int64_t>(iter->value().second.get().toString());
+			seconds = boost::lexical_cast<double>(iter->value().second.get().toString());
 		} else {
 			// if we already have set operation, then all clear operations will be meaningless, thus skip
 			if (!isSet && healthyZone.present() && iter.range().contains(healthyZone.get().first.withPrefix(kr.begin)))
@@ -1992,6 +1992,10 @@ ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWrites
 			std::string msg = "Maintenance mode cannot be used while data distribution is disabled for storage "
 			                  "server failures.";
 			return Optional<std::string>(ManagementAPIError::toJsonString(false, "maintenance", msg));
+		} else if (seconds <= 0) {
+			std::string msg = "The specified maintenance time " + boost::lexical_cast<std::string>(seconds) +
+			                  " is not a positive value";
+			return Optional<std::string>(ManagementAPIError::toJsonString(false, "maintenance", msg));
 		} else {
 			TraceEvent(SevDebug, "SKSMaintenanceSet").detail("ZoneId", zoneId.toString());
 			ryw->getTransaction().set(healthyZoneKey,

From af387e1519cc0bb3166598cb2107d1941d985bd7 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Tue, 20 Apr 2021 14:09:52 -0700
Subject: [PATCH 210/317] Add check to make sure maintenance time is
 non-negative and update the documentation

---
 documentation/sphinx/source/developer-guide.rst | 2 +-
 fdbclient/SpecialKeySpace.actor.cpp             | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst
index 3c038efb29..d26f235304 100644
--- a/documentation/sphinx/source/developer-guide.rst
+++ b/documentation/sphinx/source/developer-guide.rst
@@ -949,7 +949,7 @@ that process, and wait for necessary data to be moved away.
 #. ``\xff\xff/management/options/failed/force`` Read/write. Setting this key disables safety checks for writes to ``\xff\xff/management/failed/<exclusion>``. Setting this key only has an effect in the current transaction and is not persisted on commit.
 #. ``\xff\xff/management/min_required_commit_version`` Read/write. Changing this key will change the corresponding system key ``\xff/minRequiredCommitVersion = [[Version]]``. The value of this special key is the literal text of the underlying ``Version``, which is ``int64_t``. If you set the key with a value failed to be parsed as ``int64_t``, ``special_keys_api_failure`` will be thrown. In addition, the given ``Version`` should be larger than the current read version and smaller than the upper bound(``2**63-1-version_per_second*3600*24*365*1000``). Otherwise, ``special_keys_api_failure`` is thrown. For more details, see help text of ``fdbcli`` command ``advanceversion``.
 #. ``\xff\xff/management/profiling/<client_txn_sample_rate|client_txn_size_limit>`` Read/write. Changing these two keys will change the corresponding system keys ``\xff\x02/fdbClientInfo/<client_txn_sample_rate|client_txn_size_limit>``, respectively. The value of ``\xff\xff/management/client_txn_sample_rate`` is a literal text of ``double``, and the value of ``\xff\xff/management/client_txn_size_limit`` is a literal text of ``int64_t``. A special value ``default`` can be set to or read from these two keys, representing the client profiling is disabled. In addition, ``clear`` in this range is not allowed. For more details, see help text of ``fdbcli`` command ``profile client``.
-#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of a positive ``double`` which represents the remaining time for the zone to be in maintenance. Commiting with an invalid value will throw ``special_keys_api_failure``. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. For more details, see help text of ``fdbcli`` command ``maintenance``.
+#. ``\xff\xff/management/maintenance/<zone_id> := <seconds>`` Read/write. Set/clear a key in this range will change the corresponding system key ``\xff\x02/healthyZone``. The value is a literal text of a non-negative ``double`` which represents the remaining time for the zone to be in maintenance. Commiting with an invalid value will throw ``special_keys_api_failure``. Only one zone is allowed to be in maintenance at the same time. Setting a new key in the range will override the old one and the transaction will throw ``special_keys_api_failure`` error if more than one zone is given. For more details, see help text of ``fdbcli`` command ``maintenance``.
    In addition, a special key ``\xff\xff/management/maintenance/IgnoreSSFailures`` in the range, if set, will disable datadistribution for storage server failures.
    It is doing the same thing as the fdbcli command ``datadistribution disable ssfailure``.
    Maintenance mode will be unable to use until the key is cleared, which is the same as the fdbcli command ``datadistribution enable ssfailure``.
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index f371a73ee7..12cbc0c41c 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1992,9 +1992,9 @@ ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWrites
 			std::string msg = "Maintenance mode cannot be used while data distribution is disabled for storage "
 			                  "server failures.";
 			return Optional<std::string>(ManagementAPIError::toJsonString(false, "maintenance", msg));
-		} else if (seconds <= 0) {
-			std::string msg = "The specified maintenance time " + boost::lexical_cast<std::string>(seconds) +
-			                  " is not a positive value";
+		} else if (seconds < 0) {
+			std::string msg =
+			    "The specified maintenance time " + boost::lexical_cast<std::string>(seconds) + " is a negative value";
 			return Optional<std::string>(ManagementAPIError::toJsonString(false, "maintenance", msg));
 		} else {
 			TraceEvent(SevDebug, "SKSMaintenanceSet").detail("ZoneId", zoneId.toString());

From d76b32da188c6f51da6d0837551cada78cc6ef53 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Tue, 20 Apr 2021 15:10:01 -0600
Subject: [PATCH 211/317] Annotate read paths on the server side

---
 fdbclient/ActorLineageProfiler.h      |   2 +-
 fdbclient/NativeAPI.actor.cpp         |   3 +
 fdbclient/TransactionLineage.cpp      |  25 +++++
 fdbclient/TransactionLineage.h        | 128 ++++++++++++++++++++++++++
 fdbserver/CommitProxyServer.actor.cpp |   4 +
 fdbserver/GrvProxyServer.actor.cpp    |   5 +
 fdbserver/storageserver.actor.cpp     |  15 ++-
 fdbserver/worker.actor.cpp            |   4 +
 8 files changed, 183 insertions(+), 3 deletions(-)
 create mode 100644 fdbclient/TransactionLineage.cpp
 create mode 100644 fdbclient/TransactionLineage.h

diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 3f11840714..81d4bcaec7 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -64,7 +64,7 @@ private:
 public:
 	void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); }
 	std::shared_ptr<Sample> collect();
-	void addGetter(WaitState waitState, Getter const& getter);
+	void addGetter(WaitState waitState, Getter const& getter) { getSamples.emplace(waitState, getter); }
 };
 
 using SampleCollector = crossbow::singleton<SampleCollectorT>;
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 1857cea0c7..ac45d83b05 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -49,6 +49,7 @@
 #include "fdbclient/SpecialKeySpace.actor.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbclient/SystemData.h"
+#include "fdbclient/TransactionLineage.h"
 #include "fdbclient/versions.h"
 #include "fdbrpc/LoadBalance.h"
 #include "fdbrpc/Net2FileSystem.h"
@@ -86,6 +87,8 @@ using std::pair;
 
 namespace {
 
+TransactionLineageCollector transactionLineageCollector;
+
 template <class Interface, class Request>
 Future<REPLY_TYPE(Request)> loadBalance(
     DatabaseContext* ctx,
diff --git a/fdbclient/TransactionLineage.cpp b/fdbclient/TransactionLineage.cpp
new file mode 100644
index 0000000000..9ef0f21e1b
--- /dev/null
+++ b/fdbclient/TransactionLineage.cpp
@@ -0,0 +1,25 @@
+/*
+ * TransactionLineage.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/TransactionLineage.h"
+
+namespace {
+TransactionLineageCollector transactionLineageCollector;
+}
\ No newline at end of file
diff --git a/fdbclient/TransactionLineage.h b/fdbclient/TransactionLineage.h
new file mode 100644
index 0000000000..b4518de231
--- /dev/null
+++ b/fdbclient/TransactionLineage.h
@@ -0,0 +1,128 @@
+/*
+ * TransactionLineage.h
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "fdbclient/ActorLineageProfiler.h"
+
+struct TransactionLineage : LineageProperties<TransactionLineage> {
+	enum class Operation {
+		Unset,
+		GetValue,
+		GetKey,
+		GetKeyValues,
+		WatchValue,
+		GetConsistentReadVersion,
+		Commit,
+		GetKeyServersLocations
+	};
+	static constexpr std::string_view name = "Transaction"sv;
+	uint64_t txID;
+	Operation operation = Operation::Unset;
+
+	bool isSet(uint64_t TransactionLineage::*member) const { return this->*member > 0; }
+	bool isSet(Operation TransactionLineage::*member) const { return this->*member != Operation::Unset; }
+};
+
+struct TransactionLineageCollector : IALPCollector<TransactionLineage> {
+	using Operation = TransactionLineage::Operation;
+	std::optional<std::any> collect(ActorLineage* lineage) {
+		std::map<std::string_view, std::any> res;
+		auto txID = lineage->get(&TransactionLineage::txID);
+		if (txID.has_value()) {
+			res["ID"sv] = txID.value();
+		}
+		auto operation = lineage->get(&TransactionLineage::operation);
+		if (operation.has_value()) {
+			switch (operation.value()) {
+			case Operation::Unset:
+				res["operation"sv] = "Unset"sv;
+				break;
+			case Operation::GetValue:
+				res["operation"sv] = "GetValue"sv;
+				break;
+			case Operation::GetKey:
+				res["operation"sv] = "GetKey"sv;
+				break;
+			case Operation::GetKeyValues:
+				res["operation"sv] = "GetKeyValues"sv;
+				break;
+			case Operation::WatchValue:
+				res["operation"sv] = "WatchValue"sv;
+				break;
+			case Operation::GetConsistentReadVersion:
+				res["operation"sv] = "GetConsistentReadVersion"sv;
+				break;
+			case Operation::Commit:
+				res["operation"sv] = "Commit"sv;
+				break;
+			case Operation::GetKeyServersLocations:
+				res["operation"sv] = "GetKeyServersLocations"sv;
+				break;
+			}
+		}
+		if (res.empty()) {
+			return std::optional<std::any>{};
+		} else {
+			return res;
+		}
+	}
+};
+
+template <class T, class V>
+class ScopedLineage {
+	V before;
+	V T::*member;
+	bool valid = true;
+
+public:
+	ScopedLineage(V T::*member, V const& value) : member(member) {
+		auto val = currentLineage->modify(member);
+		before = val;
+		val = value;
+	}
+	~ScopedLineage() {
+		if (!valid) {
+			return;
+		}
+		currentLineage->modify(member) = before;
+	}
+	ScopedLineage(ScopedLineage<T, V>&& o) : before(std::move(o.before)), member(o.member), valid(o.valid) {
+		o.release();
+	}
+	ScopedLineage& operator=(ScopedLineage<T, V>&& o) {
+		if (valid) {
+			currentLineage->modify(member) = before;
+		}
+		before = std::move(o.before);
+		member = o.member;
+		valid = o.valid;
+		o.release();
+		return *this;
+	}
+	ScopedLineage(const ScopedLineage<T, V>&) = delete;
+	ScopedLineage& operator=(const ScopedLineage<T, V>&) = delete;
+	void release() { valid = false; }
+};
+
+template <class T, class V>
+ScopedLineage<T, V> make_scoped_lineage(V T::*member, V const& value) {
+	return ScopedLineage<T, V>(member, value);
+}
diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp
index 4ae833c050..428a384279 100644
--- a/fdbserver/CommitProxyServer.actor.cpp
+++ b/fdbserver/CommitProxyServer.actor.cpp
@@ -28,6 +28,7 @@
 #include "fdbclient/CommitProxyInterface.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/SystemData.h"
+#include "fdbclient/TransactionLineage.h"
 #include "fdbrpc/sim_validation.h"
 #include "fdbserver/ApplyMetadataMutation.h"
 #include "fdbserver/ConflictSet.h"
@@ -1396,6 +1397,7 @@ ACTOR Future<Void> commitBatch(ProxyCommitData* self,
 	// WARNING: this code is run at a high priority (until the first delay(0)), so it needs to do as little work as
 	// possible
 	state CommitBatch::CommitBatchContext context(self, trs, currentBatchMemBytesCount);
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::Commit;
 
 	// Active load balancing runs at a very high priority (to obtain accurate estimate of memory used by commit batches)
 	// so we need to downgrade here
@@ -1432,6 +1434,8 @@ ACTOR Future<Void> commitBatch(ProxyCommitData* self,
 
 ACTOR static Future<Void> doKeyServerLocationRequest(GetKeyServerLocationsRequest req, ProxyCommitData* commitData) {
 	// We can't respond to these requests until we have valid txnStateStore
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKeyServersLocations;
+	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 	wait(commitData->validState.getFuture());
 	wait(delay(0, TaskPriority::DefaultEndpoint));
 
diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index 8ab3719181..faad80d2d7 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -19,6 +19,7 @@
  */
 
 #include "fdbclient/Notified.h"
+#include "fdbclient/TransactionLineage.h"
 #include "fdbserver/LogSystem.h"
 #include "fdbserver/LogSystemDiskQueueAdapter.h"
 #include "fdbclient/CommitProxyInterface.h"
@@ -349,8 +350,11 @@ ACTOR Future<Void> queueGetReadVersionRequests(Reference<AsyncVar<ServerDBInfo>>
                                                GrvProxyStats* stats,
                                                GrvTransactionRateInfo* batchRateInfo,
                                                TransactionTagMap<uint64_t>* transactionTagCounter) {
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetConsistentReadVersion;
 	loop choose {
 		when(GetReadVersionRequest req = waitNext(readVersionRequests)) {
+			auto lineage = make_scoped_lineage(&TransactionLineage::txID, req.spanContext.first());
+			// currentLineage->modify(&TransactionLineage::txID) =
 			// WARNING: this code is run at a high priority, so it needs to do as little work as possible
 			if (stats->txnRequestIn.getValue() - stats->txnRequestOut.getValue() >
 			    SERVER_KNOBS->START_TRANSACTION_MAX_QUEUE_SIZE) {
@@ -637,6 +641,7 @@ ACTOR static Future<Void> transactionStarter(GrvProxyInterface proxy,
 	state Span span;
 
 	state int64_t midShardSize = SERVER_KNOBS->MIN_SHARD_BYTES;
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetConsistentReadVersion;
 	addActor.send(monitorDDMetricsChanges(&midShardSize, db));
 
 	addActor.send(getRate(proxy.id(),
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 8c26f955bb..7538685acf 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -42,6 +42,7 @@
 #include "fdbclient/Notified.h"
 #include "fdbclient/StatusClient.h"
 #include "fdbclient/SystemData.h"
+#include "fdbclient/TransactionLineage.h"
 #include "fdbclient/VersionedMap.h"
 #include "fdbserver/FDBExecHelper.actor.h"
 #include "fdbserver/IKeyValueStore.h"
@@ -521,7 +522,7 @@ public:
 	//   process of committing makeShardDurable)
 	//   == v              -> k is readable (from storage+versionedData) @ [storageVersion,v], and not being updated
 	//   when version increases
-	//   == latestVersion  -> k is readable (from storage+versionedData) @ [storageVersion,version.get()], and thus
+	//   == latestVersion  -> k is readable (from stora	ge+versionedData) @ [storageVersion,version.get()], and thus
 	//   stays available when version increases
 	CoalescedKeyRangeMap<Version> newestAvailableVersion;
 
@@ -874,7 +875,7 @@ public:
 		}
 		return fun(this, request);
 	}
-};
+		    };
 
 const StringRef StorageServer::CurrentRunningFetchKeys::emptyString = LiteralStringRef("");
 const KeyRangeRef StorageServer::CurrentRunningFetchKeys::emptyKeyRange =
@@ -1106,6 +1107,7 @@ ACTOR Future<Void> getValueQ(StorageServer* data, GetValueRequest req) {
 	state int64_t resultSize = 0;
 	Span span("SS:getValue"_loc, { req.spanContext });
 	span.addTag("key"_sr, req.key);
+	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 
 	try {
 		++data->counters.getValueQueries;
@@ -1799,6 +1801,7 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 {
 	state Span span("SS:getKeyValues"_loc, { req.spanContext });
 	state int64_t resultSize = 0;
+	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 
 	++data->counters.getRangeQueries;
 	++data->counters.allQueries;
@@ -1959,6 +1962,7 @@ ACTOR Future<Void> getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req)
 ACTOR Future<Void> getKeyQ(StorageServer* data, GetKeyRequest req) {
 	state Span span("SS:getKey"_loc, { req.spanContext });
 	state int64_t resultSize = 0;
+	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 
 	++data->counters.getKeyQueries;
 	++data->counters.allQueries;
@@ -4324,6 +4328,7 @@ ACTOR Future<Void> checkBehind(StorageServer* self) {
 }
 
 ACTOR Future<Void> serveGetValueRequests(StorageServer* self, FutureStream<GetValueRequest> getValue) {
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetValue;
 	loop {
 		GetValueRequest req = waitNext(getValue);
 		// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade
@@ -4341,6 +4346,7 @@ ACTOR Future<Void> serveGetValueRequests(StorageServer* self, FutureStream<GetVa
 }
 
 ACTOR Future<Void> serveGetKeyValuesRequests(StorageServer* self, FutureStream<GetKeyValuesRequest> getKeyValues) {
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKeyValues;
 	loop {
 		GetKeyValuesRequest req = waitNext(getKeyValues);
 		// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade
@@ -4350,6 +4356,7 @@ ACTOR Future<Void> serveGetKeyValuesRequests(StorageServer* self, FutureStream<G
 }
 
 ACTOR Future<Void> serveGetKeyRequests(StorageServer* self, FutureStream<GetKeyRequest> getKey) {
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKey;
 	loop {
 		GetKeyRequest req = waitNext(getKey);
 		// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade
@@ -4362,6 +4369,7 @@ ACTOR Future<Void> watchValueWaitForVersion(StorageServer* self,
                                             WatchValueRequest req,
                                             PromiseStream<WatchValueRequest> stream) {
 	state Span span("SS:watchValueWaitForVersion"_loc, { req.spanContext });
+	currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 	try {
 		wait(success(waitForVersionNoTooOld(self, req.version)));
 		stream.send(req);
@@ -4375,9 +4383,11 @@ ACTOR Future<Void> watchValueWaitForVersion(StorageServer* self,
 
 ACTOR Future<Void> serveWatchValueRequestsImpl(StorageServer* self, FutureStream<WatchValueRequest> stream) {
 	loop {
+		currentLineage->modify(&TransactionLineage::txID) = 0;
 		state WatchValueRequest req = waitNext(stream);
 		state Reference<ServerWatchMetadata> metadata = self->getWatchMetadata(req.key.contents());
 		state Span span("SS:serveWatchValueRequestsImpl"_loc, { req.spanContext });
+		currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first();
 
 		if (!metadata.isValid()) { // case 1: no watch set for the current key
 			metadata = makeReference<ServerWatchMetadata>(req.key, req.value, req.version, req.tags, req.debugID);
@@ -4451,6 +4461,7 @@ ACTOR Future<Void> serveWatchValueRequestsImpl(StorageServer* self, FutureStream
 
 ACTOR Future<Void> serveWatchValueRequests(StorageServer* self, FutureStream<WatchValueRequest> watchValue) {
 	state PromiseStream<WatchValueRequest> stream;
+	currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::WatchValue;
 	self->actors.add(serveWatchValueRequestsImpl(self, stream.getFuture()));
 
 	loop {
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 4d05d3f5fe..2beccdf0ef 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -79,6 +79,10 @@ extern IKeyValueStore* keyValueStoreCompressTestData(IKeyValueStore* store);
 #define KV_STORE(filename, uid) keyValueStoreMemory(filename, uid)
 #endif
 
+namespace {
+RoleLineageCollector roleLineageCollector;
+}
+
 ACTOR Future<std::vector<Endpoint>> tryDBInfoBroadcast(RequestStream<UpdateServerDBInfoRequest> stream,
                                                        UpdateServerDBInfoRequest req) {
 	ErrorOr<std::vector<Endpoint>> rep =

From 235717772281f3d545e571eb72624a9eb0a5320e Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 20 Apr 2021 15:05:51 -0700
Subject: [PATCH 212/317] Add bool support to global configuration

---
 fdbclient/ActorLineageProfiler.cpp  |  4 ++++
 fdbclient/GlobalConfig.actor.cpp    |  4 +++-
 fdbclient/GlobalConfig.actor.h      |  2 +-
 fdbclient/SpecialKeySpace.actor.cpp |  3 +++
 fdbclient/Tuple.cpp                 | 29 +++++++++++++++++++++++++++++
 fdbclient/Tuple.h                   |  4 +++-
 fdbserver/fdbserver.actor.cpp       |  5 ++---
 7 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 82d04aa42c..733f581718 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -257,6 +257,10 @@ void ActorLineageProfilerT::setFrequency(unsigned frequency) {
 	} else if (change) {
 		cond.notify_all();
 	}
+
+	if (frequency == 0) {
+		profilerThread.join();
+	}
 }
 
 void ActorLineageProfilerT::profile() {
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 95d7cfce13..8096688786 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -34,7 +34,7 @@ const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_inf
 const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag_sample_rate");
 const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost");
 
-const KeyRef sampleFrequency = LiteralStringRef("visibility/sample_frequency");
+const KeyRef samplingFrequency = LiteralStringRef("visibility/sampling/frequency");
 
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
@@ -99,6 +99,8 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
 			any = StringRef(arena, t.getString(0).contents());
 		} else if (t.getType(0) == Tuple::ElementType::INT) {
 			any = t.getInt(0);
+		} else if (t.getType(0) == Tuple::ElementType::BOOL) {
+			any = t.getBool(0);
 		} else if (t.getType(0) == Tuple::ElementType::FLOAT) {
 			any = t.getFloat(0);
 		} else if (t.getType(0) == Tuple::ElementType::DOUBLE) {
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index bf7532a974..8835955400 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -49,7 +49,7 @@ extern const KeyRef fdbClientInfoTxnSizeLimit;
 extern const KeyRef transactionTagSampleRate;
 extern const KeyRef transactionTagSampleCost;
 
-extern const KeyRef sampleFrequency;
+extern const KeyRef samplingFrequency;
 
 // Structure used to hold the values stored by global configuration. The arena
 // is used as memory to store both the key and the value (the value is only
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index af1f106a66..603887fcf6 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1397,6 +1397,9 @@ Future<Standalone<RangeResultRef>> GlobalConfigImpl::getRange(ReadYourWritesTran
 			} else if (config->value.type() == typeid(int64_t)) {
 				result.push_back_deep(result.arena(),
 				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config->value))));
+			} else if (config->value.type() == typeid(bool)) {
+				result.push_back_deep(result.arena(),
+				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<bool>(config->value))));
 			} else if (config->value.type() == typeid(float)) {
 				result.push_back_deep(result.arena(),
 				                      KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config->value))));
diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp
index 367a7b80fb..ab1fcb0314 100644
--- a/fdbclient/Tuple.cpp
+++ b/fdbclient/Tuple.cpp
@@ -71,6 +71,8 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) {
 			i += sizeof(float) + 1;
 		} else if (data[i] == 0x21) {
 			i += sizeof(double) + 1;
+		} else if (data[i] == 0x26 || data[i] == 0x27) {
+			i += 1;
 		} else if (data[i] == '\x00') {
 			i += 1;
 		} else {
@@ -144,6 +146,16 @@ Tuple& Tuple::append(int64_t value) {
 	return *this;
 }
 
+Tuple& Tuple::appendBool(bool value) {
+	offsets.push_back(data.size());
+	if (value) {
+		data.push_back(data.arena(), 0x27);
+	} else {
+		data.push_back(data.arena(), 0x26);
+	}
+	return *this;
+}
+
 Tuple& Tuple::appendFloat(float value) {
 	offsets.push_back(data.size());
 	float swap = bigEndianFloat(value);
@@ -192,6 +204,8 @@ Tuple::ElementType Tuple::getType(size_t index) const {
 		return ElementType::FLOAT;
 	} else if (code == 0x21) {
 		return ElementType::DOUBLE;
+	} else if (code == 0x26 || code == 0x27) {
+		return ElementType::BOOL;
 	} else {
 		throw invalid_tuple_data_type();
 	}
@@ -287,6 +301,21 @@ int64_t Tuple::getInt(size_t index, bool allow_incomplete) const {
 }
 
 // TODO: Combine with bindings/flow/Tuple.*. This code is copied from there.
+bool Tuple::getBool(size_t index) const {
+	if (index >= offsets.size()) {
+		throw invalid_tuple_index();
+	}
+	ASSERT_LT(offsets[index], data.size());
+	uint8_t code = data[offsets[index]];
+	if (code == 0x26) {
+		return false;
+	} else if (code == 0x27) {
+		return true;
+	} else {
+		throw invalid_tuple_data_type();
+	}
+}
+
 float Tuple::getFloat(size_t index) const {
 	if (index >= offsets.size()) {
 		throw invalid_tuple_index();
diff --git a/fdbclient/Tuple.h b/fdbclient/Tuple.h
index 3dc597f262..62feba307b 100644
--- a/fdbclient/Tuple.h
+++ b/fdbclient/Tuple.h
@@ -40,6 +40,7 @@ struct Tuple {
 	Tuple& append(int64_t);
 	// There are some ambiguous append calls in fdbclient, so to make it easier
 	// to add append for floats and doubles, name them differently for now.
+	Tuple& appendBool(bool);
 	Tuple& appendFloat(float);
 	Tuple& appendDouble(double);
 	Tuple& appendNull();
@@ -51,7 +52,7 @@ struct Tuple {
 		return append(t);
 	}
 
-	enum ElementType { NULL_TYPE, INT, BYTES, UTF8, FLOAT, DOUBLE };
+	enum ElementType { NULL_TYPE, INT, BYTES, UTF8, BOOL, FLOAT, DOUBLE };
 
 	// this is number of elements, not length of data
 	size_t size() const { return offsets.size(); }
@@ -59,6 +60,7 @@ struct Tuple {
 	ElementType getType(size_t index) const;
 	Standalone<StringRef> getString(size_t index) const;
 	int64_t getInt(size_t index, bool allow_incomplete = false) const;
+	bool getBool(size_t index) const;
 	float getFloat(size_t index) const;
 	double getDouble(size_t index) const;
 
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 59e2f494fc..ab31760f7f 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -464,14 +464,13 @@ ACTOR Future<Void> dumpDatabase(Database cx, std::string outputFilename, KeyRang
 ACTOR Future<Void> actorLineageProfiler() {
 	wait(delay(1));
 	wait(GlobalConfig::globalConfig().onInitialized());
-	// TODO: Add flag to enable/disable
-	state unsigned frequency = GlobalConfig::globalConfig().get<double>(sampleFrequency, 0);
+	state unsigned frequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
 	ActorLineageProfiler::instance().setFrequency(frequency);
 
 	loop {
 		wait(GlobalConfig::globalConfig().onChange());
 
-		unsigned latestFrequency = GlobalConfig::globalConfig().get<double>(sampleFrequency, 0);
+		unsigned latestFrequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
 		if (latestFrequency != frequency) {
 			frequency = latestFrequency;
 			ActorLineageProfiler::instance().setFrequency(latestFrequency);

From 115efaabc3b2d875a3ccabb8fd74c15fed55124c Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 20 Apr 2021 15:31:13 -0700
Subject: [PATCH 213/317] Move profiler start function

---
 ...ler.cpp => ActorLineageProfiler.actor.cpp} | 23 +++++++++++++++++-
 ...rofiler.h => ActorLineageProfiler.actor.h} | 13 +++++++++-
 fdbclient/CMakeLists.txt                      |  4 ++--
 fdbserver/RoleLineage.actor.h                 |  2 +-
 fdbserver/fdbserver.actor.cpp                 | 24 ++-----------------
 5 files changed, 39 insertions(+), 27 deletions(-)
 rename fdbclient/{ActorLineageProfiler.cpp => ActorLineageProfiler.actor.cpp} (89%)
 rename fdbclient/{ActorLineageProfiler.h => ActorLineageProfiler.actor.h} (90%)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.actor.cpp
similarity index 89%
rename from fdbclient/ActorLineageProfiler.cpp
rename to fdbclient/ActorLineageProfiler.actor.cpp
index 733f581718..5c746ad9e2 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.actor.cpp
@@ -21,7 +21,8 @@
 #include "flow/flow.h"
 #include "flow/singleton.h"
 #include "fdbrpc/IAsyncFile.h"
-#include "fdbclient/ActorLineageProfiler.h"
+#include "fdbclient/ActorLineageProfiler.actor.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include <msgpack.hpp>
 #include <memory>
 #include <boost/endian/conversion.hpp>
@@ -279,3 +280,23 @@ void ActorLineageProfilerT::profile() {
 		}
 	}
 }
+
+// Handles running the sampling profiler, including responding to frequency
+// changes and other updates the client wishes to make through global
+// configuration.
+ACTOR Future<Void> runSamplingProfiler() {
+	wait(delay(1)); // A bit of a hack to get around GlobalConfig not being setup yet
+	wait(GlobalConfig::globalConfig().onInitialized());
+	state unsigned frequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
+	ActorLineageProfiler::instance().setFrequency(frequency);
+
+	loop {
+		wait(GlobalConfig::globalConfig().onChange());
+
+		unsigned latestFrequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
+		if (latestFrequency != frequency) {
+			frequency = latestFrequency;
+			ActorLineageProfiler::instance().setFrequency(latestFrequency);
+		}
+	}
+}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.actor.h
similarity index 90%
rename from fdbclient/ActorLineageProfiler.h
rename to fdbclient/ActorLineageProfiler.actor.h
index 5dee2a4291..50d064b746 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.actor.h
@@ -19,6 +19,13 @@
  */
 
 #pragma once
+
+#if defined(NO_INTELLISENSE) && !defined(FLOW_ACTORLINEAGEPROFILER_ACTOR_G_H)
+#define FLOW_ACTORLINEAGEPROFILER_ACTOR_G_H
+#include "fdbclient/ActorLineageProfiler.actor.g.h"
+#elif !defined(FLOW_ACTORLINEAGEPROFILER_ACTOR_H)
+#define FLOW_ACTORLINEAGEPROFILER_ACTOR_H
+
 #include "fdbclient/AnnotateActor.h"
 
 #include <optional>
@@ -30,7 +37,9 @@
 #include "flow/singleton.h"
 #include "flow/flow.h"
 
-void runSamplingProfiler();
+#include "flow/actorcompiler.h" // This must be the last #include.
+
+ACTOR Future<Void> runSamplingProfiler();
 
 struct IALPCollectorBase {
 	virtual std::optional<std::any> collect(ActorLineage*) = 0;
@@ -120,3 +129,5 @@ public:
 };
 
 using ActorLineageProfiler = crossbow::singleton<ActorLineageProfilerT>;
+
+#endif
diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index ee87d08646..25825f3f23 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(FDBCLIENT_SRCS
-  ActorLineageProfiler.h
-  ActorLineageProfiler.cpp
+  ActorLineageProfiler.actor.h
+  ActorLineageProfiler.actor.cpp
   AnnotateActor.cpp
   AsyncFileS3BlobStore.actor.cpp
   AsyncFileS3BlobStore.actor.h
diff --git a/fdbserver/RoleLineage.actor.h b/fdbserver/RoleLineage.actor.h
index 5cbf65ed53..977adaa47b 100644
--- a/fdbserver/RoleLineage.actor.h
+++ b/fdbserver/RoleLineage.actor.h
@@ -28,7 +28,7 @@
 
 #include "flow/singleton.h"
 #include "fdbrpc/Locality.h"
-#include "fdbclient/ActorLineageProfiler.h"
+#include "fdbclient/ActorLineageProfiler.actor.h"
 #include "fdbserver/WorkerInterface.actor.h"
 
 #include <string_view>
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index ab31760f7f..53876fd6fd 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -35,7 +35,7 @@
 #include <boost/algorithm/string.hpp>
 #include <boost/interprocess/managed_shared_memory.hpp>
 
-#include "fdbclient/ActorLineageProfiler.h"
+#include "fdbclient/ActorLineageProfiler.actor.h"
 #include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/RestoreWorkerInterface.actor.h"
@@ -458,26 +458,6 @@ ACTOR Future<Void> dumpDatabase(Database cx, std::string outputFilename, KeyRang
 	}
 }
 
-// Handles running the sampling profiler, including responding to frequency
-// changes and other updates the client wishes to make through global
-// configuration.
-ACTOR Future<Void> actorLineageProfiler() {
-	wait(delay(1));
-	wait(GlobalConfig::globalConfig().onInitialized());
-	state unsigned frequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
-	ActorLineageProfiler::instance().setFrequency(frequency);
-
-	loop {
-		wait(GlobalConfig::globalConfig().onChange());
-
-		unsigned latestFrequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
-		if (latestFrequency != frequency) {
-			frequency = latestFrequency;
-			ActorLineageProfiler::instance().setFrequency(latestFrequency);
-		}
-	}
-}
-
 void memoryTest();
 void skipListTest();
 
@@ -2009,7 +1989,7 @@ int main(int argc, char* argv[]) {
 				                      opts.whitelistBinPaths));
 				actors.push_back(histogramReport());
 				// actors.push_back( recurring( []{}, .001 ) );  // for ASIO latency measurement
-				actors.push_back(actorLineageProfiler());
+				actors.push_back(runSamplingProfiler());
 
 				f = stopAfter(waitForAll(actors));
 				g_network->run();

From 9e89159efb7a994d2d880ddd474bc9834cbd6a2e Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Tue, 20 Apr 2021 16:21:01 -0700
Subject: [PATCH 214/317] Don't use DLDatabase objects before they are ready
 (applicable for API versions < 610). Fix reference counting of DLDatabase
 objects to avoid leaking the underlying database handle. Update release notes
 to note that clients older than 6.2 still create extra connections.

---
 .../release-notes/release-notes-630.rst       |  2 +-
 .../release-notes/release-notes-700.rst       |  2 +-
 fdbclient/MultiVersionTransaction.actor.cpp   | 94 ++++++++++++-------
 fdbclient/MultiVersionTransaction.h           |  4 +
 4 files changed, 66 insertions(+), 36 deletions(-)

diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst
index cd8c5e4150..f4b5c8aacb 100644
--- a/documentation/sphinx/source/release-notes/release-notes-630.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-630.rst
@@ -4,7 +4,7 @@ Release Notes
 
 6.3.13
 ======
-* The multi-version client now requires at most two connections to the cluster, regardless of how many external clients are configured. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
+* The multi-version client now requires at most two client connections with version 6.2 or larger, regardless of how many external clients are configured. Clients older than 6.2 will continue to create an additional connection each. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
 
 6.3.12
 ======
diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst
index 5f3d3a4669..84e8f0680a 100644
--- a/documentation/sphinx/source/release-notes/release-notes-700.rst
+++ b/documentation/sphinx/source/release-notes/release-notes-700.rst
@@ -16,7 +16,7 @@ Performance
 -----------
 
 * Increased performance of dr_agent when copying the mutation log. The ``COPY_LOG_BLOCK_SIZE``, ``COPY_LOG_BLOCKS_PER_TASK``, ``COPY_LOG_PREFETCH_BLOCKS``, ``COPY_LOG_READ_AHEAD_BYTES`` and ``COPY_LOG_TASK_DURATION_NANOS`` knobs can be set. `(PR #3436) <https://github.com/apple/foundationdb/pull/3436>`_
-* Reduced the number of connections required by the multi-version client when loading external clients. When connection to 7.0 clusters, only one connection will be used. With older clusters, at most two connections will be used. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
+* Reduced the number of connections required by the multi-version client when loading external clients. When connecting to 7.0 clusters, only one connection with version 6.2 or larger will be used. With older clusters, at most two connections with version 6.2 or larger will be used. Clients older than version 6.2 will continue to create an additional connection each. `(PR #4667) <https://github.com/apple/foundationdb/pull/4667>`_
 
 Reliability
 -----------
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index b39fde5cfd..555765c26c 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -289,12 +289,15 @@ void DLTransaction::reset() {
 
 // DLDatabase
 DLDatabase::DLDatabase(Reference<FdbCApi> api, ThreadFuture<FdbCApi::FDBDatabase*> dbFuture) : api(api), db(nullptr) {
+	addref();
 	ready = mapThreadFuture<FdbCApi::FDBDatabase*, Void>(dbFuture, [this](ErrorOr<FdbCApi::FDBDatabase*> db) {
 		if (db.isError()) {
+			delref();
 			return ErrorOr<Void>(db.getError());
 		}
 
 		this->db = db.get();
+		delref();
 		return ErrorOr<Void>(Void());
 	});
 }
@@ -1013,12 +1016,56 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
 	});
 }
 
+// Replaces the active database connection with a new one. Must be called from the main thread.
+void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> newDb, Reference<ClientInfo> client) {
+	if (newDb) {
+		optionLock.enter();
+		for (auto option : options) {
+			try {
+				// In practice, this will set a deferred error instead of throwing. If that happens, the database
+				// will be unusable (attempts to use it will throw errors).
+				newDb->setOption(option.first, option.second.castTo<StringRef>());
+			} catch (Error& e) {
+				optionLock.leave();
+
+				// If we can't set all of the options on a cluster, we abandon the client
+				TraceEvent(SevError, "ClusterVersionChangeOptionError")
+				    .error(e)
+				    .detail("Option", option.first)
+				    .detail("OptionValue", option.second)
+				    .detail("LibPath", client->libPath);
+				client->failed = true;
+				MultiVersionApi::api->updateSupportedVersions();
+				newDb = Reference<IDatabase>();
+				break;
+			}
+		}
+
+		db = newDb;
+
+		optionLock.leave();
+
+		if (dbProtocolVersion.get().hasStableInterfaces() && db) {
+			versionMonitorDb = db;
+		} else {
+			versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
+		}
+	} else {
+		db = Reference<IDatabase>();
+		versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
+	}
+
+	dbVar->set(db);
+	protocolVersionMonitor = monitorProtocolVersion();
+}
+
 // Called when a change to the protocol version of the cluster has been detected. Must be called from the main
 // thread.
 void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion protocolVersion) {
 	if (dbProtocolVersion.present() &&
 	    protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) {
 		dbProtocolVersion = protocolVersion;
+		protocolVersionMonitor = monitorProtocolVersion();
 	} else {
 		TraceEvent("ProtocolVersionChanged")
 		    .detail("NewProtocolVersion", protocolVersion)
@@ -1036,46 +1083,25 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
 
 			Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
 
-			optionLock.enter();
-			for (auto option : options) {
-				try {
-					// In practice, this will set a deferred error instead of throwing. If that happens, the database
-					// will be unusable (attempts to use it will throw errors).
-					newDb->setOption(option.first, option.second.castTo<StringRef>());
-				} catch (Error& e) {
-					optionLock.leave();
+			if (client->external && !MultiVersionApi::apiVersionAtLeast(610)) {
+				dbReady = mapThreadFuture<Void, Void>(
+				    newDb.castTo<DLDatabase>()->onReady(), [this, newDb, client](ErrorOr<Void> ready) {
+					    if (!ready.isError()) {
+						    onMainThreadVoid([this, newDb, client]() { updateDatabase(newDb, client); }, nullptr);
+					    } else {
+						    updateDatabase(Reference<IDatabase>(), client);
+					    }
 
-					// If we can't set all of the options on a cluster, we abandon the client
-					TraceEvent(SevError, "ClusterVersionChangeOptionError")
-					    .error(e)
-					    .detail("Option", option.first)
-					    .detail("OptionValue", option.second)
-					    .detail("LibPath", client->libPath);
-					client->failed = true;
-					MultiVersionApi::api->updateSupportedVersions();
-					newDb = Reference<IDatabase>();
-					break;
-				}
-			}
-
-			db = newDb;
-
-			optionLock.leave();
-
-			if (dbProtocolVersion.get().hasStableInterfaces() && db) {
-				versionMonitorDb = db;
+					    dbReady = ThreadFuture<Void>();
+					    return ready;
+				    });
 			} else {
-				versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
+				updateDatabase(newDb, client);
 			}
 		} else {
-			db = Reference<IDatabase>();
-			versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
+			updateDatabase(Reference<IDatabase>(), Reference<ClientInfo>());
 		}
-
-		dbVar->set(db);
 	}
-
-	protocolVersionMonitor = monitorProtocolVersion();
 }
 
 std::atomic_flag MultiVersionDatabase::externalClientsInitialized = ATOMIC_FLAG_INIT;
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index c8aaeb840e..4e0e91a969 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -467,6 +467,9 @@ public:
 	struct DatabaseState : ThreadSafeReferenceCounted<DatabaseState> {
 		DatabaseState(std::string clusterFilePath, Reference<IDatabase> versionMonitorDb);
 
+		// Replaces the active database connection with a new one. Must be called from the main thread.
+		void updateDatabase(Reference<IDatabase> newDb, Reference<ClientInfo> client);
+
 		// Called when a change to the protocol version of the cluster has been detected. Must be called from the main
 		// thread.
 		void protocolVersionChanged(ProtocolVersion protocolVersion);
@@ -490,6 +493,7 @@ public:
 
 		bool cancelled;
 
+		ThreadFuture<Void> dbReady;
 		ThreadFuture<Void> protocolVersionMonitor;
 		Optional<ProtocolVersion> dbProtocolVersion;
 		std::map<ProtocolVersion, Reference<ClientInfo>> clients;

From 15336ca274261bdfc27143c5143d02fa90ee0472 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 20 Apr 2021 17:51:38 -0700
Subject: [PATCH 215/317] Add callback for specific global configuration key
 changes

---
 ...ler.actor.cpp => ActorLineageProfiler.cpp} | 29 +++++++------------
 ...rofiler.actor.h => ActorLineageProfiler.h} | 12 +-------
 fdbclient/CMakeLists.txt                      |  4 +--
 fdbclient/GlobalConfig.actor.cpp              | 17 +++++++++--
 fdbclient/GlobalConfig.actor.h                |  9 ++++++
 fdbclient/NativeAPI.actor.cpp                 |  2 ++
 fdbserver/RoleLineage.actor.h                 |  2 +-
 fdbserver/fdbserver.actor.cpp                 |  3 +-
 fdbserver/worker.actor.cpp                    |  3 ++
 9 files changed, 44 insertions(+), 37 deletions(-)
 rename fdbclient/{ActorLineageProfiler.actor.cpp => ActorLineageProfiler.cpp} (90%)
 rename fdbclient/{ActorLineageProfiler.actor.h => ActorLineageProfiler.h} (90%)

diff --git a/fdbclient/ActorLineageProfiler.actor.cpp b/fdbclient/ActorLineageProfiler.cpp
similarity index 90%
rename from fdbclient/ActorLineageProfiler.actor.cpp
rename to fdbclient/ActorLineageProfiler.cpp
index 5c746ad9e2..c317a88f37 100644
--- a/fdbclient/ActorLineageProfiler.actor.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -21,7 +21,7 @@
 #include "flow/flow.h"
 #include "flow/singleton.h"
 #include "fdbrpc/IAsyncFile.h"
-#include "fdbclient/ActorLineageProfiler.actor.h"
+#include "fdbclient/ActorLineageProfiler.h"
 #include "fdbclient/GlobalConfig.actor.h"
 #include <msgpack.hpp>
 #include <memory>
@@ -259,7 +259,7 @@ void ActorLineageProfilerT::setFrequency(unsigned frequency) {
 		cond.notify_all();
 	}
 
-	if (frequency == 0) {
+	if (frequency == 0 && profilerThread.joinable()) {
 		profilerThread.join();
 	}
 }
@@ -281,22 +281,13 @@ void ActorLineageProfilerT::profile() {
 	}
 }
 
-// Handles running the sampling profiler, including responding to frequency
-// changes and other updates the client wishes to make through global
-// configuration.
-ACTOR Future<Void> runSamplingProfiler() {
-	wait(delay(1)); // A bit of a hack to get around GlobalConfig not being setup yet
-	wait(GlobalConfig::globalConfig().onInitialized());
-	state unsigned frequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
-	ActorLineageProfiler::instance().setFrequency(frequency);
-
-	loop {
-		wait(GlobalConfig::globalConfig().onChange());
-
-		unsigned latestFrequency = GlobalConfig::globalConfig().get<double>(samplingFrequency, 0);
-		if (latestFrequency != frequency) {
-			frequency = latestFrequency;
-			ActorLineageProfiler::instance().setFrequency(latestFrequency);
-		}
+// Callback used to update the sampling profilers run frequency whenever the
+// frequency changes.
+void samplingProfilerUpdateFrequency(std::optional<std::any> freq) {
+	double frequency = 0;
+	if (freq.has_value()) {
+		frequency = std::any_cast<double>(freq.value());
 	}
+	TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency);
+	ActorLineageProfiler::instance().setFrequency(frequency);
 }
diff --git a/fdbclient/ActorLineageProfiler.actor.h b/fdbclient/ActorLineageProfiler.h
similarity index 90%
rename from fdbclient/ActorLineageProfiler.actor.h
rename to fdbclient/ActorLineageProfiler.h
index 50d064b746..b73e7d04eb 100644
--- a/fdbclient/ActorLineageProfiler.actor.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -20,12 +20,6 @@
 
 #pragma once
 
-#if defined(NO_INTELLISENSE) && !defined(FLOW_ACTORLINEAGEPROFILER_ACTOR_G_H)
-#define FLOW_ACTORLINEAGEPROFILER_ACTOR_G_H
-#include "fdbclient/ActorLineageProfiler.actor.g.h"
-#elif !defined(FLOW_ACTORLINEAGEPROFILER_ACTOR_H)
-#define FLOW_ACTORLINEAGEPROFILER_ACTOR_H
-
 #include "fdbclient/AnnotateActor.h"
 
 #include <optional>
@@ -37,9 +31,7 @@
 #include "flow/singleton.h"
 #include "flow/flow.h"
 
-#include "flow/actorcompiler.h" // This must be the last #include.
-
-ACTOR Future<Void> runSamplingProfiler();
+void samplingProfilerUpdateFrequency(std::optional<std::any> freq);
 
 struct IALPCollectorBase {
 	virtual std::optional<std::any> collect(ActorLineage*) = 0;
@@ -129,5 +121,3 @@ public:
 };
 
 using ActorLineageProfiler = crossbow::singleton<ActorLineageProfilerT>;
-
-#endif
diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index 25825f3f23..ee87d08646 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(FDBCLIENT_SRCS
-  ActorLineageProfiler.actor.h
-  ActorLineageProfiler.actor.cpp
+  ActorLineageProfiler.h
+  ActorLineageProfiler.cpp
   AnnotateActor.cpp
   AsyncFileS3BlobStore.actor.cpp
   AsyncFileS3BlobStore.actor.h
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 8096688786..79bbbb2202 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -87,6 +87,10 @@ Future<Void> GlobalConfig::onChange() {
 	return configChanged.onTrigger();
 }
 
+void GlobalConfig::trigger(KeyRef key, std::function<void(std::optional<std::any>)> fn) {
+	callbacks.emplace(key, std::move(fn));
+}
+
 void GlobalConfig::insert(KeyRef key, ValueRef value) {
 	data.erase(key);
 
@@ -109,19 +113,26 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
 			ASSERT(false);
 		}
 		data[stableKey] = makeReference<ConfigValue>(std::move(arena), std::move(any));
+
+		if (callbacks.find(stableKey) != callbacks.end()) {
+			callbacks[stableKey](data[stableKey]->value);
+		}
 	} catch (Error& e) {
 		TraceEvent("GlobalConfigTupleParseError").detail("What", e.what());
 	}
 }
 
 void GlobalConfig::erase(KeyRef key) {
-	data.erase(key);
+	erase(KeyRangeRef(key, keyAfter(key)));
 }
 
 void GlobalConfig::erase(KeyRangeRef range) {
 	auto it = data.begin();
 	while (it != data.end()) {
 		if (range.contains(it->first)) {
+			if (callbacks.find(it->first) != callbacks.end()) {
+				callbacks[it->first](std::nullopt);
+			}
 			it = data.erase(it);
 		} else {
 			++it;
@@ -175,7 +186,9 @@ ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
 // Updates local copy of global configuration by reading the entire key-range
 // from storage.
 ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
-	self->data.clear();
+	for (const auto& [key, _] : self->data) {
+		self->erase(key);
+	}
 
 	Transaction tr(self->cx);
 	Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 8835955400..de98c442e1 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -27,7 +27,9 @@
 #define FDBCLIENT_GLOBALCONFIG_ACTOR_H
 
 #include <any>
+#include <functional>
 #include <map>
+#include <optional>
 #include <type_traits>
 #include <unordered_map>
 
@@ -128,6 +130,12 @@ public:
 	// configuration changes.
 	Future<Void> onChange();
 
+	// Calls \ref fn when the value associated with \ref key is changed. \ref
+	// fn is passed the updated value for the key, or an empty optional if the
+	// key has been cleared. If the value is an allocated object, its memory
+	// remains in the control of the global configuration.
+	void trigger(KeyRef key, std::function<void(std::optional<std::any>)> fn);
+
 private:
 	GlobalConfig();
 
@@ -156,6 +164,7 @@ private:
 	AsyncTrigger configChanged;
 	std::unordered_map<StringRef, Reference<ConfigValue>> data;
 	Version lastUpdate;
+	std::unordered_map<KeyRef, std::function<void(std::optional<std::any>)>> callbacks;
 };
 
 #endif
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 1857cea0c7..cd7638221b 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -32,6 +32,7 @@
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/MultiInterface.h"
 
+#include "fdbclient/ActorLineageProfiler.h"
 #include "fdbclient/AnnotateActor.h"
 #include "fdbclient/Atomic.h"
 #include "fdbclient/ClusterInterface.h"
@@ -960,6 +961,7 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 	getValueCompleted.init(LiteralStringRef("NativeAPI.GetValueCompleted"));
 
 	GlobalConfig::create(this, clientInfo);
+	GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
 
 	monitorProxiesInfoChange = monitorProxiesChange(clientInfo, &proxiesChangeTrigger);
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);
diff --git a/fdbserver/RoleLineage.actor.h b/fdbserver/RoleLineage.actor.h
index 977adaa47b..5cbf65ed53 100644
--- a/fdbserver/RoleLineage.actor.h
+++ b/fdbserver/RoleLineage.actor.h
@@ -28,7 +28,7 @@
 
 #include "flow/singleton.h"
 #include "fdbrpc/Locality.h"
-#include "fdbclient/ActorLineageProfiler.actor.h"
+#include "fdbclient/ActorLineageProfiler.h"
 #include "fdbserver/WorkerInterface.actor.h"
 
 #include <string_view>
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 53876fd6fd..1d66b163d4 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -35,7 +35,7 @@
 #include <boost/algorithm/string.hpp>
 #include <boost/interprocess/managed_shared_memory.hpp>
 
-#include "fdbclient/ActorLineageProfiler.actor.h"
+#include "fdbclient/ActorLineageProfiler.h"
 #include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/RestoreWorkerInterface.actor.h"
@@ -1989,7 +1989,6 @@ int main(int argc, char* argv[]) {
 				                      opts.whitelistBinPaths));
 				actors.push_back(histogramReport());
 				// actors.push_back( recurring( []{}, .001 ) );  // for ASIO latency measurement
-				actors.push_back(runSamplingProfiler());
 
 				f = stopAfter(waitForAll(actors));
 				g_network->run();
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 4d05d3f5fe..fea422dcd8 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -22,6 +22,7 @@
 #include <boost/lexical_cast.hpp>
 
 #include "fdbrpc/Locality.h"
+#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/ProcessInterface.h"
 #include "fdbclient/StorageServerInterface.h"
 #include "fdbserver/Knobs.h"
@@ -1038,6 +1039,8 @@ ACTOR Future<Void> workerServer(Reference<ClusterConnectionFile> connFile,
 			metricsLogger = runMetrics(openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, lockAware),
 			                           KeyRef(metricsPrefix));
 		}
+
+		GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
 	}
 
 	errorForwarders.add(resetAfter(degraded,

From 8b280f5be637a465e57c1821f3fa41d07619da6e Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 20 Apr 2021 17:55:27 -0700
Subject: [PATCH 216/317] Remove old includes

---
 fdbclient/ActorLineageProfiler.cpp | 1 -
 fdbserver/fdbserver.actor.cpp      | 2 --
 2 files changed, 3 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index c317a88f37..42ac76da90 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -22,7 +22,6 @@
 #include "flow/singleton.h"
 #include "fdbrpc/IAsyncFile.h"
 #include "fdbclient/ActorLineageProfiler.h"
-#include "fdbclient/GlobalConfig.actor.h"
 #include <msgpack.hpp>
 #include <memory>
 #include <boost/endian/conversion.hpp>
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 1d66b163d4..136cd90c3d 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -35,8 +35,6 @@
 #include <boost/algorithm/string.hpp>
 #include <boost/interprocess/managed_shared_memory.hpp>
 
-#include "fdbclient/ActorLineageProfiler.h"
-#include "fdbclient/GlobalConfig.actor.h"
 #include "fdbclient/NativeAPI.actor.h"
 #include "fdbclient/RestoreWorkerInterface.actor.h"
 #include "fdbclient/SystemData.h"

From 36b1ab7ba5fabaf0214785fff286dabcbaaced1f Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 20 Apr 2021 22:05:16 -0700
Subject: [PATCH 217/317] Detach profiler thread instead of joining it

---
 fdbclient/ActorLineageProfiler.cpp | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 42ac76da90..fe335d90d5 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -250,23 +250,30 @@ void ActorLineageProfilerT::stop() {
 }
 
 void ActorLineageProfilerT::setFrequency(unsigned frequency) {
+	unsigned oldFrequency = this->frequency;
 	bool change = this->frequency != frequency;
 	this->frequency = frequency;
-	if (frequency != 0 && !profilerThread.joinable()) {
-		profilerThread = std::thread(std::bind(&ActorLineageProfilerT::profile, this));
-	} else if (change) {
-		cond.notify_all();
-	}
 
-	if (frequency == 0 && profilerThread.joinable()) {
-		profilerThread.join();
+	if (change) {
+		// Profiler thread will automatically switch to new frequency after
+		// being triggered by the the condition variable. Only need to start a
+		// new profiler thread if the old one has been stopped due to the
+		// profiler thread returning (frequency set to 0).
+		if (oldFrequency == 0 && frequency != 0) {
+			std::thread(&ActorLineageProfilerT::profile, this).detach();
+		}
+		cond.notify_all();
 	}
 }
 
 void ActorLineageProfilerT::profile() {
+	static std::atomic_int profileThreadCount = 0;
+	ASSERT(++profileThreadCount == 1);
+
 	for (;;) {
 		collection->refresh();
 		if (frequency == 0) {
+			profileThreadCount--;
 			return;
 		}
 		{
@@ -275,6 +282,7 @@ void ActorLineageProfilerT::profile() {
 			// cond.wait_until(lock, lastSample + std::chrono::milliseconds)
 		}
 		if (frequency == 0) {
+			profileThreadCount--;
 			return;
 		}
 	}

From e18c9961b44ab61855983d5a61e9546cfe8a989e Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 21 Apr 2021 00:22:33 -0700
Subject: [PATCH 218/317] rewrote tlog recruitment logic so that it is
 deterministic, to prevent better master exists from triggering spuriously

---
 fdbrpc/ReplicationPolicy.h            |   4 +
 fdbserver/ClusterController.actor.cpp | 768 ++++++++++++++++++++------
 fdbserver/WorkerInterface.actor.h     |   2 +
 3 files changed, 611 insertions(+), 163 deletions(-)

diff --git a/fdbrpc/ReplicationPolicy.h b/fdbrpc/ReplicationPolicy.h
index f74f434304..a9c6f33e09 100644
--- a/fdbrpc/ReplicationPolicy.h
+++ b/fdbrpc/ReplicationPolicy.h
@@ -151,6 +151,10 @@ struct PolicyAcross final : IReplicationPolicy, public ReferenceCounted<PolicyAc
 		_policy->attributeKeys(set);
 	}
 
+	Reference<IReplicationPolicy> embeddedPolicy() { return _policy; }
+
+	std::string attributeKey() { return _attribKey; }
+
 protected:
 	int _count;
 	std::string _attribKey;
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 8ec3a4d30c..5121e72d94 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -321,7 +321,433 @@ public:
 		return results;
 	}
 
-	// Selects workers as TLogs from available workers based on input parameters.
+	// Adds workers to the result such that each field is used in the result set as evenly as possible,
+	// with a secondary criteria of minimizing the reuse of zoneIds
+	// only add workers which have a field which is already in the result set
+	void addWorkersByLowestField(StringRef field,
+	                             int desired,
+	                             std::vector<WorkerDetails> workers,
+	                             std::set<WorkerDetails>& resultSet) {
+		typedef Optional<Standalone<StringRef>> Field;
+		typedef Optional<Standalone<StringRef>> Zone;
+		typedef std::pair<int, Field> FieldCount;
+		typedef std::pair<int, Zone> ZoneCount;
+
+		std::priority_queue<FieldCount, std::vector<FieldCount>, std::greater<FieldCount>> fieldQueue;
+		std::map<Field, std::priority_queue<ZoneCount, std::vector<ZoneCount>, std::greater<ZoneCount>>>
+		    field_zoneQueue;
+
+		std::map<Field, std::pair<int, int>> field_count;
+		std::map<Zone, std::pair<int, Field>> zone_count;
+		std::map<Zone, std::vector<WorkerDetails>> zone_workers;
+
+		// Count the amount of fields and zones already in the result set
+		for (auto& worker : resultSet) {
+			auto thisField = worker.interf.locality.get(field);
+			auto thisZone = worker.interf.locality.zoneId();
+			auto thisDc = worker.interf.locality.dcId();
+
+			auto& f = field_count[thisField];
+			f.first++;
+			if (thisDc == clusterControllerDcId) {
+				f.second = 1;
+			}
+			auto& z = zone_count[thisZone];
+			z.first++;
+			z.second = thisField;
+		}
+
+		for (auto& worker : workers) {
+			auto thisField = worker.interf.locality.get(field);
+			auto thisZone = worker.interf.locality.zoneId();
+			zone_workers[thisZone].push_back(worker);
+
+			if (field_count.count(thisField)) {
+				zone_count[thisZone].second = thisField;
+			}
+		}
+
+		// try to avoid fields in the cluster controller datacenter if everything else is equal
+		for (auto& it : field_count) {
+			fieldQueue.push(std::make_pair(2 * it.second.first + it.second.second, it.first));
+		}
+
+		for (auto& it : zone_count) {
+			field_zoneQueue[it.second.second].push(std::make_pair(it.second.first, it.first));
+		}
+
+		// start with the least used field, and try to find a worker with that field
+		while (fieldQueue.size()) {
+			auto lowestField = fieldQueue.top();
+			auto& lowestZoneQueue = field_zoneQueue[lowestField.second];
+			bool added = false;
+			// start with the least used zoneId, and try and find a worker with that zone
+			while (lowestZoneQueue.size() && !added) {
+				auto lowestZone = lowestZoneQueue.top();
+				auto& zoneWorkers = zone_workers[lowestZone.second];
+
+				while (zoneWorkers.size() && !added) {
+					if (!resultSet.count(zoneWorkers.back())) {
+						resultSet.insert(zoneWorkers.back());
+						if (resultSet.size() >= desired) {
+							return;
+						}
+						added = true;
+					}
+					zoneWorkers.pop_back();
+				}
+				lowestZoneQueue.pop();
+				if (added && zoneWorkers.size()) {
+					lowestZoneQueue.push(std::make_pair(lowestZone.first + 1, lowestZone.second));
+				}
+			}
+			fieldQueue.pop();
+			if (added) {
+				fieldQueue.push(std::make_pair(lowestField.first + 2, lowestField.second));
+			}
+		}
+	}
+
+	// Adds workers to the result which minimize the reuse of zoneIds
+	void addWorkersByLowestZone(int desired, std::vector<WorkerDetails> workers, std::set<WorkerDetails>& resultSet) {
+		typedef Optional<Standalone<StringRef>> Zone;
+		typedef std::pair<int, Zone> ZoneCount;
+
+		std::map<Zone, int> zone_count;
+		std::map<Zone, std::vector<WorkerDetails>> zone_workers;
+		std::priority_queue<ZoneCount, std::vector<ZoneCount>, std::greater<ZoneCount>> zoneQueue;
+
+		for (auto& worker : workers) {
+			auto thisZone = worker.interf.locality.zoneId();
+			zone_count[thisZone] = 0;
+			zone_workers[thisZone].push_back(worker);
+		}
+
+		for (auto& worker : resultSet) {
+			auto thisZone = worker.interf.locality.zoneId();
+			zone_count[thisZone]++;
+		}
+
+		for (auto& it : zone_count) {
+			zoneQueue.push(std::make_pair(it.second, it.first));
+		}
+
+		while (zoneQueue.size()) {
+			auto lowestZone = zoneQueue.top();
+			auto& zoneWorkers = zone_workers[lowestZone.second];
+
+			bool added = false;
+			while (zoneWorkers.size() && !added) {
+				if (!resultSet.count(zoneWorkers.back())) {
+					resultSet.insert(zoneWorkers.back());
+					if (resultSet.size() >= desired) {
+						return;
+					}
+					added = true;
+				}
+				zoneWorkers.pop_back();
+			}
+			zoneQueue.pop();
+			if (added && zoneWorkers.size()) {
+				zoneQueue.push(std::make_pair(lowestZone.first + 1, lowestZone.second));
+			}
+		}
+	}
+
+	// A TLog recruitment method specialized for three_data_hall and three_datacenter configurations
+	// It attempts to evenly recruit processes from across data_halls or datacenters
+	std::vector<WorkerDetails> getWorkersForTlogsComplex(DatabaseConfiguration const& conf,
+	                                                     int32_t desired,
+	                                                     std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	                                                     StringRef field,
+	                                                     int minFields,
+	                                                     int minPerField,
+	                                                     bool allowDegraded,
+	                                                     bool checkStable,
+	                                                     std::set<Optional<Key>> dcIds,
+	                                                     std::vector<UID> exclusionWorkerIds) {
+		std::map<std::pair<ProcessClass::Fitness, int>, vector<WorkerDetails>> fitness_workers;
+		desired = std::max(desired, minFields * minPerField);
+
+		// Go through all the workers to list all the workers that can be recruited.
+		for (const auto& [worker_process_id, worker_info] : id_worker) {
+			const auto& worker_details = worker_info.details;
+			auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
+
+			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
+			        exclusionWorkerIds.end() ||
+			    !workerAvailable(worker_info, checkStable) ||
+			    conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign ||
+			    (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) ||
+			    (!allowDegraded && worker_details.degraded)) {
+				continue;
+			}
+
+			fitness_workers[std::make_pair(fitness, id_used[worker_process_id])].push_back(worker_details);
+		}
+
+		auto requiredFitness = ProcessClass::BestFit;
+		int requiredUsed = 0;
+
+		typedef Optional<Standalone<StringRef>> Field;
+		typedef Optional<Standalone<StringRef>> Zone;
+		std::map<Field, std::pair<std::set<Zone>, std::vector<WorkerDetails>>> field_zones;
+		std::set<Field> fieldsWithMin;
+		std::map<Field, int> field_count;
+		std::map<Field, std::tuple<ProcessClass::Fitness, int, bool>> field_fitness;
+
+		// Determine the best required workers by finding the workers with enough unique zoneIds per field
+		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+			deterministicRandom()->randomShuffle(workerIter->second);
+			for (auto& worker : workerIter->second) {
+				auto thisField = worker.interf.locality.get(field);
+				auto& zones = field_zones[thisField];
+				if (zones.first.insert(worker.interf.locality.zoneId()).second) {
+					zones.second.push_back(worker);
+					if (zones.first.size() == minPerField) {
+						fieldsWithMin.insert(thisField);
+					}
+				}
+				field_count[thisField]++;
+				field_fitness.insert({ thisField,
+				                       std::make_tuple(workerIter->first.first,
+				                                       workerIter->first.second,
+				                                       worker.interf.locality.dcId() == clusterControllerDcId) });
+			}
+			if (fieldsWithMin.size() >= minFields) {
+				requiredFitness = workerIter->first.first;
+				requiredUsed = workerIter->first.second;
+				break;
+			}
+		}
+
+		if (fieldsWithMin.size() < minFields) {
+			throw no_more_servers();
+		}
+
+		// If we cannot use all of the fields, use the fields which allow the best workers to be chosen
+		if (fieldsWithMin.size() * minPerField > desired) {
+			std::vector<std::tuple<ProcessClass::Fitness, int, bool, int, Field>> orderedFields;
+			for (auto& it : fieldsWithMin) {
+				auto& fitness = field_fitness[it];
+				orderedFields.push_back(std::make_tuple(
+				    std::get<0>(fitness), std::get<1>(fitness), std::get<2>(fitness), field_count[it], it));
+			}
+			std::sort(orderedFields.begin(), orderedFields.end());
+			std::set<Field> newFieldsWithMin;
+			int totalFields = desired / minPerField;
+			int maxCount = 0;
+			for (int i = 0; i < orderedFields.size(); i++) {
+				if (newFieldsWithMin.size() == totalFields - 1 && maxCount + std::get<3>(orderedFields[i]) < desired) {
+					for (int j = i + 1; j < orderedFields.size(); j++) {
+						if (maxCount + std::get<3>(orderedFields[j]) >= desired) {
+							newFieldsWithMin.insert(std::get<4>(orderedFields[j]));
+							break;
+						}
+					}
+					if (newFieldsWithMin.size() == totalFields) {
+						break;
+					}
+				}
+				maxCount += std::get<3>(orderedFields[i]);
+				newFieldsWithMin.insert(std::get<4>(orderedFields[i]));
+				if (newFieldsWithMin.size() == totalFields) {
+					break;
+				}
+			}
+			fieldsWithMin = newFieldsWithMin;
+		}
+
+		// Create a result set with fulfills the minField and minPerField requirements before adding more workers
+		std::set<WorkerDetails> resultSet;
+		for (auto& it : fieldsWithMin) {
+			auto& w = field_zones[it].second;
+			for (int i = 0; i < minPerField; i++) {
+				resultSet.insert(w[i]);
+			}
+		}
+
+		// Continue adding workers to the result set until we reach the desired number of workers
+		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+			if (workerIter->first.first > requiredFitness ||
+			    (workerIter->first.first == requiredFitness && workerIter->first.second > requiredUsed)) {
+				break;
+			}
+			if (workerIter->second.size() + resultSet.size() <= desired) {
+				for (auto& worker : workerIter->second) {
+					if (fieldsWithMin.count(worker.interf.locality.get(field))) {
+						resultSet.insert(worker);
+					}
+				}
+			} else {
+				addWorkersByLowestField(field, desired, workerIter->second, resultSet);
+			}
+			if (resultSet.size() >= desired) {
+				break;
+			}
+		}
+
+		for (auto& result : resultSet) {
+			id_used[result.interf.locality.processId()]++;
+		}
+
+		return std::vector<WorkerDetails>(resultSet.begin(), resultSet.end());
+	}
+
+	// Attempt to recruit TLogs without degraded processes and see if it improves the configuration
+	std::vector<WorkerDetails> getWorkersForTlogsComplex(DatabaseConfiguration const& conf,
+	                                                     int32_t desired,
+	                                                     std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	                                                     StringRef field,
+	                                                     int minFields,
+	                                                     int minPerField,
+	                                                     bool checkStable,
+	                                                     std::set<Optional<Key>> dcIds,
+	                                                     std::vector<UID> exclusionWorkerIds) {
+		std::map<Optional<Standalone<StringRef>>, int> withDegradedUsed = id_used;
+		auto withDegraded = getWorkersForTlogsComplex(conf,
+		                                              desired,
+		                                              withDegradedUsed,
+		                                              field,
+		                                              minFields,
+		                                              minPerField,
+		                                              true,
+		                                              checkStable,
+		                                              dcIds,
+		                                              exclusionWorkerIds);
+		RoleFitness withDegradedFitness(withDegraded, ProcessClass::TLog, withDegradedUsed);
+
+		bool usedDegraded = false;
+		for (auto& it : withDegraded) {
+			if (it.degraded) {
+				usedDegraded = true;
+				break;
+			}
+		}
+
+		if (!usedDegraded) {
+			id_used = withDegradedUsed;
+			return withDegraded;
+		}
+
+		try {
+			std::map<Optional<Standalone<StringRef>>, int> withoutDegradedUsed = id_used;
+			auto withoutDegraded = getWorkersForTlogsComplex(conf,
+			                                                 desired,
+			                                                 withoutDegradedUsed,
+			                                                 field,
+			                                                 minFields,
+			                                                 minPerField,
+			                                                 false,
+			                                                 checkStable,
+			                                                 dcIds,
+			                                                 exclusionWorkerIds);
+			RoleFitness withoutDegradedFitness(withoutDegraded, ProcessClass::TLog, withoutDegradedUsed);
+
+			if (withDegradedFitness < withoutDegradedFitness) {
+				id_used = withDegradedUsed;
+				return withDegraded;
+			}
+			id_used = withoutDegradedUsed;
+			return withoutDegraded;
+		} catch (Error& e) {
+			if (e.code() != error_code_no_more_servers) {
+				throw;
+			}
+			id_used = withDegradedUsed;
+			return withDegraded;
+		}
+	}
+
+	// A TLog recruitment method specialized for single, double, and triple configurations
+	// It recruits processes from with unique zoneIds until it reaches the desired amount
+	std::vector<WorkerDetails> getWorkersForTlogsSimple(DatabaseConfiguration const& conf,
+	                                                    int32_t required,
+	                                                    int32_t desired,
+	                                                    std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	                                                    bool checkStable,
+	                                                    std::set<Optional<Key>> dcIds,
+	                                                    std::vector<UID> exclusionWorkerIds) {
+		std::map<std::tuple<ProcessClass::Fitness, int, bool, bool>, vector<WorkerDetails>> fitness_workers;
+
+		// Go through all the workers to list all the workers that can be recruited.
+		for (const auto& [worker_process_id, worker_info] : id_worker) {
+			const auto& worker_details = worker_info.details;
+			auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
+			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
+			        exclusionWorkerIds.end() ||
+			    !workerAvailable(worker_info, checkStable) ||
+			    conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign ||
+			    (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0)) {
+				continue;
+			}
+
+			// This worker is a candidate for TLog recruitment.
+			bool inCCDC = worker_details.interf.locality.dcId() == clusterControllerDcId;
+			// Prefer recruiting a TransactionClass non-degraded process over a LogClass degraded process
+			if (worker_details.degraded) {
+				fitness = std::max(fitness, ProcessClass::GoodFit);
+			}
+
+			fitness_workers[std::make_tuple(fitness, id_used[worker_process_id], worker_details.degraded, inCCDC)]
+			    .push_back(worker_details);
+		}
+
+		auto requiredFitness = ProcessClass::BestFit;
+		int requiredUsed = 0;
+
+		std::set<Optional<Standalone<StringRef>>> zones;
+		std::set<WorkerDetails> resultSet;
+
+		// Determine the best required workers by finding the workers with enough unique zoneIds
+		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+			auto fitness = std::get<0>(workerIter->first);
+			auto used = std::get<1>(workerIter->first);
+			deterministicRandom()->randomShuffle(workerIter->second);
+			for (auto& worker : workerIter->second) {
+				if (!zones.count(worker.interf.locality.zoneId())) {
+					zones.insert(worker.interf.locality.zoneId());
+					resultSet.insert(worker);
+					if (resultSet.size() >= required) {
+						break;
+					}
+				}
+			}
+			if (resultSet.size() >= required) {
+				requiredFitness = fitness;
+				requiredUsed = used;
+				break;
+			}
+		}
+
+		// Continue adding workers to the result set until we reach the desired number of workers
+		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+			auto fitness = std::get<0>(workerIter->first);
+			auto used = std::get<1>(workerIter->first);
+			if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+				break;
+			}
+			if (workerIter->second.size() + resultSet.size() <= desired) {
+				for (auto& worker : workerIter->second) {
+					resultSet.insert(worker);
+				}
+			} else {
+				addWorkersByLowestZone(desired, workerIter->second, resultSet);
+			}
+			if (resultSet.size() >= desired) {
+				break;
+			}
+		}
+
+		for (auto& result : resultSet) {
+			id_used[result.interf.locality.processId()]++;
+		}
+
+		return std::vector<WorkerDetails>(resultSet.begin(), resultSet.end());
+	}
+
+	// A backup method for TLog recruitment that is used for custom policies, but does a worse job
+	// selecting the best workers.
 	//   conf:        the database configuration.
 	//   required:    the required number of TLog workers to select.
 	//   desired:     the desired number of TLog workers to select.
@@ -332,78 +758,30 @@ public:
 	//   dcIds:       the target data centers the workers are in. The selected workers must all be from these
 	//                data centers:
 	//   exclusionWorkerIds: the workers to be excluded from the selection.
-	std::vector<WorkerDetails> getWorkersForTlogs(DatabaseConfiguration const& conf,
-	                                              int32_t required,
-	                                              int32_t desired,
-	                                              Reference<IReplicationPolicy> const& policy,
-	                                              std::map<Optional<Standalone<StringRef>>, int>& id_used,
-	                                              bool checkStable = false,
-	                                              std::set<Optional<Key>> dcIds = std::set<Optional<Key>>(),
-	                                              std::vector<UID> exclusionWorkerIds = {}) {
+	std::vector<WorkerDetails> getWorkersForTlogsBackup(DatabaseConfiguration const& conf,
+	                                                    int32_t required,
+	                                                    int32_t desired,
+	                                                    Reference<IReplicationPolicy> const& policy,
+	                                                    std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	                                                    bool checkStable = false,
+	                                                    std::set<Optional<Key>> dcIds = std::set<Optional<Key>>(),
+	                                                    std::vector<UID> exclusionWorkerIds = {}) {
 		std::map<std::tuple<ProcessClass::Fitness, int, bool, bool>, vector<WorkerDetails>> fitness_workers;
 		std::vector<WorkerDetails> results;
-		std::vector<LocalityData> unavailableLocals;
-		Reference<LocalitySet> logServerSet;
-		LocalityMap<WorkerDetails>* logServerMap;
+		Reference<LocalitySet> logServerSet = Reference<LocalitySet>(new LocalityMap<WorkerDetails>());
+		LocalityMap<WorkerDetails>* logServerMap = (LocalityMap<WorkerDetails>*)logServerSet.getPtr();
 		bool bCompleted = false;
 		desired = std::max(required, desired);
 
-		// Construct the list of DCs where the TLog recruitment is happening. This is mainly for logging purpose.
-		std::string dcList;
-		for (const auto& dc : dcIds) {
-			if (!dcList.empty()) {
-				dcList += ',';
-			}
-			dcList += printable(dc);
-		}
-
-		logServerSet = Reference<LocalitySet>(new LocalityMap<WorkerDetails>());
-		logServerMap = (LocalityMap<WorkerDetails>*)logServerSet.getPtr();
-
-		// Populate `unavailableLocals` and log the reason why the worker is considered as unavailable.
-		auto logWorkerUnavailable = [this, &unavailableLocals, &dcList](const std::string& reason,
-		                                                                const WorkerDetails& details,
-		                                                                ProcessClass::Fitness fitness) {
-			unavailableLocals.push_back(details.interf.locality);
-
-			// Note that the recruitment happens only during initial database creation and recovery. So these trace
-			// events should be sparse.
-			TraceEvent("GetTLogTeamWorkerUnavailable", id)
-			    .detail("Reason", reason)
-			    .detail("WorkerID", details.interf.id())
-			    .detail("WorkerDC", details.interf.locality.dcId())
-			    .detail("Address", details.interf.addresses().toString())
-			    .detail("Fitness", fitness)
-			    .detail("RecruitmentDcIds", dcList);
-		};
-
 		// Go through all the workers to list all the workers that can be recruited.
 		for (const auto& [worker_process_id, worker_info] : id_worker) {
 			const auto& worker_details = worker_info.details;
 			auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog);
 			if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) !=
-			    exclusionWorkerIds.end()) {
-				logWorkerUnavailable("Worker is excluded", worker_details, fitness);
-				continue;
-			}
-
-			if (!workerAvailable(worker_info, checkStable)) {
-				logWorkerUnavailable("Worker is not available", worker_details, fitness);
-				continue;
-			}
-
-			if (conf.isExcludedServer(worker_details.interf.addresses())) {
-				logWorkerUnavailable("Worker server is excluded from the cluster", worker_details, fitness);
-				continue;
-			}
-
-			if (fitness == ProcessClass::NeverAssign) {
-				logWorkerUnavailable("Worker's fitness is NeverAssign", worker_details, fitness);
-				continue;
-			}
-
-			if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) {
-				logWorkerUnavailable("Worker is not in the target DC", worker_details, fitness);
+			        exclusionWorkerIds.end() ||
+			    !workerAvailable(worker_info, checkStable) ||
+			    conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign ||
+			    (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0)) {
 				continue;
 			}
 
@@ -413,6 +791,7 @@ public:
 			if (worker_details.degraded) {
 				fitness = std::max(fitness, ProcessClass::GoodFit);
 			}
+
 			fitness_workers[std::make_tuple(fitness, id_used[worker_process_id], worker_details.degraded, inCCDC)]
 			    .push_back(worker_details);
 		}
@@ -452,21 +831,6 @@ public:
 				tLocalities.push_back(object->interf.locality);
 			}
 
-			TraceEvent(SevWarn, "GetTLogTeamFailed")
-			    .detail("DcIds", dcList)
-			    .detail("Policy", policy->info())
-			    .detail("Processes", logServerSet->size())
-			    .detail("Workers", id_worker.size())
-			    .detail("FitnessGroups", fitness_workers.size())
-			    .detail("TLogZones", ::describeZones(tLocalities))
-			    .detail("TLogDataHalls", ::describeDataHalls(tLocalities))
-			    .detail("MissingZones", ::describeZones(unavailableLocals))
-			    .detail("MissingDataHalls", ::describeDataHalls(unavailableLocals))
-			    .detail("Required", required)
-			    .detail("DesiredLogs", desired)
-			    .detail("CheckStable", checkStable)
-			    .detail("NumExclusionWorkers", exclusionWorkerIds.size());
-
 			logServerSet->clear();
 			logServerSet.clear();
 			throw no_more_servers();
@@ -480,18 +844,6 @@ public:
 			for (auto& result : results) {
 				id_used[result.interf.locality.processId()]++;
 			}
-			TraceEvent("GetTLogTeamDone")
-			    .detail("DcIds", dcList)
-			    .detail("Policy", policy->info())
-			    .detail("Results", results.size())
-			    .detail("Processes", logServerSet->size())
-			    .detail("Workers", id_worker.size())
-			    .detail("Required", required)
-			    .detail("Desired", desired)
-			    .detail("Fitness", requiredFitness)
-			    .detail("Used", requiredUsed)
-			    .detail("AddingDegraded", requiredDegraded)
-			    .detail("InCCDC", requiredInCCDC);
 			return results;
 		}
 
@@ -566,18 +918,6 @@ public:
 			for (auto& result : results) {
 				id_used[result.interf.locality.processId()]++;
 			}
-			TraceEvent("GetTLogTeamDone")
-			    .detail("DcIds", dcList)
-			    .detail("Policy", policy->info())
-			    .detail("Results", results.size())
-			    .detail("Processes", logServerSet->size())
-			    .detail("Workers", id_worker.size())
-			    .detail("Required", required)
-			    .detail("Desired", desired)
-			    .detail("Fitness", requiredFitness)
-			    .detail("Used", requiredUsed)
-			    .detail("AddingDegraded", requiredDegraded)
-			    .detail("InCCDC", requiredInCCDC);
 			return results;
 		}
 
@@ -604,7 +944,6 @@ public:
 			id_used[result.interf.locality.processId()]++;
 		}
 		TraceEvent("GetTLogTeamDone")
-		    .detail("DcIds", dcList)
 		    .detail("Policy", policy->info())
 		    .detail("Results", results.size())
 		    .detail("Processes", logServerSet->size())
@@ -621,6 +960,114 @@ public:
 		return results;
 	}
 
+	// Selects the best method for TLog recruitment based on the specified policy
+	std::vector<WorkerDetails> getWorkersForTlogs(DatabaseConfiguration const& conf,
+	                                              int32_t required,
+	                                              int32_t desired,
+	                                              Reference<IReplicationPolicy> const& policy,
+	                                              std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	                                              bool checkStable = false,
+	                                              std::set<Optional<Key>> dcIds = std::set<Optional<Key>>(),
+	                                              std::vector<UID> exclusionWorkerIds = {}) {
+		desired = std::max(required, desired);
+		bool useSimple = false;
+		if (policy->name() == "Across") {
+			PolicyAcross* pa1 = (PolicyAcross*)policy.getPtr();
+			Reference<IReplicationPolicy> embedded = pa1->embeddedPolicy();
+			if (embedded->name() == "Across") {
+				PolicyAcross* pa2 = (PolicyAcross*)embedded.getPtr();
+				if (pa2->attributeKey() == "zoneid" && pa2->embeddedPolicyName() == "One") {
+					std::map<Optional<Standalone<StringRef>>, int> testUsed = id_used;
+
+					auto workers = getWorkersForTlogsComplex(conf,
+					                                         desired,
+					                                         id_used,
+					                                         pa1->attributeKey(),
+					                                         pa1->getCount(),
+					                                         pa2->getCount(),
+					                                         checkStable,
+					                                         dcIds,
+					                                         exclusionWorkerIds);
+
+					if (g_network->isSimulated()) {
+						auto testWorkers = getWorkersForTlogsBackup(
+						    conf, required, desired, policy, testUsed, checkStable, dcIds, exclusionWorkerIds);
+						RoleFitness testFitness(testWorkers, ProcessClass::TLog, testUsed);
+						RoleFitness fitness(workers, ProcessClass::TLog, id_used);
+
+						std::map<Optional<Standalone<StringRef>>, int> field_count;
+						std::set<Optional<Standalone<StringRef>>> zones;
+						bool foundDegraded = false;
+						for (auto& worker : testWorkers) {
+							if (!zones.count(worker.interf.locality.zoneId())) {
+								field_count[worker.interf.locality.get(pa1->attributeKey())]++;
+								zones.insert(worker.interf.locality.zoneId());
+							}
+							foundDegraded = foundDegraded || worker.degraded;
+						}
+						testFitness.worstDegraded = foundDegraded;
+
+						int minField = 100;
+
+						for (auto& f : field_count) {
+							minField = std::min(minField, f.second);
+						}
+
+						if (fitness > testFitness && minField > 1) {
+							for (auto& w : testWorkers) {
+								TraceEvent("TestTLogs").detail("Interf", w.interf.address());
+							}
+							for (auto& w : workers) {
+								TraceEvent("RealTLogs").detail("Interf", w.interf.address());
+							}
+							TraceEvent("FitnessCompare")
+							    .detail("TestF", testFitness.toString())
+							    .detail("RealF", fitness.toString());
+							ASSERT(false);
+						}
+					}
+
+					return workers;
+				}
+			} else if (pa1->attributeKey() == "zoneid" && embedded->name() == "One") {
+				ASSERT(pa1->getCount() == required);
+				useSimple = true;
+			}
+		} else if (policy->name() == "One") {
+			useSimple = true;
+		}
+		if (useSimple) {
+			std::map<Optional<Standalone<StringRef>>, int> testUsed = id_used;
+
+			auto workers =
+			    getWorkersForTlogsSimple(conf, required, desired, id_used, checkStable, dcIds, exclusionWorkerIds);
+
+			if (g_network->isSimulated()) {
+				auto testWorkers = getWorkersForTlogsBackup(
+				    conf, required, desired, policy, testUsed, checkStable, dcIds, exclusionWorkerIds);
+				RoleFitness testFitness(testWorkers, ProcessClass::TLog, testUsed);
+				RoleFitness fitness(workers, ProcessClass::TLog, id_used);
+
+				if (fitness > testFitness) {
+					for (auto& w : testWorkers) {
+						TraceEvent("TestTLogs").detail("Interf", w.interf.address());
+					}
+					for (auto& w : workers) {
+						TraceEvent("RealTLogs").detail("Interf", w.interf.address());
+					}
+					TraceEvent("FitnessCompare")
+					    .detail("TestF", testFitness.toString())
+					    .detail("RealF", fitness.toString());
+					ASSERT(false);
+				}
+			}
+			return workers;
+		}
+		ASSERT(false);
+		return getWorkersForTlogsBackup(
+		    conf, required, desired, policy, id_used, checkStable, dcIds, exclusionWorkerIds);
+	}
+
 	// FIXME: This logic will fallback unnecessarily when usable dcs > 1 because it does not check all combinations of
 	// potential satellite locations
 	std::vector<WorkerDetails> getWorkersForSatelliteLogs(const DatabaseConfiguration& conf,
@@ -824,8 +1271,7 @@ public:
 		ProcessClass::ClusterRole role;
 		int count;
 		int worstUsed = 1;
-		bool degraded = false;
-		bool inClusterControllerDC = false;
+		bool worstDegraded = false;
 
 		RoleFitness(int bestFit, int worstFit, int count, ProcessClass::ClusterRole role)
 		  : bestFit((ProcessClass::Fitness)bestFit), worstFit((ProcessClass::Fitness)worstFit), count(count),
@@ -841,25 +1287,18 @@ public:
 
 		RoleFitness(const vector<WorkerDetails>& workers,
 		            ProcessClass::ClusterRole role,
-		            const std::map<Optional<Standalone<StringRef>>, int>& id_used,
-		            Optional<Standalone<StringRef>> ccDcId)
+		            const std::map<Optional<Standalone<StringRef>>, int>& id_used)
 		  : role(role) {
 			// Every recruitment will attempt to recruit the preferred amount through GoodFit,
 			// So a recruitment which only has BestFit is not better than one that has a GoodFit process
 			worstFit = ProcessClass::GoodFit;
-
-			degraded = false;
-			inClusterControllerDC = false;
+			worstDegraded = false;
 			bestFit = ProcessClass::NeverAssign;
 			worstUsed = 1;
 			for (auto& it : workers) {
 				auto thisFit = it.processClass.machineClassFitness(role);
-				worstFit = std::max(worstFit, thisFit);
-				bestFit = std::min(bestFit, thisFit);
-				degraded = it.degraded || degraded;
-				inClusterControllerDC = (it.interf.locality.dcId() == ccDcId) || inClusterControllerDC;
-
 				auto thisUsed = id_used.find(it.interf.locality.processId());
+
 				if (thisUsed == id_used.end()) {
 					TraceEvent(SevError, "UsedNotFound").detail("ProcessId", it.interf.locality.processId().get());
 					ASSERT(false);
@@ -868,16 +1307,28 @@ public:
 					TraceEvent(SevError, "UsedIsZero").detail("ProcessId", it.interf.locality.processId().get());
 					ASSERT(false);
 				}
-				worstUsed = std::max(worstUsed, thisUsed->second);
+
+				bestFit = std::min(bestFit, thisFit);
+
+				if (thisFit > worstFit) {
+					worstFit = thisFit;
+					worstUsed = thisUsed->second;
+					worstDegraded = it.degraded;
+				} else if (thisFit == worstFit) {
+					if (thisUsed->second > worstUsed) {
+						worstUsed = thisUsed->second;
+						worstDegraded = it.degraded;
+					} else if (thisUsed->second == worstUsed) {
+						worstDegraded = it.degraded || worstDegraded;
+					}
+				}
 			}
 
 			count = workers.size();
 
 			// degraded is only used for recruitment of tlogs
-			// only tlogs avoid the cluster controller dc
 			if (role != ProcessClass::TLog) {
-				degraded = false;
-				inClusterControllerDC = false;
+				worstDegraded = false;
 			}
 		}
 
@@ -888,10 +1339,8 @@ public:
 				return worstUsed < r.worstUsed;
 			if (count != r.count)
 				return count > r.count;
-			if (degraded != r.degraded)
-				return r.degraded;
-			if (inClusterControllerDC != r.inClusterControllerDC)
-				return r.inClusterControllerDC;
+			if (worstDegraded != r.worstDegraded)
+				return r.worstDegraded;
 			// FIXME: TLog recruitment process does not guarantee the best fit is not worsened.
 			if (role != ProcessClass::TLog && role != ProcessClass::LogRouter && bestFit != r.bestFit)
 				return bestFit < r.bestFit;
@@ -908,20 +1357,18 @@ public:
 				return worstFit < r.worstFit;
 			if (worstUsed != r.worstUsed)
 				return worstUsed < r.worstUsed;
-			if (degraded != r.degraded)
-				return r.degraded;
-			if (inClusterControllerDC != r.inClusterControllerDC)
-				return r.inClusterControllerDC;
+			if (worstDegraded != r.worstDegraded)
+				return r.worstDegraded;
 			return false;
 		}
 
 		bool operator==(RoleFitness const& r) const {
 			return worstFit == r.worstFit && worstUsed == r.worstUsed && bestFit == r.bestFit && count == r.count &&
-			       degraded == r.degraded && inClusterControllerDC == r.inClusterControllerDC;
+			       worstDegraded == r.worstDegraded;
 		}
 
 		std::string toString() const {
-			return format("%d %d %d %d %d %d", worstFit, worstUsed, count, degraded, inClusterControllerDC, bestFit);
+			return format("%d %d %d %d %d", worstFit, worstUsed, count, worstDegraded, bestFit);
 		}
 	};
 
@@ -970,9 +1417,9 @@ public:
 		if (!goodRemoteRecruitmentTime.isReady() &&
 		    ((RoleFitness(
 		          SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredRemoteLogs(), ProcessClass::TLog)
-		          .betterCount(RoleFitness(remoteLogs, ProcessClass::TLog, id_used, clusterControllerDcId))) ||
+		          .betterCount(RoleFitness(remoteLogs, ProcessClass::TLog, id_used))) ||
 		     (RoleFitness(SERVER_KNOBS->EXPECTED_LOG_ROUTER_FITNESS, req.logRouterCount, ProcessClass::LogRouter)
-		          .betterCount(RoleFitness(logRouters, ProcessClass::LogRouter, id_used, clusterControllerDcId))))) {
+		          .betterCount(RoleFitness(logRouters, ProcessClass::LogRouter, id_used))))) {
 			throw operation_failed();
 		}
 
@@ -1102,24 +1549,24 @@ public:
 
 		if (!goodRecruitmentTime.isReady() &&
 		    (RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs(), ProcessClass::TLog)
-		         .betterCount(RoleFitness(tlogs, ProcessClass::TLog, id_used, clusterControllerDcId)) ||
+		         .betterCount(RoleFitness(tlogs, ProcessClass::TLog, id_used)) ||
 		     (region.satelliteTLogReplicationFactor > 0 && req.configuration.usableRegions > 1 &&
 		      RoleFitness(SERVER_KNOBS->EXPECTED_TLOG_FITNESS,
 		                  req.configuration.getDesiredSatelliteLogs(dcId),
 		                  ProcessClass::TLog)
-		          .betterCount(RoleFitness(satelliteLogs, ProcessClass::TLog, id_used, clusterControllerDcId))) ||
+		          .betterCount(RoleFitness(satelliteLogs, ProcessClass::TLog, id_used))) ||
 		     RoleFitness(SERVER_KNOBS->EXPECTED_COMMIT_PROXY_FITNESS,
 		                 req.configuration.getDesiredCommitProxies(),
 		                 ProcessClass::CommitProxy)
-		         .betterCount(RoleFitness(commit_proxies, ProcessClass::CommitProxy, id_used, clusterControllerDcId)) ||
+		         .betterCount(RoleFitness(commit_proxies, ProcessClass::CommitProxy, id_used)) ||
 		     RoleFitness(SERVER_KNOBS->EXPECTED_GRV_PROXY_FITNESS,
 		                 req.configuration.getDesiredGrvProxies(),
 		                 ProcessClass::GrvProxy)
-		         .betterCount(RoleFitness(grv_proxies, ProcessClass::GrvProxy, id_used, clusterControllerDcId)) ||
+		         .betterCount(RoleFitness(grv_proxies, ProcessClass::GrvProxy, id_used)) ||
 		     RoleFitness(SERVER_KNOBS->EXPECTED_RESOLVER_FITNESS,
 		                 req.configuration.getDesiredResolvers(),
 		                 ProcessClass::Resolver)
-		         .betterCount(RoleFitness(resolvers, ProcessClass::Resolver, id_used, clusterControllerDcId)))) {
+		         .betterCount(RoleFitness(resolvers, ProcessClass::Resolver, id_used)))) {
 			return operation_failed();
 		}
 
@@ -1280,10 +1727,9 @@ public:
 					                                               used,
 					                                               first_resolver);
 
-					auto fitness = std::make_tuple(
-					    RoleFitness(commit_proxies, ProcessClass::CommitProxy, used, clusterControllerDcId),
-					    RoleFitness(grv_proxies, ProcessClass::GrvProxy, used, clusterControllerDcId),
-					    RoleFitness(resolvers, ProcessClass::Resolver, used, clusterControllerDcId));
+					auto fitness = std::make_tuple(RoleFitness(commit_proxies, ProcessClass::CommitProxy, used),
+					                               RoleFitness(grv_proxies, ProcessClass::GrvProxy, used),
+					                               RoleFitness(resolvers, ProcessClass::Resolver, used));
 
 					if (dcId == clusterControllerDcId) {
 						bestFitness = fitness;
@@ -1349,7 +1795,7 @@ public:
 			if (!goodRecruitmentTime.isReady() &&
 			    (RoleFitness(
 			         SERVER_KNOBS->EXPECTED_TLOG_FITNESS, req.configuration.getDesiredLogs(), ProcessClass::TLog)
-			         .betterCount(RoleFitness(tlogs, ProcessClass::TLog, id_used, clusterControllerDcId)) ||
+			         .betterCount(RoleFitness(tlogs, ProcessClass::TLog, id_used)) ||
 			     RoleFitness(SERVER_KNOBS->EXPECTED_COMMIT_PROXY_FITNESS,
 			                 req.configuration.getDesiredCommitProxies(),
 			                 ProcessClass::CommitProxy)
@@ -1693,7 +2139,7 @@ public:
 
 		// Check tLog fitness
 		updateIdUsed(tlogs, old_id_used);
-		RoleFitness oldTLogFit(tlogs, ProcessClass::TLog, old_id_used, clusterControllerDcId);
+		RoleFitness oldTLogFit(tlogs, ProcessClass::TLog, old_id_used);
 		auto newTLogs = getWorkersForTlogs(db.config,
 		                                   db.config.tLogReplicationFactor,
 		                                   db.config.getDesiredLogs(),
@@ -1701,7 +2147,7 @@ public:
 		                                   id_used,
 		                                   true,
 		                                   primaryDC);
-		RoleFitness newTLogFit(newTLogs, ProcessClass::TLog, id_used, clusterControllerDcId);
+		RoleFitness newTLogFit(newTLogs, ProcessClass::TLog, id_used);
 
 		bool oldSatelliteFallback = false;
 
@@ -1718,14 +2164,14 @@ public:
 		}
 
 		updateIdUsed(satellite_tlogs, old_id_used);
-		RoleFitness oldSatelliteTLogFit(satellite_tlogs, ProcessClass::TLog, old_id_used, clusterControllerDcId);
+		RoleFitness oldSatelliteTLogFit(satellite_tlogs, ProcessClass::TLog, old_id_used);
 		bool newSatelliteFallback = false;
 		auto newSatelliteTLogs = satellite_tlogs;
 		RoleFitness newSatelliteTLogFit = oldSatelliteTLogFit;
 		if (region.satelliteTLogReplicationFactor > 0 && db.config.usableRegions > 1) {
 			newSatelliteTLogs =
 			    getWorkersForSatelliteLogs(db.config, region, remoteRegion, id_used, newSatelliteFallback, true);
-			newSatelliteTLogFit = RoleFitness(newSatelliteTLogs, ProcessClass::TLog, id_used, clusterControllerDcId);
+			newSatelliteTLogFit = RoleFitness(newSatelliteTLogs, ProcessClass::TLog, id_used);
 		}
 
 		std::map<Optional<Key>, int32_t> satellite_priority;
@@ -1778,7 +2224,7 @@ public:
 		}
 
 		updateIdUsed(remote_tlogs, old_id_used);
-		RoleFitness oldRemoteTLogFit(remote_tlogs, ProcessClass::TLog, old_id_used, clusterControllerDcId);
+		RoleFitness oldRemoteTLogFit(remote_tlogs, ProcessClass::TLog, old_id_used);
 		std::vector<UID> exclusionWorkerIds;
 		auto fn = [](const WorkerDetails& in) { return in.interf.id(); };
 		std::transform(newTLogs.begin(), newTLogs.end(), std::back_inserter(exclusionWorkerIds), fn);
@@ -1795,15 +2241,14 @@ public:
 			                                                  remoteDC,
 			                                                  exclusionWorkerIds),
 			                               ProcessClass::TLog,
-			                               id_used,
-			                               clusterControllerDcId);
+			                               id_used);
 		}
 		int oldRouterCount =
 		    oldTLogFit.count * std::max<int>(1, db.config.desiredLogRouterCount / std::max(1, oldTLogFit.count));
 		int newRouterCount =
 		    newTLogFit.count * std::max<int>(1, db.config.desiredLogRouterCount / std::max(1, newTLogFit.count));
 		updateIdUsed(log_routers, old_id_used);
-		RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter, old_id_used, clusterControllerDcId);
+		RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter, old_id_used);
 		RoleFitness newLogRoutersFit = oldLogRoutersFit;
 		if (db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED) {
 			newLogRoutersFit = RoleFitness(getWorkersForRoleInDatacenter(*remoteDC.begin(),
@@ -1814,8 +2259,7 @@ public:
 			                                                             Optional<WorkerFitnessInfo>(),
 			                                                             true),
 			                               ProcessClass::LogRouter,
-			                               id_used,
-			                               clusterControllerDcId);
+			                               id_used);
 		}
 
 		if (oldLogRoutersFit.count < oldRouterCount) {
@@ -1829,10 +2273,9 @@ public:
 		updateIdUsed(commitProxyClasses, old_id_used);
 		updateIdUsed(grvProxyClasses, old_id_used);
 		updateIdUsed(resolverClasses, old_id_used);
-		RoleFitness oldCommitProxyFit(
-		    commitProxyClasses, ProcessClass::CommitProxy, old_id_used, clusterControllerDcId);
-		RoleFitness oldGrvProxyFit(grvProxyClasses, ProcessClass::GrvProxy, old_id_used, clusterControllerDcId);
-		RoleFitness oldResolverFit(resolverClasses, ProcessClass::Resolver, old_id_used, clusterControllerDcId);
+		RoleFitness oldCommitProxyFit(commitProxyClasses, ProcessClass::CommitProxy, old_id_used);
+		RoleFitness oldGrvProxyFit(grvProxyClasses, ProcessClass::GrvProxy, old_id_used);
+		RoleFitness oldResolverFit(resolverClasses, ProcessClass::Resolver, old_id_used);
 
 		auto first_commit_proxy = getWorkerForRoleInDatacenter(clusterControllerDcId,
 		                                                       ProcessClass::CommitProxy,
@@ -1881,13 +2324,13 @@ public:
 		                                               first_resolver,
 		                                               true);
 
-		RoleFitness newCommitProxyFit(commit_proxies, ProcessClass::CommitProxy, id_used, clusterControllerDcId);
-		RoleFitness newGrvProxyFit(grv_proxies, ProcessClass::GrvProxy, id_used, clusterControllerDcId);
-		RoleFitness newResolverFit(resolvers, ProcessClass::Resolver, id_used, clusterControllerDcId);
+		RoleFitness newCommitProxyFit(commit_proxies, ProcessClass::CommitProxy, id_used);
+		RoleFitness newGrvProxyFit(grv_proxies, ProcessClass::GrvProxy, id_used);
+		RoleFitness newResolverFit(resolvers, ProcessClass::Resolver, id_used);
 
 		// Check backup worker fitness
 		updateIdUsed(backup_workers, old_id_used);
-		RoleFitness oldBackupWorkersFit(backup_workers, ProcessClass::Backup, old_id_used, clusterControllerDcId);
+		RoleFitness oldBackupWorkersFit(backup_workers, ProcessClass::Backup, old_id_used);
 		const int nBackup = backup_addresses.size();
 		RoleFitness newBackupWorkersFit(getWorkersForRoleInDatacenter(clusterControllerDcId,
 		                                                              ProcessClass::Backup,
@@ -1897,8 +2340,7 @@ public:
 		                                                              Optional<WorkerFitnessInfo>(),
 		                                                              true),
 		                                ProcessClass::Backup,
-		                                id_used,
-		                                clusterControllerDcId);
+		                                id_used);
 
 		auto oldFit = std::make_tuple(oldTLogFit,
 		                              oldSatelliteTLogFit,
@@ -2774,13 +3216,13 @@ void registerWorker(RegisterWorkerRequest req, ClusterControllerData* self) {
 		self->goodRemoteRecruitmentTime = lowPriorityDelay(SERVER_KNOBS->WAIT_FOR_GOOD_REMOTE_RECRUITMENT_DELAY);
 	} else {
 		TraceEvent("ClusterControllerWorkerAlreadyRegistered", self->id)
-		    .suppressFor(1.0)
 		    .detail("WorkerId", w.id())
 		    .detail("ProcessId", w.locality.processId())
 		    .detail("ZoneId", w.locality.zoneId())
 		    .detail("DataHall", w.locality.dataHallId())
 		    .detail("PClass", req.processClass.toString())
-		    .detail("Workers", self->id_worker.size());
+		    .detail("Workers", self->id_worker.size())
+		    .detail("Degraded", req.degraded);
 	}
 	if (w.address() == g_network->getLocalAddress()) {
 		if (self->changingDcIds.get().first) {
diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h
index f1d83ec819..3446b3a7b8 100644
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@@ -130,6 +130,8 @@ struct WorkerDetails {
 	WorkerDetails(const WorkerInterface& interf, ProcessClass processClass, bool degraded)
 	  : interf(interf), processClass(processClass), degraded(degraded) {}
 
+	bool operator<(const WorkerDetails& r) const { return interf.id() < r.interf.id(); }
+
 	template <class Ar>
 	void serialize(Ar& ar) {
 		serializer(ar, interf, processClass, degraded);

From 715c98572c53c868dc5df79ece73890a97667c58 Mon Sep 17 00:00:00 2001
From: Dan Lambright <hlambright@apple.com>
Date: Wed, 21 Apr 2021 10:48:35 -0400
Subject: [PATCH 219/317] bit more documentation

---
 documentation/sphinx/source/mr-status-json-schemas.rst.inc | 2 +-
 fdbserver/GrvProxyServer.actor.cpp                         | 4 ++++
 fdbserver/Status.actor.cpp                                 | 3 +++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 17a67dd57e..a6fdd0d63e 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -120,7 +120,7 @@
                      "counter":0,
                      "roughness":0.0
                   },
-                  "grv_latency_statistics":{
+                  "grv_latency_statistics":{ // GRV Latency metrics are grouped according to priority (currently batch or default).
                      "default":{
                          "count":0,
                          "min":0.0,
diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp
index 9d3a0c2020..aaf9f8b186 100644
--- a/fdbserver/GrvProxyServer.actor.cpp
+++ b/fdbserver/GrvProxyServer.actor.cpp
@@ -77,6 +77,7 @@ struct GrvProxyStats {
 		       (FLOW_KNOBS->BASIC_LOAD_BALANCE_UPDATE_RATE - (lastBucketBegin + bucketInterval - now()));
 	}
 
+	// Current stats maintained for a given grv proxy server
 	explicit GrvProxyStats(UID id)
 	  : cc("GrvProxyStats", id.toString()), recentRequests(0), lastBucketBegin(now()),
 	    bucketInterval(FLOW_KNOBS->BASIC_LOAD_BALANCE_UPDATE_RATE / FLOW_KNOBS->BASIC_LOAD_BALANCE_BUCKETS),
@@ -513,6 +514,9 @@ ACTOR Future<GetReadVersionReply> getLiveCommittedVersion(SpanID parentSpan,
 	return rep;
 }
 
+// Returns the current read version (or minimum known committed verison if requested),
+// to each request in the provided list. Also check if the request should be throttled.
+// Update GRV statistics according to the request's priority.
 ACTOR Future<Void> sendGrvReplies(Future<GetReadVersionReply> replyFuture,
                                   std::vector<GetReadVersionRequest> requests,
                                   GrvProxyStats* stats,
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 90caeee703..a4f2edcfa3 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -624,6 +624,8 @@ struct RolesInfo {
 
 		return roles.insert(std::make_pair(iface.address(), obj))->second;
 	}
+
+	// Returns a json object encoding a snapshot of grv proxy statistics
 	JsonBuilderObject& addRole(std::string const& role, GrvProxyInterface& iface, EventMap const& metrics) {
 		JsonBuilderObject obj;
 		obj["id"] = iface.id().shortString();
@@ -1844,6 +1846,7 @@ ACTOR static Future<vector<std::pair<TLogInterface, EventMap>>> getTLogsAndMetri
 	return results;
 }
 
+// Returns list of tuples of grv proxy interfaces and their latency metrics
 ACTOR static Future<vector<std::pair<CommitProxyInterface, EventMap>>> getCommitProxiesAndMetrics(
     Reference<AsyncVar<ServerDBInfo>> db,
     std::unordered_map<NetworkAddress, WorkerInterface> address_workers) {

From 28f8a2716e03384e113255d33c69fe4a75fc79c0 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 21 Apr 2021 11:54:05 -0700
Subject: [PATCH 220/317] For old incompatible connections, set the correct
 protocol version on the version async var

---
 fdbrpc/FlowTransport.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index b7221c8876..47bf03c7e8 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -1217,7 +1217,7 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
 							if (!protocolVersion.hasMultiVersionClient()) {
 								// Older versions expected us to hang up. It may work even if we don't hang up here, but
 								// it's safer to keep the old behavior.
-								peer->protocolVersion->set(peerProtocolVersion);
+								peer->protocolVersion->set(protocolVersion);
 								throw incompatible_protocol_version();
 							}
 						} else {

From f485d7fa5ea0bc6090acb2a4caba97d6aeb3b00b Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 21 Apr 2021 12:25:03 -0700
Subject: [PATCH 221/317] Fix comment typo

---
 fdbclient/MultiVersionTransaction.actor.cpp | 2 +-
 fdbclient/MultiVersionTransaction.h         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 555765c26c..0168dea969 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -1569,7 +1569,7 @@ void MultiVersionApi::addNetworkThreadCompletionHook(void (*hook)(void*), void*
 	}
 }
 
-// Creates an IDatabase object that represents a connections to the cluster
+// Creates an IDatabase object that represents a connection to the cluster
 Reference<IDatabase> MultiVersionApi::createDatabase(const char* clusterFilePath) {
 	lock.enter();
 	if (!networkSetup) {
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index 4e0e91a969..4bad3c7ca9 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -526,7 +526,7 @@ public:
 	void stopNetwork() override;
 	void addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParameter) override;
 
-	// Creates an IDatabase object that represents a connections to the cluster
+	// Creates an IDatabase object that represents a connection to the cluster
 	Reference<IDatabase> createDatabase(const char* clusterFilePath) override;
 	static MultiVersionApi* api;
 

From 80e15e87685fd462d8051a1728f8680bc08f69de Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 21 Apr 2021 14:56:02 -0600
Subject: [PATCH 222/317] started implementation

---
 .stignore                           |  2 +
 fdbclient/ActorLineageProfiler.cpp  |  3 ++
 fdbclient/ActorLineageProfiler.h    | 50 +++++++++++++++++++++-
 fdbclient/CMakeLists.txt            |  1 +
 fdbclient/FluentDSampleIngestor.cpp | 65 +++++++++++++++++++++++++++++
 okteto.yml                          | 12 ++++++
 6 files changed, 131 insertions(+), 2 deletions(-)
 create mode 100644 .stignore
 create mode 100644 fdbclient/FluentDSampleIngestor.cpp
 create mode 100644 okteto.yml

diff --git a/.stignore b/.stignore
new file mode 100644
index 0000000000..7500a08f9f
--- /dev/null
+++ b/.stignore
@@ -0,0 +1,2 @@
+.git
+.clangd
diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 82d04aa42c..f2e65e47fb 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -213,6 +213,7 @@ void SampleCollection_t::refresh() {
 			oldest = data.front()->time;
 		}
 	}
+	config->ingest(sample);
 }
 
 std::vector<std::shared_ptr<Sample>> SampleCollection_t::get(double from /*= 0.0*/,
@@ -275,3 +276,5 @@ void ActorLineageProfilerT::profile() {
 		}
 	}
 }
+
+SampleIngestor::~SampleIngestor() {}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 3f11840714..4d32760e32 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -30,8 +30,6 @@
 #include "flow/singleton.h"
 #include "flow/flow.h"
 
-void runSamplingProfiler();
-
 struct IALPCollectorBase {
 	virtual std::optional<std::any> collect(ActorLineage*) = 0;
 	virtual const std::string_view& name() = 0;
@@ -50,6 +48,53 @@ struct Sample : std::enable_shared_from_this<Sample> {
 	~Sample() { ::free(data); }
 };
 
+class SampleIngestor : std::enable_shared_from_this<SampleIngestor> {
+public:
+	virtual ~SampleIngestor();
+	virtual void ingest(std::shared_ptr<Sample> const& sample) = 0;
+};
+
+class NoneIngestor : public SampleIngestor {
+public:
+	void ingest(std::shared_ptr<Sample> const& sample) override {}
+};
+
+// The FluentD ingestor uses the pimp idiom. This is to make compilation less heavy weight as this implementation has
+// dependencies to boost::asio
+struct FluentDIngestorImpl;
+
+class FluentDIngestor : public SampleIngestor {
+public: // Public Types
+	enum class Protocol { TCP, UDP };
+
+private: // members
+	FluentDIngestorImpl* impl;
+
+public: // interface
+	void ingest(std::shared_ptr<Sample> const& sample) override;
+	FluentDIngestor(Protocol protocol, NetworkAddress& endpoint);
+	~FluentDIngestor();
+};
+
+class ProfilerConfigT {
+private: // private types
+	using Lock = std::unique_lock<std::mutex>;
+	friend class crossbow::create_static<ProfilerConfigT>;
+
+private: // members
+	std::shared_ptr<SampleIngestor> ingestor = std::make_shared<NoneIngestor>();
+
+private: // construction
+	ProfilerConfigT() {}
+	ProfilerConfigT(ProfilerConfigT const&) = delete;
+	ProfilerConfigT& operator=(ProfilerConfigT const&) = delete;
+
+public:
+	void setBackend(std::shared_ptr<SampleIngestor> ingestor) { this->ingestor = ingestor; }
+};
+
+using ProfilerConfig = crossbow::singleton<ProfilerConfigT>;
+
 class SampleCollectorT {
 public: // Types
 	friend struct crossbow::create_static<SampleCollectorT>;
@@ -78,6 +123,7 @@ class SampleCollection_t {
 	mutable std::mutex mutex;
 	std::atomic<double> windowSize = 0.0;
 	std::deque<std::shared_ptr<Sample>> data;
+	ProfilerConfig config;
 
 public:
 	/**
diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt
index ee87d08646..e9d3d3716b 100644
--- a/fdbclient/CMakeLists.txt
+++ b/fdbclient/CMakeLists.txt
@@ -30,6 +30,7 @@ set(FDBCLIENT_SRCS
   EventTypes.actor.h
   FDBOptions.h
   FDBTypes.h
+  FluentDSampleIngestor.cpp
   FileBackupAgent.actor.cpp
   GlobalConfig.h
   GlobalConfig.actor.h
diff --git a/fdbclient/FluentDSampleIngestor.cpp b/fdbclient/FluentDSampleIngestor.cpp
new file mode 100644
index 0000000000..0a81ba0613
--- /dev/null
+++ b/fdbclient/FluentDSampleIngestor.cpp
@@ -0,0 +1,65 @@
+/*
+ * FluentDSampleIngestor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "fdbclient/ActorLineageProfiler.h"
+#include <boost/asio.hpp>
+
+namespace {
+struct FluentDSocket {
+	virtual ~FluentDSocket() {}
+	virtual void connect(NetworkAddress& endpoint) = 0;
+	// virtual void send() = 0;
+};
+
+struct TCPFluentDSocket : FluentDSocket {
+	boost::asio::io_context& io_context;
+	boost::asio::ip::tcp::socket socket;
+	TCPFluentDSocket(boost::asio::io_context& context) : io_context(context), socket(context) {}
+	void connect(NetworkAddress& endpoint) override { boost::asio::ip::tcp::resolver resolver(io_context); }
+};
+
+struct UDPFluentDSocket : FluentDSocket {
+	boost::asio::io_context& io_context;
+	boost::asio::ip::tcp::socket socket;
+	UDPFluentDSocket(boost::asio::io_context& context) : io_context(context), socket(context) {}
+	void connect(NetworkAddress& endpoint) override {}
+};
+} // namespace
+
+struct FluentDIngestorImpl {
+	using Protocol = FluentDIngestor::Protocol;
+	boost::asio::io_context io_context;
+	std::unique_ptr<FluentDSocket> socket;
+	FluentDIngestorImpl(Protocol protocol, NetworkAddress& endpoint) {
+		switch (protocol) {
+		case Protocol::TCP:
+			socket.reset(new TCPFluentDSocket(io_context));
+			break;
+		case Protocol::UDP:
+			socket.reset(new UDPFluentDSocket(io_context));
+			break;
+		}
+		socket->connect(endpoint);
+	}
+};
+
+FluentDIngestor::~FluentDIngestor() {}
+
+FluentDIngestor::FluentDIngestor(Protocol protocol, NetworkAddress& endpoint) {}
\ No newline at end of file
diff --git a/okteto.yml b/okteto.yml
new file mode 100644
index 0000000000..efa744a7d8
--- /dev/null
+++ b/okteto.yml
@@ -0,0 +1,12 @@
+name: foundationdb
+autocreate: true
+image: foundationdb/devel:centos7-latest
+command: bash
+volumes:
+- /root/.m2
+- /root/build
+sync:
+- .:/usr/src/fdb
+forward:
+- 5005:5005
+- 8080:8080

From b61a91168591248061f7524b879c2421c8a72a38 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 21 Apr 2021 14:30:06 -0700
Subject: [PATCH 223/317] removed an ASSERT that was for debugging purposed,
 and increased the max commit latency, because it can be spuriously triggered
 by dummy transactions that take 5+ seconds each

---
 fdbserver/ClusterController.actor.cpp    | 1 -
 fdbserver/workloads/LowLatency.actor.cpp | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 5121e72d94..db23dfd215 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1063,7 +1063,6 @@ public:
 			}
 			return workers;
 		}
-		ASSERT(false);
 		return getWorkersForTlogsBackup(
 		    conf, required, desired, policy, id_used, checkStable, dcIds, exclusionWorkerIds);
 	}
diff --git a/fdbserver/workloads/LowLatency.actor.cpp b/fdbserver/workloads/LowLatency.actor.cpp
index 90b03dd8e9..7e761b2262 100644
--- a/fdbserver/workloads/LowLatency.actor.cpp
+++ b/fdbserver/workloads/LowLatency.actor.cpp
@@ -40,7 +40,7 @@ struct LowLatencyWorkload : TestWorkload {
 	  : TestWorkload(wcx), operations("Operations"), retries("Retries"), ok(true) {
 		testDuration = getOption(options, LiteralStringRef("testDuration"), 600.0);
 		maxGRVLatency = getOption(options, LiteralStringRef("maxGRVLatency"), 20.0);
-		maxCommitLatency = getOption(options, LiteralStringRef("maxCommitLatency"), 30.0);
+		maxCommitLatency = getOption(options, LiteralStringRef("maxCommitLatency"), 33.0);
 		checkDelay = getOption(options, LiteralStringRef("checkDelay"), 1.0);
 		testWrites = getOption(options, LiteralStringRef("testWrites"), true);
 		testKey = getOption(options, LiteralStringRef("testKey"), LiteralStringRef("testKey"));

From 7c4b5b03370a1357b3703573ffca9a3757d17ecb Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Tue, 20 Apr 2021 11:45:26 -0700
Subject: [PATCH 224/317] Add first consistent version in restore status.

First consistent version is the max of versions in RestoreFileSet.
---
 fdbclient/FileBackupAgent.actor.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 1c94aae6ef..7169412758 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -150,6 +150,7 @@ public:
 	KeyBackedProperty<Key> batchFuture() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	KeyBackedProperty<Version> beginVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	KeyBackedProperty<Version> restoreVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
+	KeyBackedProperty<Version> firstConsistentVersion() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 
 	KeyBackedProperty<Reference<IBackupContainer>> sourceContainer() {
 		return configSpace.pack(LiteralStringRef(__FUNCTION__));
@@ -358,6 +359,7 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	state Future<StringRef> status = restore.stateText(tr);
 	state Future<Version> currentVersion = restore.getCurrentVersion(tr);
 	state Future<Version> lag = restore.getApplyVersionLag(tr);
+	state Future<Version> firstConsistentVersion = restore.firstConsistentVersion().getD(tr);
 	state Future<std::string> tag = restore.tag().getD(tr);
 	state Future<std::pair<std::string, Version>> lastError = restore.lastError().getD(tr);
 
@@ -365,7 +367,7 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	state UID uid = restore.getUid();
 	wait(success(fileCount) && success(fileBlockCount) && success(fileBlocksDispatched) &&
 	     success(fileBlocksFinished) && success(bytesWritten) && success(status) && success(currentVersion) &&
-	     success(lag) && success(tag) && success(lastError));
+	     success(lag) && success(firstConsistentVersion) && success(tag) && success(lastError));
 
 	std::string errstr = "None";
 	if (lastError.get().second != 0)
@@ -383,11 +385,12 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	    .detail("FileBlocksInProgress", fileBlocksDispatched.get() - fileBlocksFinished.get())
 	    .detail("BytesWritten", bytesWritten.get())
 	    .detail("CurrentVersion", currentVersion.get())
+	    .detail("FirstConsistentVersion", firstConsistentVersion.get())
 	    .detail("ApplyLag", lag.get())
 	    .detail("TaskInstance", THIS_ADDR);
 
 	return format("Tag: %s  UID: %s  State: %s  Blocks: %lld/%lld  BlocksInProgress: %lld  Files: %lld  BytesWritten: "
-	              "%lld  CurrentVersion: %lld  ApplyVersionLag: %lld  LastError: %s",
+	              "%lld  CurrentVersion: %lld FirstConsistentVersion: %lld  ApplyVersionLag: %lld  LastError: %s",
 	              tag.get().c_str(),
 	              uid.toString().c_str(),
 	              status.get().toString().c_str(),
@@ -397,6 +400,7 @@ ACTOR Future<std::string> RestoreConfig::getProgress_impl(RestoreConfig restore,
 	              fileCount.get(),
 	              bytesWritten.get(),
 	              currentVersion.get(),
+	              firstConsistentVersion.get(),
 	              lag.get(),
 	              errstr.c_str());
 }
@@ -4105,7 +4109,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 		for (auto const& r : ranges) {
 			keyRangesFilter.push_back_deep(keyRangesFilter.arena(), KeyRangeRef(r));
 		}
-		Optional<RestorableFileSet> restorable =
+		state Optional<RestorableFileSet> restorable =
 		    wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, incremental, beginVersion));
 		if (!incremental) {
 			beginVersion = restorable.get().snapshot.beginVersion;
@@ -4121,9 +4125,13 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 		// Order does not matter, they will be put in order when written to the restoreFileMap below.
 		state std::vector<RestoreConfig::RestoreFile> files;
 
+		Version firstConsistentVersion = std::numeric_limits<int64_t>::min();
 		for (const RangeFile& f : restorable.get().ranges) {
 			files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
+			firstConsistentVersion = std::max(firstConsistentVersion, f.version);
 		}
+		restore.firstConsistentVersion().set(tr, firstConsistentVersion);
+		wait(tr->commit());
 
 		if (!CLIENT_KNOBS->RESTORE_IGNORE_LOG_FILES) {
 			for (const LogFile& f : restorable.get().logs) {

From b90f61d740e6b3a3dbe0bf81e2a40acc737ee04e Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 21 Apr 2021 15:50:25 -0700
Subject: [PATCH 225/317] Move commit to its own try loop.

---
 fdbclient/FileBackupAgent.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 7169412758..04868c352d 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -4125,7 +4125,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 		// Order does not matter, they will be put in order when written to the restoreFileMap below.
 		state std::vector<RestoreConfig::RestoreFile> files;
 
-		Version firstConsistentVersion = std::numeric_limits<int64_t>::min();
+		state Version firstConsistentVersion = beginVersion;
 		for (const RangeFile& f : restorable.get().ranges) {
 			files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
 			firstConsistentVersion = std::max(firstConsistentVersion, f.version);

From 8a00c6cdf840e7f44ec03644a6d7d4b5b82122e3 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Wed, 14 Apr 2021 17:30:21 +0000
Subject: [PATCH 226/317] Add -Wshift-sign-overflow

This catches the bug fixed in #4656 at compile time
---
 cmake/ConfigureCompiler.cmake            | 11 ++++++-----
 fdbclient/CoordinationInterface.h        | 13 ++++++++-----
 fdbclient/MonitorLeader.actor.cpp        |  3 ++-
 fdbserver/workloads/FileSystem.actor.cpp |  4 ++--
 4 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake
index 4dbe9db816..c14c5011c5 100644
--- a/cmake/ConfigureCompiler.cmake
+++ b/cmake/ConfigureCompiler.cmake
@@ -280,7 +280,12 @@ else()
         -Wno-unknown-attributes)
     endif()
     add_compile_options(
-      -Wall -Wextra
+      -Wall
+      -Wextra
+      -Wredundant-move
+      -Wpessimizing-move
+      -Woverloaded-virtual
+      -Wshift-sign-overflow
       # Here's the current set of warnings we need to explicitly disable to compile warning-free with clang 10
       -Wno-comment
       -Wno-dangling-else
@@ -288,16 +293,12 @@ else()
       -Wno-format
       -Wno-mismatched-tags
       -Wno-missing-field-initializers
-      -Wno-overloaded-virtual
       -Wno-reorder
       -Wno-reorder-ctor
       -Wno-sign-compare
       -Wno-tautological-pointer-compare
       -Wno-undefined-var-template
       -Wno-tautological-pointer-compare
-      -Wredundant-move
-      -Wpessimizing-move
-      -Woverloaded-virtual
       -Wno-unknown-pragmas
       -Wno-unknown-warning-option
       -Wno-unused-function
diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h
index 0d22b035fb..53b52ec5bf 100644
--- a/fdbclient/CoordinationInterface.h
+++ b/fdbclient/CoordinationInterface.h
@@ -107,8 +107,9 @@ private:
 
 struct LeaderInfo {
 	constexpr static FileIdentifier file_identifier = 8338794;
+	// The first 7 bits of changeID represent cluster controller process class fitness, the lower the better
 	UID changeID;
-	static const uint64_t mask = ~(127ll << 57);
+	static const uint64_t changeIDMask = ~(uint64_t(0b1111111) << 57);
 	Value serializedInfo;
 	bool forward; // If true, serializedInfo is a connection string instead!
 
@@ -125,13 +126,13 @@ struct LeaderInfo {
 	// The first 7 bits of ChangeID represent cluster controller process class fitness, the lower the better
 	void updateChangeID(ClusterControllerPriorityInfo info) {
 		changeID = UID(((uint64_t)info.processClassFitness << 57) | ((uint64_t)info.isExcluded << 60) |
-		                   ((uint64_t)info.dcFitness << 61) | (changeID.first() & mask),
+		                   ((uint64_t)info.dcFitness << 61) | (changeID.first() & changeIDMask),
 		               changeID.second());
 	}
 
 	// All but the first 7 bits are used to represent process id
 	bool equalInternalId(LeaderInfo const& leaderInfo) const {
-		return ((changeID.first() & mask) == (leaderInfo.changeID.first() & mask)) &&
+		return ((changeID.first() & changeIDMask) == (leaderInfo.changeID.first() & changeIDMask)) &&
 		       changeID.second() == leaderInfo.changeID.second();
 	}
 
@@ -139,8 +140,10 @@ struct LeaderInfo {
 	// 1. the candidate has better process class fitness and the candidate is not the leader
 	// 2. the leader process class fitness becomes worse
 	bool leaderChangeRequired(LeaderInfo const& candidate) const {
-		return ((changeID.first() & ~mask) > (candidate.changeID.first() & ~mask) && !equalInternalId(candidate)) ||
-		       ((changeID.first() & ~mask) < (candidate.changeID.first() & ~mask) && equalInternalId(candidate));
+		return ((changeID.first() & ~changeIDMask) > (candidate.changeID.first() & ~changeIDMask) &&
+		        !equalInternalId(candidate)) ||
+		       ((changeID.first() & ~changeIDMask) < (candidate.changeID.first() & ~changeIDMask) &&
+		        equalInternalId(candidate));
 	}
 
 	ClusterControllerPriorityInfo getPriorityInfo() const {
diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp
index af563c68b0..6d5bf4f691 100644
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@@ -432,7 +432,8 @@ Optional<std::pair<LeaderInfo, bool>> getLeader(const vector<Optional<LeaderInfo
 	for (int i = 0; i < nominees.size(); i++) {
 		if (nominees[i].present()) {
 			maskedNominees.push_back(std::make_pair(
-			    UID(nominees[i].get().changeID.first() & LeaderInfo::mask, nominees[i].get().changeID.second()), i));
+			    UID(nominees[i].get().changeID.first() & LeaderInfo::changeIDMask, nominees[i].get().changeID.second()),
+			    i));
 		}
 	}
 
diff --git a/fdbserver/workloads/FileSystem.actor.cpp b/fdbserver/workloads/FileSystem.actor.cpp
index c131d790b3..4b57a5ad0d 100644
--- a/fdbserver/workloads/FileSystem.actor.cpp
+++ b/fdbserver/workloads/FileSystem.actor.cpp
@@ -103,7 +103,7 @@ struct FileSystemWorkload : TestWorkload {
 
 		tr->set(key, path);
 		std::string keyStr(key.toString());
-		tr->set(keyStr + "/size", format("%d", deterministicRandom()->randomInt(0, 2 << 30)));
+		tr->set(keyStr + "/size", format("%d", deterministicRandom()->randomInt(0, std::numeric_limits<int>::max())));
 		tr->set(keyStr + "/server", format("%d", deterministicRandom()->randomInt(0, self->serverCount)));
 		tr->set(keyStr + "/deleted", deleted ? LiteralStringRef("1") : LiteralStringRef("0"));
 		tr->set(keyStr + "/server", format("%d", serverID));
@@ -236,7 +236,7 @@ struct FileSystemWorkload : TestWorkload {
 		loop {
 			state int fileID = deterministicRandom()->randomInt(0, self->fileCount);
 			state bool isDeleting = deterministicRandom()->random01() < 0.25;
-			state int size = isDeleting ? 0 : deterministicRandom()->randomInt(0, 2 << 30);
+			state int size = isDeleting ? 0 : deterministicRandom()->randomInt(0, std::numeric_limits<int>::max());
 			state std::string keyStr = self->keyForFileID(fileID).toString();
 			state double tstart = now();
 			state Transaction tr(cx);

From bc43fa99acab29b0e8ccf6924331e48d41ab0385 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Wed, 21 Apr 2021 17:37:58 -0700
Subject: [PATCH 227/317] Move commit to its own try loop.

---
 fdbclient/FileBackupAgent.actor.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 04868c352d..2598c597ce 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -4130,8 +4130,18 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
 			firstConsistentVersion = std::max(firstConsistentVersion, f.version);
 		}
-		restore.firstConsistentVersion().set(tr, firstConsistentVersion);
-		wait(tr->commit());
+		tr->reset();
+		loop {
+			try {
+				tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+				restore.firstConsistentVersion().set(tr, firstConsistentVersion);
+				wait(tr->commit());
+				break;
+			} catch (Error& e) {
+				wait(tr->onError(e));
+			}
+		}
 
 		if (!CLIENT_KNOBS->RESTORE_IGNORE_LOG_FILES) {
 			for (const LogFile& f : restorable.get().logs) {

From aeb493872039bf0478972f04b282cb5c9a2978b8 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel_b_smith@apple.com>
Date: Thu, 22 Apr 2021 12:52:39 -0400
Subject: [PATCH 228/317] Remove unnecessary copy of KVS entries into range
 read response

---
 fdbserver/storageserver.actor.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 833a90ed5e..261e9f33c3 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -1437,7 +1437,7 @@ ACTOR Future<Void> getShardStateQ(StorageServer* data, GetShardStateRequest req)
 void merge(Arena& arena,
            VectorRef<KeyValueRef, VecSerStrategy::String>& output,
            VectorRef<KeyValueRef> const& vm_output,
-           VectorRef<KeyValueRef> const& base,
+           Standalone<RangeResultRef> const& base,
            int& vCount,
            int limit,
            bool stopAtEndOfBase,
@@ -1448,6 +1448,9 @@ void merge(Arena& arena,
 // start is still inclusive and end is exclusive
 {
 	ASSERT(limit != 0);
+	// Add a dependency of the new arena on the result from the KVS so that we don't have to copy any of the KVS
+	// results.
+	arena.dependsOn(base.arena());
 
 	bool forward = limit > 0;
 	if (!forward)
@@ -1458,7 +1461,7 @@ void merge(Arena& arena,
 	KeyValueRef const* baseEnd = base.end();
 	while (baseStart != baseEnd && vCount > 0 && output.size() < adjustedLimit && accumulatedBytes < limitBytes) {
 		if (forward ? baseStart->key < vm_output[pos].key : baseStart->key > vm_output[pos].key) {
-			output.push_back_deep(arena, *baseStart++);
+			output.push_back(arena, *baseStart++);
 		} else {
 			output.push_back_deep(arena, vm_output[pos]);
 			if (baseStart->key == vm_output[pos].key)
@@ -1469,7 +1472,7 @@ void merge(Arena& arena,
 		accumulatedBytes += sizeof(KeyValueRef) + output.end()[-1].expectedSize();
 	}
 	while (baseStart != baseEnd && output.size() < adjustedLimit && accumulatedBytes < limitBytes) {
-		output.push_back_deep(arena, *baseStart++);
+		output.push_back(arena, *baseStart++);
 		accumulatedBytes += sizeof(KeyValueRef) + output.end()[-1].expectedSize();
 	}
 	if (!stopAtEndOfBase) {

From 41ca11c3e57fb888529883a550e1269b9b670ab3 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 22 Apr 2021 13:53:37 -0700
Subject: [PATCH 229/317] Implement restoring an inconsistent snapshot as a
 real feature.

---
 documentation/sphinx/source/backups.rst |  3 +++
 fdbbackup/backup.actor.cpp              | 14 ++++++++++++--
 fdbclient/BackupAgent.actor.h           |  3 +++
 fdbclient/FileBackupAgent.actor.cpp     | 18 ++++++++++++++++--
 fdbclient/Knobs.cpp                     |  1 -
 fdbclient/Knobs.h                       |  1 -
 6 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/documentation/sphinx/source/backups.rst b/documentation/sphinx/source/backups.rst
index 01a730a6bd..9f606a2b51 100644
--- a/documentation/sphinx/source/backups.rst
+++ b/documentation/sphinx/source/backups.rst
@@ -490,6 +490,9 @@ The ``start`` command will start a new restore on the specified (or default) tag
 ``--orig_cluster_file <CONNFILE>``
   The cluster file for the original database from which the backup was created.  The original database is only needed to convert a --timestamp argument to a database version.
 
+``--inconsistent_snapshot_only``
+  Ignore mutation log files during the restore to speedup the process. Because only range files are restored, this option gives an inconsistent snapshot in most cases and is not recommended to use.
+
 .. program:: fdbrestore abort
 
 ``abort``
diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index 8bb8cd5fc3..43e6f86b10 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -146,6 +146,7 @@ enum {
 	OPT_RESTORE_CLUSTERFILE_DEST,
 	OPT_RESTORE_CLUSTERFILE_ORIG,
 	OPT_RESTORE_BEGIN_VERSION,
+	OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY,
 
 	// Shared constants
 	OPT_CLUSTERFILE,
@@ -694,6 +695,7 @@ CSimpleOpt::SOption g_rgRestoreOptions[] = {
 	{ OPT_BLOB_CREDENTIALS, "--blob_credentials", SO_REQ_SEP },
 	{ OPT_INCREMENTALONLY, "--incremental", SO_NONE },
 	{ OPT_RESTORE_BEGIN_VERSION, "--begin_version", SO_REQ_SEP },
+	{ OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY, "--inconsistent_snapshot_only", SO_NONE },
 #ifndef TLS_DISABLED
 	TLS_OPTION_FLAGS
 #endif
@@ -2256,7 +2258,8 @@ ACTOR Future<Void> runRestore(Database db,
                               bool waitForDone,
                               std::string addPrefix,
                               std::string removePrefix,
-                              bool incrementalBackupOnly) {
+                              bool incrementalBackupOnly,
+                              bool inconsistentSnapshotOnly) {
 	if (ranges.empty()) {
 		ranges.push_back_deep(ranges.arena(), normalKeys);
 	}
@@ -2328,6 +2331,7 @@ ACTOR Future<Void> runRestore(Database db,
 			                                                   KeyRef(removePrefix),
 			                                                   true,
 			                                                   incrementalBackupOnly,
+			                                                   inconsistentSnapshotOnly,
 			                                                   beginVersion));
 
 			if (waitForDone && verbose) {
@@ -3243,6 +3247,7 @@ int main(int argc, char* argv[]) {
 		bool stopWhenDone = true;
 		bool usePartitionedLog = false; // Set to true to use new backup system
 		bool incrementalBackupOnly = false;
+		bool inconsistentSnapshotOnly = false;
 		bool forceAction = false;
 		bool trace = false;
 		bool quietDisplay = false;
@@ -3556,6 +3561,10 @@ int main(int argc, char* argv[]) {
 				restoreVersion = ver;
 				break;
 			}
+			case OPT_RESTORE_INCONSISTENT_SNAPSHOT_ONLY: {
+				inconsistentSnapshotOnly = true;
+				break;
+			}
 #ifdef _WIN32
 			case OPT_PARENTPID: {
 				auto pid_str = args->OptionArg();
@@ -4023,7 +4032,8 @@ int main(int argc, char* argv[]) {
 				                         waitForDone,
 				                         addPrefix,
 				                         removePrefix,
-				                         incrementalBackupOnly));
+				                         incrementalBackupOnly,
+				                         inconsistentSnapshotOnly));
 				break;
 			case RestoreType::WAIT:
 				f = stopAfter(success(ba.waitRestore(db, KeyRef(tagName), true)));
diff --git a/fdbclient/BackupAgent.actor.h b/fdbclient/BackupAgent.actor.h
index 61af1ca6a7..38888d4b59 100644
--- a/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/BackupAgent.actor.h
@@ -295,6 +295,7 @@ public:
 	                        Key removePrefix = Key(),
 	                        bool lockDB = true,
 	                        bool incrementalBackupOnly = false,
+	                        bool inconsistentSnapshotOnly = false,
 	                        Version beginVersion = -1);
 	Future<Version> restore(Database cx,
 	                        Optional<Database> cxOrig,
@@ -308,6 +309,7 @@ public:
 	                        Key removePrefix = Key(),
 	                        bool lockDB = true,
 	                        bool incrementalBackupOnly = false,
+	                        bool inconsistentSnapshotOnly = false,
 	                        Version beginVersion = -1) {
 		Standalone<VectorRef<KeyRangeRef>> rangeRef;
 		rangeRef.push_back_deep(rangeRef.arena(), range);
@@ -323,6 +325,7 @@ public:
 		               removePrefix,
 		               lockDB,
 		               incrementalBackupOnly,
+					   inconsistentSnapshotOnly,
 		               beginVersion);
 	}
 	Future<Version> atomicRestore(Database cx,
diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 2598c597ce..9d405af38f 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -142,6 +142,7 @@ public:
 	KeyBackedProperty<Key> addPrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	KeyBackedProperty<Key> removePrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	KeyBackedProperty<bool> incrementalBackupOnly() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
+	KeyBackedProperty<bool> inconsistentSnapshotOnly() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	// XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges
 	KeyBackedProperty<KeyRange> restoreRange() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	KeyBackedProperty<std::vector<KeyRange>> restoreRanges() {
@@ -4143,7 +4144,11 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			}
 		}
 
-		if (!CLIENT_KNOBS->RESTORE_IGNORE_LOG_FILES) {
+		tr->reset();
+		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
+		bool inconsistentSnapshotOnly = wait(restore.inconsistentSnapshotOnly().getD(tr, false, false));
+		if (!inconsistentSnapshotOnly) {
 			for (const LogFile& f : restorable.get().logs) {
 				files.push_back({ f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion });
 			}
@@ -4622,6 +4627,7 @@ public:
 	                                        Key removePrefix,
 	                                        bool lockDB,
 	                                        bool incrementalBackupOnly,
+	                                        bool inconsistentSnapshotOnly,
 	                                        Version beginVersion,
 	                                        UID uid) {
 		KeyRangeMap<int> restoreRangeSet;
@@ -4691,6 +4697,7 @@ public:
 		restore.stateEnum().set(tr, ERestoreState::QUEUED);
 		restore.restoreVersion().set(tr, restoreVersion);
 		restore.incrementalBackupOnly().set(tr, incrementalBackupOnly);
+		restore.inconsistentSnapshotOnly().set(tr, inconsistentSnapshotOnly);
 		restore.beginVersion().set(tr, beginVersion);
 		if (BUGGIFY && restoreRanges.size() == 1) {
 			restore.restoreRange().set(tr, restoreRanges[0]);
@@ -5264,7 +5271,9 @@ public:
 	//   removePrefix: for each key to be restored, remove this prefix first.
 	//   lockDB: if set lock the database with randomUid before performing restore;
 	//           otherwise, check database is locked with the randomUid
-	//   incrementalBackupOnly: only perform incremental backup
+	//   incrementalBackupOnly: only perform incremental restore, by only applying mutation logs
+	//   inconsistentSnapshotOnly: Ignore mutation log files during the restore to speedup the process.
+	//                             When set to true, gives an inconsistent snapshot, thus not recommended
 	//   beginVersion: restore's begin version
 	//   randomUid: the UID for lock the database
 	ACTOR static Future<Version> restore(FileBackupAgent* backupAgent,
@@ -5280,6 +5289,7 @@ public:
 	                                     Key removePrefix,
 	                                     bool lockDB,
 	                                     bool incrementalBackupOnly,
+	                                     bool inconsistentSnapshotOnly,
 	                                     Version beginVersion,
 	                                     UID randomUid) {
 		// The restore command line tool won't allow ranges to be empty, but correctness workloads somehow might.
@@ -5336,6 +5346,7 @@ public:
 				                   removePrefix,
 				                   lockDB,
 				                   incrementalBackupOnly,
+				                   inconsistentSnapshotOnly,
 				                   beginVersion,
 				                   randomUid));
 				wait(tr->commit());
@@ -5491,6 +5502,7 @@ public:
 			                           removePrefix,
 			                           true,
 			                           false,
+			                           false,
 			                           invalidVersion,
 			                           randomUid));
 			return ver;
@@ -5552,6 +5564,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
                                          Key removePrefix,
                                          bool lockDB,
                                          bool incrementalBackupOnly,
+                                         bool inconsistentSnapshotOnly,
                                          Version beginVersion) {
 	return FileBackupAgentImpl::restore(this,
 	                                    cx,
@@ -5566,6 +5579,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
 	                                    removePrefix,
 	                                    lockDB,
 	                                    incrementalBackupOnly,
+	                                    inconsistentSnapshotOnly,
 	                                    beginVersion,
 	                                    deterministicRandom()->randomUniqueID());
 }
diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index 3f5523e218..bcca5ed166 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -172,7 +172,6 @@ void ClientKnobs::initialize(bool randomize) {
 	init( BACKUP_STATUS_DELAY,                    40.0 );
 	init( BACKUP_STATUS_JITTER,                   0.05 );
 	init( MIN_CLEANUP_SECONDS,                  3600.0 );
-	init( RESTORE_IGNORE_LOG_FILES,              false );
 
 	// Configuration
 	init( DEFAULT_AUTO_COMMIT_PROXIES,               3 );
diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h
index 8cfcd9e6bd..3d22b5a24b 100644
--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@@ -167,7 +167,6 @@ public:
 	double BACKUP_STATUS_DELAY;
 	double BACKUP_STATUS_JITTER;
 	double MIN_CLEANUP_SECONDS;
-	bool RESTORE_IGNORE_LOG_FILES;   // Default is false. When set to true, the log files will be ignored during the restore, which can produce inconsistent restored data.
 
 	// Configuration
 	int32_t DEFAULT_AUTO_COMMIT_PROXIES;

From b0f6cb57b8ed29d2b22dcc2915fd9261bc2eb3f8 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 22 Apr 2021 14:19:16 -0700
Subject: [PATCH 230/317] Update simulation.

---
 fdbserver/workloads/IncrementalBackup.actor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdbserver/workloads/IncrementalBackup.actor.cpp b/fdbserver/workloads/IncrementalBackup.actor.cpp
index 8c4b20a07a..4c07866a4c 100644
--- a/fdbserver/workloads/IncrementalBackup.actor.cpp
+++ b/fdbserver/workloads/IncrementalBackup.actor.cpp
@@ -229,6 +229,7 @@ struct IncrementalBackupWorkload : TestWorkload {
 			                                       Key(),
 			                                       true,
 			                                       true,
+												   false,
 			                                       beginVersion)));
 			TraceEvent("IBackupRestoreSuccess");
 		}

From 69d7951cfa50f32d6bf4b161d5e38b353ff16eb1 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 22 Apr 2021 14:21:30 -0700
Subject: [PATCH 231/317] Clang format.

---
 fdbserver/workloads/IncrementalBackup.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/workloads/IncrementalBackup.actor.cpp b/fdbserver/workloads/IncrementalBackup.actor.cpp
index 4c07866a4c..1d9fa547af 100644
--- a/fdbserver/workloads/IncrementalBackup.actor.cpp
+++ b/fdbserver/workloads/IncrementalBackup.actor.cpp
@@ -229,7 +229,7 @@ struct IncrementalBackupWorkload : TestWorkload {
 			                                       Key(),
 			                                       true,
 			                                       true,
-												   false,
+			                                       false,
 			                                       beginVersion)));
 			TraceEvent("IBackupRestoreSuccess");
 		}

From 99c1edf87eb57816b1e7b75f1a57cae89a731456 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 22 Apr 2021 17:48:09 -0600
Subject: [PATCH 232/317] Implemented fluentd functionality

---
 fdbclient/ActorLineageProfiler.cpp  |  84 ++++++++--------
 fdbclient/ActorLineageProfiler.h    |  19 ++--
 fdbclient/FluentDSampleIngestor.cpp | 143 ++++++++++++++++++++++++----
 3 files changed, 181 insertions(+), 65 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 5bcfaacbb5..a62d0ae890 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -25,6 +25,7 @@
 #include <msgpack.hpp>
 #include <memory>
 #include <boost/endian/conversion.hpp>
+#include <boost/asio.hpp>
 
 using namespace std::literals;
 
@@ -230,7 +231,45 @@ std::vector<std::shared_ptr<Sample>> SampleCollection_t::get(double from /*= 0.0
 	return res;
 }
 
-ActorLineageProfilerT::ActorLineageProfilerT() {
+struct ProfilerImpl {
+	boost::asio::io_context context;
+	boost::asio::executor_work_guard<decltype(context.get_executor())> workGuard;
+	boost::asio::steady_timer timer;
+	std::thread mainThread;
+	unsigned frequency;
+
+	SampleCollection collection;
+
+	ProfilerImpl() : workGuard(context.get_executor()), timer(context) {
+		mainThread = std::thread([this]() { context.run(); });
+	}
+	~ProfilerImpl() {
+		setFrequency(0);
+		workGuard.reset();
+		mainThread.join();
+	}
+
+	void profileHandler(boost::system::error_code const& ec) {
+		if (ec) {
+			return;
+		}
+		collection->refresh();
+		timer = boost::asio::steady_timer(context, std::chrono::microseconds(1000000 / frequency));
+		timer.async_wait([this](auto const& ec) { profileHandler(ec); });
+	}
+
+	void setFrequency(unsigned frequency) {
+		boost::asio::post(context, [this, frequency]() {
+			this->frequency = frequency;
+			timer.cancel();
+			if (frequency > 0) {
+				profileHandler(boost::system::error_code{});
+			}
+		});
+	}
+};
+
+ActorLineageProfilerT::ActorLineageProfilerT() : impl(new ProfilerImpl()) {
 	collection->collector()->addGetter(WaitState::Network,
 	                                   std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet())));
 	collection->collector()->addGetter(
@@ -243,50 +282,15 @@ ActorLineageProfilerT::ActorLineageProfilerT() {
 }
 
 ActorLineageProfilerT::~ActorLineageProfilerT() {
-	stop();
-}
-
-void ActorLineageProfilerT::stop() {
-	setFrequency(0);
+	delete impl;
 }
 
 void ActorLineageProfilerT::setFrequency(unsigned frequency) {
-	unsigned oldFrequency = this->frequency;
-	bool change = this->frequency != frequency;
-	this->frequency = frequency;
-
-	if (change) {
-		// Profiler thread will automatically switch to new frequency after
-		// being triggered by the the condition variable. Only need to start a
-		// new profiler thread if the old one has been stopped due to the
-		// profiler thread returning (frequency set to 0).
-		if (oldFrequency == 0 && frequency != 0) {
-			std::thread(&ActorLineageProfilerT::profile, this).detach();
-		}
-		cond.notify_all();
-	}
+	impl->setFrequency(frequency);
 }
 
-void ActorLineageProfilerT::profile() {
-	static std::atomic_int profileThreadCount = 0;
-	ASSERT(++profileThreadCount == 1);
-
-	for (;;) {
-		collection->refresh();
-		if (frequency == 0) {
-			profileThreadCount--;
-			return;
-		}
-		{
-			std::unique_lock<std::mutex> lock{ mutex };
-			cond.wait_for(lock, std::chrono::microseconds(1000000 / frequency));
-			// cond.wait_until(lock, lastSample + std::chrono::milliseconds)
-		}
-		if (frequency == 0) {
-			profileThreadCount--;
-			return;
-		}
-	}
+boost::asio::io_context& ActorLineageProfilerT::context() {
+	return impl->context;
 }
 
 SampleIngestor::~SampleIngestor() {}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index c7348d83c1..0e7c8e7385 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -152,20 +152,25 @@ public:
 
 using SampleCollection = crossbow::singleton<SampleCollection_t>;
 
+struct ProfilerImpl;
+
+namespace boost {
+namespace asio {
+// forward declare io_context because including boost asio is super expensive
+class io_context;
+} // namespace asio
+} // namespace boost
+
 class ActorLineageProfilerT {
 	friend struct crossbow::create_static<ActorLineageProfilerT>;
-	ActorLineageProfilerT();
+	ProfilerImpl* impl;
 	SampleCollection collection;
-	std::thread profilerThread;
-	std::atomic<unsigned> frequency = 0;
-	std::mutex mutex;
-	std::condition_variable cond;
-	void profile();
+	ActorLineageProfilerT();
 
 public:
 	~ActorLineageProfilerT();
 	void setFrequency(unsigned frequency);
-	void stop();
+	boost::asio::io_context& context();
 };
 
 using ActorLineageProfiler = crossbow::singleton<ActorLineageProfilerT>;
diff --git a/fdbclient/FluentDSampleIngestor.cpp b/fdbclient/FluentDSampleIngestor.cpp
index 0a81ba0613..f1609ae5b3 100644
--- a/fdbclient/FluentDSampleIngestor.cpp
+++ b/fdbclient/FluentDSampleIngestor.cpp
@@ -20,46 +20,153 @@
 
 #include "fdbclient/ActorLineageProfiler.h"
 #include <boost/asio.hpp>
+#include <boost/asio/co_spawn.hpp>
 
 namespace {
+
+boost::asio::ip::address ipAddress(IPAddress const& n) {
+	if (n.isV6()) {
+		return boost::asio::ip::address_v6(n.toV6());
+	} else {
+		return boost::asio::ip::address_v4(n.toV4());
+	}
+}
+
+template <class Protocol>
+boost::asio::ip::basic_endpoint<Protocol> toEndpoint(NetworkAddress const n) {
+	return boost::asio::ip::basic_endpoint<Protocol>(ipAddress(n.ip), n.port);
+}
+
 struct FluentDSocket {
 	virtual ~FluentDSocket() {}
-	virtual void connect(NetworkAddress& endpoint) = 0;
-	// virtual void send() = 0;
+	virtual void connect(NetworkAddress const& endpoint) = 0;
+	virtual void send(std::shared_ptr<Sample> const& sample) = 0;
+	virtual const boost::system::error_code& failed() const = 0;
 };
 
-struct TCPFluentDSocket : FluentDSocket {
-	boost::asio::io_context& io_context;
-	boost::asio::ip::tcp::socket socket;
-	TCPFluentDSocket(boost::asio::io_context& context) : io_context(context), socket(context) {}
-	void connect(NetworkAddress& endpoint) override { boost::asio::ip::tcp::resolver resolver(io_context); }
+template <class Protocol>
+struct FluentDSocketImpl : FluentDSocket, std::enable_shared_from_this<FluentDSocketImpl<Protocol>> {
+	static constexpr unsigned MAX_QUEUE_SIZE = 100;
+	boost::asio::io_context& context;
+	typename Protocol::socket socket;
+	FluentDSocketImpl(boost::asio::io_context& context) : context(context), socket(context) {}
+	bool ready = false;
+	std::deque<std::shared_ptr<Sample>> queue;
+	boost::system::error_code _failed;
+
+	const boost::system::error_code& failed() const override { return _failed; }
+
+	void sendCompletionHandler(boost::system::error_code const& ec) {
+		if (ec) {
+			// TODO: trace error
+			_failed = ec;
+			return;
+		}
+		if (queue.empty()) {
+			ready = true;
+		} else {
+			auto sample = queue.front();
+			queue.pop_front();
+			sendImpl<Protocol>(sample);
+		}
+	}
+
+	template <class P>
+	std::enable_if_t<std::is_same_v<boost::asio::ip::tcp, P>> sendImpl(std::shared_ptr<Sample> const& sample) {
+		boost::asio::async_write(
+		    socket,
+		    boost::asio::const_buffer(sample->data, sample->size),
+		    [sample, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
+	}
+
+	template <class P>
+	std::enable_if_t<std::is_same_v<boost::asio::ip::udp, P>> sendImpl(std::shared_ptr<Sample> const& sample) {
+		socket.async_send(
+		    boost::asio::const_buffer(sample->data, sample->size),
+		    [sample, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
+	}
+
+	void send(std::shared_ptr<Sample> const& sample) override {
+		if (_failed) {
+			return;
+		}
+		if (ready) {
+			ready = false;
+			sendImpl<Protocol>(sample);
+		} else {
+			if (queue.size() < MAX_QUEUE_SIZE) {
+				queue.push_back(sample);
+			} // TODO: else trace a warning
+		}
+	}
+
+	void connect(NetworkAddress const& endpoint) override {
+		auto to = toEndpoint<Protocol>(endpoint);
+		socket.async_connect(to, [self = this->shared_from_this()](boost::system::error_code const& ec) {
+			if (ec) {
+				// TODO: error handling
+				self->_failed = ec;
+				return;
+			}
+			self->ready = true;
+		});
+	}
 };
 
-struct UDPFluentDSocket : FluentDSocket {
-	boost::asio::io_context& io_context;
-	boost::asio::ip::tcp::socket socket;
-	UDPFluentDSocket(boost::asio::io_context& context) : io_context(context), socket(context) {}
-	void connect(NetworkAddress& endpoint) override {}
-};
 } // namespace
 
 struct FluentDIngestorImpl {
 	using Protocol = FluentDIngestor::Protocol;
-	boost::asio::io_context io_context;
+	Protocol protocol;
+	NetworkAddress endpoint;
+	boost::asio::io_context& io_context;
 	std::unique_ptr<FluentDSocket> socket;
-	FluentDIngestorImpl(Protocol protocol, NetworkAddress& endpoint) {
+	boost::asio::steady_timer retryTimer;
+	FluentDIngestorImpl(Protocol protocol, NetworkAddress const& endpoint)
+	  : protocol(protocol), endpoint(endpoint), io_context(ActorLineageProfiler::instance().context()),
+	    retryTimer(io_context) {
+		connect();
+	}
+
+	~FluentDIngestorImpl() { retryTimer.cancel(); }
+
+	void connect() {
 		switch (protocol) {
 		case Protocol::TCP:
-			socket.reset(new TCPFluentDSocket(io_context));
+			socket.reset(new FluentDSocketImpl<boost::asio::ip::tcp>(io_context));
 			break;
 		case Protocol::UDP:
-			socket.reset(new UDPFluentDSocket(io_context));
+			socket.reset(new FluentDSocketImpl<boost::asio::ip::udp>(io_context));
 			break;
 		}
 		socket->connect(endpoint);
 	}
+
+	void retry() {
+		retryTimer = boost::asio::steady_timer(io_context, std::chrono::seconds(1));
+		retryTimer.async_wait([this](auto const& ec) {
+			if (ec) {
+				return;
+			}
+			connect();
+		});
+		socket.reset();
+	}
 };
 
 FluentDIngestor::~FluentDIngestor() {}
 
-FluentDIngestor::FluentDIngestor(Protocol protocol, NetworkAddress& endpoint) {}
\ No newline at end of file
+FluentDIngestor::FluentDIngestor(Protocol protocol, NetworkAddress& endpoint)
+  : impl(new FluentDIngestorImpl(protocol, endpoint)) {}
+
+void FluentDIngestor::ingest(const std::shared_ptr<Sample>& sample) {
+	if (!impl->socket) {
+		// the connection failed in the past and we wait for a timeout before we retry
+		return;
+	} else if (impl->socket->failed()) {
+		impl->retry();
+		return;
+	} else {
+		impl->socket->send(sample);
+	}
+}

From adb0ce97769721ba5d95206880ccf3570d5355d2 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 22 Apr 2021 17:52:27 -0600
Subject: [PATCH 233/317] address review comments

---
 fdbclient/TransactionLineage.h    | 2 +-
 fdbserver/storageserver.actor.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fdbclient/TransactionLineage.h b/fdbclient/TransactionLineage.h
index b4518de231..711d89101c 100644
--- a/fdbclient/TransactionLineage.h
+++ b/fdbclient/TransactionLineage.h
@@ -94,7 +94,7 @@ class ScopedLineage {
 
 public:
 	ScopedLineage(V T::*member, V const& value) : member(member) {
-		auto val = currentLineage->modify(member);
+		auto& val = currentLineage->modify(member);
 		before = val;
 		val = value;
 	}
diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp
index 7538685acf..254484710d 100644
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@@ -522,7 +522,7 @@ public:
 	//   process of committing makeShardDurable)
 	//   == v              -> k is readable (from storage+versionedData) @ [storageVersion,v], and not being updated
 	//   when version increases
-	//   == latestVersion  -> k is readable (from stora	ge+versionedData) @ [storageVersion,version.get()], and thus
+	//   == latestVersion  -> k is readable (from storage+versionedData) @ [storageVersion,version.get()], and thus
 	//   stays available when version increases
 	CoalescedKeyRangeMap<Version> newestAvailableVersion;
 
@@ -875,7 +875,7 @@ public:
 		}
 		return fun(this, request);
 	}
-		    };
+};
 
 const StringRef StorageServer::CurrentRunningFetchKeys::emptyString = LiteralStringRef("");
 const KeyRangeRef StorageServer::CurrentRunningFetchKeys::emptyKeyRange =

From 3e18b857a872275e912043c4b4a66e1006e1375f Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Fri, 23 Apr 2021 11:02:53 -0600
Subject: [PATCH 234/317] add command line args to configure profile ingestor

---
 fdbclient/ActorLineageProfiler.cpp | 55 ++++++++++++++++++++++++++++++
 fdbclient/ActorLineageProfiler.h   |  7 +++-
 fdbserver/fdbserver.actor.cpp      | 34 ++++++++++++++++--
 3 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index a62d0ae890..3b300f1653 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -305,3 +305,58 @@ void samplingProfilerUpdateFrequency(std::optional<std::any> freq) {
 	TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency);
 	ActorLineageProfiler::instance().setFrequency(frequency);
 }
+
+void ProfilerConfigT::reset(std::map<std::string, std::string> const& config) {
+	bool expectNoMore = false, useFluentD = false, useTCP = false;
+	std::string endpoint;
+	ConfigError err;
+	for (auto& kv : config) {
+		if (expectNoMore) {
+			err.description = format("Unexpected option %s", kv.first.c_str());
+			throw err;
+		}
+		if (kv.first == "collector") {
+			std::string val = kv.second;
+			std::for_each(val.begin(), val.end(), [](auto c) { return std::tolower(c); });
+			if (val == "none") {
+				setBackend(std::make_shared<NoneIngestor>());
+			} else if (val == "fluentd") {
+				useFluentD = true;
+			} else {
+				err.description = format("Unsupported collector: %s", val.c_str());
+				throw err;
+			}
+		} else if (kv.first == "collector_endpoint") {
+			endpoint = kv.second;
+		} else if (kv.first == "collector_protocol") {
+			auto val = kv.second;
+			std::for_each(val.begin(), val.end(), [](auto c) { return std::tolower(c); });
+			if (val == "tcp") {
+				useTCP = true;
+			} else if (val == "udp") {
+				useTCP = false;
+			} else {
+				err.description = format("Unsupported protocol for fluentd: %s", kv.second.c_str());
+				throw err;
+			}
+		} else {
+			err.description = format("Unknown option %s", kv.first.c_str());
+			throw err;
+		}
+	}
+	if (useFluentD) {
+		if (endpoint.empty()) {
+			err.description = "Endpoint is required for fluentd ingestor";
+			throw err;
+		}
+		NetworkAddress address;
+		try {
+			address = NetworkAddress::parse(endpoint);
+		} catch (Error& e) {
+			err.description = format("Can't parse address %s", endpoint.c_str());
+			throw err;
+		}
+		setBackend(std::make_shared<FluentDIngestor>(
+		    useTCP ? FluentDIngestor::Protocol::TCP : FluentDIngestor::Protocol::TCP, address));
+	}
+}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 0e7c8e7385..d09aba7d2c 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -79,6 +79,10 @@ public: // interface
 	~FluentDIngestor();
 };
 
+struct ConfigError {
+	std::string description;
+};
+
 class ProfilerConfigT {
 private: // private types
 	using Lock = std::unique_lock<std::mutex>;
@@ -91,9 +95,10 @@ private: // construction
 	ProfilerConfigT() {}
 	ProfilerConfigT(ProfilerConfigT const&) = delete;
 	ProfilerConfigT& operator=(ProfilerConfigT const&) = delete;
+	void setBackend(std::shared_ptr<SampleIngestor> ingestor) { this->ingestor = ingestor; }
 
 public:
-	void setBackend(std::shared_ptr<SampleIngestor> ingestor) { this->ingestor = ingestor; }
+	void reset(std::map<std::string, std::string> const& config);
 };
 
 using ProfilerConfig = crossbow::singleton<ProfilerConfigT>;
diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 136cd90c3d..75247d85cf 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -68,6 +68,7 @@
 #include "flow/Tracing.h"
 #include "flow/WriteOnlySet.h"
 #include "flow/UnitTest.h"
+#include "fdbclient/ActorLineageProfiler.h"
 
 #if defined(__linux__) || defined(__FreeBSD__)
 #include <execinfo.h>
@@ -85,6 +86,8 @@
 
 #include "flow/actorcompiler.h" // This must be the last #include.
 
+using namespace std::literals;
+
 // clang-format off
 enum {
 	OPT_CONNFILE, OPT_SEEDCONNFILE, OPT_SEEDCONNSTRING, OPT_ROLE, OPT_LISTEN, OPT_PUBLICADDR, OPT_DATAFOLDER, OPT_LOGFOLDER, OPT_PARENTPID, OPT_TRACER, OPT_NEWCONSOLE,
@@ -92,7 +95,7 @@ enum {
 	OPT_DCID, OPT_MACHINE_CLASS, OPT_BUGGIFY, OPT_VERSION, OPT_BUILD_FLAGS, OPT_CRASHONERROR, OPT_HELP, OPT_NETWORKIMPL, OPT_NOBUFSTDOUT, OPT_BUFSTDOUTERR,
 	OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_UNITTESTPARAM, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE,
 	OPT_METRICSPREFIX, OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE,
-	OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE
+	OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE, OPT_PROFILER
 };
 
 CSimpleOpt::SOption g_rgOptions[] = {
@@ -172,9 +175,10 @@ CSimpleOpt::SOption g_rgOptions[] = {
 	{ OPT_METRICSPREFIX,         "--metrics_prefix",            SO_REQ_SEP },
 	{ OPT_IO_TRUST_SECONDS,      "--io_trust_seconds",          SO_REQ_SEP },
 	{ OPT_IO_TRUST_WARN_ONLY,    "--io_trust_warn_only",        SO_NONE },
-	{ OPT_TRACE_FORMAT      ,    "--trace_format",              SO_REQ_SEP },
+	{ OPT_TRACE_FORMAT,          "--trace_format",              SO_REQ_SEP },
 	{ OPT_WHITELIST_BINPATH,     "--whitelist_binpath",         SO_REQ_SEP },
 	{ OPT_BLOB_CREDENTIAL_FILE,  "--blob_credential_file",      SO_REQ_SEP },
+	{ OPT_PROFILER,	             "--profiler_",                 SO_REQ_SEP},
 
 #ifndef TLS_DISABLED
 	TLS_OPTION_FLAGS
@@ -618,6 +622,11 @@ static void printUsage(const char* name, bool devhelp) {
 	                 " Machine class (valid options are storage, transaction,"
 	                 " resolution, grv_proxy, commit_proxy, master, test, unset, stateless, log, router,"
 	                 " and cluster_controller).");
+	printOptionUsage("--profiler_",
+	                 "Set a actor profiler option. Supported options are:\n"
+	                 "  collector -- None or FluentD (FluentD requires collector_endpoint to be set)\n"
+	                 "  collector_endpoint -- IP:PORT of the fluentd server\n"
+	                 "  collector_protocol -- UDP or TCP (default is UDP)");
 #ifndef TLS_DISABLED
 	printf(TLS_HELP);
 #endif
@@ -981,6 +990,8 @@ struct CLIOptions {
 	Standalone<StringRef> machineId;
 	UnitTestParameters testParams;
 
+	std::map<std::string, std::string> profilerConfig;
+
 	static CLIOptions parseArgs(int argc, char* argv[]) {
 		CLIOptions opts;
 		opts.parseArgsInternal(argc, argv);
@@ -1054,6 +1065,18 @@ private:
 				knobs.push_back(std::make_pair(syn, args.OptionArg()));
 				break;
 			}
+			case OPT_PROFILER: {
+				std::string syn = args.OptionSyntax();
+				std::string_view key = syn;
+				auto prefix = "--profiler_"sv;
+				if (key.find(prefix) != 0) {
+					fprintf(stderr, "ERROR: unable to parse profiler option '%s'\n", syn.c_str());
+					flushAndExit(FDB_EXIT_ERROR);
+				}
+				key.remove_prefix(prefix.size());
+				profilerConfig.emplace(key, args.OptionArg());
+				break;
+			};
 			case OPT_UNITTESTPARAM: {
 				std::string syn = args.OptionSyntax();
 				if (!StringRef(syn).startsWith(LiteralStringRef("--test_"))) {
@@ -1454,6 +1477,13 @@ private:
 			}
 		}
 
+		try {
+			ProfilerConfig::instance().reset(profilerConfig);
+		} catch (ConfigError& e) {
+			printf("Error seting up profiler: %s", e.description.c_str());
+			flushAndExit(FDB_EXIT_ERROR);
+		}
+
 		if (seedConnString.length() && seedConnFile.length()) {
 			fprintf(
 			    stderr, "%s\n", "--seed_cluster_file and --seed_connection_string may not both be specified at once.");

From 185d08b5b87ff46e8dc6914116445387af855907 Mon Sep 17 00:00:00 2001
From: Chaoguang Lin <chaoguang.lin@snowflake.com>
Date: Fri, 23 Apr 2021 11:13:08 -0700
Subject: [PATCH 235/317] Add comments for added actors

---
 fdbclient/SpecialKeySpace.actor.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 12cbc0c41c..543b089753 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1925,6 +1925,11 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke
 
 MaintenanceImpl::MaintenanceImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
+// Used to read the healthZoneKey
+// If the key is persisted and the delayed read version is still larger than current read version,
+// we will calculate the remaining time(truncated to integer, the same as fdbcli) and return back as the value
+// If the zoneId is the special one `ignoreSSFailuresZoneString`,
+// value will be 0 (same as fdbcli)
 ACTOR static Future<Standalone<RangeResultRef>> MaintenanceGetRangeActor(ReadYourWritesTransaction* ryw,
                                                                          KeyRef prefix,
                                                                          KeyRangeRef kr) {
@@ -1954,6 +1959,11 @@ Future<Standalone<RangeResultRef>> MaintenanceImpl::getRange(ReadYourWritesTrans
 	return MaintenanceGetRangeActor(ryw, getKeyRange().begin, kr);
 }
 
+// Commit the change to healthZoneKey
+// We do not allow more than one zone to be set in maintenance in one transaction
+// In addition, if the zoneId now is 'ignoreSSFailuresZoneString',
+// which means the data distribution is disabled for storage failures.
+// Only clear this specific key is allowed, any other operations will throw error
 ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
 	// read
 	ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
@@ -2013,6 +2023,7 @@ Future<Optional<std::string>> MaintenanceImpl::commit(ReadYourWritesTransaction*
 
 DataDistributionImpl::DataDistributionImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
 
+// Read the system keys dataDistributionModeKey and rebalanceDDIgnoreKey
 ACTOR static Future<Standalone<RangeResultRef>> DataDistributionGetRangeActor(ReadYourWritesTransaction* ryw,
                                                                               KeyRef prefix,
                                                                               KeyRangeRef kr) {

From 4b5bca6761924ee25599952fda3aa00dd49d5467 Mon Sep 17 00:00:00 2001
From: Josh Slocum <josh.slocum@snowflake.com>
Date: Tue, 2 Mar 2021 21:06:01 +0000
Subject: [PATCH 236/317] Minor Redwood comparison optimizations

---
 fdbserver/DeltaTree.h                 |   4 +
 fdbserver/VersionedBTree.actor.cpp    | 173 +++++++++++++++++++++++---
 flow/Arena.h                          |  11 ++
 tests/CMakeLists.txt                  |   2 +
 tests/RandomRangeRead.txt             |  11 ++
 tests/RedwoodPerfRandomRangeScans.txt |   6 +
 6 files changed, 193 insertions(+), 14 deletions(-)
 create mode 100644 tests/RandomRangeRead.txt
 create mode 100644 tests/RedwoodPerfRandomRangeScans.txt

diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h
index bef753a440..dc113ff98d 100644
--- a/fdbserver/DeltaTree.h
+++ b/fdbserver/DeltaTree.h
@@ -530,6 +530,10 @@ public:
 
 		const T& getOrUpperBound() const { return valid() ? node->item : *mirror->upperBound(); }
 
+		const T& lowerBound() const { return *mirror->lowerBound(); }
+
+		const T& upperBound() const { return *mirror->upperBound(); }
+
 		bool operator==(const Cursor& rhs) const { return node == rhs.node; }
 
 		bool operator!=(const Cursor& rhs) const { return node != rhs.node; }
diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp
index fab29a7034..d1c9ad77f0 100644
--- a/fdbserver/VersionedBTree.actor.cpp
+++ b/fdbserver/VersionedBTree.actor.cpp
@@ -2493,7 +2493,7 @@ struct RedwoodRecordRef {
 	// This is the same order that delta compression uses for prefix borrowing
 	int compare(const RedwoodRecordRef& rhs, int skip = 0) const {
 		int keySkip = std::min(skip, key.size());
-		int cmp = key.substr(keySkip).compare(rhs.key.substr(keySkip));
+		int cmp = key.compareSuffix(rhs.key, keySkip);
 
 		if (cmp == 0) {
 			cmp = version - rhs.version;
@@ -6062,9 +6062,13 @@ public:
 				// Read page contents without using waits
 				bool isRoot = cur.inRoot();
 				BTreePage::BinaryTree::Cursor leafCursor = cur.popPath();
+				// we can bypass the bounds check for each key in the leaf if the entire leaf is in range
+				// > because both query end and page upper bound are exclusive of the query results and page contents,
+				// respectively
+				bool boundsCheck = leafCursor.upperBound() > keys.end;
 				while (leafCursor.valid()) {
 					KeyValueRef kv = leafCursor.get().toKeyValueRef();
-					if (kv.key >= keys.end) {
+					if (boundsCheck && kv.key.compare(keys.end) >= 0) {
 						break;
 					}
 					accumulatedBytes += kv.expectedSize();
@@ -6087,9 +6091,13 @@ public:
 				// Read page contents without using waits
 				bool isRoot = cur.inRoot();
 				BTreePage::BinaryTree::Cursor leafCursor = cur.popPath();
+				// we can bypass the bounds check for each key in the leaf if the entire leaf is in range
+				// < because both query begin and page lower bound are inclusive of the query results and page contents,
+				// respectively
+				bool boundsCheck = leafCursor.lowerBound() < keys.begin;
 				while (leafCursor.valid()) {
 					KeyValueRef kv = leafCursor.get().toKeyValueRef();
-					if (kv.key < keys.begin) {
+					if (boundsCheck && kv.key.compare(keys.begin) < 0) {
 						break;
 					}
 					accumulatedBytes += kv.expectedSize();
@@ -7089,6 +7097,22 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") {
 	}
 	printf("%" PRId64 " writeDelta() %f M/s\n", total, count / (timer() - start) / 1e6);
 
+	start = timer();
+	total = 0;
+	count = 10e6;
+	for (i = 0; i < count; ++i) {
+		total += rec1.compare(rec2, 0);
+	}
+	printf("%" PRId64 " compare(skip=0) %f M/s\n", total, count / (timer() - start) / 1e6);
+
+	start = timer();
+	total = 0;
+	count = 10e6;
+	for (i = 0; i < count; ++i) {
+		total += rec1.compare(rec2, 50);
+	}
+	printf("%" PRId64 " compare(skip=50) %f M/s\n", total, count / (timer() - start) / 1e6);
+
 	return Void();
 }
 
@@ -8255,6 +8279,8 @@ struct KVSource {
 	std::string valueData;
 	int prefixLen;
 	int lastIndex;
+	// TODO there is probably a better way to do this
+	Prefix extraRangePrefix;
 
 	KVSource(const std::vector<PrefixSegment>& desc, int numPrefixes = 0) : desc(desc) {
 		if (numPrefixes == 0) {
@@ -8303,6 +8329,32 @@ struct KVSource {
 		return makeKey(p, suffixLen);
 	}
 
+	// Like getKeyRef but gets a KeyRangeRef. If samePrefix, it returns a range from the same prefix,
+	// otherwise it returns a random range from the entire keyspace
+	// Can technically return an empty range with low probability
+	KeyRangeRef getKeyRangeRef(bool samePrefix, int suffixLen, bool sorted = false) {
+		KeyRef a, b;
+
+		a = getKeyRef(suffixLen);
+		// Copy a so that b's Prefix Arena allocation doesn't overwrite a if using the same prefix
+		extraRangePrefix.reserve(extraRangePrefix.arena(), a.size());
+		a.copyTo((uint8_t*)extraRangePrefix.begin());
+		a = KeyRef(extraRangePrefix.begin(), a.size());
+
+		if (samePrefix) {
+			b = getAnotherKeyRef(suffixLen, sorted);
+		} else {
+			b = getKeyRef(suffixLen);
+		}
+
+		if (a < b) {
+			return KeyRangeRef(a, b);
+		} else {
+			return KeyRangeRef(b, a);
+		}
+	}
+
+	// TODO unused, remove?
 	// Like getKeyRef but gets a KeyRangeRef for two keys covering the given number of sorted adjacent prefixes
 	KeyRangeRef getRangeRef(int prefixesCovered, int suffixLen) {
 		prefixesCovered = std::min<int>(prefixesCovered, prefixes.size());
@@ -8373,7 +8425,8 @@ ACTOR Future<Void> prefixClusteredInsert(IKeyValueStore* kvs,
                                          int valueSize,
                                          KVSource source,
                                          int recordCountTarget,
-                                         bool usePrefixesInOrder) {
+                                         bool usePrefixesInOrder,
+                                         bool clearAfter) {
 	state int commitTarget = 5e6;
 
 	state int recordSize = source.prefixLen + suffixSize + valueSize;
@@ -8418,7 +8471,7 @@ ACTOR Future<Void> prefixClusteredInsert(IKeyValueStore* kvs,
 
 		state int i;
 		for (i = 0; i < recordsPerPrefix; ++i) {
-			KeyValueRef kv(source.getAnotherKeyRef(4, usePrefixesInOrder), source.getValue(valueSize));
+			KeyValueRef kv(source.getAnotherKeyRef(suffixSize, usePrefixesInOrder), source.getValue(valueSize));
 			kvs->set(kv);
 			kvBytes += kv.expectedSize();
 			++records;
@@ -8440,6 +8493,8 @@ ACTOR Future<Void> prefixClusteredInsert(IKeyValueStore* kvs,
 	}
 
 	wait(commit);
+	// TODO is it desired that not all records are committed? This could commit again to ensure any records set() since
+	// the last commit are persisted. For the purposes of how this is used currently, I don't think it matters though
 	stats();
 	printf("\n");
 
@@ -8447,13 +8502,15 @@ ACTOR Future<Void> prefixClusteredInsert(IKeyValueStore* kvs,
 	StorageBytes sb = wait(getStableStorageBytes(kvs));
 	printf("storageBytes: %s (stable after %.2f seconds)\n", toString(sb).c_str(), timer() - intervalStart);
 
-	printf("Clearing all keys\n");
-	intervalStart = timer();
-	kvs->clear(KeyRangeRef(LiteralStringRef(""), LiteralStringRef("\xff")));
-	state StorageBytes sbClear = wait(getStableStorageBytes(kvs));
-	printf("Cleared all keys in %.2f seconds, final storageByte: %s\n",
-	       timer() - intervalStart,
-	       toString(sbClear).c_str());
+	if (clearAfter) {
+		printf("Clearing all keys\n");
+		intervalStart = timer();
+		kvs->clear(KeyRangeRef(LiteralStringRef(""), LiteralStringRef("\xff")));
+		state StorageBytes sbClear = wait(getStableStorageBytes(kvs));
+		printf("Cleared all keys in %.2f seconds, final storageByte: %s\n",
+		       timer() - intervalStart,
+		       toString(sbClear).c_str());
+	}
 
 	return Void();
 }
@@ -8540,7 +8597,7 @@ ACTOR Future<Void> doPrefixInsertComparison(int suffixSize,
 	deleteFile("test.redwood");
 	wait(delay(5));
 	state IKeyValueStore* redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood", UID(), 0);
-	wait(prefixClusteredInsert(redwood, suffixSize, valueSize, source, recordCountTarget, usePrefixesInOrder));
+	wait(prefixClusteredInsert(redwood, suffixSize, valueSize, source, recordCountTarget, usePrefixesInOrder, true));
 	wait(closeKVS(redwood));
 	printf("\n");
 
@@ -8548,7 +8605,7 @@ ACTOR Future<Void> doPrefixInsertComparison(int suffixSize,
 	deleteFile("test.sqlite-wal");
 	wait(delay(5));
 	state IKeyValueStore* sqlite = openKVStore(KeyValueStoreType::SSD_BTREE_V2, "test.sqlite", UID(), 0);
-	wait(prefixClusteredInsert(sqlite, suffixSize, valueSize, source, recordCountTarget, usePrefixesInOrder));
+	wait(prefixClusteredInsert(sqlite, suffixSize, valueSize, source, recordCountTarget, usePrefixesInOrder, true));
 	wait(closeKVS(sqlite));
 	printf("\n");
 
@@ -8590,3 +8647,91 @@ TEST_CASE(":/redwood/performance/sequentialInsert") {
 
 	return Void();
 }
+
+// singlePrefix forces the range read to have the start and end key with the same prefix
+ACTOR Future<Void> randomRangeScans(IKeyValueStore* kvs,
+                                    int suffixSize,
+                                    KVSource source,
+                                    int valueSize,
+                                    int recordCountTarget,
+                                    bool singlePrefix,
+                                    int rowLimit) {
+	printf("\nstoreType: %d\n", kvs->getType());
+	printf("prefixSource: %s\n", source.toString().c_str());
+	printf("suffixSize: %d\n", suffixSize);
+	printf("recordCountTarget: %d\n", recordCountTarget);
+	printf("singlePrefix: %d\n", singlePrefix);
+	printf("rowLimit: %d\n", rowLimit);
+
+	state int64_t recordSize = source.prefixLen + suffixSize + valueSize;
+	state int64_t bytesRead = 0;
+	state int64_t recordsRead = 0;
+	state int queries = 0;
+	state int64_t nextPrintRecords = 1e5;
+
+	state double start = timer();
+	state std::function<void()> stats = [&]() {
+		double elapsed = timer() - start;
+		printf("Cumulative stats: %.2f seconds  %d queries %.2f MB %d records  %.2f qps %.2f MB/s  %.2f rec/s\r\n",
+		       elapsed,
+		       queries,
+		       bytesRead / 1e6,
+		       recordsRead,
+		       queries / elapsed,
+		       bytesRead / elapsed / 1e6,
+		       recordsRead / elapsed);
+		fflush(stdout);
+	};
+
+	while (recordsRead < recordCountTarget) {
+		KeyRangeRef range = source.getKeyRangeRef(singlePrefix, suffixSize);
+		int rowLim = (deterministicRandom()->randomInt(0, 2) != 0) ? rowLimit : -rowLimit;
+
+		Standalone<RangeResultRef> result = wait(kvs->readRange(range, rowLim));
+
+		recordsRead += result.size();
+		bytesRead += result.size() * recordSize;
+		++queries;
+
+		// log stats with exponential backoff
+		if (recordsRead >= nextPrintRecords) {
+			stats();
+			nextPrintRecords *= 2;
+		}
+	}
+
+	stats();
+	printf("\n");
+
+	return Void();
+}
+
+TEST_CASE("!/redwood/performance/randomRangeScans") {
+	state int prefixLen = 30;
+	state int suffixSize = 12;
+	state int valueSize = 100;
+
+	// TODO change to 100e8 after figuring out no-disk redwood mode
+	state int writeRecordCountTarget = 1e6;
+	state int queryRecordTarget = 1e7;
+	state int writePrefixesInOrder = false;
+
+	state KVSource source({ { prefixLen, 1000 } });
+
+	deleteFile("test.redwood");
+	wait(delay(5));
+	state IKeyValueStore* redwood = openKVStore(KeyValueStoreType::SSD_REDWOOD_V1, "test.redwood", UID(), 0);
+	wait(prefixClusteredInsert(
+	    redwood, suffixSize, valueSize, source, writeRecordCountTarget, writePrefixesInOrder, false));
+
+	// divide targets for tiny queries by 10 because they are much slower
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget / 10, true, 10));
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, true, 1000));
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget / 10, false, 100));
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, false, 10000));
+	wait(randomRangeScans(redwood, suffixSize, source, valueSize, queryRecordTarget, false, 1000000));
+	wait(closeKVS(redwood));
+	printf("\n");
+
+	return Void();
+}
diff --git a/flow/Arena.h b/flow/Arena.h
index 712c1fc8bf..12c6932ff0 100644
--- a/flow/Arena.h
+++ b/flow/Arena.h
@@ -529,6 +529,17 @@ public:
 		return ::compare(size(), other.size());
 	}
 
+	int compareSuffix(StringRef const& other, int prefixLen) const {
+		// pre: prefixLen <= size() && prefixLen <= other.size()
+		size_t minSuffixSize = std::min(size(), other.size()) - prefixLen;
+		if (minSuffixSize != 0) {
+			int c = memcmp(begin() + prefixLen, other.begin() + prefixLen, minSuffixSize);
+			if (c != 0)
+				return c;
+		}
+		return ::compare(size(), other.size());
+	}
+
 	// Removes bytes from begin up to and including the sep string, returns StringRef of the part before sep
 	StringRef eat(StringRef sep) {
 		for (int i = 0, iend = size() - sep.size(); i <= iend; ++i) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7caaf13007..4574752906 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -62,6 +62,7 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES PureNetwork.txt IGNORE)
   add_fdb_test(TEST_FILES RRW2500.txt IGNORE)
   add_fdb_test(TEST_FILES RandomRead.txt IGNORE)
+  add_fdb_test(TEST_FILES RandomRangeRead.txt IGNORE)
   add_fdb_test(TEST_FILES RandomReadWrite.txt IGNORE)
   add_fdb_test(TEST_FILES ReadAbsent.txt IGNORE)
   add_fdb_test(TEST_FILES ReadAfterWrite.txt IGNORE)
@@ -74,6 +75,7 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES RedwoodPerfSet.txt IGNORE)
   add_fdb_test(TEST_FILES RedwoodPerfPrefixCompression.txt IGNORE)
   add_fdb_test(TEST_FILES RedwoodPerfSequentialInsert.txt IGNORE)
+  add_fdb_test(TEST_FILES RedwoodPerfRandomRangeScans.txt IGNORE)
   add_fdb_test(TEST_FILES RocksDBTest.txt IGNORE)
   add_fdb_test(TEST_FILES S3BlobStore.txt IGNORE)
   add_fdb_test(TEST_FILES SampleNoSimAttrition.txt IGNORE)
diff --git a/tests/RandomRangeRead.txt b/tests/RandomRangeRead.txt
new file mode 100644
index 0000000000..e614cdcf4e
--- /dev/null
+++ b/tests/RandomRangeRead.txt
@@ -0,0 +1,11 @@
+testTitle=RandomReadWriteTest
+testName=ReadWrite
+testDuration=10.0
+transactionsPerSecond=2500
+writesPerTransactionA=0
+readsPerTransactionA=10
+rangeReads=true
+alpha=0
+nodeCount=50000
+valueBytes=16
+discardEdgeMeasurements=false
diff --git a/tests/RedwoodPerfRandomRangeScans.txt b/tests/RedwoodPerfRandomRangeScans.txt
new file mode 100644
index 0000000000..5f49bf8099
--- /dev/null
+++ b/tests/RedwoodPerfRandomRangeScans.txt
@@ -0,0 +1,6 @@
+testTitle=UnitTests
+testName=UnitTests
+startDelay=0
+useDB=false
+maxTestCases=0
+testsMatching=!/redwood/performance/randomRangeScans

From 52bba82e8ef24d1eaef8ee0347e04a1bc0bf8e85 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 23 Apr 2021 14:05:05 -0700
Subject: [PATCH 237/317] Add window size configuration key

---
 fdbclient/ActorLineageProfiler.cpp | 10 ++++++++++
 fdbclient/ActorLineageProfiler.h   |  1 +
 fdbclient/GlobalConfig.actor.cpp   |  1 +
 fdbclient/GlobalConfig.actor.h     |  1 +
 fdbclient/NativeAPI.actor.cpp      |  1 +
 5 files changed, 14 insertions(+)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index fe335d90d5..46a74bace7 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -298,3 +298,13 @@ void samplingProfilerUpdateFrequency(std::optional<std::any> freq) {
 	TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency);
 	ActorLineageProfiler::instance().setFrequency(frequency);
 }
+
+// Callback used to update the sample collector window size.
+void samplingProfilerUpdateWindow(std::optional<std::any> window) {
+	double duration = 0;
+	if (window.has_value()) {
+		duration = std::any_cast<double>(window.value());
+	}
+	TraceEvent(SevInfo, "SamplingProfilerUpdateWindow").detail("Duration", duration);
+	SampleCollection::instance().setWindowSize(duration);
+}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index b73e7d04eb..c612274133 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -32,6 +32,7 @@
 #include "flow/flow.h"
 
 void samplingProfilerUpdateFrequency(std::optional<std::any> freq);
+void samplingProfilerUpdateWindow(std::optional<std::any> window);
 
 struct IALPCollectorBase {
 	virtual std::optional<std::any> collect(ActorLineage*) = 0;
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 79bbbb2202..1d06d84880 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -35,6 +35,7 @@ const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag
 const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost");
 
 const KeyRef samplingFrequency = LiteralStringRef("visibility/sampling/frequency");
+const KeyRef samplingWindow = LiteralStringRef("visibility/sampling/window");
 
 GlobalConfig::GlobalConfig() : lastUpdate(0) {}
 
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index de98c442e1..65028dcd92 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -52,6 +52,7 @@ extern const KeyRef transactionTagSampleRate;
 extern const KeyRef transactionTagSampleCost;
 
 extern const KeyRef samplingFrequency;
+extern const KeyRef samplingWindow;
 
 // Structure used to hold the values stored by global configuration. The arena
 // is used as memory to store both the key and the value (the value is only
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index c329a17546..d9e24f79dc 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -965,6 +965,7 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 
 	GlobalConfig::create(this, clientInfo);
 	GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency);
+	GlobalConfig::globalConfig().trigger(samplingWindow, samplingProfilerUpdateWindow);
 
 	monitorProxiesInfoChange = monitorProxiesChange(clientInfo, &proxiesChangeTrigger);
 	clientStatusUpdater.actor = clientStatusUpdateActor(this);

From 25fb85a64c8ab7e0578c80e0adc77ab667b3268f Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 22 Apr 2021 20:55:06 -0700
Subject: [PATCH 238/317] Add API to read samples from worker

---
 fdbcli/fdbcli.actor.cpp             |   8 ++
 fdbclient/ActorLineageProfiler.cpp  |   7 +-
 fdbclient/ActorLineageProfiler.h    |   2 +-
 fdbclient/ProcessInterface.h        |  45 +++++++-
 fdbclient/SpecialKeySpace.actor.cpp | 169 +++++++++++++++++++++++++---
 fdbclient/SpecialKeySpace.actor.h   |   7 ++
 fdbserver/worker.actor.cpp          |  24 +++-
 7 files changed, 242 insertions(+), 20 deletions(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index d655601e22..d21775d47f 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -4698,6 +4698,14 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 		} catch (Error& e) {
 			if (e.code() != error_code_actor_cancelled)
 				fprintf(stderr, "ERROR: %s (%d)\n", e.what(), e.code());
+			if (e.code() == error_code_special_keys_api_failure) {
+				auto f = tr->get(LiteralStringRef("\xff\xff/error_message"));
+				ASSERT(f.isReady());
+				if (f.get().present()) {
+					auto msg = f.get().get().toString();
+					printf("Special Key space error_message: %s\n", msg.c_str());
+				}
+			}
 			is_error = true;
 			if (intrans) {
 				printf("Rolling back current transaction\n");
diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 46a74bace7..46de22d2fc 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -63,13 +63,14 @@ class Packer : public msgpack::packer<msgpack::sbuffer> {
 			                     std::string_view,
 			                     std::vector<std::any>,
 			                     std::map<std::string, std::any>,
-			                     std::map<std::string_view, std::any>>::populate(visitorMap);
+			                     std::map<std::string_view, std::any>,
+			                     std::vector<std::map<std::string_view, std::any>>>::populate(visitorMap);
 		}
 
 		void visit(const std::any& val, Packer& packer) {
 			auto iter = visitorMap.find(val.type());
 			if (iter == visitorMap.end()) {
-				// TODO: trace error
+				TraceEvent(SevError, "PackerTypeNotFound").detail("Type", val.type().name());
 			} else {
 				iter->second(val, packer);
 			}
@@ -197,7 +198,7 @@ std::shared_ptr<Sample> SampleCollectorT::collect() {
 
 void SampleCollection_t::refresh() {
 	auto sample = _collector->collect();
-	auto min = std::max(sample->time - windowSize, sample->time);
+	auto min = std::min(sample->time - windowSize, sample->time);
 	{
 		Lock _{ mutex };
 		data.emplace_back(std::move(sample));
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index c612274133..82cd22cb1c 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -78,7 +78,7 @@ class SampleCollection_t {
 
 	SampleCollector _collector;
 	mutable std::mutex mutex;
-	std::atomic<double> windowSize = 0.0;
+	std::atomic<double> windowSize = 5.0;
 	std::deque<std::shared_ptr<Sample>> data;
 
 public:
diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
index c76cf9ef48..9b648d8127 100644
--- a/fdbclient/ProcessInterface.h
+++ b/fdbclient/ProcessInterface.h
@@ -18,6 +18,7 @@
  * limitations under the License.
  */
 
+#include "fdbclient/AnnotateActor.h"
 #include "fdbclient/FDBTypes.h"
 #include "fdbrpc/fdbrpc.h"
 
@@ -26,11 +27,11 @@ constexpr UID WLTOKEN_PROCESS(-1, 11);
 struct ProcessInterface {
 	constexpr static FileIdentifier file_identifier = 985636;
 	RequestStream<struct GetProcessInterfaceRequest> getInterface;
-	RequestStream<struct EchoRequest> echo;
+	RequestStream<struct ActorLineageRequest> actorLineage;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, echo);
+		serializer(ar, actorLineage);
 	}
 };
 
@@ -55,3 +56,43 @@ struct EchoRequest {
 		serializer(ar, message, reply);
 	}
 };
+
+// This type is used to send serialized sample data over the network.
+// TODO: Possible to combine with `Sample`?
+struct SerializedSample {
+	constexpr static FileIdentifier file_identifier = 15785634;
+
+	WaitState waitState;
+	double time;
+	int seq;
+	std::string data;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, waitState, time, seq, data);
+	}
+};
+
+struct ActorLineageReply {
+	constexpr static FileIdentifier file_identifier = 1887656;
+	std::vector<SerializedSample> samples;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, samples);
+	}
+};
+
+struct ActorLineageRequest {
+	constexpr static FileIdentifier file_identifier = 11654765;
+	WaitState waitStateStart, waitStateEnd;
+	double timeStart, timeEnd;
+	int seqStart, seqEnd;
+	// TODO: Add end values
+	ReplyPromise<ActorLineageReply> reply;
+
+	template <class Ar>
+	void serialize(Ar& ar) {
+		serializer(ar, waitStateStart, waitStateEnd, timeStart, timeEnd, seqStart, seqEnd, reply);
+	}
+};
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 603887fcf6..f251feddfa 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -21,6 +21,10 @@
 #include "boost/lexical_cast.hpp"
 #include "boost/algorithm/string.hpp"
 
+#include <msgpack.hpp>
+
+#include <exception>
+
 #include "fdbclient/Knobs.h"
 #include "fdbclient/ProcessInterface.h"
 #include "fdbclient/GlobalConfig.actor.h"
@@ -96,6 +100,15 @@ std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandT
 	      .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
 };
 
+std::unordered_map<std::string, KeyRange> SpecialKeySpace::actorLineageApiCommandToRange = {
+	{ "state",
+	  KeyRangeRef(LiteralStringRef("state/"), LiteralStringRef("state0"))
+	      .withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) },
+	{ "time",
+	  KeyRangeRef(LiteralStringRef("time/"), LiteralStringRef("time0"))
+	      .withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) }
+};
+
 std::set<std::string> SpecialKeySpace::options = { "excluded/force", "failed/force" };
 
 std::set<std::string> SpecialKeySpace::tracingOptions = { kTracingTransactionIdKey, kTracingTokenKey };
@@ -1925,26 +1938,156 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke
 
 ActorLineageImpl::ActorLineageImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
 
+void parse(StringRef& val, int& i) {
+	i = std::stoi(val.toString());
+}
+
+void parse(StringRef& val, double& d) {
+	d = std::stod(val.toString());
+}
+
+void parse(StringRef& val, WaitState& w) {
+	if (val == LiteralStringRef("disk")) {
+		w = WaitState::Disk;
+	} else if (val == LiteralStringRef("network")) {
+		w = WaitState::Network;
+	} else if (val == LiteralStringRef("running")) {
+		w = WaitState::Running;
+	} else {
+		throw std::range_error("failed to parse run state");
+	}
+}
+
+void parse(StringRef& val, NetworkAddress& a) {
+	auto address = NetworkAddress::parse(val.toString());
+	if (!address.isValid()) {
+		throw std::invalid_argument("invalid host");
+	}
+	a = address;
+}
+
+// Base case function for parsing function below.
+template <typename T>
+void parse(std::vector<StringRef>::iterator it, std::vector<StringRef>::iterator end, T& t1) {
+	if (it == end) {
+		return;
+	}
+	parse(*it, t1);
+}
+
+// Given an iterator into a vector of string tokens, an iterator to the end of
+// the search space in the vector (exclusive), and a list of references to
+// types, parses each token in the vector into the associated type according to
+// the order of the arguments.
+//
+// For example, given the vector ["1", "1.5", "127.0.0.1:4000"] and the
+// argument list int a, double b, NetworkAddress c, after this function returns
+// each parameter passed in will hold the parsed value from the token list.
+//
+// The appropriate parsing function must be implemented for the type you wish
+// to parse. See the existing parsing functions above, and add your own if
+// necessary.
+template <typename T, typename... Types>
+void parse(std::vector<StringRef>::iterator it, std::vector<StringRef>::iterator end, T& t1, Types&... remaining) {
+	// Return as soon as all tokens have been parsed. This allows parameters
+	// passed at the end to act as optional parameters -- they will only be set
+	// if the value exists.
+	if (it == end) {
+		return;
+	}
+
+	try {
+		parse(*it, t1);
+		parse(++it, end, remaining...);
+	} catch (Error& e) {
+		throw e;
+	} catch (std::exception& e) {
+		throw e;
+	}
+}
+
 ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYourWritesTransaction* ryw,
                                                                           KeyRef prefix,
                                                                           KeyRangeRef kr) {
 	state Standalone<RangeResultRef> result;
-	Standalone<StringRef> addressString = kr.begin.removePrefix(prefix);
+
+	// Set default values for all fields. The default will be used if the field
+	// is missing in the key.
+	state NetworkAddress host;
+	state WaitState waitStateStart = WaitState{ 0 };
+	state WaitState waitStateEnd = WaitState{ 2 };
+	state double timeStart = 0;
+	state double timeEnd = std::numeric_limits<double>::max();
+	state int seqStart = 0;
+	state int seqEnd = std::numeric_limits<int>::max();
+
+	state std::vector<StringRef> beginValues = kr.begin.removePrefix(prefix).splitAny("/"_sr);
+	state std::vector<StringRef> endValues = kr.end.removePrefix(prefix).splitAny("/"_sr);
+	// Require index (either "state" or "time") and address:port.
+	if (beginValues.size() < 2 || endValues.size() < 2) {
+		ryw->setSpecialKeySpaceErrorMsg("missing required parameters (index, host)");
+		throw special_keys_api_failure();
+	}
 
 	try {
-		auto address = NetworkAddress::parse(addressString.contents().toString());
-
-		state ProcessInterface process;
-		process.getInterface = RequestStream<GetProcessInterfaceRequest>(Endpoint({ address }, WLTOKEN_PROCESS));
-		ProcessInterface p = wait(retryBrokenPromise(process.getInterface, GetProcessInterfaceRequest{}));
-		process = p;
-
-		EchoRequest echoRequest;
-		echoRequest.message = "Hello";
-		std::string response = wait(process.echo.getReply(echoRequest));
-		result.push_back_deep(result.arena(), KeyValueRef(kr.begin, response));
+		state NetworkAddress endRangeHost;
+		if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
+			// For the range \xff\xff/actor_lineage/state/ip:port/wait-state/time/seq
+			parse(beginValues.begin() + 1, beginValues.end(), host, waitStateStart, timeStart, seqStart);
+			if (kr.begin != kr.end) {
+				parse(endValues.begin() + 1, endValues.end(), endRangeHost, waitStateEnd, timeEnd, seqEnd);
+			}
+		} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
+			// For the range \xff\xff/actor_lineage/time/ip:port/time/wait-state/seq
+			parse(beginValues.begin() + 1, beginValues.end(), host, timeStart, waitStateStart, seqStart);
+			if (kr.begin != kr.end) {
+				parse(endValues.begin() + 1, endValues.end(), endRangeHost, timeEnd, waitStateEnd, seqEnd);
+			}
+		} else {
+			ryw->setSpecialKeySpaceErrorMsg("invalid index in actor_lineage");
+			throw special_keys_api_failure();
+		}
 	} catch (Error& e) {
-		TraceEvent(SevDebug, "SpecialKeysNetworkParseError").error(e);
+		if (e.code() != special_keys_api_failure().code()) {
+			ryw->setSpecialKeySpaceErrorMsg("failed to parse key");
+			throw special_keys_api_failure();
+		} else {
+			throw e;
+		}
+	}
+
+	if (kr.begin != kr.end && host != endRangeHost) {
+		// The client doesn't know about all the hosts, so a get range covering
+		// multiple hosts has no way of knowing which IP:port combos to use.
+		ryw->setSpecialKeySpaceErrorMsg("the host must remain the same on both ends of the range");
+		throw special_keys_api_failure();
+	}
+
+	// Open endpoint to target process on each call. This can be optimized at
+	// some point...
+	state ProcessInterface process;
+	process.getInterface = RequestStream<GetProcessInterfaceRequest>(Endpoint({ host }, WLTOKEN_PROCESS));
+	ProcessInterface p = wait(retryBrokenPromise(process.getInterface, GetProcessInterfaceRequest{}));
+	process = p;
+
+	ActorLineageRequest actorLineageRequest;
+	actorLineageRequest.waitStateStart = waitStateStart;
+	actorLineageRequest.waitStateEnd = waitStateEnd;
+	actorLineageRequest.timeStart = timeStart;
+	actorLineageRequest.timeEnd = timeEnd;
+	actorLineageRequest.seqStart = seqStart;
+	actorLineageRequest.seqEnd = seqEnd;
+	ActorLineageReply reply = wait(process.actorLineage.getReply(actorLineageRequest));
+
+	for (const auto& sample : reply.samples) {
+		msgpack::object_handle oh = msgpack::unpack(sample.data.data(), sample.data.size());
+		msgpack::object deserialized = oh.get();
+
+		std::ostringstream stream;
+		stream << deserialized;
+		// TODO: Fix return value for ranges
+		Key returnKey = prefix.withSuffix(host.toString() + "/" + std::to_string(sample.seq));
+		result.push_back_deep(result.arena(), KeyValueRef(returnKey, stream.str()));
 	}
 
 	return result;
diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h
index 08a3c6cfc5..fd16af7c2c 100644
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@@ -200,6 +200,12 @@ public:
 	static KeyRef getManagementApiCommandPrefix(const std::string& command) {
 		return managementApiCommandToRange.at(command).begin;
 	}
+	static KeyRangeRef getActorLineageApiCommandRange(const std::string& command) {
+		return actorLineageApiCommandToRange.at(command);
+	}
+	static KeyRef getActorLineageApiCommandPrefix(const std::string& command) {
+		return actorLineageApiCommandToRange.at(command).begin;
+	}
 	static Key getManagementApiCommandOptionSpecialKey(const std::string& command, const std::string& option);
 	static const std::set<std::string>& getManagementApiOptionsSet() { return options; }
 	static const std::set<std::string>& getTracingOptions() { return tracingOptions; }
@@ -228,6 +234,7 @@ private:
 	static std::unordered_map<SpecialKeySpace::MODULE, KeyRange> moduleToBoundary;
 	static std::unordered_map<std::string, KeyRange>
 	    managementApiCommandToRange; // management command to its special keys' range
+	static std::unordered_map<std::string, KeyRange> actorLineageApiCommandToRange;
 	static std::set<std::string> options; // "<command>/<option>"
 	static std::set<std::string> tracingOptions;
 
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 875a92a949..4c5dfecb16 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -2040,6 +2040,8 @@ ACTOR Future<Void> serveProtocolInfo() {
 	}
 }
 
+// Handles requests from ProcessInterface, an interface meant for direct
+// communication between the client and FDB processes.
 ACTOR Future<Void> serveProcess() {
 	state ProcessInterface process;
 	process.getInterface.makeWellKnownEndpoint(WLTOKEN_PROCESS, TaskPriority::DefaultEndpoint);
@@ -2048,7 +2050,27 @@ ACTOR Future<Void> serveProcess() {
 			when(GetProcessInterfaceRequest req = waitNext(process.getInterface.getFuture())) {
 				req.reply.send(process);
 			}
-			when(EchoRequest req = waitNext(process.echo.getFuture())) { req.reply.send(req.message); }
+			when(ActorLineageRequest req = waitNext(process.actorLineage.getFuture())) {
+				state SampleCollection sampleCollector;
+				// TODO: Add filtering by wait state
+				auto samples = sampleCollector->get(req.timeStart, req.timeEnd);
+				// The size of samples should never approach 2 billion, so
+				// casting from 64 to 32 bits here should be okay.
+				ASSERT(samples.size() < std::numeric_limits<int>::max());
+				int maxSeq = std::min(req.seqEnd, static_cast<int>(samples.size()));
+
+				std::vector<SerializedSample> serializedSamples;
+				for (int i = req.seqStart; i < maxSeq; ++i) {
+					auto samplePtr = samples.at(i);
+					auto serialized = SerializedSample{ .waitState = WaitState::Network, // TODO: Currently unused
+						                                .time = samplePtr->time,
+						                                .seq = i,
+						                                .data = std::string(samplePtr->data, samplePtr->size) };
+					serializedSamples.push_back(std::move(serialized));
+				}
+				ActorLineageReply reply{ serializedSamples };
+				req.reply.send(reply);
+			}
 		}
 	}
 }

From 34b6671303cd0b72977ac2890d70eda589ab6adc Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 22 Apr 2021 20:58:03 -0700
Subject: [PATCH 239/317] Remove temporary fix

---
 fdbcli/fdbcli.actor.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp
index d21775d47f..d655601e22 100644
--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@@ -4698,14 +4698,6 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 		} catch (Error& e) {
 			if (e.code() != error_code_actor_cancelled)
 				fprintf(stderr, "ERROR: %s (%d)\n", e.what(), e.code());
-			if (e.code() == error_code_special_keys_api_failure) {
-				auto f = tr->get(LiteralStringRef("\xff\xff/error_message"));
-				ASSERT(f.isReady());
-				if (f.get().present()) {
-					auto msg = f.get().get().toString();
-					printf("Special Key space error_message: %s\n", msg.c_str());
-				}
-			}
 			is_error = true;
 			if (intrans) {
 				printf("Rolling back current transaction\n");

From 3cf2dd0fbe54df831650315da4acc4a26db03cf0 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Thu, 22 Apr 2021 21:00:29 -0700
Subject: [PATCH 240/317] Remove TODO

---
 fdbclient/ProcessInterface.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
index 9b648d8127..c89f6028bb 100644
--- a/fdbclient/ProcessInterface.h
+++ b/fdbclient/ProcessInterface.h
@@ -88,7 +88,6 @@ struct ActorLineageRequest {
 	WaitState waitStateStart, waitStateEnd;
 	double timeStart, timeEnd;
 	int seqStart, seqEnd;
-	// TODO: Add end values
 	ReplyPromise<ActorLineageReply> reply;
 
 	template <class Ar>

From 9adce8456a2786d659acbb7fd59619f8eb012af5 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 23 Apr 2021 13:33:08 -0700
Subject: [PATCH 241/317] Add invalid reference check

---
 fdbclient/ActorLineageProfiler.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 46de22d2fc..f1a71bae60 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -238,7 +238,10 @@ ActorLineageProfilerT::ActorLineageProfilerT() {
 	    std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet())));
 	collection->collector()->addGetter(WaitState::Running, []() {
 		auto res = currentLineageThreadSafe.get();
-		return std::vector<Reference<ActorLineage>>({ currentLineageThreadSafe.get() });
+		if (res.isValid()) {
+			return std::vector<Reference<ActorLineage>>({ res });
+		}
+		return std::vector<Reference<ActorLineage>>();
 	});
 }
 

From 6fc59379d8e7ed64bf56829153ff72bb7b104f3c Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Fri, 23 Apr 2021 21:17:41 +0000
Subject: [PATCH 242/317] Add /fdbclient/multiversionclient/ to ctest, and fix
 thread safety

---
 fdbclient/MultiVersionTransaction.actor.cpp | 23 +++++++++++---
 flow/FastRef.h                              | 34 +++++++++------------
 tests/CMakeLists.txt                        |  4 +++
 3 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 4b6ba0c27c..cff80f60b5 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -1987,6 +1987,9 @@ THREAD_FUNC runSingleAssignmentVarTest(void* arg) {
 			tf.validate();
 
 			tf.future.extractPtr(); // leaks
+			for (auto t : tf.threads) {
+				waitThread(t);
+			}
 		}
 
 		for (int numRuns = 0; numRuns < 25; ++numRuns) {
@@ -2057,12 +2060,14 @@ struct AbortableTest {
 
 TEST_CASE("/fdbclient/multiversionclient/AbortableSingleAssignmentVar") {
 	state volatile bool done = false;
-	g_network->startThread(runSingleAssignmentVarTest<AbortableTest>, (void*)&done);
+	state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest<AbortableTest>, (void*)&done);
 
 	while (!done) {
 		wait(delay(1.0));
 	}
 
+	waitThread(thread);
+
 	return Void();
 }
 
@@ -2134,20 +2139,24 @@ TEST_CASE("/fdbclient/multiversionclient/DLSingleAssignmentVar") {
 	state volatile bool done = false;
 
 	MultiVersionApi::api->callbackOnMainThread = true;
-	g_network->startThread(runSingleAssignmentVarTest<DLTest>, (void*)&done);
+	state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest<DLTest>, (void*)&done);
 
 	while (!done) {
 		wait(delay(1.0));
 	}
 
+	waitThread(thread);
+
 	done = false;
 	MultiVersionApi::api->callbackOnMainThread = false;
-	g_network->startThread(runSingleAssignmentVarTest<DLTest>, (void*)&done);
+	thread = g_network->startThread(runSingleAssignmentVarTest<DLTest>, (void*)&done);
 
 	while (!done) {
 		wait(delay(1.0));
 	}
 
+	waitThread(thread);
+
 	return Void();
 }
 
@@ -2172,12 +2181,14 @@ struct MapTest {
 
 TEST_CASE("/fdbclient/multiversionclient/MapSingleAssignmentVar") {
 	state volatile bool done = false;
-	g_network->startThread(runSingleAssignmentVarTest<MapTest>, (void*)&done);
+	state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest<MapTest>, (void*)&done);
 
 	while (!done) {
 		wait(delay(1.0));
 	}
 
+	waitThread(thread);
+
 	return Void();
 }
 
@@ -2209,11 +2220,13 @@ struct FlatMapTest {
 
 TEST_CASE("/fdbclient/multiversionclient/FlatMapSingleAssignmentVar") {
 	state volatile bool done = false;
-	g_network->startThread(runSingleAssignmentVarTest<FlatMapTest>, (void*)&done);
+	state THREAD_HANDLE thread = g_network->startThread(runSingleAssignmentVarTest<FlatMapTest>, (void*)&done);
 
 	while (!done) {
 		wait(delay(1.0));
 	}
 
+	waitThread(thread);
+
 	return Void();
 }
diff --git a/flow/FastRef.h b/flow/FastRef.h
index eca6ab72d5..aaa50b9595 100644
--- a/flow/FastRef.h
+++ b/flow/FastRef.h
@@ -22,45 +22,39 @@
 #define FLOW_FASTREF_H
 #pragma once
 
+#include <atomic>
 #include <cstdint>
 
-#include "flow/Platform.h"
-
-#if VALGRIND
-#include <drd.h>
-#endif
-
 template <class Subclass>
 class ThreadSafeReferenceCounted {
 public:
 	ThreadSafeReferenceCounted() : referenceCount(1) {}
 	// NO virtual destructor!  Subclass should have a virtual destructor if it is not sealed.
-	void addref() const { interlockedIncrement(&referenceCount); }
+	void addref() const { referenceCount.fetch_add(1); }
 	// If return value is true, caller is responsible for destruction of object
 	bool delref_no_destroy() const {
-		if (interlockedDecrement(&referenceCount) != 0) {
-#ifdef VALGRIND
-			ANNOTATE_HAPPENS_BEFORE(&referenceCount);
-#endif
-			return false;
+		// The performance of this seems comparable to a version with less strict memory ordering (see e.g.
+		// https://www.boost.org/doc/libs/1_57_0/doc/html/atomic/usage_examples.html#boost_atomic.usage_examples.example_reference_counters),
+		// on both x86 and ARM, with gcc8.
+		if (referenceCount.fetch_sub(1) == 1) {
+			return true;
 		}
-#ifdef VALGRIND
-		ANNOTATE_HAPPENS_AFTER(&referenceCount);
-#endif
-		return true;
+		return false;
 	}
 	void delref() const {
 		if (delref_no_destroy())
 			delete (Subclass*)this;
 	}
-	void setrefCountUnsafe(int32_t count) const { referenceCount = count; }
-	int32_t debugGetReferenceCount() const { return referenceCount; } // Never use in production code, only for tracing
-	bool isSoleOwnerUnsafe() const { return referenceCount == 1; }
+	void setrefCountUnsafe(int32_t count) const { referenceCount.store(count); }
+	int32_t debugGetReferenceCount() const {
+		return referenceCount.load();
+	} // Never use in production code, only for tracing
+	bool isSoleOwnerUnsafe() const { return referenceCount.load() == 1; }
 
 private:
 	ThreadSafeReferenceCounted(const ThreadSafeReferenceCounted&) /* = delete*/;
 	void operator=(const ThreadSafeReferenceCounted&) /* = delete*/;
-	mutable volatile int32_t referenceCount;
+	mutable std::atomic<int32_t> referenceCount;
 };
 
 template <class Subclass>
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7caaf13007..781d1af2ee 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -261,6 +261,10 @@ if(WITH_PYTHON)
   add_fdb_test(TEST_FILES status/separate_not_enough_servers.txt)
   add_fdb_test(TEST_FILES status/single_process_too_many_config_params.txt)
 
+  add_test(
+    NAME multiversion_client/unit_tests
+    COMMAND $<TARGET_FILE:fdbserver> -r unittests -f /fdbclient/multiversionclient/
+  )
 
   verify_testing()
   if (NOT OPEN_FOR_IDE AND NOT WIN32)

From a794fca9329a8c1a2e4a6abc7a86991c4e70059b Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 23 Apr 2021 15:00:21 -0700
Subject: [PATCH 243/317] Support 5.0 (and earlier) client versions by adding
 GRV probing for old versions. Update the C bindings implementation of
 get_server_protocol to convert the ProtocolVersion object into a uint64_t.
 Rename a misleading protocol version alias.

---
 bindings/c/fdb_c.cpp                        |   7 +-
 fdbclient/MultiVersionTransaction.actor.cpp | 196 +++++++++++++++-----
 fdbclient/MultiVersionTransaction.h         |  46 ++++-
 fdbclient/NativeAPI.actor.cpp               |  15 +-
 fdbrpc/FlowTransport.actor.cpp              |   2 +-
 flow/ProtocolVersion.h                      |   2 +-
 6 files changed, 207 insertions(+), 61 deletions(-)

diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp
index 2c133dae36..4b6b3a87ed 100644
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@@ -23,6 +23,7 @@
 #define FDB_INCLUDE_LEGACY_TYPES
 
 #include "fdbclient/MultiVersionTransaction.h"
+#include "fdbclient/MultiVersionAssignmentVars.h"
 #include "foundationdb/fdb_c.h"
 
 int g_api_version = 0;
@@ -372,7 +373,11 @@ extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db
 		expected = ProtocolVersion(expected_version);
 	}
 
-	return (FDBFuture*)(DB(db)->getServerProtocol(expected).extractPtr());
+	return (
+	    FDBFuture*)(mapThreadFuture<ProtocolVersion,
+	                                uint64_t>(DB(db)->getServerProtocol(expected), [](ErrorOr<ProtocolVersion> result) {
+		                return result.map<uint64_t>([](ProtocolVersion pv) { return pv.versionWithFlags(); });
+	                }).extractPtr());
 }
 
 extern "C" DLLEXPORT void fdb_transaction_destroy(FDBTransaction* tr) {
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 0168dea969..3aa14fd6aa 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -901,10 +901,14 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
 
 		api->runOnExternalClients(threadIdx, [this](Reference<ClientInfo> client) { dbState->addClient(client); });
 
-		dbState->protocolVersionMonitor = dbState->monitorProtocolVersion();
+		onMainThreadVoid([this]() { dbState->protocolVersionMonitor = dbState->monitorProtocolVersion(); }, nullptr);
 	}
 }
 
+MultiVersionDatabase::~MultiVersionDatabase() {
+	dbState->close();
+}
+
 // Create a MultiVersionDatabase that wraps an already created IDatabase object
 // For internal use in testing
 Reference<IDatabase> MultiVersionDatabase::debugCreateFromExistingDatabase(Reference<IDatabase> db) {
@@ -998,15 +1002,28 @@ void MultiVersionDatabase::DatabaseState::addClient(Reference<ClientInfo> client
 
 		MultiVersionApi::api->updateSupportedVersions();
 	}
+
+	if (!client->protocolVersion.hasInexpensiveMultiVersionClient() && !client->failed) {
+		TraceEvent("AddingLegacyVersionMonitor")
+		    .detail("LibPath", client->libPath)
+		    .detail("ProtocolVersion", client->protocolVersion);
+
+		legacyVersionMonitors.emplace_back(client);
+	}
 }
 
-// Watch the cluster protocol version for changes and update the database state when it does
+// Watch the cluster protocol version for changes and update the database state when it does.
+// Must be called from the main thread
 ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion() {
+	startLegacyVersionMonitors();
+
+	Optional<ProtocolVersion> expected = dbProtocolVersion;
 	ThreadFuture<ProtocolVersion> f = versionMonitorDb->getServerProtocol(dbProtocolVersion);
-	return mapThreadFuture<ProtocolVersion, Void>(f, [this](ErrorOr<ProtocolVersion> cv) {
+
+	return mapThreadFuture<ProtocolVersion, Void>(f, [this, expected](ErrorOr<ProtocolVersion> cv) {
 		if (cv.isError()) {
 			TraceEvent("ErrorGettingClusterProtocolVersion")
-			    .detail("ExpectedProtocolVersion", dbProtocolVersion)
+			    .detail("ExpectedProtocolVersion", expected)
 			    .error(cv.getError());
 		}
 
@@ -1016,6 +1033,57 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
 	});
 }
 
+// Called when a change to the protocol version of the cluster has been detected.
+// Must be called from the main thread
+void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion protocolVersion) {
+	// If the protocol version changed but is still compatible, update our local version but keep the same connection
+	if (dbProtocolVersion.present() &&
+	    protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) {
+		dbProtocolVersion = protocolVersion;
+		protocolVersionMonitor = monitorProtocolVersion();
+	}
+
+	// The protocol version has changed to a different, incompatible version
+	else {
+		TraceEvent("ProtocolVersionChanged")
+		    .detail("NewProtocolVersion", protocolVersion)
+		    .detail("OldProtocolVersion", dbProtocolVersion);
+
+		dbProtocolVersion = protocolVersion;
+
+		auto itr = clients.find(protocolVersion.normalizedVersion());
+		if (itr != clients.end()) {
+			auto& client = itr->second;
+			TraceEvent("CreatingDatabaseOnClient")
+			    .detail("LibraryPath", client->libPath)
+			    .detail("Failed", client->failed)
+			    .detail("External", client->external);
+
+			Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
+
+			if (client->external && !MultiVersionApi::apiVersionAtLeast(610)) {
+				// Old API versions return a future when creating the database, so we need to wait for it
+				dbReady = mapThreadFuture<Void, Void>(
+				    newDb.castTo<DLDatabase>()->onReady(), [this, newDb, client](ErrorOr<Void> ready) {
+					    if (!ready.isError()) {
+						    onMainThreadVoid([this, newDb, client]() { updateDatabase(newDb, client); }, nullptr);
+					    } else {
+						    onMainThreadVoid([this, client]() { updateDatabase(Reference<IDatabase>(), client); },
+						                     nullptr);
+					    }
+
+					    return ready;
+				    });
+			} else {
+				updateDatabase(newDb, client);
+			}
+		} else {
+			// We don't have a client matching the current protocol
+			updateDatabase(Reference<IDatabase>(), Reference<ClientInfo>());
+		}
+	}
+}
+
 // Replaces the active database connection with a new one. Must be called from the main thread.
 void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> newDb, Reference<ClientInfo> client) {
 	if (newDb) {
@@ -1048,9 +1116,11 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
 		if (dbProtocolVersion.get().hasStableInterfaces() && db) {
 			versionMonitorDb = db;
 		} else {
+			// For older clients that don't have an API to get the protocol version, we have to monitor it locally
 			versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
 		}
 	} else {
+		// We don't have a database connection, so use the local client to monitor the protocol version
 		db = Reference<IDatabase>();
 		versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str());
 	}
@@ -1059,51 +1129,87 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
 	protocolVersionMonitor = monitorProtocolVersion();
 }
 
-// Called when a change to the protocol version of the cluster has been detected. Must be called from the main
-// thread.
-void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion protocolVersion) {
-	if (dbProtocolVersion.present() &&
-	    protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) {
-		dbProtocolVersion = protocolVersion;
-		protocolVersionMonitor = monitorProtocolVersion();
-	} else {
-		TraceEvent("ProtocolVersionChanged")
-		    .detail("NewProtocolVersion", protocolVersion)
-		    .detail("OldProtocolVersion", dbProtocolVersion);
-
-		dbProtocolVersion = protocolVersion;
-		auto itr = clients.find(protocolVersion.normalizedVersion());
-
-		if (itr != clients.end()) {
-			auto& client = itr->second;
-			TraceEvent("CreatingDatabaseOnClient")
-			    .detail("LibraryPath", client->libPath)
-			    .detail("Failed", client->failed)
-			    .detail("External", client->external);
-
-			Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
-
-			if (client->external && !MultiVersionApi::apiVersionAtLeast(610)) {
-				dbReady = mapThreadFuture<Void, Void>(
-				    newDb.castTo<DLDatabase>()->onReady(), [this, newDb, client](ErrorOr<Void> ready) {
-					    if (!ready.isError()) {
-						    onMainThreadVoid([this, newDb, client]() { updateDatabase(newDb, client); }, nullptr);
-					    } else {
-						    updateDatabase(Reference<IDatabase>(), client);
-					    }
-
-					    dbReady = ThreadFuture<Void>();
-					    return ready;
-				    });
-			} else {
-				updateDatabase(newDb, client);
-			}
-		} else {
-			updateDatabase(Reference<IDatabase>(), Reference<ClientInfo>());
+// Starts version monitors for old client versions that don't support connect packet monitoring (<= 5.0).
+// Must be called from the main thread
+void MultiVersionDatabase::DatabaseState::startLegacyVersionMonitors() {
+	for (auto itr = legacyVersionMonitors.begin(); itr != legacyVersionMonitors.end(); ++itr) {
+		while (itr != legacyVersionMonitors.end() && itr->client->failed) {
+			itr = legacyVersionMonitors.erase(itr);
+		}
+		if (itr != legacyVersionMonitors.end() &&
+		    (!dbProtocolVersion.present() || itr->client->protocolVersion != dbProtocolVersion.get())) {
+			itr->startConnectionMonitor(Reference<DatabaseState>::addRef(this));
 		}
 	}
 }
 
+// Cleans up state for the legacy version monitors to break reference cycles
+// Must be called from the main thread
+void MultiVersionDatabase::DatabaseState::close() {
+	legacyVersionMonitors.clear();
+}
+
+// Starts the connection monitor by creating a database object at an old version.
+// Must be called from the main thread
+void MultiVersionDatabase::LegacyVersionMonitor::startConnectionMonitor(
+    Reference<MultiVersionDatabase::DatabaseState> dbState) {
+	if (!monitorRunning) {
+		monitorRunning = true;
+
+		db = client->api->createDatabase(dbState->clusterFilePath.c_str());
+		tr = Reference<ITransaction>();
+
+		TraceEvent("StartingLegacyVersionMonitor").detail("ProtocolVersion", client->protocolVersion);
+		versionMonitor =
+		    mapThreadFuture<Void, Void>(db.castTo<DLDatabase>()->onReady(), [this, dbState](ErrorOr<Void> ready) {
+			    onMainThreadVoid(
+			        [this, ready, dbState]() {
+				        if (ready.isError()) {
+					        TraceEvent(SevError, "FailedToOpenDatabaseOnClient")
+					            .error(ready.getError())
+					            .detail("LibPath", client->libPath);
+
+					        client->failed = true;
+					        MultiVersionApi::api->updateSupportedVersions();
+				        } else {
+					        runGrvProbe(dbState);
+				        }
+			        },
+			        nullptr);
+
+			    return ready;
+		    });
+	}
+}
+
+// Runs a GRV probe on the cluster to determine if the client version is compatible with the cluster.
+// Must be called from main thread
+void MultiVersionDatabase::LegacyVersionMonitor::runGrvProbe(Reference<MultiVersionDatabase::DatabaseState> dbState) {
+	tr = db->createTransaction();
+	versionMonitor = mapThreadFuture<Version, Void>(tr->getReadVersion(), [this, dbState](ErrorOr<Version> v) {
+		onMainThreadVoid(
+		    [this, v, dbState]() {
+			    monitorRunning = false;
+
+			    // If the version attempt returns an error, we regard that as a connection (except
+			    // operation_cancelled)
+			    if (v.isError() && v.getError().code() == error_code_operation_cancelled) {
+				    TraceEvent(SevError, "FailedToOpenDatabaseOnClient")
+				        .error(v.getError())
+				        .detail("LibPath", client->libPath);
+
+				    client->failed = true;
+				    MultiVersionApi::api->updateSupportedVersions();
+			    } else {
+				    dbState->protocolVersionChanged(client->protocolVersion);
+			    }
+		    },
+		    nullptr);
+
+		return v.map<Void>([](Version v) { return Void(); });
+	});
+}
+
 std::atomic_flag MultiVersionDatabase::externalClientsInitialized = ATOMIC_FLAG_INIT;
 
 // MultiVersionApi
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index 4bad3c7ca9..86e5cc0a63 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -440,6 +440,8 @@ public:
 	                     Reference<IDatabase> versionMonitorDb,
 	                     bool openConnectors = true);
 
+	~MultiVersionDatabase() override;
+
 	Reference<ITransaction> createTransaction() override;
 	void setOption(FDBDatabaseOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
 	double getMainThreadBusyness() override;
@@ -462,6 +464,8 @@ public:
 
 	// private:
 
+	struct LegacyVersionMonitor;
+
 	// A struct that manages the current connection state of the MultiVersionDatabase. This wraps the underlying
 	// IDatabase object that is currently interacting with the cluster.
 	struct DatabaseState : ThreadSafeReferenceCounted<DatabaseState> {
@@ -470,23 +474,32 @@ public:
 		// Replaces the active database connection with a new one. Must be called from the main thread.
 		void updateDatabase(Reference<IDatabase> newDb, Reference<ClientInfo> client);
 
-		// Called when a change to the protocol version of the cluster has been detected. Must be called from the main
-		// thread.
+		// Called when a change to the protocol version of the cluster has been detected.
+		// Must be called from the main thread
 		void protocolVersionChanged(ProtocolVersion protocolVersion);
 
 		// Adds a client (local or externally loaded) that can be used to connect to the cluster
 		void addClient(Reference<ClientInfo> client);
 
-		// Watch the cluster protocol version for changes and update the database state when it does
+		// Watch the cluster protocol version for changes and update the database state when it does.
+		// Must be called from the main thread
 		ThreadFuture<Void> monitorProtocolVersion();
 
+		// Starts version monitors for old client versions that don't support connect packet monitoring (<= 5.0).
+		// Must be called from the main thread
+		void startLegacyVersionMonitors();
+
+		// Cleans up state for the legacy version monitors to break reference cycles
+		// Must be called from the main thread
+		void close();
+
 		Reference<IDatabase> db;
 		const Reference<ThreadSafeAsyncVar<Reference<IDatabase>>> dbVar;
 		std::string clusterFilePath;
 
 		// Used to monitor the cluster protocol version. Will be the same as db unless we have either not connected
-		// yet or if the client version associated with db does not support protocol monitoring. In those cases, this
-		// will be a specially created local db.
+		// yet or if the client version associated with db does not support protocol monitoring. In those cases,
+		// this will be a specially created local db.
 		Reference<IDatabase> versionMonitorDb;
 
 		ThreadFuture<Void> changed;
@@ -495,6 +508,7 @@ public:
 
 		ThreadFuture<Void> dbReady;
 		ThreadFuture<Void> protocolVersionMonitor;
+		std::list<LegacyVersionMonitor> legacyVersionMonitors;
 		Optional<ProtocolVersion> dbProtocolVersion;
 		std::map<ProtocolVersion, Reference<ClientInfo>> clients;
 
@@ -503,6 +517,28 @@ public:
 		Mutex optionLock;
 	};
 
+	// A struct that enables monitoring whether the cluster is running an old version (<= 5.0) that doesn't support
+	// connect packet monitoring.
+	struct LegacyVersionMonitor {
+		LegacyVersionMonitor(Reference<ClientInfo> client) : client(client), monitorRunning(false) {}
+		~LegacyVersionMonitor() { TraceEvent("DestroyingVersionMonitor"); }
+
+		// Starts the connection monitor by creating a database object at an old version.
+		// Must be called from the main thread
+		void startConnectionMonitor(Reference<DatabaseState> dbState);
+
+		// Runs a GRV probe on the cluster to determine if the client version is compatible with the cluster.
+		// Must be called from main thread
+		void runGrvProbe(Reference<DatabaseState> dbState);
+
+		Reference<ClientInfo> client;
+		Reference<IDatabase> db;
+		Reference<ITransaction> tr;
+
+		ThreadFuture<Void> versionMonitor;
+		bool monitorRunning;
+	};
+
 	const Reference<DatabaseState> dbState;
 	friend class MultiVersionTransaction;
 
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index 1b1d79cbc2..f264d16bfa 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -1024,13 +1024,13 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 		        singleKeyRange(LiteralStringRef("consistency_check_suspended"))
 		            .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)));
 		registerSpecialKeySpaceModule(
-		    SpecialKeySpace::MODULE::GLOBALCONFIG, SpecialKeySpace::IMPLTYPE::READWRITE,
-		    std::make_unique<GlobalConfigImpl>(
-		        SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG)));
+		    SpecialKeySpace::MODULE::GLOBALCONFIG,
+		    SpecialKeySpace::IMPLTYPE::READWRITE,
+		    std::make_unique<GlobalConfigImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG)));
 		registerSpecialKeySpaceModule(
-		    SpecialKeySpace::MODULE::TRACING, SpecialKeySpace::IMPLTYPE::READWRITE,
-		    std::make_unique<TracingOptionsImpl>(
-		        SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::TRACING)));
+		    SpecialKeySpace::MODULE::TRACING,
+		    SpecialKeySpace::IMPLTYPE::READWRITE,
+		    std::make_unique<TracingOptionsImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::TRACING)));
 		registerSpecialKeySpaceModule(
 		    SpecialKeySpace::MODULE::CONFIGURATION,
 		    SpecialKeySpace::IMPLTYPE::READWRITE,
@@ -4916,8 +4916,7 @@ ACTOR Future<Optional<ProtocolVersion>> getCoordinatorProtocolFromConnectPacket(
 	    FlowTransport::transport().getPeerProtocolAsyncVar(coordinatorAddress);
 
 	loop {
-		if (protocolVersion->get().present() &&
-		    (!expectedVersion.present() || expectedVersion.get() != protocolVersion->get().get())) {
+		if (protocolVersion->get().present() && protocolVersion->get() != expectedVersion) {
 			return protocolVersion->get();
 		}
 
diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index 47bf03c7e8..c8dd207d3a 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -1214,7 +1214,7 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
 								    now() + FLOW_KNOBS->CONNECTION_ID_TIMEOUT;
 							}
 							compatible = false;
-							if (!protocolVersion.hasMultiVersionClient()) {
+							if (!protocolVersion.hasInexpensiveMultiVersionClient()) {
 								// Older versions expected us to hang up. It may work even if we don't hang up here, but
 								// it's safer to keep the old behavior.
 								peer->protocolVersion->set(protocolVersion);
diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h
index 74da1dfd70..07a2675f1b 100644
--- a/flow/ProtocolVersion.h
+++ b/flow/ProtocolVersion.h
@@ -91,7 +91,7 @@ public: // introduced features
 	PROTOCOL_VERSION_FEATURE(0x0FDB00A446020000LL, Locality);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00A460010000LL, MultiGenerationTLog);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00A460010000LL, SharedMutations);
-	PROTOCOL_VERSION_FEATURE(0x0FDB00A551000000LL, MultiVersionClient);
+	PROTOCOL_VERSION_FEATURE(0x0FDB00A551000000LL, InexpensiveMultiVersionClient);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00A560010000LL, TagLocality);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B060000000LL, Fearless);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B061020000LL, EndpointAddrList);

From b39b2b4a3cc90c3e3b6c0508a4036c858e3fb40f Mon Sep 17 00:00:00 2001
From: john_leach <jleach4@gmail.com>
Date: Mon, 19 Apr 2021 11:52:44 -0700
Subject: [PATCH 244/317] Initial Container structure, build images from
 build_output/packages/

---
 cmake/InstallLayout.cmake                     |  7 +++
 packaging/docker/README.md                    | 12 ++++
 packaging/docker/base/Dockerfile              | 46 ++++++++++++++
 packaging/docker/dev/Dockerfile               | 62 +++++++++++++++++++
 packaging/docker/dev_ycsb/Dockerfile          | 45 ++++++++++++++
 packaging/docker/{ => release}/Dockerfile     | 17 +++--
 .../{ => scripts}/create_cluster_file.bash    |  0
 .../create_server_environment.bash            |  0
 .../download_multiversion_libraries.bash      |  0
 packaging/docker/{ => scripts}/fdb.bash       |  0
 10 files changed, 180 insertions(+), 9 deletions(-)
 create mode 100644 packaging/docker/base/Dockerfile
 create mode 100644 packaging/docker/dev/Dockerfile
 create mode 100644 packaging/docker/dev_ycsb/Dockerfile
 rename packaging/docker/{ => release}/Dockerfile (94%)
 rename packaging/docker/{ => scripts}/create_cluster_file.bash (100%)
 rename packaging/docker/{ => scripts}/create_server_environment.bash (100%)
 rename packaging/docker/{ => scripts}/download_multiversion_libraries.bash (100%)
 rename packaging/docker/{ => scripts}/fdb.bash (100%)

diff --git a/cmake/InstallLayout.cmake b/cmake/InstallLayout.cmake
index f4297f0179..47fe89136b 100644
--- a/cmake/InstallLayout.cmake
+++ b/cmake/InstallLayout.cmake
@@ -227,6 +227,13 @@ set(LIB_DIR lib64)
 configure_file("${PROJECT_SOURCE_DIR}/packaging/multiversion/clients/postinst" "${script_dir}/clients/postinst-el7" @ONLY)
 configure_file("${PROJECT_SOURCE_DIR}/packaging/multiversion/clients/prerm" "${script_dir}/clients" @ONLY)
 
+
+################################################################################
+# Move Docker Setup
+################################################################################
+
+file(COPY "${PROJECT_SOURCE_DIR}/packaging/docker" DESTINATION "${PROJECT_BINARY_DIR}/packages/")
+
 ################################################################################
 # General CPack configuration
 ################################################################################
diff --git a/packaging/docker/README.md b/packaging/docker/README.md
index 39fc94844a..5e608efdb2 100644
--- a/packaging/docker/README.md
+++ b/packaging/docker/README.md
@@ -76,3 +76,15 @@ files you may want to copy are:
 *	`/var/fdb/scripts/create_cluster_file.bash`: A script for setting up the
 	cluster file based on an `FDB_COORDINATOR` environment variable.
 *	`/usr/bin/fdbcli`: The FoundationDB CLI.
+
+
+# Example Usages
+
+### Release Images
+
+cd src/foundationdb/packaging/docker
+
+docker build -f release/Dockerfile -t foundationDB:foundationDB:6.2.29 . --build-arg FDB_VERSION=6.2.29
+
+### Developer Images
+
diff --git a/packaging/docker/base/Dockerfile b/packaging/docker/base/Dockerfile
new file mode 100644
index 0000000000..937d48dcff
--- /dev/null
+++ b/packaging/docker/base/Dockerfile
@@ -0,0 +1,46 @@
+# Dockerfile
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM ubuntu:18.04
+
+# Install dependencies
+
+RUN apt-get update && \
+	apt-get install -y curl>=7.58.0-2ubuntu3.6 \
+		dnsutils>=1:9.11.3+dfsg-1ubuntu1.7 \
+		lsof>=4.89+dfsg-0.1 \
+		tcptraceroute>=1.5beta7+debian-4build1 \
+		telnet>=0.17-41 \
+		netcat>=1.10-41.1 \
+		strace>=4.21-1ubuntu1 \
+		tcpdump>=4.9.3-0ubuntu0.18.04.1 \
+		less>=487-0.1 \
+		vim>=2:8.0.1453-1ubuntu1.4 \
+		net-tools>=1.60+git20161116.90da8a0-1ubuntu1 \
+		jq>=1.5+dfsg-2 && \
+	rm -r /var/lib/apt/lists/*
+
+# Adding tini https://github.com/krallin/tini
+ARG TINI_VERSION=v0.19.0
+RUN curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
+    curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64.sha256sum && \
+	sha256sum -c tini-amd64.sha256sum && \
+	rm -f tini-amd64.sha256sum && \
+    chmod +x tini-amd64 && \
+	mv tini-amd64 /usr/bin/tini
diff --git a/packaging/docker/dev/Dockerfile b/packaging/docker/dev/Dockerfile
new file mode 100644
index 0000000000..c74da35467
--- /dev/null
+++ b/packaging/docker/dev/Dockerfile
@@ -0,0 +1,62 @@
+# Dockerfile
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+ARG REPOSITORY=foundationdb/build
+ARG VERSION=centos7-latest
+FROM $REPOSITORY:$VERSION
+
+# Install FoundationDB Binaries
+
+WORKDIR /var/fdb/tmp
+
+COPY docker/scripts scripts/
+
+RUN chmod u+x scripts/*.bash && \
+	mkdir -p logs
+
+COPY . /var/fdb/tmp/packages
+
+WORKDIR /var/fdb/tmp/packages/bin
+
+RUN chmod u+x fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent && \
+	mv fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent /usr/bin
+
+WORKDIR /var/fdb/tmp/packages/lib
+
+RUN mv libfdb_c.so /usr/lib/libfdb_c.so && \
+	mv libfdb_java.so /usr/lib/libfdb_java.so
+
+# Set Up Runtime Scripts and Directories
+
+VOLUME /var/fdb/data
+
+CMD /var/fdb/scripts/fdb.bash
+
+# Runtime Configuration Options
+
+ENV FDB_PORT 4500
+ENV FDB_CLUSTER_FILE /var/fdb/fdb.cluster
+ENV FDB_NETWORKING_MODE container
+ENV FDB_COORDINATOR ""
+ENV FDB_COORDINATOR_PORT 4500
+ENV FDB_CLUSTER_FILE_CONTENTS ""
+ENV FDB_PROCESS_CLASS unset
+
+# Adding tini as PID 1 https://github.com/krallin/tini
+ENTRYPOINT ["/usr/bin/tini", "-g", "--"]
diff --git a/packaging/docker/dev_ycsb/Dockerfile b/packaging/docker/dev_ycsb/Dockerfile
new file mode 100644
index 0000000000..4e1f435766
--- /dev/null
+++ b/packaging/docker/dev_ycsb/Dockerfile
@@ -0,0 +1,45 @@
+ARG REPOSITORY=foundationdb/build
+ARG VERSION=centos7-latest
+FROM $REPOSITORY:$VERSION
+
+#########################################################################################################################################
+# This install YCSB AND the FDB client
+# libraries necessary to run it. The
+# following are the different files downloaded:
+#
+#  1. YCSB
+#  2. libfdb_c_${FDB_VERSION}.so -- the C binding. Sent to /var/lib/fdb
+#  3. fdb-java-${FDB_VERSION}.jar -- the Java library. Sent to ${YCSB_HOME}/foundationdb-binding/lib
+#  4. jaxb-api-2.3.1.jar -- a library dependency necessary for making HDR histograms. Sent to ${YCSB_HOME}/foundationdb-binding/lib
+#
+# Note that these files are only complete for FDB 6.2.x. If you are wanting to run FDB 6.3.x versions, then you'll need to add
+# libfdb_java_${FDB_VERSION}.so to /var/lib/fdb as well
+#########################################################################################################################################
+
+ENV YCSB_VERSION=ycsb-foundationdb-binding-0.17.0 \
+    PATH=${PATH}:/usr/bin
+
+RUN cd /opt \
+    && eval curl "-Ls https://github.com/brianfrankcooper/YCSB/releases/download/0.17.0/ycsb-foundationdb-binding-0.17.0.tar.gz" \
+    | tar -xzvf - 
+
+RUN rm -Rf /opt/${YCSB_VERSION}/lib/fdb-java-5.2.5.jar
+
+WORKDIR /var/fdb/tmp
+
+COPY . /var/fdb/tmp/packages
+
+WORKDIR /var/fdb/tmp/packages/lib
+
+RUN mv libfdb_c.so /usr/lib/libfdb_c.so && \
+	mv libfdb_java.so /usr/lib/libfdb_java.so
+
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/
+
+WORKDIR /var/fdb/tmp/packages
+
+RUN mv fdb-java-7.0.0-PRERELEASE.jar /opt/${YCSB_VERSION}/lib/fdb-java-7.0.0-PRERELEASE.jar
+
+WORKDIR "/opt/${YCSB_VERSION}"
+
+CMD ["tail", "-f", "/dev/null"]
diff --git a/packaging/docker/Dockerfile b/packaging/docker/release/Dockerfile
similarity index 94%
rename from packaging/docker/Dockerfile
rename to packaging/docker/release/Dockerfile
index 9fd690290c..d44b94fdfd 100644
--- a/packaging/docker/Dockerfile
+++ b/packaging/docker/release/Dockerfile
@@ -54,23 +54,22 @@ RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz -o fd
 
 WORKDIR /var/fdb
 
+
+# Set Up Runtime Scripts and Directories
+
+COPY scripts /var/fdb/scripts
+
+RUN chmod u+x scripts/*.bash && \
+	mkdir -p logs
+
 # Install FoundationDB Client Libraries
 
 ARG FDB_ADDITIONAL_VERSIONS="5.1.7"
 
-COPY download_multiversion_libraries.bash scripts/
-
 RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/libfdb_c_$FDB_VERSION.so -o /usr/lib/libfdb_c.so && \
 	bash scripts/download_multiversion_libraries.bash $FDB_WEBSITE $FDB_ADDITIONAL_VERSIONS && \
 	rm -rf /mnt/website
 
-# Set Up Runtime Scripts and Directories
-
-COPY fdb.bash scripts/
-COPY create_server_environment.bash scripts/
-COPY create_cluster_file.bash scripts/
-RUN chmod u+x scripts/*.bash && \
-	mkdir -p logs
 VOLUME /var/fdb/data
 
 CMD /var/fdb/scripts/fdb.bash
diff --git a/packaging/docker/create_cluster_file.bash b/packaging/docker/scripts/create_cluster_file.bash
similarity index 100%
rename from packaging/docker/create_cluster_file.bash
rename to packaging/docker/scripts/create_cluster_file.bash
diff --git a/packaging/docker/create_server_environment.bash b/packaging/docker/scripts/create_server_environment.bash
similarity index 100%
rename from packaging/docker/create_server_environment.bash
rename to packaging/docker/scripts/create_server_environment.bash
diff --git a/packaging/docker/download_multiversion_libraries.bash b/packaging/docker/scripts/download_multiversion_libraries.bash
similarity index 100%
rename from packaging/docker/download_multiversion_libraries.bash
rename to packaging/docker/scripts/download_multiversion_libraries.bash
diff --git a/packaging/docker/fdb.bash b/packaging/docker/scripts/fdb.bash
similarity index 100%
rename from packaging/docker/fdb.bash
rename to packaging/docker/scripts/fdb.bash

From e45faa35342253910645b98e2c68fa5fc135c2d9 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Fri, 23 Apr 2021 16:38:01 -0700
Subject: [PATCH 245/317] Fix a bug where deleting a key invalidated its memory
 which was later read

---
 fdbclient/ActorLineageProfiler.h | 2 +-
 fdbclient/GlobalConfig.actor.cpp | 6 ++----
 fdbclient/GlobalConfig.actor.h   | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 82cd22cb1c..c612274133 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -78,7 +78,7 @@ class SampleCollection_t {
 
 	SampleCollector _collector;
 	mutable std::mutex mutex;
-	std::atomic<double> windowSize = 5.0;
+	std::atomic<double> windowSize = 0.0;
 	std::deque<std::shared_ptr<Sample>> data;
 
 public:
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 1d06d84880..947c383689 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -123,7 +123,7 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) {
 	}
 }
 
-void GlobalConfig::erase(KeyRef key) {
+void GlobalConfig::erase(Key key) {
 	erase(KeyRangeRef(key, keyAfter(key)));
 }
 
@@ -187,9 +187,7 @@ ACTOR Future<Void> GlobalConfig::migrate(GlobalConfig* self) {
 // Updates local copy of global configuration by reading the entire key-range
 // from storage.
 ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
-	for (const auto& [key, _] : self->data) {
-		self->erase(key);
-	}
+	self->erase(KeyRangeRef(""_sr, "\xff"_sr));
 
 	Transaction tr(self->cx);
 	Standalone<RangeResultRef> result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY));
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 65028dcd92..a541145dd8 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -150,7 +150,7 @@ private:
 	void insert(KeyRef key, ValueRef value);
 	// Removes the given key (and associated value) from the local copy of the
 	// global configuration keyspace.
-	void erase(KeyRef key);
+	void erase(Key key);
 	// Removes the given key range (and associated values) from the local copy
 	// of the global configuration keyspace.
 	void erase(KeyRangeRef range);

From 168cba83efd2a590018fa150234482408e6d6c59 Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Fri, 23 Apr 2021 23:51:49 +0000
Subject: [PATCH 246/317] Address review comments

---
 flow/FastRef.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/flow/FastRef.h b/flow/FastRef.h
index aaa50b9595..f8292c5322 100644
--- a/flow/FastRef.h
+++ b/flow/FastRef.h
@@ -36,20 +36,14 @@ public:
 		// The performance of this seems comparable to a version with less strict memory ordering (see e.g.
 		// https://www.boost.org/doc/libs/1_57_0/doc/html/atomic/usage_examples.html#boost_atomic.usage_examples.example_reference_counters),
 		// on both x86 and ARM, with gcc8.
-		if (referenceCount.fetch_sub(1) == 1) {
-			return true;
-		}
-		return false;
+		return referenceCount.fetch_sub(1) == 1;
 	}
 	void delref() const {
 		if (delref_no_destroy())
 			delete (Subclass*)this;
 	}
 	void setrefCountUnsafe(int32_t count) const { referenceCount.store(count); }
-	int32_t debugGetReferenceCount() const {
-		return referenceCount.load();
-	} // Never use in production code, only for tracing
-	bool isSoleOwnerUnsafe() const { return referenceCount.load() == 1; }
+	int32_t debugGetReferenceCount() const { return referenceCount.load(); }
 
 private:
 	ThreadSafeReferenceCounted(const ThreadSafeReferenceCounted&) /* = delete*/;

From 5e5ccebb4c2602965db0bf698ac8a495a8f421ba Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Fri, 23 Apr 2021 18:41:04 -0700
Subject: [PATCH 247/317] Draft scripts to build kubernetes image + sidecar. 
 Depends on okteto default paths (that is a feature, not a bug).

---
 packaging/docker/README.md               | 13 +++--
 packaging/docker/build-release-docker.sh | 66 ++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 7 deletions(-)
 create mode 100755 packaging/docker/build-release-docker.sh

diff --git a/packaging/docker/README.md b/packaging/docker/README.md
index 5e608efdb2..83639c7967 100644
--- a/packaging/docker/README.md
+++ b/packaging/docker/README.md
@@ -80,11 +80,10 @@ files you may want to copy are:
 
 # Example Usages
 
-### Release Images
-
-cd src/foundationdb/packaging/docker
-
-docker build -f release/Dockerfile -t foundationDB:foundationDB:6.2.29 . --build-arg FDB_VERSION=6.2.29
-
-### Developer Images
+```
+# optional; to build a release image (as in for public consumption, or deployment at apple) for 7.0.0, set TAG=7.0.0
+# defaults to <fdb version triple>-<okteto environment name>  e.g., 7.0.0-sears-dev
+#TAG=my-custom-tag
 
+. build-release-docker.sh
+```
diff --git a/packaging/docker/build-release-docker.sh b/packaging/docker/build-release-docker.sh
new file mode 100755
index 0000000000..a385dbf474
--- /dev/null
+++ b/packaging/docker/build-release-docker.sh
@@ -0,0 +1,66 @@
+# Run using . build-release-docker.sh
+
+## This is designed to be run inside an okteto environment.
+
+cmk
+
+cd ~/src/foundationdb/
+
+FDB_VERSION=$(grep '  VERSION ' CMakeLists.txt | tr -s ' ' ' ' | cut -d ' ' -f 3)
+
+# Feel free to customize the image tag:
+TAG=${TAG:-${FDB_VERSION}-${OKTETO_NAME}}
+
+export IMAGE=foundationdb/foundationdb:${TAG}
+
+echo Building with tag ${TAG}
+
+WEBSITE_BIN_DIR=website/downloads/$FDB_VERSION/linux/
+TARBALL=${WEBSITE_BIN_DIR}/fdb_$FDB_VERSION.tar.gz
+ECR=112664522426.dkr.ecr.us-west-2.amazonaws.com
+
+cd ~/src/foundationdb/packaging/docker
+
+mkdir -p ${WEBSITE_BIN_DIR}
+tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
+
+# XXX
+make -C ~/src/fdb-kubernetes-tests/tests/ ecr-login
+
+yes| cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
+docker pull ${ECR}/ubuntu:18.04
+docker tag ${ECR}/ubuntu:18.04 ubuntu:18.04
+
+docker build -t ${IMAGE} \
+   --build-arg FDB_WEBSITE=file:///mnt/website \
+   --build-arg FDB_VERSION=$FDB_VERSION \
+   --build-arg FDB_ADDITIONAL_VERSIONS=$FDB_VERSION \
+   -f release/Dockerfile .
+
+docker tag ${IMAGE} ${ECR}/${IMAGE}
+docker push ${ECR}/${IMAGE}
+
+cd ~/src/fdb-kubernetes-operator/foundationdb-kubernetes-sidecar
+echo
+pwd
+echo
+
+mkdir -p ${WEBSITE_BIN_DIR}
+tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
+yes| cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
+
+SIDECAR_IMAGE=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
+
+docker pull ${ECR}/python:3.9-slim
+docker tag ${ECR}/python:3.9-slim python:3.9-slim
+
+docker build -t ${SIDECAR_IMAGE} \
+   --build-arg FDB_WEBSITE=file:///mnt/website \
+   --build-arg FDB_VERSION=$FDB_VERSION \
+   --build-arg FDB_LIBRARY_VERSIONS=$FDB_VERSION \
+   -f Dockerfile .
+
+docker tag ${IMAGE} ${ECR}/${SIDECAR_IMAGE}
+docker push ${ECR}/${SIDECAR_IMAGE}
+
+#docker build -f release/Dockerfile -t foundationdb/foundationdb:6.2.29 . --build-arg FDB_VERSION=6.2.29

From 22d04266734265efac902975e61191dc94501ad4 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Sun, 25 Apr 2021 17:34:12 -0700
Subject: [PATCH 248/317] Log when RatekeeperGetSSListLongLatency.

---
 fdbserver/Ratekeeper.actor.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 0224954b9e..3daf824b10 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -737,8 +737,11 @@ ACTOR Future<Void> monitorServerListChange(
 
 	loop {
 		try {
+			if (now() - self->lastSSListFetchedTimestamp > 2 * SERVER_KNOBS->SERVER_LIST_DELAY) {
+				TraceEvent(SevWarnAlways, "RatekeeperGetSSListLongLatency", self->id)
+				    .detail("latency", now() - self->lastSSListFetchedTimestamp);
+			}
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
-			TraceEvent("RatekeeperMonitorSSList", self->id).detail("CurrentTime", now());
 			vector<std::pair<StorageServerInterface, ProcessClass>> results = wait(getServerListAndProcessClasses(&tr));
 			self->lastSSListFetchedTimestamp = now();
 
@@ -1498,7 +1501,8 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
 					p.lastTagPushTime = now();
 
 					reply.throttledTags = self.throttledTags.getClientRates(self.autoThrottlingEnabled);
-					TEST(reply.throttledTags.present() && reply.throttledTags.get().size() > 0); // Returning tag throttles to a proxy
+					TEST(reply.throttledTags.present() &&
+					     reply.throttledTags.get().size() > 0); // Returning tag throttles to a proxy
 				}
 
 				reply.healthMetrics.update(self.healthMetrics, true, req.detailed);

From c06da4704caba263b81e1bba27e00eba49b394ea Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Sun, 25 Apr 2021 17:35:23 -0700
Subject: [PATCH 249/317] Revert unrelated clang format.

---
 fdbserver/Ratekeeper.actor.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index 3daf824b10..f16c2f4fe6 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -1501,8 +1501,7 @@ ACTOR Future<Void> ratekeeper(RatekeeperInterface rkInterf, Reference<AsyncVar<S
 					p.lastTagPushTime = now();
 
 					reply.throttledTags = self.throttledTags.getClientRates(self.autoThrottlingEnabled);
-					TEST(reply.throttledTags.present() &&
-					     reply.throttledTags.get().size() > 0); // Returning tag throttles to a proxy
+					TEST(reply.throttledTags.present() && reply.throttledTags.get().size() > 0); // Returning tag throttles to a proxy
 				}
 
 				reply.healthMetrics.update(self.healthMetrics, true, req.detailed);

From 384c0b48ea827dadb1e5fc2943b8932db6a54ea1 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Sun, 25 Apr 2021 22:53:52 -0700
Subject: [PATCH 250/317] Fix suppressFor order error.

---
 fdbserver/Ratekeeper.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp
index f16c2f4fe6..a7ef3f4626 100644
--- a/fdbserver/Ratekeeper.actor.cpp
+++ b/fdbserver/Ratekeeper.actor.cpp
@@ -739,7 +739,7 @@ ACTOR Future<Void> monitorServerListChange(
 		try {
 			if (now() - self->lastSSListFetchedTimestamp > 2 * SERVER_KNOBS->SERVER_LIST_DELAY) {
 				TraceEvent(SevWarnAlways, "RatekeeperGetSSListLongLatency", self->id)
-				    .detail("latency", now() - self->lastSSListFetchedTimestamp);
+				    .detail("Latency", now() - self->lastSSListFetchedTimestamp);
 			}
 			tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 			vector<std::pair<StorageServerInterface, ProcessClass>> results = wait(getServerListAndProcessClasses(&tr));
@@ -768,7 +768,7 @@ ACTOR Future<Void> monitorServerListChange(
 			tr = Transaction(self->db);
 			wait(delay(SERVER_KNOBS->SERVER_LIST_DELAY));
 		} catch (Error& e) {
-			TraceEvent("RatekeeperGetSSListError", self->id).suppressFor(1.0).error(e);
+			TraceEvent("RatekeeperGetSSListError", self->id).error(e).suppressFor(1.0);
 			wait(tr.onError(e));
 		}
 	}

From f1559a22031149950f8ccf5ba9d4ee6fc02c9a03 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 09:49:26 -0700
Subject: [PATCH 251/317] use the stateless process class instead of master or
 resolution in simulation because it is the recommended process class, and the
 others are not deterministic when recruited in a constrained process
 situation

---
 .../source/mr-status-json-schemas.rst.inc      |  1 +
 fdbclient/Schemas.cpp                          |  1 +
 fdbserver/SimulatedCluster.actor.cpp           | 18 +++++++++++-------
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
index 81da2adf83..e43f7e5fac 100644
--- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc
+++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc
@@ -27,6 +27,7 @@
                   "storage",
                   "transaction",
                   "resolution",
+                  "stateless",
                   "commit_proxy",
                   "grv_proxy",
                   "master",
diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 866ea4441e..6f5a78a927 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -47,6 +47,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                   "storage",
                   "transaction",
                   "resolution",
+                  "stateless",
                   "commit_proxy",
                   "grv_proxy",
                   "master",
diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index f8d9610f32..f10ca774bb 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -730,9 +730,14 @@ ACTOR Future<Void> restartSimulatedSystem(vector<Future<Void>>* systemActors,
 				zoneId = StringRef(zoneIDini);
 			}
 
-			ProcessClass processClass =
-			    ProcessClass((ProcessClass::ClassType)atoi(ini.GetValue(machineIdString.c_str(), "mClass")),
-			                 ProcessClass::CommandLineSource);
+			ProcessClass::ClassType cType =
+			    (ProcessClass::ClassType)(atoi(ini.GetValue(machineIdString.c_str(), "mClass")));
+			// using specialized class types can lead to nondeterministic recruitment
+			if (cType == ProcessClass::MasterClass || cType == ProcessClass::ResolutionClass) {
+				cType = ProcessClass::StatelessClass;
+			}
+			ProcessClass processClass = ProcessClass(cType, ProcessClass::CommandLineSource);
+
 			if (processClass != ProcessClass::TesterClass) {
 				dcIds.push_back(dcUIDini);
 			}
@@ -1450,8 +1455,7 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
 	bool requiresExtraDBMachines = testConfig.extraDB && g_simulator.extraDB->toString() != conn.toString();
 	int assignedMachines = 0, nonVersatileMachines = 0;
 	std::vector<ProcessClass::ClassType> processClassesSubSet = { ProcessClass::UnsetClass,
-		                                                          ProcessClass::ResolutionClass,
-		                                                          ProcessClass::MasterClass };
+		                                                          ProcessClass::StatelessClass };
 	for (int dc = 0; dc < dataCenters; dc++) {
 		// FIXME: test unset dcID
 		Optional<Standalone<StringRef>> dcUID = StringRef(format("%d", dc));
@@ -1493,12 +1497,12 @@ void setupSimulatedSystem(vector<Future<Void>>* systemActors,
 				else if (assignedMachines == 4 && !simconfig.db.regions.size())
 					processClass = ProcessClass(
 					    processClassesSubSet[deterministicRandom()->randomInt(0, processClassesSubSet.size())],
-					    ProcessClass::CommandLineSource); // Unset or Resolution or Master
+					    ProcessClass::CommandLineSource); // Unset or Stateless
 				else
 					processClass = ProcessClass((ProcessClass::ClassType)deterministicRandom()->randomInt(0, 3),
 					                            ProcessClass::CommandLineSource); // Unset, Storage, or Transaction
 				if (processClass ==
-				    ProcessClass::ResolutionClass) // *can't* be assigned to other roles, even in an emergency
+				    ProcessClass::StatelessClass) // *can't* be assigned to other roles, even in an emergency
 					nonVersatileMachines++;
 			}
 

From 9ca2c3b6c4e7150b1112bbb4e8d8866dfda7bf18 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 09:50:19 -0700
Subject: [PATCH 252/317] instead of increasing the timeout for the lowLatency
 test, reduce the amount of time a commit takes because of long commit times

---
 fdbserver/workloads/LowLatency.actor.cpp | 2 +-
 flow/Knobs.cpp                           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbserver/workloads/LowLatency.actor.cpp b/fdbserver/workloads/LowLatency.actor.cpp
index 7e761b2262..90b03dd8e9 100644
--- a/fdbserver/workloads/LowLatency.actor.cpp
+++ b/fdbserver/workloads/LowLatency.actor.cpp
@@ -40,7 +40,7 @@ struct LowLatencyWorkload : TestWorkload {
 	  : TestWorkload(wcx), operations("Operations"), retries("Retries"), ok(true) {
 		testDuration = getOption(options, LiteralStringRef("testDuration"), 600.0);
 		maxGRVLatency = getOption(options, LiteralStringRef("maxGRVLatency"), 20.0);
-		maxCommitLatency = getOption(options, LiteralStringRef("maxCommitLatency"), 33.0);
+		maxCommitLatency = getOption(options, LiteralStringRef("maxCommitLatency"), 30.0);
 		checkDelay = getOption(options, LiteralStringRef("checkDelay"), 1.0);
 		testWrites = getOption(options, LiteralStringRef("testWrites"), true);
 		testKey = getOption(options, LiteralStringRef("testKey"), LiteralStringRef("testKey"));
diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp
index 4a3eb4e2d7..e4d5a4e6f9 100644
--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@@ -135,7 +135,7 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) {
 	init( DISABLE_POSIX_KERNEL_AIO,                              0 );
 
 	//AsyncFileNonDurable
-	init( NON_DURABLE_MAX_WRITE_DELAY,                         5.0 );
+	init( NON_DURABLE_MAX_WRITE_DELAY,                         2.0 ); if( randomize && BUGGIFY ) NON_DURABLE_MAX_WRITE_DELAY = 5.0;
 	init( MAX_PRIOR_MODIFICATION_DELAY,                        1.0 ); if( randomize && BUGGIFY ) MAX_PRIOR_MODIFICATION_DELAY = 10.0;
 
 	//GenericActors

From ccfc77f6fbf8f4782bb979f4c270d85b019354f8 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 09:57:46 -0700
Subject: [PATCH 253/317] changed preferredSharing to be ordered, so that
 recruitment will always share with the same other role when everything else
 is equal

---
 fdbserver/ClusterController.actor.cpp | 168 ++++++++++++++------------
 1 file changed, 93 insertions(+), 75 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index db23dfd215..ec9314d7fd 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1174,15 +1174,15 @@ public:
 		return bestFitness;
 	}
 
-	WorkerFitnessInfo getWorkerForRoleInDatacenter(
-	    Optional<Standalone<StringRef>> const& dcId,
-	    ProcessClass::ClusterRole role,
-	    ProcessClass::Fitness unacceptableFitness,
-	    DatabaseConfiguration const& conf,
-	    std::map<Optional<Standalone<StringRef>>, int>& id_used,
-	    Optional<Standalone<StringRef>> preferredSharing = Optional<Standalone<StringRef>>(),
-	    bool checkStable = false) {
-		std::map<std::tuple<ProcessClass::Fitness, int, bool, bool>, vector<WorkerDetails>> fitness_workers;
+	WorkerFitnessInfo getWorkerForRoleInDatacenter(Optional<Standalone<StringRef>> const& dcId,
+	                                               ProcessClass::ClusterRole role,
+	                                               ProcessClass::Fitness unacceptableFitness,
+	                                               DatabaseConfiguration const& conf,
+	                                               std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	                                               std::map<Optional<Standalone<StringRef>>, int> preferredSharing =
+	                                                   std::map<Optional<Standalone<StringRef>>, int>(),
+	                                               bool checkStable = false) {
+		std::map<std::tuple<ProcessClass::Fitness, int, bool, int>, vector<WorkerDetails>> fitness_workers;
 
 		for (auto& it : id_worker) {
 			auto fitness = it.second.details.processClass.machineClassFitness(role);
@@ -1191,10 +1191,11 @@ public:
 			}
 			if (workerAvailable(it.second, checkStable) && fitness < unacceptableFitness &&
 			    it.second.details.interf.locality.dcId() == dcId) {
+				auto sharing = preferredSharing.find(it.first);
 				fitness_workers[std::make_tuple(fitness,
 				                                id_used[it.first],
 				                                isLongLivedStateless(it.first),
-				                                preferredSharing != it.first)]
+				                                sharing != preferredSharing.end() ? sharing->second : 1e6)]
 				    .push_back(it.second.details);
 			}
 		}
@@ -1216,10 +1217,11 @@ public:
 	    int amount,
 	    DatabaseConfiguration const& conf,
 	    std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	    std::map<Optional<Standalone<StringRef>>, int> preferredSharing =
+	        std::map<Optional<Standalone<StringRef>>, int>(),
 	    Optional<WorkerFitnessInfo> minWorker = Optional<WorkerFitnessInfo>(),
 	    bool checkStable = false) {
-		std::map<std::pair<ProcessClass::Fitness, int>, std::pair<vector<WorkerDetails>, vector<WorkerDetails>>>
-		    fitness_workers;
+		std::map<std::tuple<ProcessClass::Fitness, int, bool, int>, vector<WorkerDetails>> fitness_workers;
 		vector<WorkerDetails> results;
 		if (minWorker.present()) {
 			results.push_back(minWorker.get().worker);
@@ -1237,24 +1239,22 @@ public:
 			     (it.second.details.interf.id() != minWorker.get().worker.interf.id() &&
 			      (fitness < minWorker.get().fitness ||
 			       (fitness == minWorker.get().fitness && id_used[it.first] <= minWorker.get().used))))) {
-				if (isLongLivedStateless(it.first)) {
-					fitness_workers[std::make_pair(fitness, id_used[it.first])].second.push_back(it.second.details);
-				} else {
-					fitness_workers[std::make_pair(fitness, id_used[it.first])].first.push_back(it.second.details);
-				}
+				auto sharing = preferredSharing.find(it.first);
+				fitness_workers[std::make_tuple(fitness,
+				                                id_used[it.first],
+				                                isLongLivedStateless(it.first),
+				                                sharing != preferredSharing.end() ? sharing->second : 1e6)]
+				    .push_back(it.second.details);
 			}
 		}
 
 		for (auto& it : fitness_workers) {
-			for (int j = 0; j < 2; j++) {
-				auto& w = j == 0 ? it.second.first : it.second.second;
-				deterministicRandom()->randomShuffle(w);
-				for (int i = 0; i < w.size(); i++) {
-					results.push_back(w[i]);
-					id_used[w[i].interf.locality.processId()]++;
-					if (results.size() == amount)
-						return results;
-				}
+			deterministicRandom()->randomShuffle(it.second);
+			for (int i = 0; i < it.second.size(); i++) {
+				results.push_back(it.second[i]);
+				id_used[it.second[i].interf.locality.processId()]++;
+				if (results.size() == amount)
+					return results;
 			}
 		}
 
@@ -1475,20 +1475,16 @@ public:
 			}
 		}
 
+		std::map<Optional<Standalone<StringRef>>, int> preferredSharing;
 		auto first_commit_proxy = getWorkerForRoleInDatacenter(
-		    dcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, req.configuration, id_used);
-		auto first_grv_proxy = getWorkerForRoleInDatacenter(dcId,
-		                                                    ProcessClass::GrvProxy,
-		                                                    ProcessClass::ExcludeFit,
-		                                                    req.configuration,
-		                                                    id_used,
-		                                                    first_commit_proxy.worker.interf.locality.processId());
-		auto first_resolver = getWorkerForRoleInDatacenter(dcId,
-		                                                   ProcessClass::Resolver,
-		                                                   ProcessClass::ExcludeFit,
-		                                                   req.configuration,
-		                                                   id_used,
-		                                                   first_commit_proxy.worker.interf.locality.processId());
+		    dcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, req.configuration, id_used, preferredSharing);
+		preferredSharing[first_commit_proxy.worker.interf.locality.processId()] = 0;
+		auto first_grv_proxy = getWorkerForRoleInDatacenter(
+		    dcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, req.configuration, id_used, preferredSharing);
+		preferredSharing[first_grv_proxy.worker.interf.locality.processId()] = 1;
+		auto first_resolver = getWorkerForRoleInDatacenter(
+		    dcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, req.configuration, id_used, preferredSharing);
+		preferredSharing[first_resolver.worker.interf.locality.processId()] = 2;
 
 		// If one of the first process recruitments is forced to share a process, allow all of next recruitments
 		// to also share a process.
@@ -1502,18 +1498,21 @@ public:
 		                                                    req.configuration.getDesiredCommitProxies(),
 		                                                    req.configuration,
 		                                                    id_used,
+		                                                    preferredSharing,
 		                                                    first_commit_proxy);
 		auto grv_proxies = getWorkersForRoleInDatacenter(dcId,
 		                                                 ProcessClass::GrvProxy,
 		                                                 req.configuration.getDesiredGrvProxies(),
 		                                                 req.configuration,
 		                                                 id_used,
+		                                                 preferredSharing,
 		                                                 first_grv_proxy);
 		auto resolvers = getWorkersForRoleInDatacenter(dcId,
 		                                               ProcessClass::Resolver,
 		                                               req.configuration.getDesiredResolvers(),
 		                                               req.configuration,
 		                                               id_used,
+		                                               preferredSharing,
 		                                               first_resolver);
 		for (int i = 0; i < commit_proxies.size(); i++)
 			result.commitProxies.push_back(commit_proxies[i].interf);
@@ -1681,22 +1680,28 @@ public:
 					// SOMEDAY: recruitment in other DCs besides the clusterControllerDcID will not account for the
 					// processes used by the master and cluster controller properly.
 					auto used = id_used;
-					auto first_commit_proxy = getWorkerForRoleInDatacenter(
-					    dcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, req.configuration, used);
-					auto first_grv_proxy =
-					    getWorkerForRoleInDatacenter(dcId,
-					                                 ProcessClass::GrvProxy,
-					                                 ProcessClass::ExcludeFit,
-					                                 req.configuration,
-					                                 used,
-					                                 first_commit_proxy.worker.interf.locality.processId());
-					auto first_resolver =
-					    getWorkerForRoleInDatacenter(dcId,
-					                                 ProcessClass::Resolver,
-					                                 ProcessClass::ExcludeFit,
-					                                 req.configuration,
-					                                 used,
-					                                 first_commit_proxy.worker.interf.locality.processId());
+					std::map<Optional<Standalone<StringRef>>, int> preferredSharing;
+					auto first_commit_proxy = getWorkerForRoleInDatacenter(dcId,
+					                                                       ProcessClass::CommitProxy,
+					                                                       ProcessClass::ExcludeFit,
+					                                                       req.configuration,
+					                                                       used,
+					                                                       preferredSharing);
+					preferredSharing[first_commit_proxy.worker.interf.locality.processId()] = 0;
+					auto first_grv_proxy = getWorkerForRoleInDatacenter(dcId,
+					                                                    ProcessClass::GrvProxy,
+					                                                    ProcessClass::ExcludeFit,
+					                                                    req.configuration,
+					                                                    used,
+					                                                    preferredSharing);
+					preferredSharing[first_grv_proxy.worker.interf.locality.processId()] = 1;
+					auto first_resolver = getWorkerForRoleInDatacenter(dcId,
+					                                                   ProcessClass::Resolver,
+					                                                   ProcessClass::ExcludeFit,
+					                                                   req.configuration,
+					                                                   used,
+					                                                   preferredSharing);
+					preferredSharing[first_resolver.worker.interf.locality.processId()] = 2;
 
 					// If one of the first process recruitments is forced to share a process, allow all of next
 					// recruitments to also share a process.
@@ -1710,6 +1715,7 @@ public:
 					                                                    req.configuration.getDesiredCommitProxies(),
 					                                                    req.configuration,
 					                                                    used,
+					                                                    preferredSharing,
 					                                                    first_commit_proxy);
 
 					auto grv_proxies = getWorkersForRoleInDatacenter(dcId,
@@ -1717,6 +1723,7 @@ public:
 					                                                 req.configuration.getDesiredGrvProxies(),
 					                                                 req.configuration,
 					                                                 used,
+					                                                 preferredSharing,
 					                                                 first_grv_proxy);
 
 					auto resolvers = getWorkersForRoleInDatacenter(dcId,
@@ -1724,6 +1731,7 @@ public:
 					                                               req.configuration.getDesiredResolvers(),
 					                                               req.configuration,
 					                                               used,
+					                                               preferredSharing,
 					                                               first_resolver);
 
 					auto fitness = std::make_tuple(RoleFitness(commit_proxies, ProcessClass::CommitProxy, used),
@@ -1829,14 +1837,14 @@ public:
 			                             ProcessClass::ExcludeFit,
 			                             db.config,
 			                             id_used,
-			                             Optional<Standalone<StringRef>>(),
+			                             std::map<Optional<Standalone<StringRef>>, int>(),
 			                             true);
 			getWorkerForRoleInDatacenter(regions[0].dcId,
 			                             ProcessClass::Master,
 			                             ProcessClass::ExcludeFit,
 			                             db.config,
 			                             id_used,
-			                             Optional<Standalone<StringRef>>(),
+			                             std::map<Optional<Standalone<StringRef>>, int>(),
 			                             true);
 
 			std::set<Optional<Key>> primaryDC;
@@ -1858,21 +1866,21 @@ public:
 			                             ProcessClass::ExcludeFit,
 			                             db.config,
 			                             id_used,
-			                             Optional<Standalone<StringRef>>(),
+			                             std::map<Optional<Standalone<StringRef>>, int>(),
 			                             true);
 			getWorkerForRoleInDatacenter(regions[0].dcId,
 			                             ProcessClass::CommitProxy,
 			                             ProcessClass::ExcludeFit,
 			                             db.config,
 			                             id_used,
-			                             Optional<Standalone<StringRef>>(),
+			                             std::map<Optional<Standalone<StringRef>>, int>(),
 			                             true);
 			getWorkerForRoleInDatacenter(regions[0].dcId,
 			                             ProcessClass::GrvProxy,
 			                             ProcessClass::ExcludeFit,
 			                             db.config,
 			                             id_used,
-			                             Optional<Standalone<StringRef>>(),
+			                             std::map<Optional<Standalone<StringRef>>, int>(),
 			                             true);
 
 			vector<Optional<Key>> dcPriority;
@@ -2090,7 +2098,7 @@ public:
 		                                                         ProcessClass::NeverAssign,
 		                                                         db.config,
 		                                                         id_used,
-		                                                         Optional<Standalone<StringRef>>(),
+		                                                         std::map<Optional<Standalone<StringRef>>, int>(),
 		                                                         true);
 		auto newMasterFit = mworker.worker.processClass.machineClassFitness(ProcessClass::Master);
 		if (db.config.isExcludedServer(mworker.worker.interf.addresses())) {
@@ -2250,15 +2258,17 @@ public:
 		RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter, old_id_used);
 		RoleFitness newLogRoutersFit = oldLogRoutersFit;
 		if (db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED) {
-			newLogRoutersFit = RoleFitness(getWorkersForRoleInDatacenter(*remoteDC.begin(),
-			                                                             ProcessClass::LogRouter,
-			                                                             newRouterCount,
-			                                                             db.config,
-			                                                             id_used,
-			                                                             Optional<WorkerFitnessInfo>(),
-			                                                             true),
-			                               ProcessClass::LogRouter,
-			                               id_used);
+			newLogRoutersFit =
+			    RoleFitness(getWorkersForRoleInDatacenter(*remoteDC.begin(),
+			                                              ProcessClass::LogRouter,
+			                                              newRouterCount,
+			                                              db.config,
+			                                              id_used,
+			                                              std::map<Optional<Standalone<StringRef>>, int>(),
+			                                              Optional<WorkerFitnessInfo>(),
+			                                              true),
+			                ProcessClass::LogRouter,
+			                id_used);
 		}
 
 		if (oldLogRoutersFit.count < oldRouterCount) {
@@ -2276,27 +2286,31 @@ public:
 		RoleFitness oldGrvProxyFit(grvProxyClasses, ProcessClass::GrvProxy, old_id_used);
 		RoleFitness oldResolverFit(resolverClasses, ProcessClass::Resolver, old_id_used);
 
+		std::map<Optional<Standalone<StringRef>>, int> preferredSharing;
 		auto first_commit_proxy = getWorkerForRoleInDatacenter(clusterControllerDcId,
 		                                                       ProcessClass::CommitProxy,
 		                                                       ProcessClass::ExcludeFit,
 		                                                       db.config,
 		                                                       id_used,
-		                                                       Optional<Standalone<StringRef>>(),
+		                                                       preferredSharing,
 		                                                       true);
+		preferredSharing[first_commit_proxy.worker.interf.locality.processId()] = 0;
 		auto first_grv_proxy = getWorkerForRoleInDatacenter(clusterControllerDcId,
 		                                                    ProcessClass::GrvProxy,
 		                                                    ProcessClass::ExcludeFit,
 		                                                    db.config,
 		                                                    id_used,
-		                                                    first_commit_proxy.worker.interf.locality.processId(),
+		                                                    preferredSharing,
 		                                                    true);
+		preferredSharing[first_grv_proxy.worker.interf.locality.processId()] = 1;
 		auto first_resolver = getWorkerForRoleInDatacenter(clusterControllerDcId,
 		                                                   ProcessClass::Resolver,
 		                                                   ProcessClass::ExcludeFit,
 		                                                   db.config,
 		                                                   id_used,
-		                                                   first_commit_proxy.worker.interf.locality.processId(),
+		                                                   preferredSharing,
 		                                                   true);
+		preferredSharing[first_resolver.worker.interf.locality.processId()] = 2;
 		auto maxUsed = std::max({ first_commit_proxy.used, first_grv_proxy.used, first_resolver.used });
 		first_commit_proxy.used = maxUsed;
 		first_grv_proxy.used = maxUsed;
@@ -2306,6 +2320,7 @@ public:
 		                                                    db.config.getDesiredCommitProxies(),
 		                                                    db.config,
 		                                                    id_used,
+		                                                    preferredSharing,
 		                                                    first_commit_proxy,
 		                                                    true);
 		auto grv_proxies = getWorkersForRoleInDatacenter(clusterControllerDcId,
@@ -2313,6 +2328,7 @@ public:
 		                                                 db.config.getDesiredGrvProxies(),
 		                                                 db.config,
 		                                                 id_used,
+		                                                 preferredSharing,
 		                                                 first_grv_proxy,
 		                                                 true);
 		auto resolvers = getWorkersForRoleInDatacenter(clusterControllerDcId,
@@ -2320,6 +2336,7 @@ public:
 		                                               db.config.getDesiredResolvers(),
 		                                               db.config,
 		                                               id_used,
+		                                               preferredSharing,
 		                                               first_resolver,
 		                                               true);
 
@@ -2336,6 +2353,7 @@ public:
 		                                                              nBackup,
 		                                                              db.config,
 		                                                              id_used,
+		                                                              std::map<Optional<Standalone<StringRef>>, int>(),
 		                                                              Optional<WorkerFitnessInfo>(),
 		                                                              true),
 		                                ProcessClass::Backup,
@@ -2775,7 +2793,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 	                                                               ProcessClass::NeverAssign,
 	                                                               self->db.config,
 	                                                               id_used,
-	                                                               Optional<Standalone<StringRef>>(),
+	                                                               std::map<Optional<Standalone<StringRef>>, int>(),
 	                                                               true)
 	                                .worker;
 	if (self->onMasterIsBetter(newRKWorker, ProcessClass::Ratekeeper)) {
@@ -2791,7 +2809,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 	                                                               ProcessClass::NeverAssign,
 	                                                               self->db.config,
 	                                                               id_used,
-	                                                               Optional<Standalone<StringRef>>(),
+	                                                               std::map<Optional<Standalone<StringRef>>, int>(),
 	                                                               true)
 	                                .worker;
 	if (self->onMasterIsBetter(newDDWorker, ProcessClass::DataDistributor)) {

From 7503964ee9da0ab7ce1d31ea9a26b8a05c6bc8fe Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 10:01:54 -0700
Subject: [PATCH 254/317] recruitment tries to avoid degraded processes
 altogether, rather than just the worst one. Since this is a behavior change
 from the backup recruitment, we cannot compared degraded between the two
 recruitments

---
 fdbserver/ClusterController.actor.cpp | 38 ++++++++++++---------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index ec9314d7fd..42dd777e11 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -997,15 +997,15 @@ public:
 
 						std::map<Optional<Standalone<StringRef>>, int> field_count;
 						std::set<Optional<Standalone<StringRef>>> zones;
-						bool foundDegraded = false;
 						for (auto& worker : testWorkers) {
 							if (!zones.count(worker.interf.locality.zoneId())) {
 								field_count[worker.interf.locality.get(pa1->attributeKey())]++;
 								zones.insert(worker.interf.locality.zoneId());
 							}
-							foundDegraded = foundDegraded || worker.degraded;
 						}
-						testFitness.worstDegraded = foundDegraded;
+						// backup recruitment is not required to use degraded processes that have better fitness
+						// so we cannot compare degraded between the two methods
+						testFitness.degraded = fitness.degraded;
 
 						int minField = 100;
 
@@ -1047,6 +1047,9 @@ public:
 				    conf, required, desired, policy, testUsed, checkStable, dcIds, exclusionWorkerIds);
 				RoleFitness testFitness(testWorkers, ProcessClass::TLog, testUsed);
 				RoleFitness fitness(workers, ProcessClass::TLog, id_used);
+				// backup recruitment is not required to use degraded processes that have better fitness
+				// so we cannot compare degraded between the two methods
+				testFitness.degraded = fitness.degraded;
 
 				if (fitness > testFitness) {
 					for (auto& w : testWorkers) {
@@ -1270,7 +1273,7 @@ public:
 		ProcessClass::ClusterRole role;
 		int count;
 		int worstUsed = 1;
-		bool worstDegraded = false;
+		bool degraded = false;
 
 		RoleFitness(int bestFit, int worstFit, int count, ProcessClass::ClusterRole role)
 		  : bestFit((ProcessClass::Fitness)bestFit), worstFit((ProcessClass::Fitness)worstFit), count(count),
@@ -1291,7 +1294,7 @@ public:
 			// Every recruitment will attempt to recruit the preferred amount through GoodFit,
 			// So a recruitment which only has BestFit is not better than one that has a GoodFit process
 			worstFit = ProcessClass::GoodFit;
-			worstDegraded = false;
+			degraded = false;
 			bestFit = ProcessClass::NeverAssign;
 			worstUsed = 1;
 			for (auto& it : workers) {
@@ -1312,22 +1315,17 @@ public:
 				if (thisFit > worstFit) {
 					worstFit = thisFit;
 					worstUsed = thisUsed->second;
-					worstDegraded = it.degraded;
 				} else if (thisFit == worstFit) {
-					if (thisUsed->second > worstUsed) {
-						worstUsed = thisUsed->second;
-						worstDegraded = it.degraded;
-					} else if (thisUsed->second == worstUsed) {
-						worstDegraded = it.degraded || worstDegraded;
-					}
+					worstUsed = std::max(worstUsed, thisUsed->second);
 				}
+				degraded = degraded || it.degraded;
 			}
 
 			count = workers.size();
 
 			// degraded is only used for recruitment of tlogs
 			if (role != ProcessClass::TLog) {
-				worstDegraded = false;
+				degraded = false;
 			}
 		}
 
@@ -1338,8 +1336,8 @@ public:
 				return worstUsed < r.worstUsed;
 			if (count != r.count)
 				return count > r.count;
-			if (worstDegraded != r.worstDegraded)
-				return r.worstDegraded;
+			if (degraded != r.degraded)
+				return r.degraded;
 			// FIXME: TLog recruitment process does not guarantee the best fit is not worsened.
 			if (role != ProcessClass::TLog && role != ProcessClass::LogRouter && bestFit != r.bestFit)
 				return bestFit < r.bestFit;
@@ -1356,19 +1354,17 @@ public:
 				return worstFit < r.worstFit;
 			if (worstUsed != r.worstUsed)
 				return worstUsed < r.worstUsed;
-			if (worstDegraded != r.worstDegraded)
-				return r.worstDegraded;
+			if (degraded != r.degraded)
+				return r.degraded;
 			return false;
 		}
 
 		bool operator==(RoleFitness const& r) const {
 			return worstFit == r.worstFit && worstUsed == r.worstUsed && bestFit == r.bestFit && count == r.count &&
-			       worstDegraded == r.worstDegraded;
+			       degraded == r.degraded;
 		}
 
-		std::string toString() const {
-			return format("%d %d %d %d %d", worstFit, worstUsed, count, worstDegraded, bestFit);
-		}
+		std::string toString() const { return format("%d %d %d %d %d", worstFit, worstUsed, count, degraded, bestFit); }
 	};
 
 	std::set<Optional<Standalone<StringRef>>> getDatacenters(DatabaseConfiguration const& conf,

From 49ca48f82ebfae8d4f90b1ae47a02aa7a3e728d8 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 10:09:44 -0700
Subject: [PATCH 255/317] fix: tlog recruitment could select more than the
 desired about of tlogs fix: tlog recruitment did not attempt to avoid
 longLivedStateless processes

---
 fdbserver/ClusterController.actor.cpp | 87 +++++++++++++++------------
 1 file changed, 50 insertions(+), 37 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 42dd777e11..24a52b1a74 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -464,10 +464,9 @@ public:
 	                                                     int minPerField,
 	                                                     bool allowDegraded,
 	                                                     bool checkStable,
-	                                                     std::set<Optional<Key>> dcIds,
-	                                                     std::vector<UID> exclusionWorkerIds) {
-		std::map<std::pair<ProcessClass::Fitness, int>, vector<WorkerDetails>> fitness_workers;
-		desired = std::max(desired, minFields * minPerField);
+	                                                     const std::set<Optional<Key>>& dcIds,
+	                                                     const std::vector<UID>& exclusionWorkerIds) {
+		std::map<std::tuple<ProcessClass::Fitness, int, bool>, vector<WorkerDetails>> fitness_workers;
 
 		// Go through all the workers to list all the workers that can be recruited.
 		for (const auto& [worker_process_id, worker_info] : id_worker) {
@@ -483,11 +482,13 @@ public:
 				continue;
 			}
 
-			fitness_workers[std::make_pair(fitness, id_used[worker_process_id])].push_back(worker_details);
+			fitness_workers[std::make_tuple(
+			                    fitness, id_used[worker_process_id], isLongLivedStateless(worker_process_id))]
+			    .push_back(worker_details);
 		}
 
-		auto requiredFitness = ProcessClass::BestFit;
-		int requiredUsed = 0;
+		auto requiredFitness = ProcessClass::NeverAssign;
+		int requiredUsed = 1e6;
 
 		typedef Optional<Standalone<StringRef>> Field;
 		typedef Optional<Standalone<StringRef>> Zone;
@@ -499,6 +500,13 @@ public:
 		// Determine the best required workers by finding the workers with enough unique zoneIds per field
 		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
 			deterministicRandom()->randomShuffle(workerIter->second);
+			auto fitness = std::get<0>(workerIter->first);
+			auto used = std::get<1>(workerIter->first);
+
+			if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
+				break;
+			}
+
 			for (auto& worker : workerIter->second) {
 				auto thisField = worker.interf.locality.get(field);
 				auto& zones = field_zones[thisField];
@@ -525,6 +533,7 @@ public:
 			throw no_more_servers();
 		}
 
+		std::set<Field> chosenFields;
 		// If we cannot use all of the fields, use the fields which allow the best workers to be chosen
 		if (fieldsWithMin.size() * minPerField > desired) {
 			std::vector<std::tuple<ProcessClass::Fitness, int, bool, int, Field>> orderedFields;
@@ -534,33 +543,29 @@ public:
 				    std::get<0>(fitness), std::get<1>(fitness), std::get<2>(fitness), field_count[it], it));
 			}
 			std::sort(orderedFields.begin(), orderedFields.end());
-			std::set<Field> newFieldsWithMin;
 			int totalFields = desired / minPerField;
 			int maxCount = 0;
-			for (int i = 0; i < orderedFields.size(); i++) {
-				if (newFieldsWithMin.size() == totalFields - 1 && maxCount + std::get<3>(orderedFields[i]) < desired) {
+			for (int i = 0; i < orderedFields.size() && chosenFields.size() < totalFields; i++) {
+				if (chosenFields.size() == totalFields - 1 && maxCount + std::get<3>(orderedFields[i]) < desired) {
 					for (int j = i + 1; j < orderedFields.size(); j++) {
 						if (maxCount + std::get<3>(orderedFields[j]) >= desired) {
-							newFieldsWithMin.insert(std::get<4>(orderedFields[j]));
+							chosenFields.insert(std::get<4>(orderedFields[j]));
 							break;
 						}
 					}
-					if (newFieldsWithMin.size() == totalFields) {
-						break;
-					}
 				}
-				maxCount += std::get<3>(orderedFields[i]);
-				newFieldsWithMin.insert(std::get<4>(orderedFields[i]));
-				if (newFieldsWithMin.size() == totalFields) {
-					break;
+				if (chosenFields.size() < totalFields) {
+					maxCount += std::get<3>(orderedFields[i]);
+					chosenFields.insert(std::get<4>(orderedFields[i]));
 				}
 			}
-			fieldsWithMin = newFieldsWithMin;
+		} else {
+			chosenFields = fieldsWithMin;
 		}
 
 		// Create a result set with fulfills the minField and minPerField requirements before adding more workers
 		std::set<WorkerDetails> resultSet;
-		for (auto& it : fieldsWithMin) {
+		for (auto& it : chosenFields) {
 			auto& w = field_zones[it].second;
 			for (int i = 0; i < minPerField; i++) {
 				resultSet.insert(w[i]);
@@ -568,23 +573,24 @@ public:
 		}
 
 		// Continue adding workers to the result set until we reach the desired number of workers
-		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
-			if (workerIter->first.first > requiredFitness ||
-			    (workerIter->first.first == requiredFitness && workerIter->first.second > requiredUsed)) {
+		for (auto workerIter = fitness_workers.begin();
+		     workerIter != fitness_workers.end() && resultSet.size() < desired;
+		     ++workerIter) {
+			auto fitness = std::get<0>(workerIter->first);
+			auto used = std::get<1>(workerIter->first);
+
+			if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
 				break;
 			}
 			if (workerIter->second.size() + resultSet.size() <= desired) {
 				for (auto& worker : workerIter->second) {
-					if (fieldsWithMin.count(worker.interf.locality.get(field))) {
+					if (chosenFields.count(worker.interf.locality.get(field))) {
 						resultSet.insert(worker);
 					}
 				}
 			} else {
 				addWorkersByLowestField(field, desired, workerIter->second, resultSet);
 			}
-			if (resultSet.size() >= desired) {
-				break;
-			}
 		}
 
 		for (auto& result : resultSet) {
@@ -602,8 +608,9 @@ public:
 	                                                     int minFields,
 	                                                     int minPerField,
 	                                                     bool checkStable,
-	                                                     std::set<Optional<Key>> dcIds,
-	                                                     std::vector<UID> exclusionWorkerIds) {
+	                                                     const std::set<Optional<Key>>& dcIds,
+	                                                     const std::vector<UID>& exclusionWorkerIds) {
+		desired = std::max(desired, minFields * minPerField);
 		std::map<Optional<Standalone<StringRef>>, int> withDegradedUsed = id_used;
 		auto withDegraded = getWorkersForTlogsComplex(conf,
 		                                              desired,
@@ -616,6 +623,7 @@ public:
 		                                              dcIds,
 		                                              exclusionWorkerIds);
 		RoleFitness withDegradedFitness(withDegraded, ProcessClass::TLog, withDegradedUsed);
+		ASSERT(withDegraded.size() <= desired);
 
 		bool usedDegraded = false;
 		for (auto& it : withDegraded) {
@@ -666,9 +674,9 @@ public:
 	                                                    int32_t desired,
 	                                                    std::map<Optional<Standalone<StringRef>>, int>& id_used,
 	                                                    bool checkStable,
-	                                                    std::set<Optional<Key>> dcIds,
-	                                                    std::vector<UID> exclusionWorkerIds) {
-		std::map<std::tuple<ProcessClass::Fitness, int, bool, bool>, vector<WorkerDetails>> fitness_workers;
+	                                                    const std::set<Optional<Key>>& dcIds,
+	                                                    const std::vector<UID>& exclusionWorkerIds) {
+		std::map<std::tuple<ProcessClass::Fitness, int, bool, bool, bool>, vector<WorkerDetails>> fitness_workers;
 
 		// Go through all the workers to list all the workers that can be recruited.
 		for (const auto& [worker_process_id, worker_info] : id_worker) {
@@ -689,7 +697,11 @@ public:
 				fitness = std::max(fitness, ProcessClass::GoodFit);
 			}
 
-			fitness_workers[std::make_tuple(fitness, id_used[worker_process_id], worker_details.degraded, inCCDC)]
+			fitness_workers[std::make_tuple(fitness,
+			                                id_used[worker_process_id],
+			                                worker_details.degraded,
+			                                isLongLivedStateless(worker_process_id),
+			                                inCCDC)]
 			    .push_back(worker_details);
 		}
 
@@ -721,7 +733,9 @@ public:
 		}
 
 		// Continue adding workers to the result set until we reach the desired number of workers
-		for (auto workerIter = fitness_workers.begin(); workerIter != fitness_workers.end(); ++workerIter) {
+		for (auto workerIter = fitness_workers.begin();
+		     workerIter != fitness_workers.end() && resultSet.size() < desired;
+		     ++workerIter) {
 			auto fitness = std::get<0>(workerIter->first);
 			auto used = std::get<1>(workerIter->first);
 			if (fitness > requiredFitness || (fitness == requiredFitness && used > requiredUsed)) {
@@ -734,11 +748,10 @@ public:
 			} else {
 				addWorkersByLowestZone(desired, workerIter->second, resultSet);
 			}
-			if (resultSet.size() >= desired) {
-				break;
-			}
 		}
 
+		ASSERT(resultSet.size() <= desired);
+
 		for (auto& result : resultSet) {
 			id_used[result.interf.locality.processId()]++;
 		}

From 50bb9b51b483b69095e14606407072393a28a5b2 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 10:13:59 -0700
Subject: [PATCH 256/317] simulation does recruitment twice and compares the
 results to ensure recruitment is deterministic

---
 fdbserver/ClusterController.actor.cpp | 122 ++++++++++++++++++++++++--
 1 file changed, 116 insertions(+), 6 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 24a52b1a74..c118168c30 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1434,8 +1434,8 @@ public:
 		return result;
 	}
 
-	ErrorOr<RecruitFromConfigurationReply> findWorkersForConfiguration(RecruitFromConfigurationRequest const& req,
-	                                                                   Optional<Key> dcId) {
+	ErrorOr<RecruitFromConfigurationReply> findWorkersForConfigurationFromDC(RecruitFromConfigurationRequest const& req,
+	                                                                         Optional<Key> dcId) {
 		RecruitFromConfigurationReply result;
 		std::map<Optional<Standalone<StringRef>>, int> id_used;
 		updateKnownIds(&id_used);
@@ -1580,7 +1580,7 @@ public:
 		return result;
 	}
 
-	RecruitFromConfigurationReply findWorkersForConfiguration(RecruitFromConfigurationRequest const& req) {
+	RecruitFromConfigurationReply findWorkersForConfigurationDispatch(RecruitFromConfigurationRequest const& req) {
 		if (req.configuration.regions.size() > 1) {
 			std::vector<RegionInfo> regions = req.configuration.regions;
 			if (regions[0].priority == regions[1].priority && regions[1].dcId == clusterControllerDcId.get()) {
@@ -1600,7 +1600,7 @@ public:
 
 			bool setPrimaryDesired = false;
 			try {
-				auto reply = findWorkersForConfiguration(req, regions[0].dcId);
+				auto reply = findWorkersForConfigurationFromDC(req, regions[0].dcId);
 				setPrimaryDesired = true;
 				vector<Optional<Key>> dcPriority;
 				dcPriority.push_back(regions[0].dcId);
@@ -1621,7 +1621,7 @@ public:
 					throw;
 				}
 				TraceEvent(SevWarn, "AttemptingRecruitmentInRemoteDC", id).error(e);
-				auto reply = findWorkersForConfiguration(req, regions[1].dcId);
+				auto reply = findWorkersForConfigurationFromDC(req, regions[1].dcId);
 				if (!setPrimaryDesired) {
 					vector<Optional<Key>> dcPriority;
 					dcPriority.push_back(regions[1].dcId);
@@ -1639,7 +1639,7 @@ public:
 			vector<Optional<Key>> dcPriority;
 			dcPriority.push_back(req.configuration.regions[0].dcId);
 			desiredDcIds.set(dcPriority);
-			auto reply = findWorkersForConfiguration(req, req.configuration.regions[0].dcId);
+			auto reply = findWorkersForConfigurationFromDC(req, req.configuration.regions[0].dcId);
 			if (reply.isError()) {
 				throw reply.getError();
 			} else if (req.configuration.regions[0].dcId == clusterControllerDcId.get()) {
@@ -1831,6 +1831,116 @@ public:
 		}
 	}
 
+	void updateIdUsed(const std::vector<WorkerInterface>& workers,
+	                  std::map<Optional<Standalone<StringRef>>, int>& id_used) {
+		for (auto& it : workers) {
+			id_used[it.locality.processId()]++;
+		}
+	}
+
+	void compareWorkers(const DatabaseConfiguration& conf,
+	                    const std::vector<WorkerInterface>& first,
+	                    std::map<Optional<Standalone<StringRef>>, int>& firstUsed,
+	                    const std::vector<WorkerInterface>& second,
+	                    std::map<Optional<Standalone<StringRef>>, int>& secondUsed,
+	                    ProcessClass::ClusterRole role,
+	                    std::string description) {
+		std::vector<WorkerDetails> firstDetails;
+		for (auto& it : first) {
+			auto w = id_worker.find(it.locality.processId());
+			ASSERT(w != id_worker.end());
+			ASSERT(!conf.isExcludedServer(w->second.details.interf.addresses()));
+			firstDetails.push_back(w->second.details);
+			//TraceEvent("CompareAddressesFirst").detail(description.c_str(), w->second.details.interf.address());
+		}
+		RoleFitness firstFitness(firstDetails, role, firstUsed);
+
+		std::vector<WorkerDetails> secondDetails;
+		for (auto& it : second) {
+			auto w = id_worker.find(it.locality.processId());
+			ASSERT(w != id_worker.end());
+			ASSERT(!conf.isExcludedServer(w->second.details.interf.addresses()));
+			secondDetails.push_back(w->second.details);
+			//TraceEvent("CompareAddressesSecond").detail(description.c_str(), w->second.details.interf.address());
+		}
+		RoleFitness secondFitness(secondDetails, role, secondUsed);
+
+		if (!(firstFitness == secondFitness)) {
+			TraceEvent(SevError, "NonDeterministicRecruitment")
+			    .detail("FirstFitness", firstFitness.toString())
+			    .detail("SecondFitness", secondFitness.toString())
+			    .detail("ClusterRole", role);
+		}
+	}
+
+	RecruitFromConfigurationReply findWorkersForConfiguration(RecruitFromConfigurationRequest const& req) {
+		RecruitFromConfigurationReply rep = findWorkersForConfigurationDispatch(req);
+		if (g_network->isSimulated()) {
+			RecruitFromConfigurationReply compare = findWorkersForConfigurationDispatch(req);
+
+			std::map<Optional<Standalone<StringRef>>, int> firstUsed;
+			std::map<Optional<Standalone<StringRef>>, int> secondUsed;
+			updateKnownIds(&firstUsed);
+			updateKnownIds(&secondUsed);
+
+			auto mworker = id_worker.find(masterProcessId);
+			//TraceEvent("CompareAddressesMaster")
+			//    .detail("Master",
+			//            mworker != id_worker.end() ? mworker->second.details.interf.address() : NetworkAddress());
+
+			updateIdUsed(rep.tLogs, firstUsed);
+			updateIdUsed(compare.tLogs, secondUsed);
+			compareWorkers(
+			    req.configuration, rep.tLogs, firstUsed, compare.tLogs, secondUsed, ProcessClass::TLog, "TLog");
+			updateIdUsed(rep.satelliteTLogs, firstUsed);
+			updateIdUsed(compare.satelliteTLogs, secondUsed);
+			compareWorkers(req.configuration,
+			               rep.satelliteTLogs,
+			               firstUsed,
+			               compare.satelliteTLogs,
+			               secondUsed,
+			               ProcessClass::TLog,
+			               "Satellite");
+			updateIdUsed(rep.commitProxies, firstUsed);
+			updateIdUsed(compare.commitProxies, secondUsed);
+			updateIdUsed(rep.grvProxies, firstUsed);
+			updateIdUsed(compare.grvProxies, secondUsed);
+			updateIdUsed(rep.resolvers, firstUsed);
+			updateIdUsed(compare.resolvers, secondUsed);
+			compareWorkers(req.configuration,
+			               rep.commitProxies,
+			               firstUsed,
+			               compare.commitProxies,
+			               secondUsed,
+			               ProcessClass::CommitProxy,
+			               "CommitProxy");
+			compareWorkers(req.configuration,
+			               rep.grvProxies,
+			               firstUsed,
+			               compare.grvProxies,
+			               secondUsed,
+			               ProcessClass::GrvProxy,
+			               "GrvProxy");
+			compareWorkers(req.configuration,
+			               rep.resolvers,
+			               firstUsed,
+			               compare.resolvers,
+			               secondUsed,
+			               ProcessClass::Resolver,
+			               "Resolver");
+			updateIdUsed(rep.backupWorkers, firstUsed);
+			updateIdUsed(compare.backupWorkers, secondUsed);
+			compareWorkers(req.configuration,
+			               rep.backupWorkers,
+			               firstUsed,
+			               compare.backupWorkers,
+			               secondUsed,
+			               ProcessClass::Backup,
+			               "Backup");
+		}
+		return rep;
+	}
+
 	// Check if txn system is recruited successfully in each region
 	void checkRegions(const std::vector<RegionInfo>& regions) {
 		if (desiredDcIds.get().present() && desiredDcIds.get().get().size() == 2 &&

From 451609e6be07cdadf9b87b91910d8850fc95f465 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 10:16:18 -0700
Subject: [PATCH 257/317] code cleanup

---
 fdbrpc/ReplicationPolicy.h            |  4 +-
 fdbserver/ClusterController.actor.cpp | 86 ++++++++++++++-------------
 2 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/fdbrpc/ReplicationPolicy.h b/fdbrpc/ReplicationPolicy.h
index a9c6f33e09..c2401a4bc0 100644
--- a/fdbrpc/ReplicationPolicy.h
+++ b/fdbrpc/ReplicationPolicy.h
@@ -151,9 +151,9 @@ struct PolicyAcross final : IReplicationPolicy, public ReferenceCounted<PolicyAc
 		_policy->attributeKeys(set);
 	}
 
-	Reference<IReplicationPolicy> embeddedPolicy() { return _policy; }
+	Reference<IReplicationPolicy> embeddedPolicy() const { return _policy; }
 
-	std::string attributeKey() { return _attribKey; }
+	const std::string& attributeKey() const { return _attribKey; }
 
 protected:
 	int _count;
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index c118168c30..3cbaebe6bf 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -326,18 +326,18 @@ public:
 	// only add workers which have a field which is already in the result set
 	void addWorkersByLowestField(StringRef field,
 	                             int desired,
-	                             std::vector<WorkerDetails> workers,
+	                             const std::vector<WorkerDetails>& workers,
 	                             std::set<WorkerDetails>& resultSet) {
 		typedef Optional<Standalone<StringRef>> Field;
 		typedef Optional<Standalone<StringRef>> Zone;
-		typedef std::pair<int, Field> FieldCount;
+		typedef std::tuple<int, bool, Field> FieldCount;
 		typedef std::pair<int, Zone> ZoneCount;
 
 		std::priority_queue<FieldCount, std::vector<FieldCount>, std::greater<FieldCount>> fieldQueue;
 		std::map<Field, std::priority_queue<ZoneCount, std::vector<ZoneCount>, std::greater<ZoneCount>>>
 		    field_zoneQueue;
 
-		std::map<Field, std::pair<int, int>> field_count;
+		std::map<Field, std::pair<int, bool>> field_count;
 		std::map<Zone, std::pair<int, Field>> zone_count;
 		std::map<Zone, std::vector<WorkerDetails>> zone_workers;
 
@@ -347,29 +347,28 @@ public:
 			auto thisZone = worker.interf.locality.zoneId();
 			auto thisDc = worker.interf.locality.dcId();
 
-			auto& f = field_count[thisField];
-			f.first++;
-			if (thisDc == clusterControllerDcId) {
-				f.second = 1;
-			}
-			auto& z = zone_count[thisZone];
-			z.first++;
-			z.second = thisField;
+			auto& fitness = field_count[thisField];
+			fitness.first++;
+			fitness.second = thisDc == clusterControllerDcId;
+
+			auto& zc = zone_count[thisZone];
+			zc.first++;
+			zc.second = thisField;
 		}
 
 		for (auto& worker : workers) {
 			auto thisField = worker.interf.locality.get(field);
 			auto thisZone = worker.interf.locality.zoneId();
-			zone_workers[thisZone].push_back(worker);
 
 			if (field_count.count(thisField)) {
+				zone_workers[thisZone].push_back(worker);
 				zone_count[thisZone].second = thisField;
 			}
 		}
 
 		// try to avoid fields in the cluster controller datacenter if everything else is equal
 		for (auto& it : field_count) {
-			fieldQueue.push(std::make_pair(2 * it.second.first + it.second.second, it.first));
+			fieldQueue.push(std::make_tuple(it.second.first, it.second.second, it.first));
 		}
 
 		for (auto& it : zone_count) {
@@ -379,7 +378,7 @@ public:
 		// start with the least used field, and try to find a worker with that field
 		while (fieldQueue.size()) {
 			auto lowestField = fieldQueue.top();
-			auto& lowestZoneQueue = field_zoneQueue[lowestField.second];
+			auto& lowestZoneQueue = field_zoneQueue[std::get<2>(lowestField)];
 			bool added = false;
 			// start with the least used zoneId, and try and find a worker with that zone
 			while (lowestZoneQueue.size() && !added) {
@@ -389,7 +388,7 @@ public:
 				while (zoneWorkers.size() && !added) {
 					if (!resultSet.count(zoneWorkers.back())) {
 						resultSet.insert(zoneWorkers.back());
-						if (resultSet.size() >= desired) {
+						if (resultSet.size() == desired) {
 							return;
 						}
 						added = true;
@@ -398,18 +397,22 @@ public:
 				}
 				lowestZoneQueue.pop();
 				if (added && zoneWorkers.size()) {
-					lowestZoneQueue.push(std::make_pair(lowestZone.first + 1, lowestZone.second));
+					++lowestZone.first;
+					lowestZoneQueue.push(lowestZone);
 				}
 			}
 			fieldQueue.pop();
 			if (added) {
-				fieldQueue.push(std::make_pair(lowestField.first + 2, lowestField.second));
+				++std::get<0>(lowestField);
+				fieldQueue.push(lowestField);
 			}
 		}
 	}
 
 	// Adds workers to the result which minimize the reuse of zoneIds
-	void addWorkersByLowestZone(int desired, std::vector<WorkerDetails> workers, std::set<WorkerDetails>& resultSet) {
+	void addWorkersByLowestZone(int desired,
+	                            const std::vector<WorkerDetails>& workers,
+	                            std::set<WorkerDetails>& resultSet) {
 		typedef Optional<Standalone<StringRef>> Zone;
 		typedef std::pair<int, Zone> ZoneCount;
 
@@ -417,7 +420,7 @@ public:
 		std::map<Zone, std::vector<WorkerDetails>> zone_workers;
 		std::priority_queue<ZoneCount, std::vector<ZoneCount>, std::greater<ZoneCount>> zoneQueue;
 
-		for (auto& worker : workers) {
+		for (const auto& worker : workers) {
 			auto thisZone = worker.interf.locality.zoneId();
 			zone_count[thisZone] = 0;
 			zone_workers[thisZone].push_back(worker);
@@ -440,7 +443,7 @@ public:
 			while (zoneWorkers.size() && !added) {
 				if (!resultSet.count(zoneWorkers.back())) {
 					resultSet.insert(zoneWorkers.back());
-					if (resultSet.size() >= desired) {
+					if (resultSet.size() == desired) {
 						return;
 					}
 					added = true;
@@ -449,7 +452,8 @@ public:
 			}
 			zoneQueue.pop();
 			if (added && zoneWorkers.size()) {
-				zoneQueue.push(std::make_pair(lowestZone.first + 1, lowestZone.second));
+				++lowestZone.first;
+				zoneQueue.push(lowestZone);
 			}
 		}
 	}
@@ -517,15 +521,13 @@ public:
 					}
 				}
 				field_count[thisField]++;
-				field_fitness.insert({ thisField,
-				                       std::make_tuple(workerIter->first.first,
-				                                       workerIter->first.second,
-				                                       worker.interf.locality.dcId() == clusterControllerDcId) });
+				field_fitness.insert(
+				    { thisField,
+				      std::make_tuple(fitness, used, worker.interf.locality.dcId() == clusterControllerDcId) });
 			}
 			if (fieldsWithMin.size() >= minFields) {
-				requiredFitness = workerIter->first.first;
-				requiredUsed = workerIter->first.second;
-				break;
+				requiredFitness = fitness;
+				requiredUsed = used;
 			}
 		}
 
@@ -651,6 +653,7 @@ public:
 			                                                 dcIds,
 			                                                 exclusionWorkerIds);
 			RoleFitness withoutDegradedFitness(withoutDegraded, ProcessClass::TLog, withoutDegradedUsed);
+			ASSERT(withoutDegraded.size() <= desired);
 
 			if (withDegradedFitness < withoutDegradedFitness) {
 				id_used = withDegradedUsed;
@@ -720,12 +723,12 @@ public:
 				if (!zones.count(worker.interf.locality.zoneId())) {
 					zones.insert(worker.interf.locality.zoneId());
 					resultSet.insert(worker);
-					if (resultSet.size() >= required) {
+					if (resultSet.size() == required) {
 						break;
 					}
 				}
 			}
-			if (resultSet.size() >= required) {
+			if (resultSet.size() == required) {
 				requiredFitness = fitness;
 				requiredUsed = used;
 				break;
@@ -771,14 +774,15 @@ public:
 	//   dcIds:       the target data centers the workers are in. The selected workers must all be from these
 	//                data centers:
 	//   exclusionWorkerIds: the workers to be excluded from the selection.
-	std::vector<WorkerDetails> getWorkersForTlogsBackup(DatabaseConfiguration const& conf,
-	                                                    int32_t required,
-	                                                    int32_t desired,
-	                                                    Reference<IReplicationPolicy> const& policy,
-	                                                    std::map<Optional<Standalone<StringRef>>, int>& id_used,
-	                                                    bool checkStable = false,
-	                                                    std::set<Optional<Key>> dcIds = std::set<Optional<Key>>(),
-	                                                    std::vector<UID> exclusionWorkerIds = {}) {
+	std::vector<WorkerDetails> getWorkersForTlogsBackup(
+	    DatabaseConfiguration const& conf,
+	    int32_t required,
+	    int32_t desired,
+	    Reference<IReplicationPolicy> const& policy,
+	    std::map<Optional<Standalone<StringRef>>, int>& id_used,
+	    bool checkStable = false,
+	    const std::set<Optional<Key>>& dcIds = std::set<Optional<Key>>(),
+	    const std::vector<UID>& exclusionWorkerIds = {}) {
 		std::map<std::tuple<ProcessClass::Fitness, int, bool, bool>, vector<WorkerDetails>> fitness_workers;
 		std::vector<WorkerDetails> results;
 		Reference<LocalitySet> logServerSet = Reference<LocalitySet>(new LocalityMap<WorkerDetails>());
@@ -980,8 +984,8 @@ public:
 	                                              Reference<IReplicationPolicy> const& policy,
 	                                              std::map<Optional<Standalone<StringRef>>, int>& id_used,
 	                                              bool checkStable = false,
-	                                              std::set<Optional<Key>> dcIds = std::set<Optional<Key>>(),
-	                                              std::vector<UID> exclusionWorkerIds = {}) {
+	                                              const std::set<Optional<Key>>& dcIds = std::set<Optional<Key>>(),
+	                                              const std::vector<UID>& exclusionWorkerIds = {}) {
 		desired = std::max(required, desired);
 		bool useSimple = false;
 		if (policy->name() == "Across") {
@@ -1079,6 +1083,7 @@ public:
 			}
 			return workers;
 		}
+		TraceEvent(g_network->isSimulated() ? SevError : SevWarnAlways, "PolicyEngineNotOptimized");
 		return getWorkersForTlogsBackup(
 		    conf, required, desired, policy, id_used, checkStable, dcIds, exclusionWorkerIds);
 	}
@@ -3352,6 +3357,7 @@ void registerWorker(RegisterWorkerRequest req, ClusterControllerData* self) {
 		self->goodRemoteRecruitmentTime = lowPriorityDelay(SERVER_KNOBS->WAIT_FOR_GOOD_REMOTE_RECRUITMENT_DELAY);
 	} else {
 		TraceEvent("ClusterControllerWorkerAlreadyRegistered", self->id)
+		    .suppressFor(1.0)
 		    .detail("WorkerId", w.id())
 		    .detail("ProcessId", w.locality.processId())
 		    .detail("ZoneId", w.locality.zoneId())

From 656c9a6c47b1b952aa7c096c4e28d7234924beeb Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Mon, 26 Apr 2021 17:46:35 +0000
Subject: [PATCH 258/317] Add benchmark and document entities touched

---
 fdbclient/MultiVersionTransaction.actor.cpp |  6 ++++++
 flow/FastRef.h                              |  7 +++++++
 flowbench/BenchRef.cpp                      | 10 ++++++++++
 3 files changed, 23 insertions(+)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index cff80f60b5..918eeb40fb 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -1975,6 +1975,12 @@ ACTOR Future<Void> checkUndestroyedFutures(std::vector<ThreadSingleAssignmentVar
 	return Void();
 }
 
+// Common code for tests of single assignment vars. Tests both correctness and thread safety.
+// T should be a class that has a static method with the following signature:
+//
+//     static FutureInfo createThreadFuture(FutureInfo f);
+//
+// See AbortableTest for an example T type
 template <class T>
 THREAD_FUNC runSingleAssignmentVarTest(void* arg) {
 	noUnseed = true;
diff --git a/flow/FastRef.h b/flow/FastRef.h
index f8292c5322..06221e4cc0 100644
--- a/flow/FastRef.h
+++ b/flow/FastRef.h
@@ -25,6 +25,13 @@
 #include <atomic>
 #include <cstdint>
 
+// The thread safety this class provides is that it's safe to call addref and
+// delref on the same object concurrently in different threads. Subclass does
+// not get deleted until after all calls to delref complete.
+//
+// Importantly, this class does _not_ make accessing Subclass automatically
+// thread safe. Clients will need to provide their own external synchronization
+// for that.
 template <class Subclass>
 class ThreadSafeReferenceCounted {
 public:
diff --git a/flowbench/BenchRef.cpp b/flowbench/BenchRef.cpp
index a60c15bb6f..facf09db68 100644
--- a/flowbench/BenchRef.cpp
+++ b/flowbench/BenchRef.cpp
@@ -26,12 +26,14 @@
 #include <memory>
 
 struct Empty : public ReferenceCounted<Empty>, public FastAllocated<Empty> {};
+struct EmptyTSRC : public ThreadSafeReferenceCounted<EmptyTSRC>, public FastAllocated<EmptyTSRC> {};
 
 enum class RefType {
 	RawPointer,
 	UniquePointer,
 	SharedPointer,
 	FlowReference,
+	FlowReferenceThreadSafe,
 };
 
 template <RefType refType>
@@ -61,6 +63,12 @@ struct Factory<RefType::FlowReference> {
 	static void cleanup(const Reference<Empty>&) {}
 };
 
+template <>
+struct Factory<RefType::FlowReferenceThreadSafe> {
+	static Reference<EmptyTSRC> create() { return makeReference<EmptyTSRC>(); }
+	static void cleanup(const Reference<EmptyTSRC>&) {}
+};
+
 template <RefType refType>
 static void bench_ref_create_and_destroy(benchmark::State& state) {
 	while (state.KeepRunning()) {
@@ -86,7 +94,9 @@ BENCHMARK_TEMPLATE(bench_ref_create_and_destroy, RefType::RawPointer)->ReportAgg
 BENCHMARK_TEMPLATE(bench_ref_create_and_destroy, RefType::UniquePointer)->ReportAggregatesOnly(true);
 BENCHMARK_TEMPLATE(bench_ref_create_and_destroy, RefType::SharedPointer)->ReportAggregatesOnly(true);
 BENCHMARK_TEMPLATE(bench_ref_create_and_destroy, RefType::FlowReference)->ReportAggregatesOnly(true);
+BENCHMARK_TEMPLATE(bench_ref_create_and_destroy, RefType::FlowReferenceThreadSafe)->ReportAggregatesOnly(true);
 
 BENCHMARK_TEMPLATE(bench_ref_copy, RefType::RawPointer)->ReportAggregatesOnly(true);
 BENCHMARK_TEMPLATE(bench_ref_copy, RefType::SharedPointer)->ReportAggregatesOnly(true);
 BENCHMARK_TEMPLATE(bench_ref_copy, RefType::FlowReference)->ReportAggregatesOnly(true);
+BENCHMARK_TEMPLATE(bench_ref_copy, RefType::FlowReferenceThreadSafe)->ReportAggregatesOnly(true);

From 6b81b7a04b528ddd9a33b33bf195694c15c9029e Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 26 Apr 2021 11:04:36 -0700
Subject: [PATCH 259/317] Remove current lineage validity check

---
 fdbclient/ActorLineageProfiler.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index f1a71bae60..8bb2910001 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -238,10 +238,7 @@ ActorLineageProfilerT::ActorLineageProfilerT() {
 	    std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet())));
 	collection->collector()->addGetter(WaitState::Running, []() {
 		auto res = currentLineageThreadSafe.get();
-		if (res.isValid()) {
-			return std::vector<Reference<ActorLineage>>({ res });
-		}
-		return std::vector<Reference<ActorLineage>>();
+		return std::vector<Reference<ActorLineage>>({ res });
 	});
 }
 

From 719f810676f9382a56abc9fa979e43d4f338c6ae Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Mon, 26 Apr 2021 12:30:46 -0700
Subject: [PATCH 260/317] Rename incrementalBackupOnly to onlyAppyMutationLogs
 in all restore configs and functions.

---
 fdbbackup/backup.actor.cpp          | 10 +++++----
 fdbclient/BackupAgent.actor.h       |  8 +++----
 fdbclient/FileBackupAgent.actor.cpp | 35 ++++++++++++++---------------
 3 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp
index 43e6f86b10..f976de06a6 100644
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@@ -2258,7 +2258,7 @@ ACTOR Future<Void> runRestore(Database db,
                               bool waitForDone,
                               std::string addPrefix,
                               std::string removePrefix,
-                              bool incrementalBackupOnly,
+                              bool onlyAppyMutationLogs,
                               bool inconsistentSnapshotOnly) {
 	if (ranges.empty()) {
 		ranges.push_back_deep(ranges.arena(), normalKeys);
@@ -2305,7 +2305,7 @@ ACTOR Future<Void> runRestore(Database db,
 
 			BackupDescription desc = wait(bc->describeBackup());
 
-			if (incrementalBackupOnly && desc.contiguousLogEnd.present()) {
+			if (onlyAppyMutationLogs && desc.contiguousLogEnd.present()) {
 				targetVersion = desc.contiguousLogEnd.get() - 1;
 			} else if (desc.maxRestorableVersion.present()) {
 				targetVersion = desc.maxRestorableVersion.get();
@@ -2330,7 +2330,7 @@ ACTOR Future<Void> runRestore(Database db,
 			                                                   KeyRef(addPrefix),
 			                                                   KeyRef(removePrefix),
 			                                                   true,
-			                                                   incrementalBackupOnly,
+			                                                   onlyAppyMutationLogs,
 			                                                   inconsistentSnapshotOnly,
 			                                                   beginVersion));
 
@@ -3247,6 +3247,7 @@ int main(int argc, char* argv[]) {
 		bool stopWhenDone = true;
 		bool usePartitionedLog = false; // Set to true to use new backup system
 		bool incrementalBackupOnly = false;
+		bool onlyAppyMutationLogs = false;
 		bool inconsistentSnapshotOnly = false;
 		bool forceAction = false;
 		bool trace = false;
@@ -3511,6 +3512,7 @@ int main(int argc, char* argv[]) {
 				break;
 			case OPT_INCREMENTALONLY:
 				incrementalBackupOnly = true;
+				onlyAppyMutationLogs = true;
 				break;
 			case OPT_RESTORECONTAINER:
 				restoreContainer = args->OptionArg();
@@ -4032,7 +4034,7 @@ int main(int argc, char* argv[]) {
 				                         waitForDone,
 				                         addPrefix,
 				                         removePrefix,
-				                         incrementalBackupOnly,
+				                         onlyAppyMutationLogs,
 				                         inconsistentSnapshotOnly));
 				break;
 			case RestoreType::WAIT:
diff --git a/fdbclient/BackupAgent.actor.h b/fdbclient/BackupAgent.actor.h
index 38888d4b59..1e3f41bb0e 100644
--- a/fdbclient/BackupAgent.actor.h
+++ b/fdbclient/BackupAgent.actor.h
@@ -294,7 +294,7 @@ public:
 	                        Key addPrefix = Key(),
 	                        Key removePrefix = Key(),
 	                        bool lockDB = true,
-	                        bool incrementalBackupOnly = false,
+	                        bool onlyAppyMutationLogs = false,
 	                        bool inconsistentSnapshotOnly = false,
 	                        Version beginVersion = -1);
 	Future<Version> restore(Database cx,
@@ -308,7 +308,7 @@ public:
 	                        Key addPrefix = Key(),
 	                        Key removePrefix = Key(),
 	                        bool lockDB = true,
-	                        bool incrementalBackupOnly = false,
+	                        bool onlyAppyMutationLogs = false,
 	                        bool inconsistentSnapshotOnly = false,
 	                        Version beginVersion = -1) {
 		Standalone<VectorRef<KeyRangeRef>> rangeRef;
@@ -324,8 +324,8 @@ public:
 		               addPrefix,
 		               removePrefix,
 		               lockDB,
-		               incrementalBackupOnly,
-					   inconsistentSnapshotOnly,
+		               onlyAppyMutationLogs,
+		               inconsistentSnapshotOnly,
 		               beginVersion);
 	}
 	Future<Version> atomicRestore(Database cx,
diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 9d405af38f..ecc83dd955 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -141,7 +141,7 @@ public:
 	}
 	KeyBackedProperty<Key> addPrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	KeyBackedProperty<Key> removePrefix() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
-	KeyBackedProperty<bool> incrementalBackupOnly() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
+	KeyBackedProperty<bool> onlyAppyMutationLogs() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	KeyBackedProperty<bool> inconsistentSnapshotOnly() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
 	// XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges
 	KeyBackedProperty<KeyRange> restoreRange() { return configSpace.pack(LiteralStringRef(__FUNCTION__)); }
@@ -3574,9 +3574,9 @@ struct RestoreDispatchTaskFunc : RestoreTaskFuncBase {
 		state int64_t remainingInBatch = Params.remainingInBatch().get(task);
 		state bool addingToExistingBatch = remainingInBatch > 0;
 		state Version restoreVersion;
-		state Future<Optional<bool>> incrementalBackupOnly = restore.incrementalBackupOnly().get(tr);
+		state Future<Optional<bool>> onlyAppyMutationLogs = restore.onlyAppyMutationLogs().get(tr);
 
-		wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)) && success(incrementalBackupOnly) &&
+		wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)) && success(onlyAppyMutationLogs) &&
 		     checkTaskVersion(tr->getDatabase(), task, name, version));
 
 		// If not adding to an existing batch then update the apply mutations end version so the mutations from the
@@ -4101,8 +4101,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			}
 		}
 
-		Optional<bool> _incremental = wait(restore.incrementalBackupOnly().get(tr));
-		state bool incremental = _incremental.present() ? _incremental.get() : false;
+		state bool logsOnly = wait(restore.onlyAppyMutationLogs().getD(tr, false, false));
 		if (beginVersion == invalidVersion) {
 			beginVersion = 0;
 		}
@@ -4111,8 +4110,8 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			keyRangesFilter.push_back_deep(keyRangesFilter.arena(), KeyRangeRef(r));
 		}
 		state Optional<RestorableFileSet> restorable =
-		    wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, incremental, beginVersion));
-		if (!incremental) {
+		    wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, logsOnly, beginVersion));
+		if (!logsOnly) {
 			beginVersion = restorable.get().snapshot.beginVersion;
 		}
 
@@ -4226,7 +4225,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 		    tr, taskBucket, task, 0, "", 0, CLIENT_KNOBS->RESTORE_DISPATCH_BATCH_SIZE)));
 
 		wait(taskBucket->finish(tr, task));
-		state Future<Optional<bool>> logsOnly = restore.incrementalBackupOnly().get(tr);
+		state Future<Optional<bool>> logsOnly = restore.onlyAppyMutationLogs().get(tr);
 		wait(success(logsOnly));
 		if (logsOnly.get().present() && logsOnly.get().get()) {
 			// If this is an incremental restore, we need to set the applyMutationsMapPrefix
@@ -4626,7 +4625,7 @@ public:
 	                                        Key addPrefix,
 	                                        Key removePrefix,
 	                                        bool lockDB,
-	                                        bool incrementalBackupOnly,
+	                                        bool onlyAppyMutationLogs,
 	                                        bool inconsistentSnapshotOnly,
 	                                        Version beginVersion,
 	                                        UID uid) {
@@ -4679,7 +4678,7 @@ public:
 			                                .removePrefix(removePrefix)
 			                                .withPrefix(addPrefix);
 			Standalone<RangeResultRef> existingRows = wait(tr->getRange(restoreIntoRange, 1));
-			if (existingRows.size() > 0 && !incrementalBackupOnly) {
+			if (existingRows.size() > 0 && !onlyAppyMutationLogs) {
 				throw restore_destination_not_empty();
 			}
 		}
@@ -4696,7 +4695,7 @@ public:
 		restore.sourceContainer().set(tr, bc);
 		restore.stateEnum().set(tr, ERestoreState::QUEUED);
 		restore.restoreVersion().set(tr, restoreVersion);
-		restore.incrementalBackupOnly().set(tr, incrementalBackupOnly);
+		restore.onlyAppyMutationLogs().set(tr, onlyAppyMutationLogs);
 		restore.inconsistentSnapshotOnly().set(tr, inconsistentSnapshotOnly);
 		restore.beginVersion().set(tr, beginVersion);
 		if (BUGGIFY && restoreRanges.size() == 1) {
@@ -5271,7 +5270,7 @@ public:
 	//   removePrefix: for each key to be restored, remove this prefix first.
 	//   lockDB: if set lock the database with randomUid before performing restore;
 	//           otherwise, check database is locked with the randomUid
-	//   incrementalBackupOnly: only perform incremental restore, by only applying mutation logs
+	//   onlyAppyMutationLogs: only perform incremental restore, by only applying mutation logs
 	//   inconsistentSnapshotOnly: Ignore mutation log files during the restore to speedup the process.
 	//                             When set to true, gives an inconsistent snapshot, thus not recommended
 	//   beginVersion: restore's begin version
@@ -5288,7 +5287,7 @@ public:
 	                                     Key addPrefix,
 	                                     Key removePrefix,
 	                                     bool lockDB,
-	                                     bool incrementalBackupOnly,
+	                                     bool onlyAppyMutationLogs,
 	                                     bool inconsistentSnapshotOnly,
 	                                     Version beginVersion,
 	                                     UID randomUid) {
@@ -5308,12 +5307,12 @@ public:
 		if (targetVersion == invalidVersion && desc.maxRestorableVersion.present())
 			targetVersion = desc.maxRestorableVersion.get();
 
-		if (targetVersion == invalidVersion && incrementalBackupOnly && desc.contiguousLogEnd.present()) {
+		if (targetVersion == invalidVersion && onlyAppyMutationLogs && desc.contiguousLogEnd.present()) {
 			targetVersion = desc.contiguousLogEnd.get() - 1;
 		}
 
 		Optional<RestorableFileSet> restoreSet =
-		    wait(bc->getRestoreSet(targetVersion, ranges, incrementalBackupOnly, beginVersion));
+		    wait(bc->getRestoreSet(targetVersion, ranges, onlyAppyMutationLogs, beginVersion));
 
 		if (!restoreSet.present()) {
 			TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
@@ -5345,7 +5344,7 @@ public:
 				                   addPrefix,
 				                   removePrefix,
 				                   lockDB,
-				                   incrementalBackupOnly,
+				                   onlyAppyMutationLogs,
 				                   inconsistentSnapshotOnly,
 				                   beginVersion,
 				                   randomUid));
@@ -5563,7 +5562,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
                                          Key addPrefix,
                                          Key removePrefix,
                                          bool lockDB,
-                                         bool incrementalBackupOnly,
+                                         bool onlyAppyMutationLogs,
                                          bool inconsistentSnapshotOnly,
                                          Version beginVersion) {
 	return FileBackupAgentImpl::restore(this,
@@ -5578,7 +5577,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
 	                                    addPrefix,
 	                                    removePrefix,
 	                                    lockDB,
-	                                    incrementalBackupOnly,
+	                                    onlyAppyMutationLogs,
 	                                    inconsistentSnapshotOnly,
 	                                    beginVersion,
 	                                    deterministicRandom()->randomUniqueID());

From 823873a9aa150ab9c5edc2048d56a89f9e27ae7a Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Mon, 26 Apr 2021 14:39:27 -0700
Subject: [PATCH 261/317] Address review comments:

Use nullptr instead of NULL
Use const& for a parameter
Add some comments
---
 bindings/c/fdb_c.cpp                        | 1 +
 fdbclient/DatabaseContext.h                 | 1 +
 fdbclient/IClientApi.h                      | 1 +
 fdbclient/MultiVersionTransaction.actor.cpp | 4 +++-
 fdbclient/MultiVersionTransaction.h         | 7 ++++++-
 fdbclient/NativeAPI.actor.cpp               | 1 +
 fdbclient/ThreadSafeTransaction.cpp         | 1 +
 fdbclient/ThreadSafeTransaction.h           | 1 +
 8 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp
index 4b6b3a87ed..66bb974b71 100644
--- a/bindings/c/fdb_c.cpp
+++ b/bindings/c/fdb_c.cpp
@@ -367,6 +367,7 @@ extern "C" DLLEXPORT double fdb_database_get_main_thread_busyness(FDBDatabase* d
 
 // Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is non-zero, the future won't return until the protocol version is different than expected
+// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version) {
 	Optional<ProtocolVersion> expected;
 	if (expected_version > 0) {
diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h
index 487ce50bf2..ae1a5a741b 100644
--- a/fdbclient/DatabaseContext.h
+++ b/fdbclient/DatabaseContext.h
@@ -201,6 +201,7 @@ public:
 
 	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
+	// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 	Future<ProtocolVersion> getClusterProtocol(Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>());
 
 	// Update the watch counter for the database
diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h
index a3de56bf10..45249f1509 100644
--- a/fdbclient/IClientApi.h
+++ b/fdbclient/IClientApi.h
@@ -102,6 +102,7 @@ public:
 
 	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
+	// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 	virtual ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) = 0;
 
diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 3aa14fd6aa..bca3549651 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -361,6 +361,7 @@ double DLDatabase::getMainThreadBusyness() {
 
 // Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
+// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 ThreadFuture<ProtocolVersion> DLDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
 	ASSERT(api->databaseGetServerProtocol != nullptr);
 
@@ -972,13 +973,14 @@ double MultiVersionDatabase::getMainThreadBusyness() {
 
 // Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
+// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 ThreadFuture<ProtocolVersion> MultiVersionDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
 	return dbState->versionMonitorDb->getServerProtocol(expectedVersion);
 }
 
 MultiVersionDatabase::DatabaseState::DatabaseState(std::string clusterFilePath, Reference<IDatabase> versionMonitorDb)
   : clusterFilePath(clusterFilePath), versionMonitorDb(versionMonitorDb),
-    dbVar(new ThreadSafeAsyncVar<Reference<IDatabase>>(Reference<IDatabase>(NULL))) {}
+    dbVar(new ThreadSafeAsyncVar<Reference<IDatabase>>(Reference<IDatabase>(nullptr))) {}
 
 // Adds a client (local or externally loaded) that can be used to connect to the cluster
 void MultiVersionDatabase::DatabaseState::addClient(Reference<ClientInfo> client) {
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index 86e5cc0a63..388db1bd3b 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -273,6 +273,7 @@ public:
 
 	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
+	// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 	ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
 
@@ -448,6 +449,7 @@ public:
 
 	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
+	// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 	ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
 
@@ -510,6 +512,9 @@ public:
 		ThreadFuture<Void> protocolVersionMonitor;
 		std::list<LegacyVersionMonitor> legacyVersionMonitors;
 		Optional<ProtocolVersion> dbProtocolVersion;
+
+		// This maps a normalized protocol version to the client associated with it. This prevents compatible
+		// differences in protocol version not matching each other.
 		std::map<ProtocolVersion, Reference<ClientInfo>> clients;
 
 		std::vector<std::pair<FDBDatabaseOptions::Option, Optional<Standalone<StringRef>>>> options;
@@ -520,7 +525,7 @@ public:
 	// A struct that enables monitoring whether the cluster is running an old version (<= 5.0) that doesn't support
 	// connect packet monitoring.
 	struct LegacyVersionMonitor {
-		LegacyVersionMonitor(Reference<ClientInfo> client) : client(client), monitorRunning(false) {}
+		LegacyVersionMonitor(Reference<ClientInfo> const& client) : client(client), monitorRunning(false) {}
 		~LegacyVersionMonitor() { TraceEvent("DestroyingVersionMonitor"); }
 
 		// Starts the connection monitor by creating a database object at an old version.
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index f264d16bfa..40373d69a6 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -4982,6 +4982,7 @@ ACTOR Future<ProtocolVersion> getClusterProtocolImpl(
 
 // Returns the protocol version reported by the coordinator this client is currently connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
+// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 Future<ProtocolVersion> DatabaseContext::getClusterProtocol(Optional<ProtocolVersion> expectedVersion) {
 	return getClusterProtocolImpl(coordinator, expectedVersion);
 }
diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp
index ce17338af7..b8f2bc6a0a 100644
--- a/fdbclient/ThreadSafeTransaction.cpp
+++ b/fdbclient/ThreadSafeTransaction.cpp
@@ -99,6 +99,7 @@ double ThreadSafeDatabase::getMainThreadBusyness() {
 
 // Returns the protocol version reported by the coordinator this client is connected to
 // If an expected version is given, the future won't return until the protocol version is different than expected
+// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 ThreadFuture<ProtocolVersion> ThreadSafeDatabase::getServerProtocol(Optional<ProtocolVersion> expectedVersion) {
 	DatabaseContext* db = this->db;
 	return onMainThread(
diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h
index 407f9aefae..d8502f7613 100644
--- a/fdbclient/ThreadSafeTransaction.h
+++ b/fdbclient/ThreadSafeTransaction.h
@@ -41,6 +41,7 @@ public:
 
 	// Returns the protocol version reported by the coordinator this client is connected to
 	// If an expected version is given, the future won't return until the protocol version is different than expected
+	// Note: this will never return if the server is running a protocol from FDB 5.0 or older
 	ThreadFuture<ProtocolVersion> getServerProtocol(
 	    Optional<ProtocolVersion> expectedVersion = Optional<ProtocolVersion>()) override;
 

From fabeedb52733eea0acdb241f0c3da1ec94b029f2 Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Mon, 26 Apr 2021 15:20:25 -0700
Subject: [PATCH 262/317] improve docker packaging script

---
 packaging/docker/build-release-docker.sh | 75 +++++++++++++-----------
 1 file changed, 40 insertions(+), 35 deletions(-)

diff --git a/packaging/docker/build-release-docker.sh b/packaging/docker/build-release-docker.sh
index a385dbf474..32d173aed5 100755
--- a/packaging/docker/build-release-docker.sh
+++ b/packaging/docker/build-release-docker.sh
@@ -1,35 +1,51 @@
-# Run using . build-release-docker.sh
+#!/bin/bash
+set -euxo pipefail
 
-## This is designed to be run inside an okteto environment.
+## This is designed to be run inside an environment with the following repos checked out under ~/src:
+#
+#     foundationdb
+#     fdb-kubernetes-operator
+#
+# The foundationdb build will write its output to ~/build_output
 
-cmk
+FDB_VERSION=$(grep '  VERSION ' ~/src/foundationdb/CMakeLists.txt | tr -s ' ' ' ' | cut -d ' ' -f 3)
 
-cd ~/src/foundationdb/
+# Options (passed via environment variables)
 
-FDB_VERSION=$(grep '  VERSION ' CMakeLists.txt | tr -s ' ' ' ' | cut -d ' ' -f 3)
-
-# Feel free to customize the image tag:
+# Feel free to customize the image tag.
+# TODO: add a mechanism to set TAG=FDB_VERSION when we're building public releases.
 TAG=${TAG:-${FDB_VERSION}-${OKTETO_NAME}}
-
-export IMAGE=foundationdb/foundationdb:${TAG}
+ECR=${ECR:-112664522426.dkr.ecr.us-west-2.amazonaws.com}
 
 echo Building with tag ${TAG}
 
-WEBSITE_BIN_DIR=website/downloads/$FDB_VERSION/linux/
-TARBALL=${WEBSITE_BIN_DIR}/fdb_$FDB_VERSION.tar.gz
-ECR=112664522426.dkr.ecr.us-west-2.amazonaws.com
+# TODO: This is a copy of the commonly-used 'cmk' function.
+cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output \
+   -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON \
+   -G Ninja \
+      && ninja -C ${HOME}/build_output -j 84
 
-cd ~/src/foundationdb/packaging/docker
+
+# derived variables
+IMAGE=foundationdb/foundationdb:${TAG}
+SIDECAR_IMAGE=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
+
+WEBSITE_BIN_DIR=website/downloads/${FDB_VERSION}/linux/
+TARBALL=${WEBSITE_BIN_DIR}/fdb_${FDB_VERSION}.tar.gz
+
+# copy packaging scripts from operator repo into fdb build_output directory
+cp -an ~/src/fdb-kubernetes-operator/foundationdb-kubernetes-sidecar/* ~/build_output/packages/docker/
+
+cd ~/build_output/packages/docker
 
 mkdir -p ${WEBSITE_BIN_DIR}
 tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
+cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
 
-# XXX
-make -C ~/src/fdb-kubernetes-tests/tests/ ecr-login
-
-yes| cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
 docker pull ${ECR}/ubuntu:18.04
 docker tag ${ECR}/ubuntu:18.04 ubuntu:18.04
+docker pull ${ECR}/python:3.9-slim
+docker tag ${ECR}/python:3.9-slim python:3.9-slim
 
 docker build -t ${IMAGE} \
    --build-arg FDB_WEBSITE=file:///mnt/website \
@@ -38,21 +54,6 @@ docker build -t ${IMAGE} \
    -f release/Dockerfile .
 
 docker tag ${IMAGE} ${ECR}/${IMAGE}
-docker push ${ECR}/${IMAGE}
-
-cd ~/src/fdb-kubernetes-operator/foundationdb-kubernetes-sidecar
-echo
-pwd
-echo
-
-mkdir -p ${WEBSITE_BIN_DIR}
-tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
-yes| cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
-
-SIDECAR_IMAGE=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
-
-docker pull ${ECR}/python:3.9-slim
-docker tag ${ECR}/python:3.9-slim python:3.9-slim
 
 docker build -t ${SIDECAR_IMAGE} \
    --build-arg FDB_WEBSITE=file:///mnt/website \
@@ -60,7 +61,11 @@ docker build -t ${SIDECAR_IMAGE} \
    --build-arg FDB_LIBRARY_VERSIONS=$FDB_VERSION \
    -f Dockerfile .
 
-docker tag ${IMAGE} ${ECR}/${SIDECAR_IMAGE}
-docker push ${ECR}/${SIDECAR_IMAGE}
+docker tag ${SIDECAR_IMAGE} ${ECR}/${SIDECAR_IMAGE}
 
-#docker build -f release/Dockerfile -t foundationdb/foundationdb:6.2.29 . --build-arg FDB_VERSION=6.2.29
+# Login to ECR
+# TODO: Move this to a common place instead of repeatedly copy-pasting it.
+aws ecr get-login-password | docker login --username AWS --password-stdin ${ECR}
+
+docker push ${ECR}/${IMAGE}
+docker push ${ECR}/${SIDECAR_IMAGE}

From c3ce091602927d731b60a78d916f1226715f8b78 Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Mon, 26 Apr 2021 15:21:07 -0700
Subject: [PATCH 263/317] remove unused Dockerfiles

---
 packaging/docker/base/Dockerfile     | 46 ---------------------
 packaging/docker/dev/Dockerfile      | 62 ----------------------------
 packaging/docker/dev_ycsb/Dockerfile | 45 --------------------
 3 files changed, 153 deletions(-)
 delete mode 100644 packaging/docker/base/Dockerfile
 delete mode 100644 packaging/docker/dev/Dockerfile
 delete mode 100644 packaging/docker/dev_ycsb/Dockerfile

diff --git a/packaging/docker/base/Dockerfile b/packaging/docker/base/Dockerfile
deleted file mode 100644
index 937d48dcff..0000000000
--- a/packaging/docker/base/Dockerfile
+++ /dev/null
@@ -1,46 +0,0 @@
-# Dockerfile
-#
-# This source file is part of the FoundationDB open source project
-#
-# Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-FROM ubuntu:18.04
-
-# Install dependencies
-
-RUN apt-get update && \
-	apt-get install -y curl>=7.58.0-2ubuntu3.6 \
-		dnsutils>=1:9.11.3+dfsg-1ubuntu1.7 \
-		lsof>=4.89+dfsg-0.1 \
-		tcptraceroute>=1.5beta7+debian-4build1 \
-		telnet>=0.17-41 \
-		netcat>=1.10-41.1 \
-		strace>=4.21-1ubuntu1 \
-		tcpdump>=4.9.3-0ubuntu0.18.04.1 \
-		less>=487-0.1 \
-		vim>=2:8.0.1453-1ubuntu1.4 \
-		net-tools>=1.60+git20161116.90da8a0-1ubuntu1 \
-		jq>=1.5+dfsg-2 && \
-	rm -r /var/lib/apt/lists/*
-
-# Adding tini https://github.com/krallin/tini
-ARG TINI_VERSION=v0.19.0
-RUN curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
-    curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64.sha256sum && \
-	sha256sum -c tini-amd64.sha256sum && \
-	rm -f tini-amd64.sha256sum && \
-    chmod +x tini-amd64 && \
-	mv tini-amd64 /usr/bin/tini
diff --git a/packaging/docker/dev/Dockerfile b/packaging/docker/dev/Dockerfile
deleted file mode 100644
index c74da35467..0000000000
--- a/packaging/docker/dev/Dockerfile
+++ /dev/null
@@ -1,62 +0,0 @@
-# Dockerfile
-#
-# This source file is part of the FoundationDB open source project
-#
-# Copyright 2013-2021 Apple Inc. and the FoundationDB project authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-ARG REPOSITORY=foundationdb/build
-ARG VERSION=centos7-latest
-FROM $REPOSITORY:$VERSION
-
-# Install FoundationDB Binaries
-
-WORKDIR /var/fdb/tmp
-
-COPY docker/scripts scripts/
-
-RUN chmod u+x scripts/*.bash && \
-	mkdir -p logs
-
-COPY . /var/fdb/tmp/packages
-
-WORKDIR /var/fdb/tmp/packages/bin
-
-RUN chmod u+x fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent && \
-	mv fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent /usr/bin
-
-WORKDIR /var/fdb/tmp/packages/lib
-
-RUN mv libfdb_c.so /usr/lib/libfdb_c.so && \
-	mv libfdb_java.so /usr/lib/libfdb_java.so
-
-# Set Up Runtime Scripts and Directories
-
-VOLUME /var/fdb/data
-
-CMD /var/fdb/scripts/fdb.bash
-
-# Runtime Configuration Options
-
-ENV FDB_PORT 4500
-ENV FDB_CLUSTER_FILE /var/fdb/fdb.cluster
-ENV FDB_NETWORKING_MODE container
-ENV FDB_COORDINATOR ""
-ENV FDB_COORDINATOR_PORT 4500
-ENV FDB_CLUSTER_FILE_CONTENTS ""
-ENV FDB_PROCESS_CLASS unset
-
-# Adding tini as PID 1 https://github.com/krallin/tini
-ENTRYPOINT ["/usr/bin/tini", "-g", "--"]
diff --git a/packaging/docker/dev_ycsb/Dockerfile b/packaging/docker/dev_ycsb/Dockerfile
deleted file mode 100644
index 4e1f435766..0000000000
--- a/packaging/docker/dev_ycsb/Dockerfile
+++ /dev/null
@@ -1,45 +0,0 @@
-ARG REPOSITORY=foundationdb/build
-ARG VERSION=centos7-latest
-FROM $REPOSITORY:$VERSION
-
-#########################################################################################################################################
-# This install YCSB AND the FDB client
-# libraries necessary to run it. The
-# following are the different files downloaded:
-#
-#  1. YCSB
-#  2. libfdb_c_${FDB_VERSION}.so -- the C binding. Sent to /var/lib/fdb
-#  3. fdb-java-${FDB_VERSION}.jar -- the Java library. Sent to ${YCSB_HOME}/foundationdb-binding/lib
-#  4. jaxb-api-2.3.1.jar -- a library dependency necessary for making HDR histograms. Sent to ${YCSB_HOME}/foundationdb-binding/lib
-#
-# Note that these files are only complete for FDB 6.2.x. If you are wanting to run FDB 6.3.x versions, then you'll need to add
-# libfdb_java_${FDB_VERSION}.so to /var/lib/fdb as well
-#########################################################################################################################################
-
-ENV YCSB_VERSION=ycsb-foundationdb-binding-0.17.0 \
-    PATH=${PATH}:/usr/bin
-
-RUN cd /opt \
-    && eval curl "-Ls https://github.com/brianfrankcooper/YCSB/releases/download/0.17.0/ycsb-foundationdb-binding-0.17.0.tar.gz" \
-    | tar -xzvf - 
-
-RUN rm -Rf /opt/${YCSB_VERSION}/lib/fdb-java-5.2.5.jar
-
-WORKDIR /var/fdb/tmp
-
-COPY . /var/fdb/tmp/packages
-
-WORKDIR /var/fdb/tmp/packages/lib
-
-RUN mv libfdb_c.so /usr/lib/libfdb_c.so && \
-	mv libfdb_java.so /usr/lib/libfdb_java.so
-
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/
-
-WORKDIR /var/fdb/tmp/packages
-
-RUN mv fdb-java-7.0.0-PRERELEASE.jar /opt/${YCSB_VERSION}/lib/fdb-java-7.0.0-PRERELEASE.jar
-
-WORKDIR "/opt/${YCSB_VERSION}"
-
-CMD ["tail", "-f", "/dev/null"]

From f155374fd237b6b16073f0588220a361bf65aab0 Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Mon, 26 Apr 2021 22:41:55 +0000
Subject: [PATCH 264/317] Copy sidecar docker from fdb-kubernetes-operator
 github (the intent is to delete it from the other repo)

---
 packaging/docker/build-release-docker.sh  |  25 +-
 packaging/docker/sidecar/Dockerfile       |  66 +++
 packaging/docker/sidecar/entrypoint.bash  |  27 +
 packaging/docker/sidecar/requirements.txt |   1 +
 packaging/docker/sidecar/sidecar.py       | 633 ++++++++++++++++++++++
 5 files changed, 737 insertions(+), 15 deletions(-)
 create mode 100644 packaging/docker/sidecar/Dockerfile
 create mode 100755 packaging/docker/sidecar/entrypoint.bash
 create mode 100644 packaging/docker/sidecar/requirements.txt
 create mode 100644 packaging/docker/sidecar/sidecar.py

diff --git a/packaging/docker/build-release-docker.sh b/packaging/docker/build-release-docker.sh
index 32d173aed5..c165fb12ea 100755
--- a/packaging/docker/build-release-docker.sh
+++ b/packaging/docker/build-release-docker.sh
@@ -1,14 +1,12 @@
 #!/bin/bash
 set -euxo pipefail
 
-## This is designed to be run inside an environment with the following repos checked out under ~/src:
-#
-#     foundationdb
-#     fdb-kubernetes-operator
-#
+# This is designed to be run inside an environment with foundationdb checked out at ~/src/foundationdb.
 # The foundationdb build will write its output to ~/build_output
+FDB_SRC=${HOME}/src/foundationdb
+FDB_BUILD=${HOME}/build_output
 
-FDB_VERSION=$(grep '  VERSION ' ~/src/foundationdb/CMakeLists.txt | tr -s ' ' ' ' | cut -d ' ' -f 3)
+FDB_VERSION=$(grep '  VERSION ' ${FDB_SRC}/CMakeLists.txt | tr -s ' ' ' ' | cut -d ' ' -f 3)
 
 # Options (passed via environment variables)
 
@@ -20,24 +18,21 @@ ECR=${ECR:-112664522426.dkr.ecr.us-west-2.amazonaws.com}
 echo Building with tag ${TAG}
 
 # TODO: This is a copy of the commonly-used 'cmk' function.
-cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output \
+cmake -S ${FDB_SRC} -B ${FDB_BUILD} \
    -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON \
-   -G Ninja \
-      && ninja -C ${HOME}/build_output -j 84
+   -G Ninja
 
+ninja -C ${FDB_BUILD} -j 84
 
 # derived variables
 IMAGE=foundationdb/foundationdb:${TAG}
 SIDECAR_IMAGE=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
 
+cd ${FDB_BUILD}/packages/docker
+
 WEBSITE_BIN_DIR=website/downloads/${FDB_VERSION}/linux/
 TARBALL=${WEBSITE_BIN_DIR}/fdb_${FDB_VERSION}.tar.gz
 
-# copy packaging scripts from operator repo into fdb build_output directory
-cp -an ~/src/fdb-kubernetes-operator/foundationdb-kubernetes-sidecar/* ~/build_output/packages/docker/
-
-cd ~/build_output/packages/docker
-
 mkdir -p ${WEBSITE_BIN_DIR}
 tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
 cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
@@ -59,7 +54,7 @@ docker build -t ${SIDECAR_IMAGE} \
    --build-arg FDB_WEBSITE=file:///mnt/website \
    --build-arg FDB_VERSION=$FDB_VERSION \
    --build-arg FDB_LIBRARY_VERSIONS=$FDB_VERSION \
-   -f Dockerfile .
+   -f sidecar/Dockerfile .
 
 docker tag ${SIDECAR_IMAGE} ${ECR}/${SIDECAR_IMAGE}
 
diff --git a/packaging/docker/sidecar/Dockerfile b/packaging/docker/sidecar/Dockerfile
new file mode 100644
index 0000000000..c3245afcd0
--- /dev/null
+++ b/packaging/docker/sidecar/Dockerfile
@@ -0,0 +1,66 @@
+# Dockerfile
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2018-2019 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM python:3.9-slim
+
+WORKDIR /var/fdb/tmp
+ARG FDB_VERSION=6.2.30
+ARG FDB_LIBRARY_VERSIONS="6.2.30 6.1.13"
+ARG FDB_WEBSITE=https://www.foundationdb.org
+# Adding tini as PID 1 https://github.com/krallin/tini
+ARG TINI_VERSION=v0.19.0
+
+COPY website /mnt/website
+RUN apt-get update && \
+	apt-get install -y --no-install-recommends curl && \
+	curl --fail $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz -o fdb_$FDB_VERSION.tar.gz && \
+	tar -xzf fdb_$FDB_VERSION.tar.gz --strip-components=1 && \
+	rm fdb_$FDB_VERSION.tar.gz && \
+	chmod u+x fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent && \
+	mv fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent /usr/bin && \
+	echo ${FDB_VERSION} > /var/fdb/version && mkdir -p /var/fdb/lib && \
+	for version in $FDB_LIBRARY_VERSIONS; do curl --fail $FDB_WEBSITE/downloads/$version/linux/libfdb_c_$version.so -o /var/fdb/lib/libfdb_c_${version%.*}.so; done && \
+	curl -LO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
+    curl -LO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64.sha256sum && \
+	sha256sum -c tini-amd64.sha256sum && \
+	rm -f tini-amd64.sha256sum && \
+    chmod +x tini-amd64 && \
+	mv tini-amd64 /usr/bin/tini && \
+	rm -r /var/fdb/tmp && \
+	groupadd --gid 4059 fdb && \
+	useradd --gid 4059 --uid 4059 --no-create-home --shell /bin/bash fdb && \
+	apt-get remove -y curl && \
+	rm -rf /var/lib/apt/lists/*
+
+WORKDIR /
+
+COPY sidecar/entrypoint.bash /
+COPY sidecar/requirements.txt /
+COPY sidecar/sidecar.py /
+
+RUN pip install -r /requirements.txt && rm /requirements.txt && chmod a+x /entrypoint.bash
+
+VOLUME /var/input-files
+VOLUME /var/output-files
+
+USER fdb
+
+ENV LISTEN_PORT 8080
+
+ENTRYPOINT ["/usr/bin/tini", "-g", "--", "/entrypoint.bash"]
diff --git a/packaging/docker/sidecar/entrypoint.bash b/packaging/docker/sidecar/entrypoint.bash
new file mode 100755
index 0000000000..be173d4ea9
--- /dev/null
+++ b/packaging/docker/sidecar/entrypoint.bash
@@ -0,0 +1,27 @@
+#! /bin/bash
+
+# entrypoint.bash
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2018-2019 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+if [[ -n "$ADDITIONAL_ENV_FILE" ]]; then
+  source $ADDITIONAL_ENV_FILE
+fi
+
+python sidecar.py $*
\ No newline at end of file
diff --git a/packaging/docker/sidecar/requirements.txt b/packaging/docker/sidecar/requirements.txt
new file mode 100644
index 0000000000..c7fcc8bac8
--- /dev/null
+++ b/packaging/docker/sidecar/requirements.txt
@@ -0,0 +1 @@
+watchdog==0.9.0
\ No newline at end of file
diff --git a/packaging/docker/sidecar/sidecar.py b/packaging/docker/sidecar/sidecar.py
new file mode 100644
index 0000000000..af2a580439
--- /dev/null
+++ b/packaging/docker/sidecar/sidecar.py
@@ -0,0 +1,633 @@
+#! /usr/bin/python
+
+# entrypoint.py
+#
+# This source file is part of the FoundationDB open source project
+#
+# Copyright 2018-2019 Apple Inc. and the FoundationDB project authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import hashlib
+import http.server
+import logging
+import json
+import os
+import shutil
+import socket
+import ssl
+import stat
+import time
+import traceback
+import sys
+from pathlib import Path
+
+from watchdog.observers import Observer
+from watchdog.events import FileSystemEventHandler
+
+log = logging.getLogger(__name__)
+log.setLevel(logging.INFO)
+
+
+class Config(object):
+    def __init__(self):
+        parser = argparse.ArgumentParser(description="FoundationDB Kubernetes Sidecar")
+        parser.add_argument(
+            "--init-mode",
+            help=(
+                "Whether to run the sidecar in init mode "
+                "which causes it to copy the files once and "
+                "exit without starting a server."
+            ),
+            action="store_true",
+        )
+        parser.add_argument(
+            "--bind-address", help="IP and port to bind on", default="0.0.0.0:8080"
+        )
+        parser.add_argument(
+            "--tls",
+            help=("This flag enables TLS for incoming " "connections"),
+            action="store_true",
+        )
+        parser.add_argument(
+            "--tls-certificate-file",
+            help=(
+                "The path to the certificate file for TLS "
+                "connections. If this is not provided we "
+                "will take the path from the "
+                "FDB_TLS_CERTIFICATE_FILE environment "
+                "variable."
+            ),
+        )
+        parser.add_argument(
+            "--tls-ca-file",
+            help=(
+                "The path to the certificate authority file "
+                "for TLS connections  If this is not "
+                "provided we will take the path from the "
+                "FDB_TLS_CA_FILE environment variable."
+            ),
+        )
+        parser.add_argument(
+            "--tls-key-file",
+            help=(
+                "The path to the key file for TLS "
+                "connections. If this is not provided we "
+                "will take the path from the "
+                "FDB_TLS_KEY_FILE environment "
+                "variable."
+            ),
+        )
+        parser.add_argument(
+            "--tls-verify-peers",
+            help=(
+                "The peer verification rules for incoming "
+                "TLS  connections. If this is not provided "
+                "we will take the rules from the "
+                "FDB_TLS_VERIFY_PEERS environment variable. "
+                "The format of this is the same as the TLS "
+                "peer verification rules in FoundationDB."
+            ),
+        )
+        parser.add_argument(
+            "--input-dir",
+            help=("The directory containing the input files " "the config map."),
+            default="/var/input-files",
+        )
+        parser.add_argument(
+            "--output-dir",
+            help=(
+                "The directory into which the sidecar should "
+                "place the file it generates."
+            ),
+            default="/var/output-files",
+        )
+        parser.add_argument(
+            "--substitute-variable",
+            help=(
+                "A custom environment variable that should "
+                "available for substitution in the monitor "
+                "conf."
+            ),
+            action="append",
+        )
+        parser.add_argument(
+            "--copy-file",
+            help=("A file to copy from the config map to the " "output directory."),
+            action="append",
+        )
+        parser.add_argument(
+            "--copy-binary",
+            help=("A binary to copy from the to the output" "directory."),
+            action="append",
+        )
+        parser.add_argument(
+            "--copy-library",
+            help=(
+                "A version of the client library to copy " "to the output directory."
+            ),
+            action="append",
+        )
+        parser.add_argument(
+            "--input-monitor-conf",
+            help=("The name of a monitor conf template in the " "input files"),
+        )
+        parser.add_argument(
+            "--main-container-version",
+            help=("The version of the main foundationdb " "container in the pod"),
+        )
+        parser.add_argument(
+            "--main-container-conf-dir",
+            help=(
+                "The directory where the dynamic conf "
+                "written by the sidecar will be mounted in "
+                "the main container."
+            ),
+            default="/var/dynamic-conf",
+        )
+        parser.add_argument(
+            "--require-not-empty",
+            help=(
+                "A file that must be present and non-empty " "in the input directory"
+            ),
+            action="append",
+        )
+        args = parser.parse_args()
+
+        self.bind_address = args.bind_address
+        self.input_dir = args.input_dir
+        self.output_dir = args.output_dir
+
+        self.enable_tls = args.tls
+        self.copy_files = args.copy_file or []
+        self.copy_binaries = args.copy_binary or []
+        self.copy_libraries = args.copy_library or []
+        self.input_monitor_conf = args.input_monitor_conf
+        self.init_mode = args.init_mode
+        self.main_container_version = args.main_container_version
+        self.require_not_empty = args.require_not_empty
+
+        with open("/var/fdb/version") as version_file:
+            self.primary_version = version_file.read().strip()
+
+        version_split = self.primary_version.split(".")
+        self.minor_version = [int(version_split[0]), int(version_split[1])]
+
+        forbid_deprecated_environment_variables = self.is_at_least([6, 3])
+
+        if self.enable_tls:
+            self.certificate_file = args.tls_certificate_file or os.getenv(
+                "FDB_TLS_CERTIFICATE_FILE"
+            )
+            assert self.certificate_file, (
+                "You must provide a certificate file, either through the "
+                "tls_certificate_file argument or the FDB_TLS_CERTIFICATE_FILE "
+                "environment variable"
+            )
+            self.ca_file = args.tls_ca_file or os.getenv("FDB_TLS_CA_FILE")
+            assert self.ca_file, (
+                "You must provide a CA file, either through the tls_ca_file "
+                "argument or the FDB_TLS_CA_FILE environment variable"
+            )
+            self.key_file = args.tls_key_file or os.getenv("FDB_TLS_KEY_FILE")
+            assert self.key_file, (
+                "You must provide a key file, either through the tls_key_file "
+                "argument or the FDB_TLS_KEY_FILE environment variable"
+            )
+            self.peer_verification_rules = args.tls_verify_peers or os.getenv(
+                "FDB_TLS_VERIFY_PEERS"
+            )
+
+        self.substitutions = {}
+        for key in [
+            "FDB_PUBLIC_IP",
+            "FDB_MACHINE_ID",
+            "FDB_ZONE_ID",
+            "FDB_INSTANCE_ID",
+        ]:
+            self.substitutions[key] = os.getenv(key, "")
+
+        if self.substitutions["FDB_MACHINE_ID"] == "":
+            self.substitutions["FDB_MACHINE_ID"] = os.getenv("HOSTNAME", "")
+
+        if self.substitutions["FDB_ZONE_ID"] == "":
+            self.substitutions["FDB_ZONE_ID"] = self.substitutions["FDB_MACHINE_ID"]
+        if self.substitutions["FDB_PUBLIC_IP"] == "":
+            address_info = socket.getaddrinfo(
+                self.substitutions["FDB_MACHINE_ID"],
+                4500,
+                family=socket.AddressFamily.AF_INET,
+            )
+            if len(address_info) > 0:
+                self.substitutions["FDB_PUBLIC_IP"] = address_info[0][4][0]
+
+        if self.main_container_version == self.primary_version:
+            self.substitutions["BINARY_DIR"] = "/usr/bin"
+        else:
+            self.substitutions["BINARY_DIR"] = target_path = str(
+                Path("%s/bin/%s" % (args.main_container_conf_dir, self.primary_version))
+            )
+
+        for variable in args.substitute_variable or []:
+            self.substitutions[variable] = os.getenv(variable)
+
+        if forbid_deprecated_environment_variables:
+            for variable in [
+                "SIDECAR_CONF_DIR",
+                "INPUT_DIR",
+                "OUTPUT_DIR",
+                "COPY_ONCE",
+            ]:
+                if os.getenv(variable):
+                    print(
+                        f"""Environment variable {variable} is not supported in this version of FoundationDB.
+                        Please use the command-line arguments instead."""
+                    )
+                    sys.exit(1)
+
+        if os.getenv("SIDECAR_CONF_DIR"):
+            with open(
+                os.path.join(os.getenv("SIDECAR_CONF_DIR"), "config.json")
+            ) as conf_file:
+                config = json.load(conf_file)
+        else:
+            config = {}
+
+        if os.getenv("INPUT_DIR"):
+            self.input_dir = os.getenv("INPUT_DIR")
+
+        if os.getenv("OUTPUT_DIR"):
+            self.output_dir = os.getenv("OUTPUT_DIR")
+
+        if "ADDITIONAL_SUBSTITUTIONS" in config and config["ADDITIONAL_SUBSTITUTIONS"]:
+            for key in config["ADDITIONAL_SUBSTITUTIONS"]:
+                self.substitutions[key] = os.getenv(key, key)
+
+        if "COPY_FILES" in config and config["COPY_FILES"]:
+            self.copy_files.extend(config["COPY_FILES"])
+
+        if "COPY_BINARIES" in config and config["COPY_BINARIES"]:
+            self.copy_binaries.extend(config["COPY_BINARIES"])
+
+        if "COPY_LIBRARIES" in config and config["COPY_LIBRARIES"]:
+            self.copy_libraries.extend(config["COPY_LIBRARIES"])
+
+        if "INPUT_MONITOR_CONF" in config and config["INPUT_MONITOR_CONF"]:
+            self.input_monitor_conf = config["INPUT_MONITOR_CONF"]
+
+        if os.getenv("COPY_ONCE", "0") == "1":
+            self.init_mode = True
+
+    @classmethod
+    def shared(cls):
+        if cls.shared_config:
+            return cls.shared_config
+        cls.shared_config = Config()
+        return cls.shared_config
+
+    shared_config = None
+
+    def is_at_least(self, target_version):
+        return self.minor_version[0] > target_version[0] or (
+            self.minor_version[0] == target_version[0]
+            and self.minor_version[1] >= target_version[1]
+        )
+
+
+class Server(http.server.BaseHTTPRequestHandler):
+    ssl_context = None
+
+    @classmethod
+    def start(cls):
+        """
+        This method starts the server.
+        """
+        config = Config.shared()
+        (address, port) = config.bind_address.split(":")
+        log.info("Listening on %s:%s" % (address, port))
+        httpd = http.server.HTTPServer((address, int(port)), cls)
+
+        if config.enable_tls:
+            context = Server.load_ssl_context()
+            httpd.socket = context.wrap_socket(httpd.socket, server_side=True)
+            observer = Observer()
+            event_handler = CertificateEventHandler()
+            for path in set(
+                [
+                    Path(config.certificate_file).parent.as_posix(),
+                    Path(config.key_file).parent.as_posix(),
+                ]
+            ):
+                observer.schedule(event_handler, path)
+            observer.start()
+
+        httpd.serve_forever()
+
+    @classmethod
+    def load_ssl_context(cls):
+        config = Config.shared()
+        if not cls.ssl_context:
+            cls.ssl_context = ssl.create_default_context(cafile=config.ca_file)
+            cls.ssl_context.check_hostname = False
+            cls.ssl_context.verify_mode = ssl.CERT_REQUIRED
+        cls.ssl_context.load_cert_chain(config.certificate_file, config.key_file)
+        return cls.ssl_context
+
+    def send_text(self, text, code=200, content_type="text/plain", add_newline=True):
+        """
+        This method sends a text response.
+        """
+        if add_newline:
+            text += "\n"
+
+        self.send_response(code)
+        response = bytes(text, encoding="utf-8")
+        self.send_header("Content-Length", str(len(response)))
+        self.send_header("Content-Type", content_type)
+        self.end_headers()
+        self.wfile.write(response)
+
+    def check_request_cert(self):
+        config = Config.shared()
+        approved = not config.enable_tls or self.check_cert(
+            self.connection.getpeercert(), config.peer_verification_rules
+        )
+        if not approved:
+            self.send_error(401, "Client certificate was not approved")
+        return approved
+
+    def check_cert(self, cert, rules):
+        """
+        This method checks that the client's certificate is valid.
+
+        If there is any problem with the certificate, this will return a string
+        describing the error.
+        """
+        if not rules:
+            return True
+
+        for option in rules.split(";"):
+            option_valid = True
+            for rule in option.split(","):
+                if not self.check_cert_rule(cert, rule):
+                    option_valid = False
+                    break
+
+            if option_valid:
+                return True
+
+        return False
+
+    def check_cert_rule(self, cert, rule):
+        (key, expected_value) = rule.split("=", 1)
+        if "." in key:
+            (scope_key, field_key) = key.split(".", 1)
+        else:
+            scope_key = "S"
+            field_key = key
+
+        if scope_key == "S" or scope_key == "Subject":
+            scope_name = "subject"
+        elif scope_key == "I" or scope_key == "Issuer":
+            scope_name = "issuer"
+        elif scope_key == "R" or scope_key == "Root":
+            scope_name = "root"
+        else:
+            assert False, "Unknown certificate scope %s" % scope_key
+
+        if scope_name not in cert:
+            return False
+
+        rdns = None
+        operator = ""
+        if field_key == "CN":
+            field_name = "commonName"
+        elif field_key == "C":
+            field_name = "country"
+        elif field_key == "L":
+            field_name = "localityName"
+        elif field_key == "ST":
+            field_name = "stateOrProvinceName"
+        elif field_key == "O":
+            field_name = "organizationName"
+        elif field_key == "OU":
+            field_name = "organizationalUnitName"
+        elif field_key == "UID":
+            field_name = "userId"
+        elif field_key == "DC":
+            field_name = "domainComponent"
+        elif field_key.startswith("subjectAltName") and scope_name == "subject":
+            operator = field_key[14:]
+            field_key = field_key[0:14]
+            (field_name, expected_value) = expected_value.split(":", 1)
+            if field_key not in cert:
+                return False
+            rdns = [cert["subjectAltName"]]
+        else:
+            assert False, "Unknown certificate field %s" % field_key
+
+        if not rdns:
+            rdns = list(cert[scope_name])
+
+        for rdn in rdns:
+            for entry in list(rdn):
+                if entry[0] == field_name:
+                    if operator == "" and entry[1] == expected_value:
+                        return True
+                    elif operator == "<" and entry[1].endswith(expected_value):
+                        return True
+                    elif operator == ">" and entry[1].startswith(expected_value):
+                        return True
+
+    def do_GET(self):
+        """
+        This method executes a GET request.
+        """
+        try:
+            if not self.check_request_cert():
+                return
+            if self.path.startswith("/check_hash/"):
+                try:
+                    self.send_text(check_hash(self.path[12:]), add_newline=False)
+                except FileNotFoundError:
+                    self.send_error(404, "Path not found")
+                    self.end_headers()
+            elif self.path == "/ready":
+                self.send_text(ready())
+            elif self.path == "/substitutions":
+                self.send_text(get_substitutions())
+            else:
+                self.send_error(404, "Path not found")
+                self.end_headers()
+        except RequestException as e:
+            self.send_error(400, e.message)
+        except Exception as ex:
+            log.error(f"Error processing request {ex}", exc_info=True)
+            self.send_error(500)
+            self.end_headers()
+
+    def do_POST(self):
+        """
+        This method executes a POST request.
+        """
+        try:
+            if not self.check_request_cert():
+                return
+            if self.path == "/copy_files":
+                self.send_text(copy_files())
+            elif self.path == "/copy_binaries":
+                self.send_text(copy_binaries())
+            elif self.path == "/copy_libraries":
+                self.send_text(copy_libraries())
+            elif self.path == "/copy_monitor_conf":
+                self.send_text(copy_monitor_conf())
+            elif self.path == "/refresh_certs":
+                self.send_text(refresh_certs())
+            elif self.path == "/restart":
+                self.send_text("OK")
+                exit(1)
+            else:
+                self.send_error(404, "Path not found")
+                self.end_headers()
+        except SystemExit as e:
+            raise e
+        except RequestException as e:
+            self.send_error(400, e.message)
+        except e:
+            log.error("Error processing request", exc_info=True)
+            self.send_error(500)
+            self.end_headers()
+
+    def log_message(self, format, *args):
+        log.info(format % args)
+
+
+class CertificateEventHandler(FileSystemEventHandler):
+    def on_any_event(self, event):
+        log.info("Detected change to certificates")
+        time.sleep(10)
+        log.info("Reloading certificates")
+        Server.load_ssl_context()
+
+
+def check_hash(filename):
+    with open(os.path.join(Config.shared().output_dir, filename), "rb") as contents:
+        m = hashlib.sha256()
+        m.update(contents.read())
+        return m.hexdigest()
+
+
+def copy_files():
+    config = Config.shared()
+    if config.require_not_empty:
+        for filename in config.require_not_empty:
+            path = os.path.join(config.input_dir, filename)
+            if not os.path.isfile(path) or os.path.getsize(path) == 0:
+                raise Exception("No contents for file %s" % path)
+    for filename in config.copy_files:
+        tmp_file = os.path.join(config.output_dir, f"{filename}.tmp")
+        shutil.copy(os.path.join(config.input_dir, filename), tmp_file)
+        os.replace(tmp_file, os.path.join(config.output_dir, filename))
+
+    return "OK"
+
+
+def copy_binaries():
+    config = Config.shared()
+    if config.main_container_version != config.primary_version:
+        for binary in config.copy_binaries:
+            path = Path(f"/usr/bin/{binary}")
+            target_path = Path(
+                f"{config.output_dir}/bin/{config.primary_version}/{binary}"
+            )
+            if not target_path.exists():
+                target_path.parent.mkdir(parents=True, exist_ok=True)
+                tmp_file = f"{target_path}.tmp"
+                shutil.copy(path, tmp_file)
+                os.replace(tmp_file, target_path)
+                target_path.chmod(0o744)
+    return "OK"
+
+
+def copy_libraries():
+    config = Config.shared()
+    for version in config.copy_libraries:
+        path = Path(f"/var/fdb/lib/libfdb_c_{version}.so")
+        if version == config.copy_libraries[0]:
+            target_path = Path(f"{config.output_dir}/lib/libfdb_c.so")
+        else:
+            target_path = Path(
+                f"{config.output_dir}/lib/multiversion/libfdb_c_{version}.so"
+            )
+        if not target_path.exists():
+            target_path.parent.mkdir(parents=True, exist_ok=True)
+            tmp_file = f"{target_path}.tmp"
+            shutil.copy(path, tmp_file)
+            os.replace(tmp_file, target_path)
+    return "OK"
+
+
+def copy_monitor_conf():
+    config = Config.shared()
+    if config.input_monitor_conf:
+        with open(
+            os.path.join(config.input_dir, config.input_monitor_conf)
+        ) as monitor_conf_file:
+            monitor_conf = monitor_conf_file.read()
+        for variable in config.substitutions:
+            monitor_conf = monitor_conf.replace(
+                "$" + variable, config.substitutions[variable]
+            )
+
+        tmp_file = os.path.join(config.output_dir, "fdbmonitor.conf.tmp")
+        target_file = os.path.join(config.output_dir, "fdbmonitor.conf")
+
+        with open(tmp_file, "w") as output_conf_file:
+            output_conf_file.write(monitor_conf)
+
+        os.replace(tmp_file, target_file)
+    return "OK"
+
+
+def get_substitutions():
+    return json.dumps(Config.shared().substitutions)
+
+
+def ready():
+    return "OK"
+
+
+def refresh_certs():
+    if not Config.shared().enable_tls:
+        raise RequestException("Server is not using TLS")
+    Server.load_ssl_context()
+    return "OK"
+
+
+class RequestException(Exception):
+    def __init__(self, message):
+        super().__init__(message)
+        self.message = message
+
+
+if __name__ == "__main__":
+    logging.basicConfig(format="%(asctime)-15s %(levelname)s %(message)s")
+    copy_files()
+    copy_binaries()
+    copy_libraries()
+    copy_monitor_conf()
+
+    if not Config.shared().init_mode:
+        Server.start()

From 8dc487e56267db6d2c4cab718eb382ef5c6761fc Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Mon, 26 Apr 2021 16:23:54 -0700
Subject: [PATCH 265/317] docker cleanup

---
 packaging/docker/build-release-docker.sh               | 10 +++++-----
 packaging/docker/release/Dockerfile                    |  5 ++---
 .../{scripts => release}/create_cluster_file.bash      |  0
 .../create_server_environment.bash                     |  0
 .../download_multiversion_libraries.bash               |  0
 packaging/docker/{scripts => release}/fdb.bash         |  0
 packaging/docker/sidecar/Dockerfile                    |  4 ++--
 7 files changed, 9 insertions(+), 10 deletions(-)
 rename packaging/docker/{scripts => release}/create_cluster_file.bash (100%)
 mode change 100644 => 100755
 rename packaging/docker/{scripts => release}/create_server_environment.bash (100%)
 mode change 100644 => 100755
 rename packaging/docker/{scripts => release}/download_multiversion_libraries.bash (100%)
 mode change 100644 => 100755
 rename packaging/docker/{scripts => release}/fdb.bash (100%)
 mode change 100644 => 100755

diff --git a/packaging/docker/build-release-docker.sh b/packaging/docker/build-release-docker.sh
index c165fb12ea..eaaa941bce 100755
--- a/packaging/docker/build-release-docker.sh
+++ b/packaging/docker/build-release-docker.sh
@@ -37,6 +37,10 @@ mkdir -p ${WEBSITE_BIN_DIR}
 tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
 cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
 
+# Login to ECR
+# TODO: Move this to a common place instead of repeatedly copy-pasting it.
+aws ecr get-login-password | docker login --username AWS --password-stdin ${ECR}
+
 docker pull ${ECR}/ubuntu:18.04
 docker tag ${ECR}/ubuntu:18.04 ubuntu:18.04
 docker pull ${ECR}/python:3.9-slim
@@ -53,14 +57,10 @@ docker tag ${IMAGE} ${ECR}/${IMAGE}
 docker build -t ${SIDECAR_IMAGE} \
    --build-arg FDB_WEBSITE=file:///mnt/website \
    --build-arg FDB_VERSION=$FDB_VERSION \
-   --build-arg FDB_LIBRARY_VERSIONS=$FDB_VERSION \
+   --build-arg FDB_ADDITIONAL_VERSIONS=$FDB_VERSION \
    -f sidecar/Dockerfile .
 
 docker tag ${SIDECAR_IMAGE} ${ECR}/${SIDECAR_IMAGE}
 
-# Login to ECR
-# TODO: Move this to a common place instead of repeatedly copy-pasting it.
-aws ecr get-login-password | docker login --username AWS --password-stdin ${ECR}
-
 docker push ${ECR}/${IMAGE}
 docker push ${ECR}/${SIDECAR_IMAGE}
diff --git a/packaging/docker/release/Dockerfile b/packaging/docker/release/Dockerfile
index d44b94fdfd..7e265cbf91 100644
--- a/packaging/docker/release/Dockerfile
+++ b/packaging/docker/release/Dockerfile
@@ -57,10 +57,9 @@ WORKDIR /var/fdb
 
 # Set Up Runtime Scripts and Directories
 
-COPY scripts /var/fdb/scripts
+COPY release/*.bash /var/fdb/scripts/
 
-RUN chmod u+x scripts/*.bash && \
-	mkdir -p logs
+RUN	mkdir -p logs
 
 # Install FoundationDB Client Libraries
 
diff --git a/packaging/docker/scripts/create_cluster_file.bash b/packaging/docker/release/create_cluster_file.bash
old mode 100644
new mode 100755
similarity index 100%
rename from packaging/docker/scripts/create_cluster_file.bash
rename to packaging/docker/release/create_cluster_file.bash
diff --git a/packaging/docker/scripts/create_server_environment.bash b/packaging/docker/release/create_server_environment.bash
old mode 100644
new mode 100755
similarity index 100%
rename from packaging/docker/scripts/create_server_environment.bash
rename to packaging/docker/release/create_server_environment.bash
diff --git a/packaging/docker/scripts/download_multiversion_libraries.bash b/packaging/docker/release/download_multiversion_libraries.bash
old mode 100644
new mode 100755
similarity index 100%
rename from packaging/docker/scripts/download_multiversion_libraries.bash
rename to packaging/docker/release/download_multiversion_libraries.bash
diff --git a/packaging/docker/scripts/fdb.bash b/packaging/docker/release/fdb.bash
old mode 100644
new mode 100755
similarity index 100%
rename from packaging/docker/scripts/fdb.bash
rename to packaging/docker/release/fdb.bash
diff --git a/packaging/docker/sidecar/Dockerfile b/packaging/docker/sidecar/Dockerfile
index c3245afcd0..cb6c0d8397 100644
--- a/packaging/docker/sidecar/Dockerfile
+++ b/packaging/docker/sidecar/Dockerfile
@@ -21,7 +21,7 @@ FROM python:3.9-slim
 
 WORKDIR /var/fdb/tmp
 ARG FDB_VERSION=6.2.30
-ARG FDB_LIBRARY_VERSIONS="6.2.30 6.1.13"
+ARG FDB_ADDITIONAL_VERSIONS="6.2.30 6.1.13"
 ARG FDB_WEBSITE=https://www.foundationdb.org
 # Adding tini as PID 1 https://github.com/krallin/tini
 ARG TINI_VERSION=v0.19.0
@@ -35,7 +35,7 @@ RUN apt-get update && \
 	chmod u+x fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent && \
 	mv fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent /usr/bin && \
 	echo ${FDB_VERSION} > /var/fdb/version && mkdir -p /var/fdb/lib && \
-	for version in $FDB_LIBRARY_VERSIONS; do curl --fail $FDB_WEBSITE/downloads/$version/linux/libfdb_c_$version.so -o /var/fdb/lib/libfdb_c_${version%.*}.so; done && \
+	for version in $FDB_ADDITIONAL_VERSIONS; do curl --fail $FDB_WEBSITE/downloads/$version/linux/libfdb_c_$version.so -o /var/fdb/lib/libfdb_c_${version%.*}.so; done && \
 	curl -LO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
     curl -LO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64.sha256sum && \
 	sha256sum -c tini-amd64.sha256sum && \

From 76acb0fcb98cbe9bc2a147a9aadc2d69028cbd9d Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 26 Apr 2021 17:42:15 -0700
Subject: [PATCH 266/317] Update date format to ISO 8601

---
 fdbclient/ProcessInterface.h        |  2 +-
 fdbclient/SpecialKeySpace.actor.cpp | 20 ++++++++++++++++++--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
index c89f6028bb..0c57107106 100644
--- a/fdbclient/ProcessInterface.h
+++ b/fdbclient/ProcessInterface.h
@@ -86,7 +86,7 @@ struct ActorLineageReply {
 struct ActorLineageRequest {
 	constexpr static FileIdentifier file_identifier = 11654765;
 	WaitState waitStateStart, waitStateEnd;
-	double timeStart, timeEnd;
+	time_t timeStart, timeEnd;
 	int seqStart, seqEnd;
 	ReplyPromise<ActorLineageReply> reply;
 
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index f251feddfa..d2f0e57d55 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -21,6 +21,7 @@
 #include "boost/lexical_cast.hpp"
 #include "boost/algorithm/string.hpp"
 
+#include <time.h>
 #include <msgpack.hpp>
 
 #include <exception>
@@ -1958,6 +1959,21 @@ void parse(StringRef& val, WaitState& w) {
 	}
 }
 
+void parse(StringRef& val, time_t& t) {
+	struct tm tm = { 0 };
+	if (strptime(val.toString().c_str(), "%FT%T%z", &tm) == nullptr) {
+			TraceEvent("LUKAS_FailedToParse");
+		throw std::invalid_argument("failed to parse ISO 8601 datetime");
+	}
+
+	long timezone = tm.tm_gmtoff;
+	t = timegm(&tm);
+	if (t == -1) {
+		throw std::runtime_error("failed to convert ISO 8601 datetime");
+	}
+	t -= timezone;
+}
+
 void parse(StringRef& val, NetworkAddress& a) {
 	auto address = NetworkAddress::parse(val.toString());
 	if (!address.isValid()) {
@@ -2016,8 +2032,8 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 	state NetworkAddress host;
 	state WaitState waitStateStart = WaitState{ 0 };
 	state WaitState waitStateEnd = WaitState{ 2 };
-	state double timeStart = 0;
-	state double timeEnd = std::numeric_limits<double>::max();
+	state time_t timeStart = 0;
+	state time_t timeEnd = std::numeric_limits<time_t>::max();
 	state int seqStart = 0;
 	state int seqEnd = std::numeric_limits<int>::max();
 

From 5279512097a31e546ebf57b69cd3f88e9500c2a2 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 18:06:25 -0700
Subject: [PATCH 267/317] the enum values changed so the master class in the
 previous version is the same as GrvProxyClass now

---
 fdbserver/SimulatedCluster.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index f10ca774bb..68ae6966eb 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -733,7 +733,7 @@ ACTOR Future<Void> restartSimulatedSystem(vector<Future<Void>>* systemActors,
 			ProcessClass::ClassType cType =
 			    (ProcessClass::ClassType)(atoi(ini.GetValue(machineIdString.c_str(), "mClass")));
 			// using specialized class types can lead to nondeterministic recruitment
-			if (cType == ProcessClass::MasterClass || cType == ProcessClass::ResolutionClass) {
+			if (cType == ProcessClass::GrvProxyClass || cType == ProcessClass::ResolutionClass) {
 				cType = ProcessClass::StatelessClass;
 			}
 			ProcessClass processClass = ProcessClass(cType, ProcessClass::CommandLineSource);

From a02da36e8541dbd284458e6d70ebcec4ad599b86 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 18:45:44 -0700
Subject: [PATCH 268/317] fixed the problem with the GrvProxyClass the proper
 way my keeping the enum the same between versions

---
 fdbrpc/Locality.h                    | 2 +-
 fdbserver/SimulatedCluster.actor.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbrpc/Locality.h b/fdbrpc/Locality.h
index a3a3ebd4a9..0a7467e0bf 100644
--- a/fdbrpc/Locality.h
+++ b/fdbrpc/Locality.h
@@ -34,7 +34,6 @@ struct ProcessClass {
 		ResolutionClass,
 		TesterClass,
 		CommitProxyClass,
-		GrvProxyClass,
 		MasterClass,
 		StatelessClass,
 		LogClass,
@@ -46,6 +45,7 @@ struct ProcessClass {
 		RatekeeperClass,
 		StorageCacheClass,
 		BackupClass,
+		GrvProxyClass,
 		InvalidClass = -1
 	};
 
diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp
index 68ae6966eb..f10ca774bb 100644
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@@ -733,7 +733,7 @@ ACTOR Future<Void> restartSimulatedSystem(vector<Future<Void>>* systemActors,
 			ProcessClass::ClassType cType =
 			    (ProcessClass::ClassType)(atoi(ini.GetValue(machineIdString.c_str(), "mClass")));
 			// using specialized class types can lead to nondeterministic recruitment
-			if (cType == ProcessClass::GrvProxyClass || cType == ProcessClass::ResolutionClass) {
+			if (cType == ProcessClass::MasterClass || cType == ProcessClass::ResolutionClass) {
 				cType = ProcessClass::StatelessClass;
 			}
 			ProcessClass processClass = ProcessClass(cType, ProcessClass::CommandLineSource);

From 1f98dec1df2d1bb5bad5c3c5296160f85d829bd3 Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Mon, 26 Apr 2021 19:26:25 -0700
Subject: [PATCH 269/317] cleaned up default constructed maps

---
 fdbserver/ClusterController.actor.cpp | 80 +++++++++------------------
 1 file changed, 26 insertions(+), 54 deletions(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 3cbaebe6bf..0ba69cbf84 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1200,8 +1200,7 @@ public:
 	                                               ProcessClass::Fitness unacceptableFitness,
 	                                               DatabaseConfiguration const& conf,
 	                                               std::map<Optional<Standalone<StringRef>>, int>& id_used,
-	                                               std::map<Optional<Standalone<StringRef>>, int> preferredSharing =
-	                                                   std::map<Optional<Standalone<StringRef>>, int>(),
+	                                               std::map<Optional<Standalone<StringRef>>, int> preferredSharing = {},
 	                                               bool checkStable = false) {
 		std::map<std::tuple<ProcessClass::Fitness, int, bool, int>, vector<WorkerDetails>> fitness_workers;
 
@@ -1238,8 +1237,7 @@ public:
 	    int amount,
 	    DatabaseConfiguration const& conf,
 	    std::map<Optional<Standalone<StringRef>>, int>& id_used,
-	    std::map<Optional<Standalone<StringRef>>, int> preferredSharing =
-	        std::map<Optional<Standalone<StringRef>>, int>(),
+	    std::map<Optional<Standalone<StringRef>>, int> preferredSharing = {},
 	    Optional<WorkerFitnessInfo> minWorker = Optional<WorkerFitnessInfo>(),
 	    bool checkStable = false) {
 		std::map<std::tuple<ProcessClass::Fitness, int, bool, int>, vector<WorkerDetails>> fitness_workers;
@@ -1961,15 +1959,10 @@ public:
 			                             ProcessClass::ExcludeFit,
 			                             db.config,
 			                             id_used,
-			                             std::map<Optional<Standalone<StringRef>>, int>(),
-			                             true);
-			getWorkerForRoleInDatacenter(regions[0].dcId,
-			                             ProcessClass::Master,
-			                             ProcessClass::ExcludeFit,
-			                             db.config,
-			                             id_used,
-			                             std::map<Optional<Standalone<StringRef>>, int>(),
+			                             {},
 			                             true);
+			getWorkerForRoleInDatacenter(
+			    regions[0].dcId, ProcessClass::Master, ProcessClass::ExcludeFit, db.config, id_used, {}, true);
 
 			std::set<Optional<Key>> primaryDC;
 			primaryDC.insert(regions[0].dcId);
@@ -1985,27 +1978,12 @@ public:
 				getWorkersForSatelliteLogs(db.config, regions[0], regions[1], id_used, satelliteFallback, true);
 			}
 
-			getWorkerForRoleInDatacenter(regions[0].dcId,
-			                             ProcessClass::Resolver,
-			                             ProcessClass::ExcludeFit,
-			                             db.config,
-			                             id_used,
-			                             std::map<Optional<Standalone<StringRef>>, int>(),
-			                             true);
-			getWorkerForRoleInDatacenter(regions[0].dcId,
-			                             ProcessClass::CommitProxy,
-			                             ProcessClass::ExcludeFit,
-			                             db.config,
-			                             id_used,
-			                             std::map<Optional<Standalone<StringRef>>, int>(),
-			                             true);
-			getWorkerForRoleInDatacenter(regions[0].dcId,
-			                             ProcessClass::GrvProxy,
-			                             ProcessClass::ExcludeFit,
-			                             db.config,
-			                             id_used,
-			                             std::map<Optional<Standalone<StringRef>>, int>(),
-			                             true);
+			getWorkerForRoleInDatacenter(
+			    regions[0].dcId, ProcessClass::Resolver, ProcessClass::ExcludeFit, db.config, id_used, {}, true);
+			getWorkerForRoleInDatacenter(
+			    regions[0].dcId, ProcessClass::CommitProxy, ProcessClass::ExcludeFit, db.config, id_used, {}, true);
+			getWorkerForRoleInDatacenter(
+			    regions[0].dcId, ProcessClass::GrvProxy, ProcessClass::ExcludeFit, db.config, id_used, {}, true);
 
 			vector<Optional<Key>> dcPriority;
 			dcPriority.push_back(regions[0].dcId);
@@ -2217,13 +2195,8 @@ public:
 		std::map<Optional<Standalone<StringRef>>, int> old_id_used;
 		id_used[clusterControllerProcessId]++;
 		old_id_used[clusterControllerProcessId]++;
-		WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(clusterControllerDcId,
-		                                                         ProcessClass::Master,
-		                                                         ProcessClass::NeverAssign,
-		                                                         db.config,
-		                                                         id_used,
-		                                                         std::map<Optional<Standalone<StringRef>>, int>(),
-		                                                         true);
+		WorkerFitnessInfo mworker = getWorkerForRoleInDatacenter(
+		    clusterControllerDcId, ProcessClass::Master, ProcessClass::NeverAssign, db.config, id_used, {}, true);
 		auto newMasterFit = mworker.worker.processClass.machineClassFitness(ProcessClass::Master);
 		if (db.config.isExcludedServer(mworker.worker.interf.addresses())) {
 			newMasterFit = std::max(newMasterFit, ProcessClass::ExcludeFit);
@@ -2382,17 +2355,16 @@ public:
 		RoleFitness oldLogRoutersFit(log_routers, ProcessClass::LogRouter, old_id_used);
 		RoleFitness newLogRoutersFit = oldLogRoutersFit;
 		if (db.config.usableRegions > 1 && dbi.recoveryState == RecoveryState::FULLY_RECOVERED) {
-			newLogRoutersFit =
-			    RoleFitness(getWorkersForRoleInDatacenter(*remoteDC.begin(),
-			                                              ProcessClass::LogRouter,
-			                                              newRouterCount,
-			                                              db.config,
-			                                              id_used,
-			                                              std::map<Optional<Standalone<StringRef>>, int>(),
-			                                              Optional<WorkerFitnessInfo>(),
-			                                              true),
-			                ProcessClass::LogRouter,
-			                id_used);
+			newLogRoutersFit = RoleFitness(getWorkersForRoleInDatacenter(*remoteDC.begin(),
+			                                                             ProcessClass::LogRouter,
+			                                                             newRouterCount,
+			                                                             db.config,
+			                                                             id_used,
+			                                                             {},
+			                                                             Optional<WorkerFitnessInfo>(),
+			                                                             true),
+			                               ProcessClass::LogRouter,
+			                               id_used);
 		}
 
 		if (oldLogRoutersFit.count < oldRouterCount) {
@@ -2477,7 +2449,7 @@ public:
 		                                                              nBackup,
 		                                                              db.config,
 		                                                              id_used,
-		                                                              std::map<Optional<Standalone<StringRef>>, int>(),
+		                                                              {},
 		                                                              Optional<WorkerFitnessInfo>(),
 		                                                              true),
 		                                ProcessClass::Backup,
@@ -2917,7 +2889,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 	                                                               ProcessClass::NeverAssign,
 	                                                               self->db.config,
 	                                                               id_used,
-	                                                               std::map<Optional<Standalone<StringRef>>, int>(),
+	                                                               {},
 	                                                               true)
 	                                .worker;
 	if (self->onMasterIsBetter(newRKWorker, ProcessClass::Ratekeeper)) {
@@ -2933,7 +2905,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) {
 	                                                               ProcessClass::NeverAssign,
 	                                                               self->db.config,
 	                                                               id_used,
-	                                                               std::map<Optional<Standalone<StringRef>>, int>(),
+	                                                               {},
 	                                                               true)
 	                                .worker;
 	if (self->onMasterIsBetter(newDDWorker, ProcessClass::DataDistributor)) {

From 7f9ee224a4e849a7f5f1320357110110cadb1df7 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Mon, 26 Apr 2021 22:50:44 -0700
Subject: [PATCH 270/317] Refactor samples to include wait state

---
 fdbclient/ActorLineageProfiler.cpp  | 21 ++++++++---------
 fdbclient/ActorLineageProfiler.h    |  9 +++++---
 fdbclient/ProcessInterface.h        |  5 ++--
 fdbclient/SpecialKeySpace.actor.cpp | 36 +++++++++++++++++++++++------
 fdbserver/worker.actor.cpp          | 17 +++++++++-----
 5 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 8bb2910001..e0a2e1bdf5 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -150,12 +150,9 @@ public:
 		}
 	}
 
-	std::shared_ptr<Sample> done(double time) {
-		auto res = std::make_shared<Sample>();
-		res->time = time;
-		res->size = sbuffer.size();
-		res->data = sbuffer.release();
-		return res;
+	std::pair<char*, unsigned> getbuf() {
+		unsigned size = sbuffer.size();
+		return std::make_pair(sbuffer.release(), size);
 	}
 };
 
@@ -175,11 +172,11 @@ std::map<std::string_view, std::any> SampleCollectorT::collect(ActorLineage* lin
 }
 
 std::shared_ptr<Sample> SampleCollectorT::collect() {
-	Packer packer;
-	std::map<std::string_view, std::any> res;
+	auto sample = std::make_shared<Sample>();
 	double time = g_network->now();
-	res["time"sv] = time;
+	sample->time = time;
 	for (auto& p : getSamples) {
+		Packer packer;
 		std::vector<std::map<std::string_view, std::any>> samples;
 		auto sampleVec = p.second();
 		for (auto& val : sampleVec) {
@@ -189,11 +186,11 @@ std::shared_ptr<Sample> SampleCollectorT::collect() {
 			}
 		}
 		if (!samples.empty()) {
-			res[to_string(p.first)] = samples;
+			packer.pack(samples);
+			sample->data[p.first] = packer.getbuf();
 		}
 	}
-	packer.pack(res);
-	return packer.done(time);
+	return sample;
 }
 
 void SampleCollection_t::refresh() {
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index c612274133..67c6c83ff3 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -47,9 +47,12 @@ struct IALPCollector : IALPCollectorBase {
 
 struct Sample : std::enable_shared_from_this<Sample> {
 	double time = 0.0;
-	unsigned size = 0u;
-	char* data = nullptr;
-	~Sample() { ::free(data); }
+	std::unordered_map<WaitState, std::pair<char*, unsigned>> data;
+	~Sample() {
+		std::for_each(data.begin(), data.end(), [](std::pair<WaitState, std::pair<char*, unsigned>> entry) {
+			::free(entry.second.first);
+		});
+	}
 };
 
 class SampleCollectorT {
diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
index 0c57107106..04ecf76181 100644
--- a/fdbclient/ProcessInterface.h
+++ b/fdbclient/ProcessInterface.h
@@ -62,14 +62,13 @@ struct EchoRequest {
 struct SerializedSample {
 	constexpr static FileIdentifier file_identifier = 15785634;
 
-	WaitState waitState;
 	double time;
 	int seq;
-	std::string data;
+	std::unordered_map<WaitState, std::string> data;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, waitState, time, seq, data);
+		serializer(ar, time, seq, data);
 	}
 };
 
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index d2f0e57d55..ac200e20eb 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -2095,15 +2095,37 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 	actorLineageRequest.seqEnd = seqEnd;
 	ActorLineageReply reply = wait(process.actorLineage.getReply(actorLineageRequest));
 
+	time_t dt = 0;
+	int seq = -1;
 	for (const auto& sample : reply.samples) {
-		msgpack::object_handle oh = msgpack::unpack(sample.data.data(), sample.data.size());
-		msgpack::object deserialized = oh.get();
+		for (const auto& [waitState, data] : sample.data) {
+			time_t datetime = (time_t)sample.time;
+			seq = dt == datetime ? seq + 1 : 0;
+			dt = datetime;
 
-		std::ostringstream stream;
-		stream << deserialized;
-		// TODO: Fix return value for ranges
-		Key returnKey = prefix.withSuffix(host.toString() + "/" + std::to_string(sample.seq));
-		result.push_back_deep(result.arena(), KeyValueRef(returnKey, stream.str()));
+			if (seq < seqStart) {
+				continue;
+			} else if (seq >= seqEnd) {
+				break;
+			}
+
+			char buf[200];
+			struct tm* tm;
+			tm = localtime(&datetime);
+			size_t size = strftime(buf, 200, "%FT%T%z", tm);
+			std::string date(buf, size);
+
+			msgpack::object_handle oh = msgpack::unpack(data.data(), data.size());
+			msgpack::object deserialized = oh.get();
+
+			std::ostringstream stream;
+			stream << deserialized;
+
+			// TODO: Fix return value for time range
+			Key returnKey = prefix.withSuffix(host.toString() + "/" + std::string(to_string(waitState)) + "/" + date +
+			                                  "/" + std::to_string(seq));
+			result.push_back_deep(result.arena(), KeyValueRef(returnKey, stream.str()));
+		}
 	}
 
 	return result;
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 4c5dfecb16..c897b80354 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -2060,13 +2060,18 @@ ACTOR Future<Void> serveProcess() {
 				int maxSeq = std::min(req.seqEnd, static_cast<int>(samples.size()));
 
 				std::vector<SerializedSample> serializedSamples;
-				for (int i = req.seqStart; i < maxSeq; ++i) {
-					auto samplePtr = samples.at(i);
-					auto serialized = SerializedSample{ .waitState = WaitState::Network, // TODO: Currently unused
-						                                .time = samplePtr->time,
-						                                .seq = i,
-						                                .data = std::string(samplePtr->data, samplePtr->size) };
+				for (const auto& samplePtr : samples) {
+					int seq = 0;
+					auto serialized = SerializedSample{ .time = samplePtr->time, .seq = seq };
+					for (const auto& [waitState, pair] : samplePtr->data) {
+						serialized.data[waitState] = std::string(pair.first, pair.second);
+					}
 					serializedSamples.push_back(std::move(serialized));
+
+					// TODO: Don't need to transmit seq over the network anymore
+					if (++seq >= maxSeq) {
+						continue;
+					};
 				}
 				ActorLineageReply reply{ serializedSamples };
 				req.reply.send(reply);

From 2d6fafde64ce017b992d57bbd8481bf49c0d5e31 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Tue, 27 Apr 2021 10:26:42 -0600
Subject: [PATCH 271/317] Implemented configuration

---
 fdbclient/ActorLineageProfiler.cpp  |  8 ++++
 fdbclient/ActorLineageProfiler.h    |  4 ++
 fdbclient/FluentDSampleIngestor.cpp |  6 +++
 fdbclient/NativeAPI.actor.cpp       |  4 ++
 fdbclient/SpecialKeySpace.actor.cpp | 67 ++++++++++++++++++++++++++++-
 fdbclient/SpecialKeySpace.actor.h   | 14 ++++++
 6 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 3b300f1653..3c656010b6 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -360,3 +360,11 @@ void ProfilerConfigT::reset(std::map<std::string, std::string> const& config) {
 		    useTCP ? FluentDIngestor::Protocol::TCP : FluentDIngestor::Protocol::TCP, address));
 	}
 }
+
+std::map<std::string, std::string> ProfilerConfigT::getConfig() const {
+	std::map<std::string, std::string> res;
+	if (ingestor) {
+		ingestor->getConfig(res);
+	}
+	return res;
+}
diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index d09aba7d2c..9f9fdc3300 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -55,11 +55,13 @@ class SampleIngestor : std::enable_shared_from_this<SampleIngestor> {
 public:
 	virtual ~SampleIngestor();
 	virtual void ingest(std::shared_ptr<Sample> const& sample) = 0;
+	virtual void getConfig(std::map<std::string, std::string>&) const = 0;
 };
 
 class NoneIngestor : public SampleIngestor {
 public:
 	void ingest(std::shared_ptr<Sample> const& sample) override {}
+	void getConfig(std::map<std::string, std::string>& res) const override { res["ingestor"] = "none"; }
 };
 
 // The FluentD ingestor uses the pimp idiom. This is to make compilation less heavy weight as this implementation has
@@ -76,6 +78,7 @@ private: // members
 public: // interface
 	void ingest(std::shared_ptr<Sample> const& sample) override;
 	FluentDIngestor(Protocol protocol, NetworkAddress& endpoint);
+	void getConfig(std::map<std::string, std::string>& res) const override;
 	~FluentDIngestor();
 };
 
@@ -99,6 +102,7 @@ private: // construction
 
 public:
 	void reset(std::map<std::string, std::string> const& config);
+	std::map<std::string, std::string> getConfig() const;
 };
 
 using ProfilerConfig = crossbow::singleton<ProfilerConfigT>;
diff --git a/fdbclient/FluentDSampleIngestor.cpp b/fdbclient/FluentDSampleIngestor.cpp
index f1609ae5b3..e912643dbf 100644
--- a/fdbclient/FluentDSampleIngestor.cpp
+++ b/fdbclient/FluentDSampleIngestor.cpp
@@ -170,3 +170,9 @@ void FluentDIngestor::ingest(const std::shared_ptr<Sample>& sample) {
 		impl->socket->send(sample);
 	}
 }
+
+void FluentDIngestor::getConfig(std::map<std::string, std::string>& res) const {
+	res["ingestor"] = "fluentd";
+	res["collector_endpoint"] = impl->endpoint.toString();
+	res["collector_protocol"] = impl->protocol == Protocol::TCP ? "tcp" : "udp";
+}
diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index cd7638221b..443571c097 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -1060,6 +1060,10 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 		    SpecialKeySpace::MODULE::ACTORLINEAGE,
 		    SpecialKeySpace::IMPLTYPE::READONLY,
 		    std::make_unique<ActorLineageImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE)));
+		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF,
+		                              SpecialKeySpace::IMPLTYPE::READWRITE,
+		                              std::make_unique<ActorProfilerConf>(
+		                                  SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE)));
 	}
 	if (apiVersionAtLeast(630)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION,
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 603887fcf6..b692a5dea3 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -21,6 +21,7 @@
 #include "boost/lexical_cast.hpp"
 #include "boost/algorithm/string.hpp"
 
+#include "fdbclient/ActorLineageProfiler.h"
 #include "fdbclient/Knobs.h"
 #include "fdbclient/ProcessInterface.h"
 #include "fdbclient/GlobalConfig.actor.h"
@@ -71,7 +72,10 @@ std::unordered_map<SpecialKeySpace::MODULE, KeyRange> SpecialKeySpace::moduleToB
 	{ SpecialKeySpace::MODULE::TRACING,
 	  KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) },
 	{ SpecialKeySpace::MODULE::ACTORLINEAGE,
-	  KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) }
+	  KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) },
+	{ SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF,
+	  KeyRangeRef(LiteralStringRef("\xff\xff/actor_profiler_conf/"),
+	              LiteralStringRef("\xff\xff/actor_profiler_conf0")) }
 };
 
 std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandToRange = {
@@ -1953,3 +1957,64 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 Future<Standalone<RangeResultRef>> ActorLineageImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
 	return actorLineageGetRangeActor(ryw, getKeyRange().begin, kr);
 }
+
+namespace {
+std::string_view to_string_view(StringRef sr) {
+	return std::string_view(reinterpret_cast<const char*>(sr.begin()), sr.size());
+}
+} // namespace
+
+ActorProfilerConf::ActorProfilerConf(KeyRangeRef kr)
+  : SpecialKeyRangeRWImpl(kr), config(ProfilerConfig::instance().getConfig()) {}
+
+Future<Standalone<RangeResultRef>> ActorProfilerConf::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const {
+	Standalone<RangeResultRef> res;
+	std::string_view begin(to_string_view(kr.begin.removePrefix(range.begin))),
+	    end(to_string_view(kr.end.removePrefix(range.begin)));
+	for (auto& p : config) {
+		if (p.first > end) {
+			break;
+		} else if (p.first > begin) {
+			KeyValueRef kv;
+			kv.key = StringRef(res.arena(), p.first);
+			kv.value = StringRef(res.arena(), p.second);
+			res.push_back(res.arena(), kv);
+		}
+	}
+	return res;
+}
+
+void ActorProfilerConf::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
+	config[key.removePrefix(range.begin).toString()] = value.toString();
+}
+
+void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& kr) {
+	std::string begin(kr.begin.removePrefix(range.begin).toString()), end(kr.end.removePrefix(range.begin).toString());
+	auto first = config.lower_bound(begin);
+	if (first == config.end()) {
+		// nothing to clear
+		return;
+	}
+	auto last = config.upper_bound(end);
+	config.erase(first, last);
+}
+
+void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
+	std::string k = key.removePrefix(range.begin).toString();
+	auto iter = config.find(k);
+	if (iter != config.end()) {
+		config.erase(iter);
+	}
+}
+
+Future<Optional<std::string>> ActorProfilerConf::commit(ReadYourWritesTransaction* ryw) {
+	Optional<std::string> res{};
+	try {
+		if (didWrite) {
+			ProfilerConfig::instance().reset(config);
+		}
+		return res;
+	} catch (ConfigError& err) {
+		return Optional<std::string>{ err.description };
+	}
+}
diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h
index 08a3c6cfc5..f17a4b38ca 100644
--- a/fdbclient/SpecialKeySpace.actor.h
+++ b/fdbclient/SpecialKeySpace.actor.h
@@ -143,6 +143,7 @@ class SpecialKeySpace {
 public:
 	enum class MODULE {
 		ACTORLINEAGE, // Sampling data
+		ACTOR_PROFILER_CONF, // profiler configuration
 		CLUSTERFILEPATH,
 		CONFIGURATION, // Configuration of the cluster
 		CONNECTIONSTRING,
@@ -395,5 +396,18 @@ public:
 	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
 };
 
+class ActorProfilerConf : public SpecialKeyRangeRWImpl {
+	bool didWrite = false;
+	std::map<std::string, std::string> config;
+
+public:
+	explicit ActorProfilerConf(KeyRangeRef kr);
+	Future<Standalone<RangeResultRef>> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override;
+	void set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) override;
+	void clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) override;
+	void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override;
+	Future<Optional<std::string>> commit(ReadYourWritesTransaction* ryw) override;
+};
+
 #include "flow/unactorcompiler.h"
 #endif

From 10d5007e1a9f4c808e648320f7b57c4f07b4ea45 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 27 Apr 2021 09:59:10 -0700
Subject: [PATCH 272/317] Cleanup

---
 fdbclient/SpecialKeySpace.actor.cpp | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index ac200e20eb..c3426e27df 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -2103,18 +2103,18 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 			seq = dt == datetime ? seq + 1 : 0;
 			dt = datetime;
 
-			if (seq < seqStart) {
-				continue;
-			} else if (seq >= seqEnd) {
-				break;
-			}
+			if (seq < seqStart) { continue; }
+			else if (seq >= seqEnd) { break; }
 
-			char buf[200];
+			char buf[50];
 			struct tm* tm;
 			tm = localtime(&datetime);
-			size_t size = strftime(buf, 200, "%FT%T%z", tm);
+			size_t size = strftime(buf, 50, "%FT%T%z", tm);
 			std::string date(buf, size);
 
+			std::ostringstream streamKey;
+			streamKey << prefix.toString() << host.toString() << "/" << to_string(waitState) << "/" << date << "/" << seq;
+
 			msgpack::object_handle oh = msgpack::unpack(data.data(), data.size());
 			msgpack::object deserialized = oh.get();
 
@@ -2122,9 +2122,7 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 			stream << deserialized;
 
 			// TODO: Fix return value for time range
-			Key returnKey = prefix.withSuffix(host.toString() + "/" + std::string(to_string(waitState)) + "/" + date +
-			                                  "/" + std::to_string(seq));
-			result.push_back_deep(result.arena(), KeyValueRef(returnKey, stream.str()));
+			result.push_back_deep(result.arena(), KeyValueRef(streamKey.str(), stream.str()));
 		}
 	}
 

From e16343230351152d12f2eb1b9a727c9658731627 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 27 Apr 2021 10:20:25 -0700
Subject: [PATCH 273/317] Add filtering by wait state

---
 fdbclient/ProcessInterface.h        |  6 ++----
 fdbclient/SpecialKeySpace.actor.cpp |  2 --
 fdbserver/worker.actor.cpp          | 17 ++++-------------
 3 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
index 04ecf76181..80fb1f9aff 100644
--- a/fdbclient/ProcessInterface.h
+++ b/fdbclient/ProcessInterface.h
@@ -63,12 +63,11 @@ struct SerializedSample {
 	constexpr static FileIdentifier file_identifier = 15785634;
 
 	double time;
-	int seq;
 	std::unordered_map<WaitState, std::string> data;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, time, seq, data);
+		serializer(ar, time, data);
 	}
 };
 
@@ -86,11 +85,10 @@ struct ActorLineageRequest {
 	constexpr static FileIdentifier file_identifier = 11654765;
 	WaitState waitStateStart, waitStateEnd;
 	time_t timeStart, timeEnd;
-	int seqStart, seqEnd;
 	ReplyPromise<ActorLineageReply> reply;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
-		serializer(ar, waitStateStart, waitStateEnd, timeStart, timeEnd, seqStart, seqEnd, reply);
+		serializer(ar, waitStateStart, waitStateEnd, timeStart, timeEnd, reply);
 	}
 };
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index c3426e27df..e0adca87c2 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -2091,8 +2091,6 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 	actorLineageRequest.waitStateEnd = waitStateEnd;
 	actorLineageRequest.timeStart = timeStart;
 	actorLineageRequest.timeEnd = timeEnd;
-	actorLineageRequest.seqStart = seqStart;
-	actorLineageRequest.seqEnd = seqEnd;
 	ActorLineageReply reply = wait(process.actorLineage.getReply(actorLineageRequest));
 
 	time_t dt = 0;
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index c897b80354..447f2f85a7 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -2052,26 +2052,17 @@ ACTOR Future<Void> serveProcess() {
 			}
 			when(ActorLineageRequest req = waitNext(process.actorLineage.getFuture())) {
 				state SampleCollection sampleCollector;
-				// TODO: Add filtering by wait state
 				auto samples = sampleCollector->get(req.timeStart, req.timeEnd);
-				// The size of samples should never approach 2 billion, so
-				// casting from 64 to 32 bits here should be okay.
-				ASSERT(samples.size() < std::numeric_limits<int>::max());
-				int maxSeq = std::min(req.seqEnd, static_cast<int>(samples.size()));
 
 				std::vector<SerializedSample> serializedSamples;
 				for (const auto& samplePtr : samples) {
-					int seq = 0;
-					auto serialized = SerializedSample{ .time = samplePtr->time, .seq = seq };
+					auto serialized = SerializedSample{ .time = samplePtr->time };
 					for (const auto& [waitState, pair] : samplePtr->data) {
-						serialized.data[waitState] = std::string(pair.first, pair.second);
+						if (waitState >= req.waitStateStart && waitState <= req.waitStateEnd) {
+							serialized.data[waitState] = std::string(pair.first, pair.second);
+						}
 					}
 					serializedSamples.push_back(std::move(serialized));
-
-					// TODO: Don't need to transmit seq over the network anymore
-					if (++seq >= maxSeq) {
-						continue;
-					};
 				}
 				ActorLineageReply reply{ serializedSamples };
 				req.reply.send(reply);

From 0ba5a8e9d1bf880aaf48da17737a9ece8fa4f6a9 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 27 Apr 2021 10:39:26 -0700
Subject: [PATCH 274/317] Fix return key when sorting by time

---
 fdbclient/ProcessInterface.h        |  1 -
 fdbclient/SpecialKeySpace.actor.cpp | 10 ++++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
index 80fb1f9aff..4224ae9d03 100644
--- a/fdbclient/ProcessInterface.h
+++ b/fdbclient/ProcessInterface.h
@@ -58,7 +58,6 @@ struct EchoRequest {
 };
 
 // This type is used to send serialized sample data over the network.
-// TODO: Possible to combine with `Sample`?
 struct SerializedSample {
 	constexpr static FileIdentifier file_identifier = 15785634;
 
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index e0adca87c2..c86411933a 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -2111,7 +2111,14 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 			std::string date(buf, size);
 
 			std::ostringstream streamKey;
-			streamKey << prefix.toString() << host.toString() << "/" << to_string(waitState) << "/" << date << "/" << seq;
+			if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
+				streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("state").toString() << host.toString() << "/" << to_string(waitState) << "/" << date;
+			} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
+				streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("time").toString() << host.toString() << "/" << date << "/" << to_string(waitState);;
+			} else {
+				ASSERT(false);
+			}
+			streamKey <<  "/" << seq;
 
 			msgpack::object_handle oh = msgpack::unpack(data.data(), data.size());
 			msgpack::object deserialized = oh.get();
@@ -2119,7 +2126,6 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 			std::ostringstream stream;
 			stream << deserialized;
 
-			// TODO: Fix return value for time range
 			result.push_back_deep(result.arena(), KeyValueRef(streamKey.str(), stream.str()));
 		}
 	}

From d964b5ded08e0d864e9cfa87f4abde3a8bbadc96 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 27 Apr 2021 10:41:48 -0700
Subject: [PATCH 275/317] clang-format

---
 fdbclient/SpecialKeySpace.actor.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index c86411933a..1776b6dffd 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -2112,13 +2112,16 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 
 			std::ostringstream streamKey;
 			if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
-				streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("state").toString() << host.toString() << "/" << to_string(waitState) << "/" << date;
+				streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("state").toString() << host.toString()
+				          << "/" << to_string(waitState) << "/" << date;
 			} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
-				streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("time").toString() << host.toString() << "/" << date << "/" << to_string(waitState);;
+				streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("time").toString() << host.toString()
+				          << "/" << date << "/" << to_string(waitState);
+				;
 			} else {
 				ASSERT(false);
 			}
-			streamKey <<  "/" << seq;
+			streamKey << "/" << seq;
 
 			msgpack::object_handle oh = msgpack::unpack(data.data(), data.size());
 			msgpack::object deserialized = oh.get();

From 9009780aa8e8a4098e083c32ca69d399f2a2a095 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Tue, 27 Apr 2021 11:15:16 -0700
Subject: [PATCH 276/317] Fix bug that could cause the server to crash when an
 old client connected

---
 fdbrpc/FlowTransport.actor.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp
index c8dd207d3a..8cc9d0d8e6 100644
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@@ -1215,9 +1215,12 @@ ACTOR static Future<Void> connectionReader(TransportData* transport,
 							}
 							compatible = false;
 							if (!protocolVersion.hasInexpensiveMultiVersionClient()) {
+								if(peer) {
+									peer->protocolVersion->set(protocolVersion);
+								}
+
 								// Older versions expected us to hang up. It may work even if we don't hang up here, but
 								// it's safer to keep the old behavior.
-								peer->protocolVersion->set(protocolVersion);
 								throw incompatible_protocol_version();
 							}
 						} else {

From 2f3d70c084e6d63004c1276535baa71f9310714f Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Tue, 27 Apr 2021 11:27:57 -0700
Subject: [PATCH 277/317] Fix the logic of getting firstConsistentVersion.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

First consistent version should be:

- In a logs-only restore, it is the begin version the user said to start applying logs for;
- In an inconsistent-snapshot-only restore, if all range files have the same version, then it is that version, otherwise unknown (use -1);
- If using both range files and logs, then it is the highest version of any range file in the RestoreSet’s ranges vector.
---
 fdbclient/FileBackupAgent.actor.cpp | 57 ++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index ecc83dd955..60d21a6a95 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -4102,6 +4102,8 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 		}
 
 		state bool logsOnly = wait(restore.onlyAppyMutationLogs().getD(tr, false, false));
+		state bool inconsistentSnapshotOnly = wait(restore.inconsistentSnapshotOnly().getD(tr, false, false));
+		state Version firstConsistentVersion = invalidVersion;
 		if (beginVersion == invalidVersion) {
 			beginVersion = 0;
 		}
@@ -4111,25 +4113,46 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 		}
 		state Optional<RestorableFileSet> restorable =
 		    wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, logsOnly, beginVersion));
-		if (!logsOnly) {
-			beginVersion = restorable.get().snapshot.beginVersion;
-		}
-
 		if (!restorable.present())
 			throw restore_missing_data();
 
-		// First version for which log data should be applied
-		Params.firstVersion().set(task, beginVersion);
-
 		// Convert the two lists in restorable (logs and ranges) to a single list of RestoreFiles.
 		// Order does not matter, they will be put in order when written to the restoreFileMap below.
 		state std::vector<RestoreConfig::RestoreFile> files;
-
-		state Version firstConsistentVersion = beginVersion;
-		for (const RangeFile& f : restorable.get().ranges) {
-			files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
-			firstConsistentVersion = std::max(firstConsistentVersion, f.version);
+		if (!logsOnly) {
+			beginVersion = restorable.get().snapshot.beginVersion;
+			if (!inconsistentSnapshotOnly) {
+				for (const RangeFile& f : restorable.get().ranges) {
+					files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
+					// In a restore with both snapshots and logs, the firstConsistentVersion is the highest version of
+					// any range file.
+					firstConsistentVersion = std::max(firstConsistentVersion, f.version);
+				}
+			} else {
+				for (int i = 0; i < restorable.get().ranges.size(); ++i) {
+					const RangeFile& f = restorable.get().ranges[i];
+					files.push_back({ f.version, f.fileName, true, f.blockSize, f.fileSize });
+					// In inconsistentSnapshotOnly mode, if all range files have the same version, then it is the
+					// firstConsistentVersion, otherwise unknown (use -1).
+					if (i != 0 && f.version != firstConsistentVersion) {
+						firstConsistentVersion = invalidVersion;
+					} else {
+						firstConsistentVersion = f.version;
+					}
+				}
+			}
+		} else {
+			// In logs-only (incremental) mode, the firstConsistentVersion should just be restore.beginVersion().
+			firstConsistentVersion = beginVersion;
 		}
+		if (!inconsistentSnapshotOnly) {
+			for (const LogFile& f : restorable.get().logs) {
+				files.push_back({ f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion });
+			}
+		}
+		// First version for which log data should be applied
+		Params.firstVersion().set(task, beginVersion);
+
 		tr->reset();
 		loop {
 			try {
@@ -4143,16 +4166,6 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			}
 		}
 
-		tr->reset();
-		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-		bool inconsistentSnapshotOnly = wait(restore.inconsistentSnapshotOnly().getD(tr, false, false));
-		if (!inconsistentSnapshotOnly) {
-			for (const LogFile& f : restorable.get().logs) {
-				files.push_back({ f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion });
-			}
-		}
-
 		state std::vector<RestoreConfig::RestoreFile>::iterator start = files.begin();
 		state std::vector<RestoreConfig::RestoreFile>::iterator end = files.end();
 

From 5d0eaac3ea5c0f2aa2937e1537c14503e0cb48e1 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Tue, 27 Apr 2021 11:40:02 -0700
Subject: [PATCH 278/317] Remove old code

---
 fdbclient/ProcessInterface.h        | 12 ------------
 fdbclient/SpecialKeySpace.actor.cpp |  1 -
 2 files changed, 13 deletions(-)

diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h
index 4224ae9d03..11bafc2987 100644
--- a/fdbclient/ProcessInterface.h
+++ b/fdbclient/ProcessInterface.h
@@ -45,18 +45,6 @@ struct GetProcessInterfaceRequest {
 	}
 };
 
-// TODO: Used for demonstration purposes, remove in later PR
-struct EchoRequest {
-	constexpr static FileIdentifier file_identifier = 10624019;
-	std::string message;
-	ReplyPromise<std::string> reply;
-
-	template <class Ar>
-	void serialize(Ar& ar) {
-		serializer(ar, message, reply);
-	}
-};
-
 // This type is used to send serialized sample data over the network.
 struct SerializedSample {
 	constexpr static FileIdentifier file_identifier = 15785634;
diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 1776b6dffd..b8721f52e1 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1962,7 +1962,6 @@ void parse(StringRef& val, WaitState& w) {
 void parse(StringRef& val, time_t& t) {
 	struct tm tm = { 0 };
 	if (strptime(val.toString().c_str(), "%FT%T%z", &tm) == nullptr) {
-			TraceEvent("LUKAS_FailedToParse");
 		throw std::invalid_argument("failed to parse ISO 8601 datetime");
 	}
 

From 1b5119b7fb6addfc479edf418de04bf96eff54e6 Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Tue, 27 Apr 2021 12:24:54 -0700
Subject: [PATCH 279/317] Add new docker images for EKS environments

---
 packaging/docker/Dockerfile.eks            | 85 ++++++++++++++++++++++
 packaging/docker/build-eks-docker.sh       | 50 +++++++++++++
 packaging/docker/misc/tini-amd64.sha256sum |  1 +
 3 files changed, 136 insertions(+)
 create mode 100644 packaging/docker/Dockerfile.eks
 create mode 100755 packaging/docker/build-eks-docker.sh
 create mode 100644 packaging/docker/misc/tini-amd64.sha256sum

diff --git a/packaging/docker/Dockerfile.eks b/packaging/docker/Dockerfile.eks
new file mode 100644
index 0000000000..d7fbdf04cb
--- /dev/null
+++ b/packaging/docker/Dockerfile.eks
@@ -0,0 +1,85 @@
+FROM amazonlinux:2.0.20210326.0 as base
+
+RUN yum install -y \
+  bind-utils \
+  curl \
+  jq \
+  less \
+  lsof \
+  nc \
+  net-tools \
+  perf \
+  python3-pip \
+  strace \
+  tar \
+  traceroute \
+  telnet \
+  tcpdump \
+  vim
+
+#todo: nload, iperf, numademo
+
+COPY misc/tini-amd64.sha256sum /tmp/
+# Adding tini as PID 1 https://github.com/krallin/tini
+ARG TINI_VERSION=v0.19.0
+RUN curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
+	sha256sum -c /tmp/tini-amd64.sha256sum && \
+  chmod +x tini-amd64 && \
+	mv tini-amd64 /usr/bin/tini
+ENTRYPOINT ["/usr/bin/tini", "-g", "--"]
+
+COPY sidecar/requirements.txt /tmp
+RUN pip3 install -r /tmp/requirements.txt
+
+# TODO: Only used by sidecar
+RUN groupadd --gid 4059 fdb && \
+	useradd --gid 4059 --uid 4059 --no-create-home --shell /bin/bash fdb
+
+ARG FDB_VERSION
+
+# These are the output of the current build (not stripped)
+COPY --chown=root bin /usr/bin/
+COPY --chown=root lib/libfdb_c.so /var/fdb/lib/
+RUN mv /var/fdb/lib/libfdb_c.so /var/fdb/lib/libfdb_c_${FDB_VERSION%.*}.so
+RUN ln -s /var/fdb/lib/libfdb_c_${FDB_VERSION%.*}.so /var/fdb/lib/libfdb_c.so
+# -------------------------------------------------
+
+FROM base as foundationdb
+
+COPY release/*.bash /var/fdb/scripts/
+RUN mkdir -p /var/fdb/logs
+
+# TODO: FDB_ADDITIONAL_VERSIONS
+RUN mkdir -p /usr/lib/fdb/multiversion
+
+VOLUME /var/fdb/data
+
+# Runtime Configuration Options
+ENV FDB_PORT 4500
+ENV FDB_CLUSTER_FILE /var/fdb/fdb.cluster
+ENV FDB_NETWORKING_MODE container
+ENV FDB_COORDINATOR ""
+ENV FDB_COORDINATOR_PORT 4500
+ENV FDB_CLUSTER_FILE_CONTENTS ""
+ENV FDB_PROCESS_CLASS unset
+
+CMD /var/fdb/scripts/fdb.bash
+
+# -------------------------------------------------
+
+FROM base AS sidecar
+
+COPY sidecar/entrypoint.bash /
+COPY sidecar/sidecar.py /
+
+VOLUME /var/input-files
+VOLUME /var/output-files
+
+RUN echo ${FDB_VERSION} > /var/fdb/version
+RUN mkdir -p /var/fdb/lib
+
+ENV LISTEN_PORT 8080
+
+USER fdb
+
+CMD "/entrypoint.bash"
\ No newline at end of file
diff --git a/packaging/docker/build-eks-docker.sh b/packaging/docker/build-eks-docker.sh
new file mode 100755
index 0000000000..b467d0be5b
--- /dev/null
+++ b/packaging/docker/build-eks-docker.sh
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+set -euxo pipefail
+
+DOCKER_ROOT=$(realpath $(dirname ${BASH_SOURCE[0]}))
+BUILD_OUTPUT=$(realpath ${DOCKER_ROOT}/../..)
+
+echo Docker root:  $DOCKER_ROOT
+echo Build output: $BUILD_OUTPUT
+
+cd ${DOCKER_ROOT}
+
+## eg: CMAKE_PROJECT_VERSION:STATIC=7.0.0
+FDB_VERSION=$(grep CMAKE_PROJECT_VERSION\: ${BUILD_OUTPUT}/CMakeCache.txt | cut -d '=' -f 2)
+
+# Options (passed via environment variables)
+
+# Feel free to customize the image tag.
+# TODO: add a mechanism to set TAG=FDB_VERSION when we're building public releases.
+TAG=${TAG:-${FDB_VERSION}-${OKTETO_NAME}}
+ECR=${ECR:-112664522426.dkr.ecr.us-west-2.amazonaws.com}
+
+echo Building with tag ${TAG}
+
+# Login to ECR
+# TODO: Move this to a common place instead of repeatedly copy-pasting it.
+aws ecr get-login-password | docker login --username AWS --password-stdin ${ECR}
+
+docker pull ${ECR}/amazonlinux:2.0.20210326.0
+docker tag ${ECR}/amazonlinux:2.0.20210326.0 amazonlinux:2.0.20210326.0
+
+IMAGE=foundationdb/foundationdb:${TAG}
+SIDECAR=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
+STRIPPED=${STRIPPED:-false}
+if $STRIPPED; then
+  rsync -av --delete --exclude=*.xml ${BUILD_OUTPUT}/packages/bin .
+  rsync -av --delete --exclude=*.a --exclude=*.xml ${BUILD_OUTPUT}/packages/lib .
+else
+  rsync -av --delete --exclude=*.xml ${BUILD_OUTPUT}/bin .
+  rsync -av --delete --exclude=*.a --exclude=*.xml ${BUILD_OUTPUT}/lib .
+fi
+
+docker build --build-arg FDB_VERSION=$FDB_VERSION -t ${IMAGE}   --target foundationdb -f Dockerfile.eks .
+docker build --build-arg FDB_VERSION=$FDB_VERSION -t ${SIDECAR} --target sidecar      -f Dockerfile.eks .
+
+docker tag ${IMAGE} ${ECR}/${IMAGE}
+docker tag ${SIDECAR} ${ECR}/${SIDECAR}
+
+docker push ${ECR}/${IMAGE}
+docker push ${ECR}/${SIDECAR}
diff --git a/packaging/docker/misc/tini-amd64.sha256sum b/packaging/docker/misc/tini-amd64.sha256sum
new file mode 100644
index 0000000000..3cb1f9f635
--- /dev/null
+++ b/packaging/docker/misc/tini-amd64.sha256sum
@@ -0,0 +1 @@
+93dcc18adc78c65a028a84799ecf8ad40c936fdfc5f2a57b1acda5a8117fa82c  tini-amd64

From 16dfb2b2f2b24b767700a7f32b8c5877e7d1a8f7 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Tue, 27 Apr 2021 15:00:56 -0700
Subject: [PATCH 280/317] Keep connections older than 6.2 open indefinitely to
 avoid weird bugs around quickly closing the database.

---
 fdbclient/MultiVersionTransaction.actor.cpp | 24 ++++++++++++++++-----
 fdbclient/MultiVersionTransaction.h         |  9 ++++++++
 flow/ProtocolVersion.h                      |  1 +
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index bca3549651..552db4c3ac 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -892,15 +892,26 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
 			dbState->addClient(api->getLocalClient());
 		}
 
+		api->runOnExternalClients(threadIdx, [this](Reference<ClientInfo> client) { dbState->addClient(client); });
+
 		if (!externalClientsInitialized.test_and_set()) {
 			api->runOnExternalClientsAllThreads([&clusterFilePath](Reference<ClientInfo> client) {
-				// This creates a database to initialize some client state on the external library,
-				// but it gets deleted immediately so that we don't keep open connections
-				Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
+				// This creates a database to initialize some client state on the external library
+				// We only do this on 6.2+ clients to avoid some bugs associated with older versions
+				// This deletes the new database immediately to discard its connections
+				if (client->protocolVersion.hasCloseUnusedConnection()) {
+					Reference<IDatabase> newDb = client->api->createDatabase(clusterFilePath.c_str());
+				}
 			});
 		}
 
-		api->runOnExternalClients(threadIdx, [this](Reference<ClientInfo> client) { dbState->addClient(client); });
+		// For clients older than 6.2 we create and maintain our database connection
+		api->runOnExternalClients(threadIdx, [this, &clusterFilePath](Reference<ClientInfo> client) {
+			if (!client->protocolVersion.hasCloseUnusedConnection()) {
+				dbState->legacyDatabaseConnections[client->protocolVersion] =
+				    client->api->createDatabase(clusterFilePath.c_str());
+			}
+		});
 
 		onMainThreadVoid([this]() { dbState->protocolVersionMonitor = dbState->monitorProtocolVersion(); }, nullptr);
 	}
@@ -1158,7 +1169,10 @@ void MultiVersionDatabase::LegacyVersionMonitor::startConnectionMonitor(
 	if (!monitorRunning) {
 		monitorRunning = true;
 
-		db = client->api->createDatabase(dbState->clusterFilePath.c_str());
+		auto itr = dbState->legacyDatabaseConnections.find(client->protocolVersion);
+		ASSERT(itr != dbState->legacyDatabaseConnections.end());
+
+		db = itr->second;
 		tr = Reference<ITransaction>();
 
 		TraceEvent("StartingLegacyVersionMonitor").detail("ProtocolVersion", client->protocolVersion);
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index 388db1bd3b..2244ec2c6e 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -510,7 +510,16 @@ public:
 
 		ThreadFuture<Void> dbReady;
 		ThreadFuture<Void> protocolVersionMonitor;
+
+		// Versions older than 6.1 do not benefit from having their database connections closed. Additionally,
+		// there are various issues that result in negative behavior in some cases if the connections are closed.
+		// Therefore, we leave them open.
+		std::map<ProtocolVersion, Reference<IDatabase>> legacyDatabaseConnections;
+
+		// Versions 5.0 and older do not support connection packet monitoring and require alternate techniques to
+		// determine the cluster version.
 		std::list<LegacyVersionMonitor> legacyVersionMonitors;
+
 		Optional<ProtocolVersion> dbProtocolVersion;
 
 		// This maps a normalized protocol version to the client associated with it. This prevents compatible
diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h
index 07a2675f1b..d3c601a9b5 100644
--- a/flow/ProtocolVersion.h
+++ b/flow/ProtocolVersion.h
@@ -118,6 +118,7 @@ public: // introduced features
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, BackupMutations);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, ClusterControllerPriorityInfo);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, ProcessIDFile);
+	PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, CloseUnusedConnection);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, DBCoreState);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, TagThrottleValue);
 	PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, ServerListValue);

From 0145eea68467ee25bc82a3951d1d8b684fb5879c Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Tue, 27 Apr 2021 15:17:20 -0700
Subject: [PATCH 281/317] Make `MonitorLeaderForwarding` and `LeaderForwarding`
 trackLatest events.

---
 fdbclient/MonitorLeader.actor.cpp  | 2 +-
 fdbserver/Knobs.cpp                | 2 +-
 fdbserver/LeaderElection.actor.cpp | 4 +++-
 fdbserver/worker.actor.cpp         | 6 ------
 4 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp
index af563c68b0..d18fd12adc 100644
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@@ -495,7 +495,7 @@ ACTOR Future<MonitorLeaderInfo> monitorLeaderOneGeneration(Reference<ClusterConn
 			if (leader.get().first.forward) {
 				TraceEvent("MonitorLeaderForwarding")
 				    .detail("NewConnStr", leader.get().first.serializedInfo.toString())
-				    .detail("OldConnStr", info.intermediateConnFile->getConnectionString().toString());
+				    .detail("OldConnStr", info.intermediateConnFile->getConnectionString().toString()).trackLatest("MonitorLeaderForwarding");
 				info.intermediateConnFile = makeReference<ClusterConnectionFile>(
 				    connFile->getFilename(), ClusterConnectionString(leader.get().first.serializedInfo.toString()));
 				return info;
diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp
index ef2334d3cf..d3e42f760d 100644
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@@ -616,7 +616,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi
 	//Worker
 	init( WORKER_LOGGING_INTERVAL,                               5.0 );
 	init( HEAP_PROFILER_INTERVAL,                               30.0 );
-	init( REGISTER_WORKER_REQUEST_TIMEOUT,                       300.0 );
+	init( REGISTER_WORKER_REQUEST_TIMEOUT,                     300.0 );
 	init( DEGRADED_RESET_INTERVAL,                          24*60*60 ); if ( randomize && BUGGIFY ) DEGRADED_RESET_INTERVAL = 10;
 	init( DEGRADED_WARNING_LIMIT,                                  1 );
 	init( DEGRADED_WARNING_RESET_DELAY,                   7*24*60*60 );
diff --git a/fdbserver/LeaderElection.actor.cpp b/fdbserver/LeaderElection.actor.cpp
index 319074630d..d6ce27126a 100644
--- a/fdbserver/LeaderElection.actor.cpp
+++ b/fdbserver/LeaderElection.actor.cpp
@@ -143,7 +143,9 @@ ACTOR Future<Void> tryBecomeLeaderInternal(ServerCoordinators coordinators,
 				}
 				coordinators.ccf->setConnectionString(
 				    ClusterConnectionString(leader.get().first.serializedInfo.toString()));
-				TraceEvent("LeaderForwarding").detail("ConnStr", coordinators.ccf->getConnectionString().toString());
+				TraceEvent("LeaderForwarding")
+				    .detail("ConnStr", coordinators.ccf->getConnectionString().toString())
+				    .trackLatest("LeaderForwarding");
 				throw coordinators_changed();
 			}
 
diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index b7cbfe16d7..5437d4dead 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -542,12 +542,6 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			}
 		} else {
 			incorrectTime = Optional<double>();
-			if (connFile && connFile->canGetFilename()) {
-				TraceEvent("ClusterFileContents")
-				    .detail("Filename", connFile->getFilename())
-				    .detail("ConnectionStringFromFile", fileConnectionString.toString())
-				    .detail("CurrentConnectionString", connectionString);
-			}
 		}
 
 		auto peers = FlowTransport::transport().getIncompatiblePeers();

From 2bf7cf707e6fbc56634a69dcd54af3c1f0ea0b0f Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Tue, 27 Apr 2021 15:19:18 -0700
Subject: [PATCH 282/317] Revert a change in fdbserver/worker.actor.cpp.

---
 fdbserver/worker.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp
index 5437d4dead..5a568fc96d 100644
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@@ -526,9 +526,9 @@ ACTOR Future<Void> registrationClient(Reference<AsyncVar<Optional<ClusterControl
 			request.issues.push_back_deep(request.issues.arena(), i);
 		}
 		ClusterConnectionString fileConnectionString;
-		std::string connectionString = connFile->getConnectionString().toString();
 		if (connFile && !connFile->fileContentsUpToDate(fileConnectionString)) {
 			request.issues.push_back_deep(request.issues.arena(), LiteralStringRef("incorrect_cluster_file_contents"));
+			std::string connectionString = connFile->getConnectionString().toString();
 			if (!incorrectTime.present()) {
 				incorrectTime = now();
 			}

From ec0f5db98e6991daa63f19301a1780321a4a0f88 Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Tue, 27 Apr 2021 13:46:59 -0700
Subject: [PATCH 283/317] port sidecar.py to distributions that default to
 python 2 instead of 3

fix various entrypoint bugs in sidecar
---
 packaging/docker/Dockerfile.eks          | 10 +++++++---
 packaging/docker/sidecar/entrypoint.bash |  2 +-
 packaging/docker/sidecar/sidecar.py      |  2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)
 mode change 100644 => 100755 packaging/docker/sidecar/sidecar.py

diff --git a/packaging/docker/Dockerfile.eks b/packaging/docker/Dockerfile.eks
index d7fbdf04cb..e9a1185dc9 100644
--- a/packaging/docker/Dockerfile.eks
+++ b/packaging/docker/Dockerfile.eks
@@ -9,6 +9,7 @@ RUN yum install -y \
   nc \
   net-tools \
   perf \
+  python38 \
   python3-pip \
   strace \
   tar \
@@ -26,7 +27,6 @@ RUN curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/
 	sha256sum -c /tmp/tini-amd64.sha256sum && \
   chmod +x tini-amd64 && \
 	mv tini-amd64 /usr/bin/tini
-ENTRYPOINT ["/usr/bin/tini", "-g", "--"]
 
 COPY sidecar/requirements.txt /tmp
 RUN pip3 install -r /tmp/requirements.txt
@@ -63,6 +63,7 @@ ENV FDB_COORDINATOR_PORT 4500
 ENV FDB_CLUSTER_FILE_CONTENTS ""
 ENV FDB_PROCESS_CLASS unset
 
+ENTRYPOINT ["/usr/bin/tini", "-g", "--"]
 CMD /var/fdb/scripts/fdb.bash
 
 # -------------------------------------------------
@@ -71,15 +72,18 @@ FROM base AS sidecar
 
 COPY sidecar/entrypoint.bash /
 COPY sidecar/sidecar.py /
+RUN chmod a+x /sidecar.py /entrypoint.bash
 
 VOLUME /var/input-files
 VOLUME /var/output-files
 
-RUN echo ${FDB_VERSION} > /var/fdb/version
+ARG FDB_VERSION
+
+RUN echo ${FDB_VERSION} ; echo ${FDB_VERSION}> /var/fdb/version
 RUN mkdir -p /var/fdb/lib
 
 ENV LISTEN_PORT 8080
 
 USER fdb
 
-CMD "/entrypoint.bash"
\ No newline at end of file
+ENTRYPOINT ["/usr/bin/tini", "-g", "--", "/entrypoint.bash"]
\ No newline at end of file
diff --git a/packaging/docker/sidecar/entrypoint.bash b/packaging/docker/sidecar/entrypoint.bash
index be173d4ea9..dbc885581c 100755
--- a/packaging/docker/sidecar/entrypoint.bash
+++ b/packaging/docker/sidecar/entrypoint.bash
@@ -24,4 +24,4 @@ if [[ -n "$ADDITIONAL_ENV_FILE" ]]; then
   source $ADDITIONAL_ENV_FILE
 fi
 
-python sidecar.py $*
\ No newline at end of file
+/sidecar.py $*
diff --git a/packaging/docker/sidecar/sidecar.py b/packaging/docker/sidecar/sidecar.py
old mode 100644
new mode 100755
index af2a580439..3e47c1c932
--- a/packaging/docker/sidecar/sidecar.py
+++ b/packaging/docker/sidecar/sidecar.py
@@ -1,4 +1,4 @@
-#! /usr/bin/python
+#! /usr/bin/python3
 
 # entrypoint.py
 #

From 2eef4e28beabfe852b007b45e8517020b73e360d Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Tue, 27 Apr 2021 17:15:44 -0700
Subject: [PATCH 284/317] Documentation update

---
 packaging/docker/README.md | 61 +++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 17 deletions(-)

diff --git a/packaging/docker/README.md b/packaging/docker/README.md
index 83639c7967..f8bf9cb5ed 100644
--- a/packaging/docker/README.md
+++ b/packaging/docker/README.md
@@ -1,21 +1,29 @@
 # Overview
 
-This directory provides a Docker image for running FoundationDB.
+This directory provides various Docker images for running FoundationDB.
 
-The image in this directory is based on Ubuntu 18.04, but the commands and
-scripts used to build it should be suitable for most other distros with small
-tweaks to the installation of dependencies.
-
-The image relies on the following dependencies:
-
-*	bash
-*	wget
-*	dig
-*	glibc
+This directory includes two sets of images.  The "release" images are based
+on Ubuntu 18.04.  The EKS images use Amazon Linux, which allows us to profile
+FoundationDB when it is running inside of Amazon EKS.
 
 # Build Configuration
 
-This image supports several build arguments for build-time configuration.
+The build scripts are configured using the following environment variables:
+
+`TAG` is the base docker tag for this build.  The sidecar tag will be this
+string, with a "-1" appended to it.  If you do not specify a tag, then the
+scripts attempt to provide a reasonable default.
+
+`ECR` is the name of the Docker registry the images should be published to.
+It defaults to a private registry, so it is likely you will need to override this.
+
+`STRIPPED` if true, the Docker images will contain stripped binaries without
+debugging symbols.  Debugging symbols add approximately 2GiB to the image size.
+
+# Release Dockerfile arguments.
+
+These arguments are set automatically by the build scripts, but are documented here
+in case you need to invoke the release Dockerfiles directly.
 
 ### FDB_VERSION
 
@@ -26,6 +34,10 @@ The version of FoundationDB to install in the container. This is required.
 The base URL for the FoundationDB website. The default is
 `https://www.foundationdb.org`.
 
+You can build the docker without talking to a webserver by using the URL
+`file:///mnt/website` and mirroring the directory tree of the webserver
+inside the `website` subdirectory.
+
 ### FDB_ADDITIONAL_VERSIONS
 
 A list of additional client library versions to include in this image. These
@@ -77,13 +89,28 @@ files you may want to copy are:
 	cluster file based on an `FDB_COORDINATOR` environment variable.
 *	`/usr/bin/fdbcli`: The FoundationDB CLI.
 
+If you are running FDB inside of a Kubernetes cluster, you should probably use
+the sidecar image instead.  It makes it easier to automatically copy a compatible
+`libfdb_c.so` and cluster file into application containers.
+
+TODO: Document the sidecar.
 
 # Example Usages
 
-```
-# optional; to build a release image (as in for public consumption, or deployment at apple) for 7.0.0, set TAG=7.0.0
-# defaults to <fdb version triple>-<okteto environment name>  e.g., 7.0.0-sears-dev
-#TAG=my-custom-tag
+### Build an Ubuntu-based image
+
+TAG is optional and defaults to <fdb version triple>-<okteto environment name>
+e.g., 7.0.0-username-dev
 
-. build-release-docker.sh
 ```
+TAG=my-custom-tag ./build-release-docker.sh
+```
+### Build an Amazon Linux-based image
+From inside the developer Docker container:
+```
+# compile FDB, then:
+cd ~/build_output/packages/docker/
+STRIPPED=true TAG=my-custom-tag ./build-eks-docker.sh
+```
+
+TODO: Unify the above, so they're invoked in the same way.
\ No newline at end of file

From 6b0ba1419917ea5f80f7ba22786a6f819d47d411 Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Tue, 27 Apr 2021 20:30:27 -0700
Subject: [PATCH 285/317] make release docker build script match EKS one as
 much as possible

---
 packaging/docker/README.md               | 23 +++-----
 packaging/docker/build-eks-docker.sh     | 14 ++++-
 packaging/docker/build-release-docker.sh | 74 +++++++++++-------------
 packaging/docker/release/Dockerfile      |  3 +-
 packaging/docker/sidecar/Dockerfile      |  2 +-
 packaging/docker/sidecar/sidecar.py      |  2 +-
 6 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/packaging/docker/README.md b/packaging/docker/README.md
index f8bf9cb5ed..3f48d5cbc5 100644
--- a/packaging/docker/README.md
+++ b/packaging/docker/README.md
@@ -93,24 +93,19 @@ If you are running FDB inside of a Kubernetes cluster, you should probably use
 the sidecar image instead.  It makes it easier to automatically copy a compatible
 `libfdb_c.so` and cluster file into application containers.
 
-TODO: Document the sidecar.
+TODO: Document sidecar.py
 
 # Example Usages
 
-### Build an Ubuntu-based image
-
-TAG is optional and defaults to <fdb version triple>-<okteto environment name>
-e.g., 7.0.0-username-dev
-
-```
-TAG=my-custom-tag ./build-release-docker.sh
-```
-### Build an Amazon Linux-based image
-From inside the developer Docker container:
+### Build an Ubuntu-based image with a custom tag and unstripped binaries
 ```
 # compile FDB, then:
 cd ~/build_output/packages/docker/
-STRIPPED=true TAG=my-custom-tag ./build-eks-docker.sh
+TAG=my-custom-tag ./build-release-docker.sh
 ```
-
-TODO: Unify the above, so they're invoked in the same way.
\ No newline at end of file
+### Build an Amazon Linux-based image with a default tag and stripped binaries
+```
+# compile FDB, then:
+cd ~/build_output/packages/docker/
+STRIPPED=true ./build-eks-docker.sh
+```
\ No newline at end of file
diff --git a/packaging/docker/build-eks-docker.sh b/packaging/docker/build-eks-docker.sh
index b467d0be5b..2bdb2e8e54 100755
--- a/packaging/docker/build-eks-docker.sh
+++ b/packaging/docker/build-eks-docker.sh
@@ -29,9 +29,17 @@ aws ecr get-login-password | docker login --username AWS --password-stdin ${ECR}
 docker pull ${ECR}/amazonlinux:2.0.20210326.0
 docker tag ${ECR}/amazonlinux:2.0.20210326.0 amazonlinux:2.0.20210326.0
 
+
+
+#derived variables
 IMAGE=foundationdb/foundationdb:${TAG}
 SIDECAR=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
 STRIPPED=${STRIPPED:-false}
+
+
+
+
+
 if $STRIPPED; then
   rsync -av --delete --exclude=*.xml ${BUILD_OUTPUT}/packages/bin .
   rsync -av --delete --exclude=*.a --exclude=*.xml ${BUILD_OUTPUT}/packages/lib .
@@ -40,8 +48,10 @@ else
   rsync -av --delete --exclude=*.a --exclude=*.xml ${BUILD_OUTPUT}/lib .
 fi
 
-docker build --build-arg FDB_VERSION=$FDB_VERSION -t ${IMAGE}   --target foundationdb -f Dockerfile.eks .
-docker build --build-arg FDB_VERSION=$FDB_VERSION -t ${SIDECAR} --target sidecar      -f Dockerfile.eks .
+BUILD_ARGS="--build-arg FDB_VERSION=$FDB_VERSION"
+
+docker build ${BUILD_ARGS} -t ${IMAGE}   --target foundationdb -f Dockerfile.eks .
+docker build ${BUILD_ARGS} -t ${SIDECAR} --target sidecar      -f Dockerfile.eks .
 
 docker tag ${IMAGE} ${ECR}/${IMAGE}
 docker tag ${SIDECAR} ${ECR}/${SIDECAR}
diff --git a/packaging/docker/build-release-docker.sh b/packaging/docker/build-release-docker.sh
index eaaa941bce..3e52edf4d3 100755
--- a/packaging/docker/build-release-docker.sh
+++ b/packaging/docker/build-release-docker.sh
@@ -1,12 +1,17 @@
 #!/bin/bash
+
 set -euxo pipefail
 
-# This is designed to be run inside an environment with foundationdb checked out at ~/src/foundationdb.
-# The foundationdb build will write its output to ~/build_output
-FDB_SRC=${HOME}/src/foundationdb
-FDB_BUILD=${HOME}/build_output
+DOCKER_ROOT=$(realpath $(dirname ${BASH_SOURCE[0]}))
+BUILD_OUTPUT=$(realpath ${DOCKER_ROOT}/../..)
 
-FDB_VERSION=$(grep '  VERSION ' ${FDB_SRC}/CMakeLists.txt | tr -s ' ' ' ' | cut -d ' ' -f 3)
+echo Docker root:  $DOCKER_ROOT
+echo Build output: $BUILD_OUTPUT
+
+cd ${DOCKER_ROOT}
+
+## eg: CMAKE_PROJECT_VERSION:STATIC=7.0.0
+FDB_VERSION=$(grep CMAKE_PROJECT_VERSION\: ${BUILD_OUTPUT}/CMakeCache.txt | cut -d '=' -f 2)
 
 # Options (passed via environment variables)
 
@@ -17,26 +22,6 @@ ECR=${ECR:-112664522426.dkr.ecr.us-west-2.amazonaws.com}
 
 echo Building with tag ${TAG}
 
-# TODO: This is a copy of the commonly-used 'cmk' function.
-cmake -S ${FDB_SRC} -B ${FDB_BUILD} \
-   -D USE_CCACHE=ON -D USE_WERROR=ON -D RocksDB_ROOT=/opt/rocksdb-6.10.1 -D RUN_JUNIT_TESTS=ON -D RUN_JAVA_INTEGRATION_TESTS=ON \
-   -G Ninja
-
-ninja -C ${FDB_BUILD} -j 84
-
-# derived variables
-IMAGE=foundationdb/foundationdb:${TAG}
-SIDECAR_IMAGE=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
-
-cd ${FDB_BUILD}/packages/docker
-
-WEBSITE_BIN_DIR=website/downloads/${FDB_VERSION}/linux/
-TARBALL=${WEBSITE_BIN_DIR}/fdb_${FDB_VERSION}.tar.gz
-
-mkdir -p ${WEBSITE_BIN_DIR}
-tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
-cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
-
 # Login to ECR
 # TODO: Move this to a common place instead of repeatedly copy-pasting it.
 aws ecr get-login-password | docker login --username AWS --password-stdin ${ECR}
@@ -46,21 +31,32 @@ docker tag ${ECR}/ubuntu:18.04 ubuntu:18.04
 docker pull ${ECR}/python:3.9-slim
 docker tag ${ECR}/python:3.9-slim python:3.9-slim
 
-docker build -t ${IMAGE} \
-   --build-arg FDB_WEBSITE=file:///mnt/website \
-   --build-arg FDB_VERSION=$FDB_VERSION \
-   --build-arg FDB_ADDITIONAL_VERSIONS=$FDB_VERSION \
-   -f release/Dockerfile .
+# derived variables
+IMAGE=foundationdb/foundationdb:${TAG}
+SIDECAR=foundationdb/foundationdb-kubernetes-sidecar:${TAG}-1
+STRIPPED=${STRIPPED:-false}
+
+WEBSITE_BIN_DIR=website/downloads/${FDB_VERSION}/linux/
+TARBALL=${WEBSITE_BIN_DIR}/fdb_${FDB_VERSION}.tar.gz
+mkdir -p ${WEBSITE_BIN_DIR}
+
+if $STRIPPED; then
+  tar -C ~/build_output/packages/ -zcvf ${TARBALL} bin lib
+  cp ~/build_output/packages/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
+else
+  tar -C ~/build_output/ -zcvf ${TARBALL} bin lib
+  cp ~/build_output/lib/libfdb_c.so ${WEBSITE_BIN_DIR}/libfdb_c_${FDB_VERSION}.so
+fi
+
+BUILD_ARGS="--build-arg FDB_WEBSITE=file:///mnt/website "
+BUILD_ARGS+="--build-arg FDB_VERSION=$FDB_VERSION "
+BUILD_ARGS+="--build-arg FDB_ADDITIONAL_VERSIONS=$FDB_VERSION"
+
+docker build -t ${IMAGE} ${BUILD_ARGS} -f release/Dockerfile .
+docker build -t ${SIDECAR} ${BUILD_ARGS} -f sidecar/Dockerfile .
 
 docker tag ${IMAGE} ${ECR}/${IMAGE}
-
-docker build -t ${SIDECAR_IMAGE} \
-   --build-arg FDB_WEBSITE=file:///mnt/website \
-   --build-arg FDB_VERSION=$FDB_VERSION \
-   --build-arg FDB_ADDITIONAL_VERSIONS=$FDB_VERSION \
-   -f sidecar/Dockerfile .
-
-docker tag ${SIDECAR_IMAGE} ${ECR}/${SIDECAR_IMAGE}
+docker tag ${SIDECAR} ${ECR}/${SIDECAR}
 
 docker push ${ECR}/${IMAGE}
-docker push ${ECR}/${SIDECAR_IMAGE}
+docker push ${ECR}/${SIDECAR}
diff --git a/packaging/docker/release/Dockerfile b/packaging/docker/release/Dockerfile
index 7e265cbf91..45fcf71531 100644
--- a/packaging/docker/release/Dockerfile
+++ b/packaging/docker/release/Dockerfile
@@ -54,11 +54,10 @@ RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz -o fd
 
 WORKDIR /var/fdb
 
-
 # Set Up Runtime Scripts and Directories
 
 COPY release/*.bash /var/fdb/scripts/
-
+RUN chmod a+x /var/fdb/scripts/*.bash
 RUN	mkdir -p logs
 
 # Install FoundationDB Client Libraries
diff --git a/packaging/docker/sidecar/Dockerfile b/packaging/docker/sidecar/Dockerfile
index cb6c0d8397..f4e5ea7b44 100644
--- a/packaging/docker/sidecar/Dockerfile
+++ b/packaging/docker/sidecar/Dockerfile
@@ -54,7 +54,7 @@ COPY sidecar/entrypoint.bash /
 COPY sidecar/requirements.txt /
 COPY sidecar/sidecar.py /
 
-RUN pip install -r /requirements.txt && rm /requirements.txt && chmod a+x /entrypoint.bash
+RUN pip install -r /requirements.txt && rm /requirements.txt && chmod a+x /entrypoint.bash /sidecar.py
 
 VOLUME /var/input-files
 VOLUME /var/output-files
diff --git a/packaging/docker/sidecar/sidecar.py b/packaging/docker/sidecar/sidecar.py
index 3e47c1c932..12262db02e 100755
--- a/packaging/docker/sidecar/sidecar.py
+++ b/packaging/docker/sidecar/sidecar.py
@@ -1,4 +1,4 @@
-#! /usr/bin/python3
+#! /usr/bin/env python3
 
 # entrypoint.py
 #

From 55ce798ff431695a7863021dcefb1b6fb9f594d2 Mon Sep 17 00:00:00 2001
From: Russell Sears <russell_sears@apple.com>
Date: Tue, 27 Apr 2021 22:14:14 -0700
Subject: [PATCH 286/317] Reconcile differences in release and sidecar
 Dockerfile

---
 packaging/docker/release/Dockerfile | 60 +++++++++++++-------------
 packaging/docker/sidecar/Dockerfile | 65 ++++++++++++++++-------------
 2 files changed, 66 insertions(+), 59 deletions(-)

diff --git a/packaging/docker/release/Dockerfile b/packaging/docker/release/Dockerfile
index 45fcf71531..ad506e620d 100644
--- a/packaging/docker/release/Dockerfile
+++ b/packaging/docker/release/Dockerfile
@@ -19,8 +19,6 @@
 
 FROM ubuntu:18.04
 
-# Install dependencies
-
 RUN apt-get update && \
 	apt-get install -y curl>=7.58.0-2ubuntu3.6 \
 		dnsutils>=1:9.11.3+dfsg-1ubuntu1.7 \
@@ -34,44 +32,51 @@ RUN apt-get update && \
 		vim>=2:8.0.1453-1ubuntu1.4 \
 		net-tools>=1.60+git20161116.90da8a0-1ubuntu1 \
 		jq>=1.5+dfsg-2 && \
-	rm -r /var/lib/apt/lists/*
+	rm -rf /var/lib/apt/lists/*
 
-
-# Install FoundationDB Binaries
+COPY misc/tini-amd64.sha256sum /tmp/
+# Adding tini as PID 1 https://github.com/krallin/tini
+ARG TINI_VERSION=v0.19.0
+RUN curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
+  sha256sum -c /tmp/tini-amd64.sha256sum && \
+  chmod +x tini-amd64 && \
+  mv tini-amd64 /usr/bin/tini
 
 ARG FDB_VERSION
+ARG FDB_ADDITIONAL_VERSIONS="5.1.7"
 ARG FDB_WEBSITE=https://www.foundationdb.org
 
 WORKDIR /var/fdb/tmp
-ADD website /mnt/website
-RUN ls -l /mnt/website
-RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz -o fdb_$FDB_VERSION.tar.gz && \
-	tar -xzf fdb_$FDB_VERSION.tar.gz --strip-components=1 && \
-	rm fdb_$FDB_VERSION.tar.gz && \
+COPY website /mnt/website/
+
+# Install FoundationDB Binaries
+RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz | tar zxf - --strip-components=1 && \
 	chmod u+x fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent && \
 	mv fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent /usr/bin && \
 	rm -r /var/fdb/tmp
 
-WORKDIR /var/fdb
+WORKDIR /
+
+## TODO: Can unify everything above this line
+## TODO: we can almost unify the additional client library download,
+##       but sidecar.py expects them in a different location,
+##        with a different naming convention.
+
+RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/libfdb_c_$FDB_VERSION.so -o /usr/lib/libfdb_c.so
 
 # Set Up Runtime Scripts and Directories
-
-COPY release/*.bash /var/fdb/scripts/
+ADD release/*.bash /var/fdb/scripts/
 RUN chmod a+x /var/fdb/scripts/*.bash
+
+# Install additional FoundationDB Client Libraries
+RUN	/var/fdb/scripts/download_multiversion_libraries.bash $FDB_WEBSITE $FDB_ADDITIONAL_VERSIONS
+
+RUN	rm -rf /mnt/website
+
 RUN	mkdir -p logs
 
-# Install FoundationDB Client Libraries
-
-ARG FDB_ADDITIONAL_VERSIONS="5.1.7"
-
-RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/libfdb_c_$FDB_VERSION.so -o /usr/lib/libfdb_c.so && \
-	bash scripts/download_multiversion_libraries.bash $FDB_WEBSITE $FDB_ADDITIONAL_VERSIONS && \
-	rm -rf /mnt/website
-
 VOLUME /var/fdb/data
 
-CMD /var/fdb/scripts/fdb.bash
-
 # Runtime Configuration Options
 
 ENV FDB_PORT 4500
@@ -82,12 +87,5 @@ ENV FDB_COORDINATOR_PORT 4500
 ENV FDB_CLUSTER_FILE_CONTENTS ""
 ENV FDB_PROCESS_CLASS unset
 
-# Adding tini as PID 1 https://github.com/krallin/tini
-ARG TINI_VERSION=v0.19.0
-RUN curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
-    curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64.sha256sum && \
-	sha256sum -c tini-amd64.sha256sum && \
-	rm -f tini-amd64.sha256sum && \
-    chmod +x tini-amd64 && \
-	mv tini-amd64 /usr/bin/tini
 ENTRYPOINT ["/usr/bin/tini", "-g", "--"]
+CMD /var/fdb/scripts/fdb.bash
diff --git a/packaging/docker/sidecar/Dockerfile b/packaging/docker/sidecar/Dockerfile
index f4e5ea7b44..b2d76693ec 100644
--- a/packaging/docker/sidecar/Dockerfile
+++ b/packaging/docker/sidecar/Dockerfile
@@ -19,42 +19,51 @@
 
 FROM python:3.9-slim
 
-WORKDIR /var/fdb/tmp
-ARG FDB_VERSION=6.2.30
-ARG FDB_ADDITIONAL_VERSIONS="6.2.30 6.1.13"
-ARG FDB_WEBSITE=https://www.foundationdb.org
-# Adding tini as PID 1 https://github.com/krallin/tini
-ARG TINI_VERSION=v0.19.0
-
-COPY website /mnt/website
 RUN apt-get update && \
 	apt-get install -y --no-install-recommends curl && \
-	curl --fail $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz -o fdb_$FDB_VERSION.tar.gz && \
-	tar -xzf fdb_$FDB_VERSION.tar.gz --strip-components=1 && \
-	rm fdb_$FDB_VERSION.tar.gz && \
+	rm -rf /var/lub/apt/lists/*
+
+COPY misc/tini-amd64.sha256sum /tmp/
+# Adding tini as PID 1 https://github.com/krallin/tini
+ARG TINI_VERSION=v0.19.0
+RUN curl -sLO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
+  sha256sum -c /tmp/tini-amd64.sha256sum && \
+  chmod +x tini-amd64 && \
+  mv tini-amd64 /usr/bin/tini
+
+COPY sidecar/requirements.txt /tmp
+RUN pip install -r tmp/requirements.txt
+
+ARG FDB_VERSION=
+ARG FDB_ADDITIONAL_VERSIONS="6.2.30 6.1.13"
+ARG FDB_WEBSITE=https://www.foundationdb.org
+
+WORKDIR /var/fdb/tmp
+COPY website /mnt/website/
+
+# Install FoundationDB Binaries
+RUN curl $FDB_WEBSITE/downloads/$FDB_VERSION/linux/fdb_$FDB_VERSION.tar.gz | tar zxf - --strip-components=1 && \
 	chmod u+x fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent && \
 	mv fdbbackup fdbcli fdbdr fdbmonitor fdbrestore fdbserver backup_agent dr_agent /usr/bin && \
-	echo ${FDB_VERSION} > /var/fdb/version && mkdir -p /var/fdb/lib && \
-	for version in $FDB_ADDITIONAL_VERSIONS; do curl --fail $FDB_WEBSITE/downloads/$version/linux/libfdb_c_$version.so -o /var/fdb/lib/libfdb_c_${version%.*}.so; done && \
-	curl -LO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64 && \
-    curl -LO https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini-amd64.sha256sum && \
-	sha256sum -c tini-amd64.sha256sum && \
-	rm -f tini-amd64.sha256sum && \
-    chmod +x tini-amd64 && \
-	mv tini-amd64 /usr/bin/tini && \
-	rm -r /var/fdb/tmp && \
-	groupadd --gid 4059 fdb && \
-	useradd --gid 4059 --uid 4059 --no-create-home --shell /bin/bash fdb && \
-	apt-get remove -y curl && \
-	rm -rf /var/lib/apt/lists/*
+	rm -r /var/fdb/tmp
 
 WORKDIR /
 
-COPY sidecar/entrypoint.bash /
-COPY sidecar/requirements.txt /
-COPY sidecar/sidecar.py /
+# Set Up Runtime Scripts and Directories
+ADD sidecar/entrypoint.bash sidecar/sidecar.py /
+RUN chmod a+x /entrypoint.bash /sidecar.py
+
+# Install additional FoundationDB Client Libraries
+RUN mkdir -p /var/fdb/lib && \
+    for version in $FDB_ADDITIONAL_VERSIONS; do curl $FDB_WEBSITE/downloads/$version/linux/libfdb_c_$version.so -o /var/fdb/lib/libfdb_c_${version%.*}.so; done
+
+RUN	rm -rf /mnt/website
+
+RUN	echo ${FDB_VERSION} > /var/fdb/version && \
+	mkdir -p /var/fdb/lib && \
+	groupadd --gid 4059 fdb && \
+	useradd --gid 4059 --uid 4059 --no-create-home --shell /bin/bash fdb
 
-RUN pip install -r /requirements.txt && rm /requirements.txt && chmod a+x /entrypoint.bash /sidecar.py
 
 VOLUME /var/input-files
 VOLUME /var/output-files

From 1e665044fe3586117b72666dd6883f0bcdde4dc0 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 09:08:17 -0600
Subject: [PATCH 287/317] bugfix

---
 fdbclient/SpecialKeySpace.actor.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index b692a5dea3..3ba31cca95 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -1986,6 +1986,7 @@ Future<Standalone<RangeResultRef>> ActorProfilerConf::getRange(ReadYourWritesTra
 
 void ActorProfilerConf::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
 	config[key.removePrefix(range.begin).toString()] = value.toString();
+	didWrite = true;
 }
 
 void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& kr) {
@@ -1995,6 +1996,7 @@ void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef&
 		// nothing to clear
 		return;
 	}
+	didWrite = true;
 	auto last = config.upper_bound(end);
 	config.erase(first, last);
 }
@@ -2005,6 +2007,7 @@ void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRef& key)
 	if (iter != config.end()) {
 		config.erase(iter);
 	}
+	didWrite = true;
 }
 
 Future<Optional<std::string>> ActorProfilerConf::commit(ReadYourWritesTransaction* ryw) {

From 838d4e021a5b4b6bd2bca5562bebb0229c2b52ee Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 09:24:08 -0600
Subject: [PATCH 288/317] remove accidently added files

---
 .stignore  |  2 --
 okteto.yml | 12 ------------
 2 files changed, 14 deletions(-)
 delete mode 100644 .stignore
 delete mode 100644 okteto.yml

diff --git a/.stignore b/.stignore
deleted file mode 100644
index 7500a08f9f..0000000000
--- a/.stignore
+++ /dev/null
@@ -1,2 +0,0 @@
-.git
-.clangd
diff --git a/okteto.yml b/okteto.yml
deleted file mode 100644
index efa744a7d8..0000000000
--- a/okteto.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-name: foundationdb
-autocreate: true
-image: foundationdb/devel:centos7-latest
-command: bash
-volumes:
-- /root/.m2
-- /root/build
-sync:
-- .:/usr/src/fdb
-forward:
-- 5005:5005
-- 8080:8080

From e2b8a2734b0971bb5295026746669361c681841c Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 09:24:58 -0600
Subject: [PATCH 289/317] Fix typo

Co-authored-by: Lukas Joswiak <lukas.joswiak@snowflake.com>
---
 fdbserver/fdbserver.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp
index 75247d85cf..3b0fc8b180 100644
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@@ -623,7 +623,7 @@ static void printUsage(const char* name, bool devhelp) {
 	                 " resolution, grv_proxy, commit_proxy, master, test, unset, stateless, log, router,"
 	                 " and cluster_controller).");
 	printOptionUsage("--profiler_",
-	                 "Set a actor profiler option. Supported options are:\n"
+	                 "Set an actor profiler option. Supported options are:\n"
 	                 "  collector -- None or FluentD (FluentD requires collector_endpoint to be set)\n"
 	                 "  collector_endpoint -- IP:PORT of the fluentd server\n"
 	                 "  collector_protocol -- UDP or TCP (default is UDP)");

From 0ee0b8a76f41de62ceb97fc2df214a7d36633b52 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 09:25:28 -0600
Subject: [PATCH 290/317] Fixed typo

Co-authored-by: Lukas Joswiak <lukas.joswiak@snowflake.com>
---
 fdbclient/ActorLineageProfiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 9f9fdc3300..30e478bc48 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -64,7 +64,7 @@ public:
 	void getConfig(std::map<std::string, std::string>& res) const override { res["ingestor"] = "none"; }
 };
 
-// The FluentD ingestor uses the pimp idiom. This is to make compilation less heavy weight as this implementation has
+// The FluentD ingestor uses the pimpl idiom. This is to make compilation less heavy weight as this implementation has
 // dependencies to boost::asio
 struct FluentDIngestorImpl;
 

From 6f71a811b6dab8d81db81587a889397ed03b9ab7 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 09:27:11 -0600
Subject: [PATCH 291/317] fix memory leak

---
 fdbclient/FluentDSampleIngestor.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fdbclient/FluentDSampleIngestor.cpp b/fdbclient/FluentDSampleIngestor.cpp
index e912643dbf..ac34567584 100644
--- a/fdbclient/FluentDSampleIngestor.cpp
+++ b/fdbclient/FluentDSampleIngestor.cpp
@@ -154,7 +154,9 @@ struct FluentDIngestorImpl {
 	}
 };
 
-FluentDIngestor::~FluentDIngestor() {}
+FluentDIngestor::~FluentDIngestor() {
+	delete impl;
+}
 
 FluentDIngestor::FluentDIngestor(Protocol protocol, NetworkAddress& endpoint)
   : impl(new FluentDIngestorImpl(protocol, endpoint)) {}

From 05dba91dd4e0f361468148dbc7d76d22f7c1f36d Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 10:36:41 -0600
Subject: [PATCH 292/317] fix OPEN_FOR_IDE

---
 fdbclient/SpecialKeySpace.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp
index 3c7824366d..c866354759 100644
--- a/fdbclient/SpecialKeySpace.actor.cpp
+++ b/fdbclient/SpecialKeySpace.actor.cpp
@@ -2055,8 +2055,8 @@ ACTOR static Future<Standalone<RangeResultRef>> actorLineageGetRangeActor(ReadYo
 		throw special_keys_api_failure();
 	}
 
+	state NetworkAddress endRangeHost;
 	try {
-		state NetworkAddress endRangeHost;
 		if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
 			// For the range \xff\xff/actor_lineage/state/ip:port/wait-state/time/seq
 			parse(beginValues.begin() + 1, beginValues.end(), host, waitStateStart, timeStart, seqStart);

From 135cc9c69adb4dafd7cbb85154fea3a777f1a90b Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Wed, 28 Apr 2021 10:30:30 -0700
Subject: [PATCH 293/317] Make parameter const&

---
 fdbrpc/LoadBalance.actor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h
index f3fe58e441..393c3c0ee2 100644
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@@ -122,7 +122,7 @@ struct RequestData : NonCopyable {
 	// A return value of true means that the request completed successfully
 	// A return value of false means that the request failed but should be retried
 	// A return value with an error means that the error should be thrown back to original caller
-	static ErrorOr<bool> checkAndProcessResultImpl(Reply result,
+	static ErrorOr<bool> checkAndProcessResultImpl(Reply const& result,
 	                                               Reference<ModelHolder> modelHolder,
 	                                               bool atMostOnce,
 	                                               bool triedAllOptions) {

From 65fcf4014edbbfd72798e4bb01af9dffc6e14c1d Mon Sep 17 00:00:00 2001
From: Evan Tschannen <evan.tschannen@snowflake.com>
Date: Wed, 28 Apr 2021 12:41:48 -0700
Subject: [PATCH 294/317] Fix: simulation could still stall writes for 10
 seconds even when speedUpSimulation was on Fix: disable connection failures
 in simulation when there are too many generations outstanding

---
 fdbclient/Knobs.cpp                | 1 +
 fdbclient/Knobs.h                  | 1 +
 fdbrpc/AsyncFileNonDurable.actor.h | 2 +-
 fdbserver/masterserver.actor.cpp   | 6 ++++++
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp
index 3f5523e218..cd196737f8 100644
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@@ -50,6 +50,7 @@ void ClientKnobs::initialize(bool randomize) {
 	init( RECOVERY_DELAY_SECONDS_PER_GENERATION,  60.0 );
 	init( MAX_GENERATIONS,                         100 );
 	init( MAX_GENERATIONS_OVERRIDE,                  0 );
+	init( MAX_GENERATIONS_SIM,                      50 ); //Disable network connections after this many generations in simulation, should be less than RECOVERY_DELAY_START_GENERATION
 
 	init( COORDINATOR_RECONNECTION_DELAY,          1.0 );
 	init( CLIENT_EXAMPLE_AMOUNT,                    20 );
diff --git a/fdbclient/Knobs.h b/fdbclient/Knobs.h
index 8cfcd9e6bd..3ee7d1ee83 100644
--- a/fdbclient/Knobs.h
+++ b/fdbclient/Knobs.h
@@ -42,6 +42,7 @@ public:
 	double RECOVERY_DELAY_SECONDS_PER_GENERATION;
 	double MAX_GENERATIONS;
 	double MAX_GENERATIONS_OVERRIDE;
+	double MAX_GENERATIONS_SIM;
 
 	double COORDINATOR_RECONNECTION_DELAY;
 	int CLIENT_EXAMPLE_AMOUNT;
diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h
index 848d755fb1..00c0f7441d 100644
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@@ -449,7 +449,7 @@ private:
 			    self->getModificationsAndInsert(offset, length, true, writeEnded);
 			self->minSizeAfterPendingModifications = std::max(self->minSizeAfterPendingModifications, offset + length);
 
-			if (BUGGIFY_WITH_PROB(0.001))
+			if (BUGGIFY_WITH_PROB(0.001) && !g_simulator.speedUpSimulation)
 				priorModifications.push_back(
 				    delay(deterministicRandom()->random01() * FLOW_KNOBS->MAX_PRIOR_MODIFICATION_DELAY) ||
 				    self->killed.getFuture());
diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp
index a5f6ed7b75..040c03595f 100644
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@@ -26,6 +26,7 @@
 #include "fdbrpc/FailureMonitor.h"
 #include "fdbrpc/PerfMetric.h"
 #include "fdbrpc/sim_validation.h"
+#include "fdbrpc/simulator.h"
 #include "fdbserver/ApplyMetadataMutation.h"
 #include "fdbserver/BackupProgress.actor.h"
 #include "fdbserver/ConflictSet.h"
@@ -1680,6 +1681,11 @@ ACTOR Future<Void> masterCore(Reference<MasterData> self) {
 			wait(delay(CLIENT_KNOBS->RECOVERY_DELAY_SECONDS_PER_GENERATION *
 			           (self->cstate.myDBState.oldTLogData.size() - CLIENT_KNOBS->RECOVERY_DELAY_START_GENERATION)));
 		}
+		if (g_network->isSimulated() && self->cstate.myDBState.oldTLogData.size() > CLIENT_KNOBS->MAX_GENERATIONS_SIM) {
+			g_simulator.connectionFailuresDisableDuration = 1e6;
+			g_simulator.speedUpSimulation = true;
+			TraceEvent(SevWarnAlways, "DisableConnectionFailures_TooManyGenerations");
+		}
 	}
 
 	state Reference<AsyncVar<Reference<ILogSystem>>> oldLogSystems(new AsyncVar<Reference<ILogSystem>>);

From d946e90d75589d472dd4b3976fa0ea781b571285 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 14:10:45 -0600
Subject: [PATCH 295/317] Use new Sample interface

---
 fdbclient/FluentDSampleIngestor.cpp | 105 ++++++++++++++++++++++++----
 1 file changed, 91 insertions(+), 14 deletions(-)

diff --git a/fdbclient/FluentDSampleIngestor.cpp b/fdbclient/FluentDSampleIngestor.cpp
index ac34567584..08d5bfe55f 100644
--- a/fdbclient/FluentDSampleIngestor.cpp
+++ b/fdbclient/FluentDSampleIngestor.cpp
@@ -21,6 +21,7 @@
 #include "fdbclient/ActorLineageProfiler.h"
 #include <boost/asio.hpp>
 #include <boost/asio/co_spawn.hpp>
+#include <msgpack.hpp>
 
 namespace {
 
@@ -44,6 +45,90 @@ struct FluentDSocket {
 	virtual const boost::system::error_code& failed() const = 0;
 };
 
+template <class Protocol, class Callback>
+class SampleSender : public std::enable_shared_from_this<SampleSender<Protocol, Callback>> {
+	using Socket = typename Protocol::socket;
+	using Iter = typename decltype(Sample::data)::iterator;
+	Socket& socket;
+	Callback callback;
+	Iter iter, end;
+
+	struct Buf {
+		const char* data;
+		const unsigned size;
+		Buf(const char* data, unsigned size) : data(data), size(size) {}
+		Buf(Buf const&) = delete;
+		Buf& operator=(Buf const&) = delete;
+		~Buf() { delete[] data; }
+	};
+
+	void sendCompletionHandler(boost::system::error_code const& ec) {
+		if (ec) {
+			callback(ec);
+		} else {
+			++iter;
+			sendNext();
+		}
+	}
+
+	void send(boost::asio::ip::tcp::socket& socket, std::shared_ptr<Buf> const& buf) {
+		boost::asio::async_write(
+		    socket,
+		    boost::asio::const_buffer(buf->data, buf->size),
+		    [buf, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
+	}
+	void send(boost::asio::ip::udp::socket& socket, std::shared_ptr<Buf> const& buf) {
+		socket.async_send(
+		    boost::asio::const_buffer(buf->data, buf->size),
+		    [buf, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
+	}
+
+	void sendNext() {
+		if (iter == end) {
+			callback(boost::system::error_code());
+		}
+		// 1. calculate size of buffer
+		unsigned size = 1; // 1 for fixmap identifier byte
+		auto waitState = to_string(iter->first);
+		if (waitState.size() < 32) {
+			size = waitState.size() + 1;
+		} else {
+			size = waitState.size() + 2;
+		}
+		size += iter->second.second;
+		// 2. allocate the buffer
+		std::unique_ptr<char[]> buf(new char[size]);
+		unsigned off = 0;
+		// 3. serialize fixmap
+		buf[off++] = 0x81; // map of size 1
+		// 3.1 serialize key
+		if (waitState.size() < 32) {
+			buf[off++] = 0xa0 + waitState.size(); // fixstr
+		} else {
+			buf[off++] = 0xd9;
+			buf[off++] = char(waitState.size());
+		}
+		memcpy(buf.get() + off, waitState.data(), waitState.size());
+		off += waitState.size();
+		// 3.2 append serialized value
+		memcpy(buf.get() + off, iter->second.first, iter->second.second);
+		// 4. send the result to fluentd
+		send(socket, std::make_shared<Buf>(buf.release(), size));
+	}
+
+public:
+	SampleSender(Socket& socket, Callback const& callback, std::shared_ptr<Sample> const& sample)
+	  : socket(socket), callback(callback), iter(sample->data.begin()), end(sample->data.end()) {
+			sendNext();
+		}
+};
+
+// Sample function to make instanciation of SampleSender easier
+template <class Protocol, class Callback>
+std::shared_ptr<SampleSender<Protocol, Callback>> makeSampleSender(typename Protocol::socket& socket, Callback const& callback, std::shared_ptr<Sample> const& sample) {
+	return std::make_shared<SampleSender<Protocol, Callback>>(socket, callback, sample);
+}
+
 template <class Protocol>
 struct FluentDSocketImpl : FluentDSocket, std::enable_shared_from_this<FluentDSocketImpl<Protocol>> {
 	static constexpr unsigned MAX_QUEUE_SIZE = 100;
@@ -67,23 +152,15 @@ struct FluentDSocketImpl : FluentDSocket, std::enable_shared_from_this<FluentDSo
 		} else {
 			auto sample = queue.front();
 			queue.pop_front();
-			sendImpl<Protocol>(sample);
+			sendImpl(sample);
 		}
 	}
 
-	template <class P>
-	std::enable_if_t<std::is_same_v<boost::asio::ip::tcp, P>> sendImpl(std::shared_ptr<Sample> const& sample) {
-		boost::asio::async_write(
-		    socket,
-		    boost::asio::const_buffer(sample->data, sample->size),
-		    [sample, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
-	}
 
-	template <class P>
-	std::enable_if_t<std::is_same_v<boost::asio::ip::udp, P>> sendImpl(std::shared_ptr<Sample> const& sample) {
-		socket.async_send(
-		    boost::asio::const_buffer(sample->data, sample->size),
-		    [sample, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); });
+	void sendImpl(std::shared_ptr<Sample> const& sample) {
+		makeSampleSender<Protocol>(socket, [self = this->shared_from_this()](boost::system::error_code const& ec){
+			self->sendCompletionHandler(ec);
+		}, sample);
 	}
 
 	void send(std::shared_ptr<Sample> const& sample) override {
@@ -92,7 +169,7 @@ struct FluentDSocketImpl : FluentDSocket, std::enable_shared_from_this<FluentDSo
 		}
 		if (ready) {
 			ready = false;
-			sendImpl<Protocol>(sample);
+			sendImpl(sample);
 		} else {
 			if (queue.size() < MAX_QUEUE_SIZE) {
 				queue.push_back(sample);

From 32ee206675ca8aed0edf237776806dcb29eb3c20 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 14:11:09 -0600
Subject: [PATCH 296/317] delete copy constructor of Sample

---
 fdbclient/ActorLineageProfiler.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 1d9af52ace..8fc29ef0b2 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -47,6 +47,8 @@ struct IALPCollector : IALPCollectorBase {
 
 struct Sample : std::enable_shared_from_this<Sample> {
 	double time = 0.0;
+	Sample(Sample const&) = delete;
+	Sample& operator=(Sample const&) = delete;
 	std::unordered_map<WaitState, std::pair<char*, unsigned>> data;
 	~Sample() {
 		std::for_each(data.begin(), data.end(), [](std::pair<WaitState, std::pair<char*, unsigned>> entry) {

From b256c6822d221da0864f29d9e30b99516ae2a8dd Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 15:44:06 -0600
Subject: [PATCH 297/317] add default constructor to Sample

---
 fdbclient/ActorLineageProfiler.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h
index 8fc29ef0b2..30a3eec3e7 100644
--- a/fdbclient/ActorLineageProfiler.h
+++ b/fdbclient/ActorLineageProfiler.h
@@ -47,6 +47,7 @@ struct IALPCollector : IALPCollectorBase {
 
 struct Sample : std::enable_shared_from_this<Sample> {
 	double time = 0.0;
+	Sample() {}
 	Sample(Sample const&) = delete;
 	Sample& operator=(Sample const&) = delete;
 	std::unordered_map<WaitState, std::pair<char*, unsigned>> data;

From f513543305a638c9bd8f03cf422d6ce3909d78ac Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Wed, 28 Apr 2021 16:35:09 -0600
Subject: [PATCH 298/317] fix special keyspace register

---
 fdbclient/NativeAPI.actor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp
index e3ac5dbab0..418a1acb6e 100644
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@@ -1076,8 +1076,8 @@ DatabaseContext::DatabaseContext(Reference<AsyncVar<Reference<ClusterConnectionF
 		    std::make_unique<ActorLineageImpl>(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE)));
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF,
 		                              SpecialKeySpace::IMPLTYPE::READWRITE,
-		                              std::make_unique<ActorProfilerConf>(
-		                                  SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE)));
+		                              std::make_unique<ActorProfilerConf>(SpecialKeySpace::getModuleRange(
+		                                  SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF)));
 	}
 	if (apiVersionAtLeast(630)) {
 		registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION,

From 045d20ab7af6cb45b53ad72fc0da3f27f3d6cffc Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Wed, 28 Apr 2021 15:40:13 -0700
Subject: [PATCH 299/317] Check validity of ActorLineage

---
 fdbclient/ActorLineageProfiler.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index d596d30616..4095725ffc 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -275,7 +275,10 @@ ActorLineageProfilerT::ActorLineageProfilerT() : impl(new ProfilerImpl()) {
 	    std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet())));
 	collection->collector()->addGetter(WaitState::Running, []() {
 		auto res = currentLineageThreadSafe.get();
-		return std::vector<Reference<ActorLineage>>({ res });
+		if (res.isValid()) {
+			return std::vector<Reference<ActorLineage>>({ res });
+		}
+		return std::vector<Reference<ActorLineage>>();
 	});
 }
 

From 7ab6fedb5b186d234126a06ee000f764fd641741 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 29 Apr 2021 11:34:42 -0600
Subject: [PATCH 300/317] fix refcounting in WriteOnlySet

---
 flow/WriteOnlySet.actor.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index c79f8f4db7..8e2e0ecfd8 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -70,16 +70,16 @@ bool WriteOnlySet<T, IndexType, CAPACITY>::erase(Index idx) {
 template <class T, class IndexType, IndexType CAPACITY>
 bool WriteOnlySet<T, IndexType, CAPACITY>::replace(Index idx, const Reference<T>& lineage) {
 	auto lineagePtr = reinterpret_cast<uintptr_t>(lineage.getPtr());
+	if (lineage.isValid()) {
+		lineage->addref();
+	}
 	ASSERT((lineagePtr % 2) == 0); // this needs to be at least 2-byte aligned
 
 	while (true) {
-		if (lineage.isValid()) {
-			lineage->addref();
-		}
-
 		auto ptr = _set[idx].load();
 		if (ptr & LOCK) {
 			_set[idx].store(lineagePtr);
+			ASSERT(freeList.push(reinterpret_cast<T*>(ptr ^ LOCK)));
 			return false;
 		} else {
 			if (_set[idx].compare_exchange_strong(ptr, lineagePtr)) {

From 44197644be41ee08bd1b6c9c7843438eb3d4bb84 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 29 Apr 2021 12:19:58 -0700
Subject: [PATCH 301/317] Reset transaction after a commit and before start a
 read.

---
 fdbclient/FileBackupAgent.actor.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 60d21a6a95..905592f8bc 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -4101,6 +4101,9 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			}
 		}
 
+		tr->reset();
+		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
+		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 		state bool logsOnly = wait(restore.onlyAppyMutationLogs().getD(tr, false, false));
 		state bool inconsistentSnapshotOnly = wait(restore.inconsistentSnapshotOnly().getD(tr, false, false));
 		state Version firstConsistentVersion = invalidVersion;

From e616545949a0308e84a00307f03be32bcb9745db Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 29 Apr 2021 15:23:33 -0600
Subject: [PATCH 302/317] use std::atomic for threadsaferefcounted

---
 flow/FastRef.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flow/FastRef.h b/flow/FastRef.h
index eca6ab72d5..d20cdfd9a1 100644
--- a/flow/FastRef.h
+++ b/flow/FastRef.h
@@ -35,10 +35,10 @@ class ThreadSafeReferenceCounted {
 public:
 	ThreadSafeReferenceCounted() : referenceCount(1) {}
 	// NO virtual destructor!  Subclass should have a virtual destructor if it is not sealed.
-	void addref() const { interlockedIncrement(&referenceCount); }
+	void addref() const { ++referenceCount; }
 	// If return value is true, caller is responsible for destruction of object
 	bool delref_no_destroy() const {
-		if (interlockedDecrement(&referenceCount) != 0) {
+		if (--referenceCount != 0) {
 #ifdef VALGRIND
 			ANNOTATE_HAPPENS_BEFORE(&referenceCount);
 #endif
@@ -60,7 +60,7 @@ public:
 private:
 	ThreadSafeReferenceCounted(const ThreadSafeReferenceCounted&) /* = delete*/;
 	void operator=(const ThreadSafeReferenceCounted&) /* = delete*/;
-	mutable volatile int32_t referenceCount;
+	mutable std::atomic<int32_t> referenceCount;
 };
 
 template <class Subclass>

From 5c1279ceb7debd1c229ddeca69962a8c306fd3e6 Mon Sep 17 00:00:00 2001
From: Markus Pilman <markus.pilman@snowflake.com>
Date: Thu, 29 Apr 2021 15:23:46 -0600
Subject: [PATCH 303/317] make actorlineage inherit from threadsaferefcounted

---
 flow/flow.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/flow.h b/flow/flow.h
index 2fab7b11a4..8388113253 100644
--- a/flow/flow.h
+++ b/flow/flow.h
@@ -448,7 +448,7 @@ struct LineageProperties : LineagePropertiesBase {
 	}
 };
 
-struct ActorLineage : ReferenceCounted<ActorLineage> {
+struct ActorLineage : ThreadSafeReferenceCounted<ActorLineage> {
 	friend class LocalLineage;
 
 private:

From 41e0eac450725bd2e6bf4d301d4db502d6268ffa Mon Sep 17 00:00:00 2001
From: Andrew Noyes <andrew.noyes@snowflake.com>
Date: Thu, 29 Apr 2021 21:25:55 +0000
Subject: [PATCH 304/317] Fix the data dir for DEB and EL7 to be
 /usr/lib/foundationdb/data

Also remove the unused legacy mapping for install dirs
---
 cmake/InstallLayout.cmake | 46 +--------------------------------------
 1 file changed, 1 insertion(+), 45 deletions(-)

diff --git a/cmake/InstallLayout.cmake b/cmake/InstallLayout.cmake
index 47fe89136b..a037b65df2 100644
--- a/cmake/InstallLayout.cmake
+++ b/cmake/InstallLayout.cmake
@@ -103,54 +103,10 @@ function(symlink_files)
   endif()
 endfunction()
 
-# 'map' from (destination, package) to path
-# format vars like install_destination_for_${destination}_${package}
-set(install_destination_for_bin_tgz "bin")
-set(install_destination_for_bin_deb "usr/bin")
-set(install_destination_for_bin_el6 "usr/bin")
-set(install_destination_for_bin_el7 "usr/bin")
-set(install_destination_for_bin_pm "usr/local/bin")
-set(install_destination_for_sbin_tgz "sbin")
-set(install_destination_for_sbin_deb "usr/sbin")
-set(install_destination_for_sbin_el6 "usr/sbin")
-set(install_destination_for_sbin_el7 "usr/sbin")
-set(install_destination_for_sbin_pm "usr/local/libexec")
-set(install_destination_for_lib_tgz "lib")
-set(install_destination_for_lib_deb "usr/lib")
-set(install_destination_for_lib_el6 "usr/lib64")
-set(install_destination_for_lib_el7 "usr/lib64")
-set(install_destination_for_lib_pm "usr/local/lib")
-set(install_destination_for_fdbmonitor_tgz "sbin")
-set(install_destination_for_fdbmonitor_deb "usr/lib/foundationdb")
-set(install_destination_for_fdbmonitor_el6 "usr/lib/foundationdb")
-set(install_destination_for_fdbmonitor_el7 "usr/lib/foundationdb")
-set(install_destination_for_fdbmonitor_pm "usr/local/libexec")
-set(install_destination_for_include_tgz "include")
-set(install_destination_for_include_deb "usr/include")
-set(install_destination_for_include_el6 "usr/include")
-set(install_destination_for_include_el7 "usr/include")
-set(install_destination_for_include_pm "usr/local/include")
-set(install_destination_for_etc_tgz "etc/foundationdb")
-set(install_destination_for_etc_deb "etc/foundationdb")
-set(install_destination_for_etc_el6 "etc/foundationdb")
-set(install_destination_for_etc_el7 "etc/foundationdb")
-set(install_destination_for_etc_pm "usr/local/etc/foundationdb")
-set(install_destination_for_log_tgz "log/foundationdb")
-set(install_destination_for_log_deb "var/log/foundationdb")
-set(install_destination_for_log_el6 "var/log/foundationdb")
-set(install_destination_for_log_el7 "var/log/foundationdb")
-set(install_destination_for_log_pm "usr/local/foundationdb/logs")
-set(install_destination_for_data_tgz "lib/foundationdb")
-set(install_destination_for_data_deb "var/lib/foundationdb/data")
-set(install_destination_for_data_el6 "var/lib/foundationdb/data")
-set(install_destination_for_data_el7 "var/lib/foundationdb/data")
-set(install_destination_for_data_pm "usr/local/foundationdb/data")
 fdb_install_packages(TGZ DEB EL7 PM VERSIONED)
 fdb_install_dirs(BIN SBIN LIB FDBMONITOR INCLUDE ETC LOG DATA)
 message(STATUS "FDB_INSTALL_DIRS -> ${FDB_INSTALL_DIRS}")
 
-# 'map' from (destination, package) to path
-# format vars like install_destination_for_${destination}_${package}
 install_destinations(TGZ
   BIN bin
   SBIN sbin
@@ -169,7 +125,7 @@ install_destinations(DEB
   INCLUDE usr/include
   ETC etc/foundationdb
   LOG var/log/foundationdb
-  DATA var/lib/foundationdb)
+  DATA var/lib/foundationdb/data)
 copy_install_destinations(DEB EL7)
 install_destinations(EL7 LIB usr/lib64)
 install_destinations(PM

From f151df3203311c104d87820e079ddca073864aa2 Mon Sep 17 00:00:00 2001
From: Sreenath Bodagala <sbodagala@apple.com>
Date: Thu, 29 Apr 2021 22:11:09 +0000
Subject: [PATCH 305/317] Expose CommitBatchingWindowSize metric to fdbcli
 status

Changes:

Schemas.cpp:
- Extend JSON schema to include aggregated information about
CommitBatchingWindowSize samples.

Status.actor.cpp:
- Extend getStorageServersAndMetrics() to gather metrics about
CommitBatchingWindowSize.
- Extend CommitProxy AddRole() to populate the status-JSON object
with the metrics about CommitBatchingWindowSize.
---
 fdbclient/Schemas.cpp      | 12 ++++++++++++
 fdbserver/Status.actor.cpp |  7 ++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp
index 866ea4441e..1439a4cc3a 100644
--- a/fdbclient/Schemas.cpp
+++ b/fdbclient/Schemas.cpp
@@ -181,6 +181,18 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema(
                      "p99":0.0,
                      "p99.9":0.0
                   },
+                  "commit_batching_window_size":{
+                     "count":0,
+                     "min":0.0,
+                     "max":0.0,
+                     "median":0.0,
+                     "mean":0.0,
+                     "p25":0.0,
+                     "p90":0.0,
+                     "p95":0.0,
+                     "p99":0.0,
+                     "p99.9":0.0
+                  },
                   "grv_latency_bands":{
                      "$map": 1
                   },
diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp
index 284cedf97b..b45856ff53 100644
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@@ -615,6 +615,11 @@ struct RolesInfo {
 			TraceEventFields const& commitLatencyBands = metrics.at("CommitLatencyBands");
 			if (commitLatencyBands.size()) {
 				obj["commit_latency_bands"] = addLatencyBandInfo(commitLatencyBands);
+			} 
+
+			TraceEventFields const& commitBatchingWindowSize = metrics.at("CommitBatchingWindowSize");
+			if (commitBatchingWindowSize.size()) {
+				obj["commit_batching_window_size"] = addLatencyStatistics(commitBatchingWindowSize);
 			}
 		} catch (Error& e) {
 			if (e.code() != error_code_attribute_not_found) {
@@ -1839,7 +1844,7 @@ ACTOR static Future<vector<std::pair<CommitProxyInterface, EventMap>>> getCommit
 	vector<std::pair<CommitProxyInterface, EventMap>> results =
 	    wait(getServerMetrics(db->get().client.commitProxies,
 	                          address_workers,
-	                          std::vector<std::string>{ "CommitLatencyMetrics", "CommitLatencyBands" }));
+	                          std::vector<std::string>{ "CommitLatencyMetrics", "CommitLatencyBands", "CommitBatchingWindowSize"}));
 
 	return results;
 }

From 5e045bd21cfed47f06da9ae09fc3218b8dfca418 Mon Sep 17 00:00:00 2001
From: RenxuanW <renxuan@apple.com>
Date: Thu, 29 Apr 2021 17:00:34 -0700
Subject: [PATCH 306/317] Move the read of logsOnly and
 inconsistentSnapshotOnly to a loop where other RestoreConfig vars are
 fetched.

---
 fdbclient/FileBackupAgent.actor.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp
index 905592f8bc..a09ed25789 100644
--- a/fdbclient/FileBackupAgent.actor.cpp
+++ b/fdbclient/FileBackupAgent.actor.cpp
@@ -4043,6 +4043,8 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 		state Version beginVersion;
 		state Reference<IBackupContainer> bc;
 		state std::vector<KeyRange> ranges;
+		state bool logsOnly;
+		state bool inconsistentSnapshotOnly;
 
 		loop {
 			try {
@@ -4050,11 +4052,12 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 				tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 
 				wait(checkTaskVersion(tr->getDatabase(), task, name, version));
-				Optional<Version> _beginVersion = wait(restore.beginVersion().get(tr));
-				beginVersion = _beginVersion.present() ? _beginVersion.get() : invalidVersion;
+				wait(store(beginVersion, restore.beginVersion().getD(tr, false, invalidVersion)));
 
 				wait(store(restoreVersion, restore.restoreVersion().getOrThrow(tr)));
 				wait(store(ranges, restore.getRestoreRangesOrDefault(tr)));
+				wait(store(logsOnly, restore.onlyAppyMutationLogs().getD(tr, false, false)));
+				wait(store(inconsistentSnapshotOnly, restore.inconsistentSnapshotOnly().getD(tr, false, false)));
 
 				wait(taskBucket->keepRunning(tr, task));
 
@@ -4101,11 +4104,6 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
 			}
 		}
 
-		tr->reset();
-		tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
-		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
-		state bool logsOnly = wait(restore.onlyAppyMutationLogs().getD(tr, false, false));
-		state bool inconsistentSnapshotOnly = wait(restore.inconsistentSnapshotOnly().getD(tr, false, false));
 		state Version firstConsistentVersion = invalidVersion;
 		if (beginVersion == invalidVersion) {
 			beginVersion = 0;

From f61f13a0babf00cc1758fa83aac969b2bdf11342 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Thu, 29 Apr 2021 22:02:38 -0700
Subject: [PATCH 307/317] Explicitly cancel thread futures for the protocol
 version monitors in MVC

---
 fdbclient/MultiVersionTransaction.actor.cpp | 31 +++++++++++----------
 fdbclient/MultiVersionTransaction.h         |  6 +++-
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index bb4049d859..354102f883 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -1035,6 +1035,10 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
 
 	return mapThreadFuture<ProtocolVersion, Void>(f, [this, expected](ErrorOr<ProtocolVersion> cv) {
 		if (cv.isError()) {
+			if (cv.getError().code() == error_code_operation_cancelled) {
+				return ErrorOr<Void>(cv.getError());
+			}
+
 			TraceEvent("ErrorGettingClusterProtocolVersion")
 			    .detail("ExpectedProtocolVersion", expected)
 			    .error(cv.getError());
@@ -1042,7 +1046,7 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
 
 		ProtocolVersion clusterVersion = !cv.isError() ? cv.get() : dbProtocolVersion.orDefault(currentProtocolVersion);
 		onMainThreadVoid([this, clusterVersion]() { protocolVersionChanged(clusterVersion); }, nullptr);
-		return Void();
+		return ErrorOr<Void>(Void());
 	});
 }
 
@@ -1053,6 +1057,8 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
 	if (dbProtocolVersion.present() &&
 	    protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) {
 		dbProtocolVersion = protocolVersion;
+
+		protocolVersionMonitor.cancel();
 		protocolVersionMonitor = monitorProtocolVersion();
 	}
 
@@ -1139,6 +1145,8 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
 	}
 
 	dbVar->set(db);
+
+	protocolVersionMonitor.cancel();
 	protocolVersionMonitor = monitorProtocolVersion();
 }
 
@@ -1181,12 +1189,14 @@ void MultiVersionDatabase::LegacyVersionMonitor::startConnectionMonitor(
 			    onMainThreadVoid(
 			        [this, ready, dbState]() {
 				        if (ready.isError()) {
-					        TraceEvent(SevError, "FailedToOpenDatabaseOnClient")
-					            .error(ready.getError())
-					            .detail("LibPath", client->libPath);
+					        if (ready.getError().code() != error_code_operation_cancelled) {
+						        TraceEvent(SevError, "FailedToOpenDatabaseOnClient")
+						            .error(ready.getError())
+						            .detail("LibPath", client->libPath);
 
-					        client->failed = true;
-					        MultiVersionApi::api->updateSupportedVersions();
+						        client->failed = true;
+						        MultiVersionApi::api->updateSupportedVersions();
+					        }
 				        } else {
 					        runGrvProbe(dbState);
 				        }
@@ -1209,14 +1219,7 @@ void MultiVersionDatabase::LegacyVersionMonitor::runGrvProbe(Reference<MultiVers
 
 			    // If the version attempt returns an error, we regard that as a connection (except
 			    // operation_cancelled)
-			    if (v.isError() && v.getError().code() == error_code_operation_cancelled) {
-				    TraceEvent(SevError, "FailedToOpenDatabaseOnClient")
-				        .error(v.getError())
-				        .detail("LibPath", client->libPath);
-
-				    client->failed = true;
-				    MultiVersionApi::api->updateSupportedVersions();
-			    } else {
+			    if (!v.isError() || v.getError().code() != error_code_operation_cancelled) {
 				    dbState->protocolVersionChanged(client->protocolVersion);
 			    }
 		    },
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index 2244ec2c6e..5401076cf3 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -535,7 +535,11 @@ public:
 	// connect packet monitoring.
 	struct LegacyVersionMonitor {
 		LegacyVersionMonitor(Reference<ClientInfo> const& client) : client(client), monitorRunning(false) {}
-		~LegacyVersionMonitor() { TraceEvent("DestroyingVersionMonitor"); }
+		~LegacyVersionMonitor() {
+			if (versionMonitor.isValid()) {
+				versionMonitor.cancel();
+			}
+		}
 
 		// Starts the connection monitor by creating a database object at an old version.
 		// Must be called from the main thread

From cb3d2bfec7cf83d12e04a8308fa2d6f8355899d5 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 30 Apr 2021 10:35:44 -0700
Subject: [PATCH 308/317] Add cancellation of the protocol monitor when the
 database is destroyed. Avoid using any state when cancelled. Fix race between
 setting up the protocol version monitor and destroying the database.

---
 fdbclient/MultiVersionTransaction.actor.cpp | 16 +++++++++++-----
 fdbclient/MultiVersionTransaction.h         |  1 -
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 354102f883..c67f752d30 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -913,7 +913,8 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
 			}
 		});
 
-		onMainThreadVoid([this]() { dbState->protocolVersionMonitor = dbState->monitorProtocolVersion(); }, nullptr);
+		Reference<DatabaseState> dbStateRef = dbState;
+		onMainThreadVoid([dbStateRef]() { dbStateRef->protocolVersionMonitor = dbStateRef->monitorProtocolVersion(); }, nullptr);
 	}
 }
 
@@ -1165,9 +1166,15 @@ void MultiVersionDatabase::DatabaseState::startLegacyVersionMonitors() {
 }
 
 // Cleans up state for the legacy version monitors to break reference cycles
-// Must be called from the main thread
 void MultiVersionDatabase::DatabaseState::close() {
-	legacyVersionMonitors.clear();
+	addref();
+	onMainThreadVoid(
+	    [this]() {
+		    protocolVersionMonitor.cancel();
+		    legacyVersionMonitors.clear();
+		    delref();
+	    },
+	    nullptr);
 }
 
 // Starts the connection monitor by creating a database object at an old version.
@@ -1215,11 +1222,10 @@ void MultiVersionDatabase::LegacyVersionMonitor::runGrvProbe(Reference<MultiVers
 	versionMonitor = mapThreadFuture<Version, Void>(tr->getReadVersion(), [this, dbState](ErrorOr<Version> v) {
 		onMainThreadVoid(
 		    [this, v, dbState]() {
-			    monitorRunning = false;
-
 			    // If the version attempt returns an error, we regard that as a connection (except
 			    // operation_cancelled)
 			    if (!v.isError() || v.getError().code() != error_code_operation_cancelled) {
+				    monitorRunning = false;
 				    dbState->protocolVersionChanged(client->protocolVersion);
 			    }
 		    },
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index 5401076cf3..c68f9259d6 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -492,7 +492,6 @@ public:
 		void startLegacyVersionMonitors();
 
 		// Cleans up state for the legacy version monitors to break reference cycles
-		// Must be called from the main thread
 		void close();
 
 		Reference<IDatabase> db;

From eaf1e0f64eeaefc4e5a10417609eec6d9782a421 Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 30 Apr 2021 11:11:02 -0700
Subject: [PATCH 309/317] Be more defensive with cancellation by not capturing
 the this pointer in lambdas, instead capturing a full Reference.

---
 fdbclient/MultiVersionTransaction.actor.cpp | 67 +++++++++++++--------
 fdbclient/MultiVersionTransaction.h         | 12 ++--
 2 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index c67f752d30..25c770d7a7 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -1022,7 +1022,7 @@ void MultiVersionDatabase::DatabaseState::addClient(Reference<ClientInfo> client
 		    .detail("LibPath", client->libPath)
 		    .detail("ProtocolVersion", client->protocolVersion);
 
-		legacyVersionMonitors.emplace_back(client);
+		legacyVersionMonitors.emplace_back(new LegacyVersionMonitor(client));
 	}
 }
 
@@ -1034,7 +1034,8 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
 	Optional<ProtocolVersion> expected = dbProtocolVersion;
 	ThreadFuture<ProtocolVersion> f = versionMonitorDb->getServerProtocol(dbProtocolVersion);
 
-	return mapThreadFuture<ProtocolVersion, Void>(f, [this, expected](ErrorOr<ProtocolVersion> cv) {
+	Reference<DatabaseState> self = Reference<DatabaseState>::addRef(this);
+	return mapThreadFuture<ProtocolVersion, Void>(f, [self, expected](ErrorOr<ProtocolVersion> cv) {
 		if (cv.isError()) {
 			if (cv.getError().code() == error_code_operation_cancelled) {
 				return ErrorOr<Void>(cv.getError());
@@ -1045,8 +1046,9 @@ ThreadFuture<Void> MultiVersionDatabase::DatabaseState::monitorProtocolVersion()
 			    .error(cv.getError());
 		}
 
-		ProtocolVersion clusterVersion = !cv.isError() ? cv.get() : dbProtocolVersion.orDefault(currentProtocolVersion);
-		onMainThreadVoid([this, clusterVersion]() { protocolVersionChanged(clusterVersion); }, nullptr);
+		ProtocolVersion clusterVersion =
+		    !cv.isError() ? cv.get() : self->dbProtocolVersion.orDefault(currentProtocolVersion);
+		onMainThreadVoid([self, clusterVersion]() { self->protocolVersionChanged(clusterVersion); }, nullptr);
 		return ErrorOr<Void>(Void());
 	});
 }
@@ -1083,12 +1085,13 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
 
 			if (client->external && !MultiVersionApi::apiVersionAtLeast(610)) {
 				// Old API versions return a future when creating the database, so we need to wait for it
+				Reference<DatabaseState> self = Reference<DatabaseState>::addRef(this);
 				dbReady = mapThreadFuture<Void, Void>(
-				    newDb.castTo<DLDatabase>()->onReady(), [this, newDb, client](ErrorOr<Void> ready) {
+				    newDb.castTo<DLDatabase>()->onReady(), [self, newDb, client](ErrorOr<Void> ready) {
 					    if (!ready.isError()) {
-						    onMainThreadVoid([this, newDb, client]() { updateDatabase(newDb, client); }, nullptr);
+						    onMainThreadVoid([self, newDb, client]() { self->updateDatabase(newDb, client); }, nullptr);
 					    } else {
-						    onMainThreadVoid([this, client]() { updateDatabase(Reference<IDatabase>(), client); },
+						    onMainThreadVoid([self, client]() { self->updateDatabase(Reference<IDatabase>(), client); },
 						                     nullptr);
 					    }
 
@@ -1155,12 +1158,13 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
 // Must be called from the main thread
 void MultiVersionDatabase::DatabaseState::startLegacyVersionMonitors() {
 	for (auto itr = legacyVersionMonitors.begin(); itr != legacyVersionMonitors.end(); ++itr) {
-		while (itr != legacyVersionMonitors.end() && itr->client->failed) {
+		while (itr != legacyVersionMonitors.end() && (*itr)->client->failed) {
+			(*itr)->close();
 			itr = legacyVersionMonitors.erase(itr);
 		}
 		if (itr != legacyVersionMonitors.end() &&
-		    (!dbProtocolVersion.present() || itr->client->protocolVersion != dbProtocolVersion.get())) {
-			itr->startConnectionMonitor(Reference<DatabaseState>::addRef(this));
+		    (!dbProtocolVersion.present() || (*itr)->client->protocolVersion != dbProtocolVersion.get())) {
+			(*itr)->startConnectionMonitor(Reference<DatabaseState>::addRef(this));
 		}
 	}
 }
@@ -1171,6 +1175,10 @@ void MultiVersionDatabase::DatabaseState::close() {
 	onMainThreadVoid(
 	    [this]() {
 		    protocolVersionMonitor.cancel();
+		    for (auto monitor : legacyVersionMonitors) {
+			    monitor->close();
+		    }
+
 		    legacyVersionMonitors.clear();
 		    delref();
 	    },
@@ -1191,21 +1199,22 @@ void MultiVersionDatabase::LegacyVersionMonitor::startConnectionMonitor(
 		tr = Reference<ITransaction>();
 
 		TraceEvent("StartingLegacyVersionMonitor").detail("ProtocolVersion", client->protocolVersion);
+		Reference<LegacyVersionMonitor> self = Reference<LegacyVersionMonitor>::addRef(this);
 		versionMonitor =
-		    mapThreadFuture<Void, Void>(db.castTo<DLDatabase>()->onReady(), [this, dbState](ErrorOr<Void> ready) {
+		    mapThreadFuture<Void, Void>(db.castTo<DLDatabase>()->onReady(), [self, dbState](ErrorOr<Void> ready) {
 			    onMainThreadVoid(
-			        [this, ready, dbState]() {
+			        [self, ready, dbState]() {
 				        if (ready.isError()) {
 					        if (ready.getError().code() != error_code_operation_cancelled) {
 						        TraceEvent(SevError, "FailedToOpenDatabaseOnClient")
 						            .error(ready.getError())
-						            .detail("LibPath", client->libPath);
+						            .detail("LibPath", self->client->libPath);
 
-						        client->failed = true;
+						        self->client->failed = true;
 						        MultiVersionApi::api->updateSupportedVersions();
 					        }
 				        } else {
-					        runGrvProbe(dbState);
+					        self->runGrvProbe(dbState);
 				        }
 			        },
 			        nullptr);
@@ -1219,22 +1228,28 @@ void MultiVersionDatabase::LegacyVersionMonitor::startConnectionMonitor(
 // Must be called from main thread
 void MultiVersionDatabase::LegacyVersionMonitor::runGrvProbe(Reference<MultiVersionDatabase::DatabaseState> dbState) {
 	tr = db->createTransaction();
-	versionMonitor = mapThreadFuture<Version, Void>(tr->getReadVersion(), [this, dbState](ErrorOr<Version> v) {
-		onMainThreadVoid(
-		    [this, v, dbState]() {
-			    // If the version attempt returns an error, we regard that as a connection (except
-			    // operation_cancelled)
-			    if (!v.isError() || v.getError().code() != error_code_operation_cancelled) {
-				    monitorRunning = false;
-				    dbState->protocolVersionChanged(client->protocolVersion);
-			    }
-		    },
-		    nullptr);
+	Reference<LegacyVersionMonitor> self = Reference<LegacyVersionMonitor>::addRef(this);
+	versionMonitor = mapThreadFuture<Version, Void>(tr->getReadVersion(), [self, dbState](ErrorOr<Version> v) {
+		// If the version attempt returns an error, we regard that as a connection (except operation_cancelled)
+		if (!v.isError() || v.getError().code() != error_code_operation_cancelled) {
+			onMainThreadVoid(
+			    [self, dbState]() {
+				    self->monitorRunning = false;
+				    dbState->protocolVersionChanged(self->client->protocolVersion);
+			    },
+			    nullptr);
+		}
 
 		return v.map<Void>([](Version v) { return Void(); });
 	});
 }
 
+void MultiVersionDatabase::LegacyVersionMonitor::close() {
+	if (versionMonitor.isValid()) {
+		versionMonitor.cancel();
+	}
+}
+
 std::atomic_flag MultiVersionDatabase::externalClientsInitialized = ATOMIC_FLAG_INIT;
 
 // MultiVersionApi
diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h
index c68f9259d6..70de66c064 100644
--- a/fdbclient/MultiVersionTransaction.h
+++ b/fdbclient/MultiVersionTransaction.h
@@ -517,7 +517,7 @@ public:
 
 		// Versions 5.0 and older do not support connection packet monitoring and require alternate techniques to
 		// determine the cluster version.
-		std::list<LegacyVersionMonitor> legacyVersionMonitors;
+		std::list<Reference<LegacyVersionMonitor>> legacyVersionMonitors;
 
 		Optional<ProtocolVersion> dbProtocolVersion;
 
@@ -532,13 +532,11 @@ public:
 
 	// A struct that enables monitoring whether the cluster is running an old version (<= 5.0) that doesn't support
 	// connect packet monitoring.
-	struct LegacyVersionMonitor {
+	struct LegacyVersionMonitor : ThreadSafeReferenceCounted<LegacyVersionMonitor> {
 		LegacyVersionMonitor(Reference<ClientInfo> const& client) : client(client), monitorRunning(false) {}
-		~LegacyVersionMonitor() {
-			if (versionMonitor.isValid()) {
-				versionMonitor.cancel();
-			}
-		}
+
+		// Terminates the version monitor to break reference cycles
+		void close();
 
 		// Starts the connection monitor by creating a database object at an old version.
 		// Must be called from the main thread

From ab3f96f16dd9c3836bcf25abe2a3409d8038235b Mon Sep 17 00:00:00 2001
From: "A.J. Beamon" <aj.beamon@snowflake.com>
Date: Fri, 30 Apr 2021 11:56:35 -0700
Subject: [PATCH 310/317] Fix: simulation doesn't have a protocol version
 monitor and can't cancel it.

---
 fdbclient/MultiVersionTransaction.actor.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp
index 25c770d7a7..177ce68673 100644
--- a/fdbclient/MultiVersionTransaction.actor.cpp
+++ b/fdbclient/MultiVersionTransaction.actor.cpp
@@ -914,7 +914,8 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api,
 		});
 
 		Reference<DatabaseState> dbStateRef = dbState;
-		onMainThreadVoid([dbStateRef]() { dbStateRef->protocolVersionMonitor = dbStateRef->monitorProtocolVersion(); }, nullptr);
+		onMainThreadVoid([dbStateRef]() { dbStateRef->protocolVersionMonitor = dbStateRef->monitorProtocolVersion(); },
+		                 nullptr);
 	}
 }
 
@@ -1061,6 +1062,7 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion
 	    protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) {
 		dbProtocolVersion = protocolVersion;
 
+		ASSERT(protocolVersionMonitor.isValid());
 		protocolVersionMonitor.cancel();
 		protocolVersionMonitor = monitorProtocolVersion();
 	}
@@ -1150,6 +1152,7 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
 
 	dbVar->set(db);
 
+	ASSERT(protocolVersionMonitor.isValid());
 	protocolVersionMonitor.cancel();
 	protocolVersionMonitor = monitorProtocolVersion();
 }
@@ -1171,16 +1174,17 @@ void MultiVersionDatabase::DatabaseState::startLegacyVersionMonitors() {
 
 // Cleans up state for the legacy version monitors to break reference cycles
 void MultiVersionDatabase::DatabaseState::close() {
-	addref();
+	Reference<DatabaseState> self = Reference<DatabaseState>::addRef(this);
 	onMainThreadVoid(
-	    [this]() {
-		    protocolVersionMonitor.cancel();
-		    for (auto monitor : legacyVersionMonitors) {
+	    [self]() {
+		    if (self->protocolVersionMonitor.isValid()) {
+			    self->protocolVersionMonitor.cancel();
+		    }
+		    for (auto monitor : self->legacyVersionMonitors) {
 			    monitor->close();
 		    }
 
-		    legacyVersionMonitors.clear();
-		    delref();
+		    self->legacyVersionMonitors.clear();
 	    },
 	    nullptr);
 }

From cf4218dfd15ed6d851e6c26160af0982c866d348 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Sat, 1 May 2021 15:20:49 -0700
Subject: [PATCH 311/317] Fixes simulation failures

Fixes the following issues:

1. Use the right index when initializing the WriteOnlySet's vector of
   atomics. Also switch to std::atomic_init to initialize each atomic in
   the vector (cannot default construct the atomics in the vector
   because std::atomic does not have a copy constructor).
2. Add failure check for when items cannot be inserted into the
   WriteOnlySet due to capacity constraints. This situation occurs when
   `copy` is not called on the WriteOnlySet, such as when sampling is
   disabled. The `copy` function is what clears the WriteOnlySet.
3. Remove a global config feature I added to update the ClientDBInfo
   object used by the global config listener function. This needs more
   investigation, but the effect of this change could be that global
   config changes are not correctly recognized on fdbserver processes.
4. Add various ASSERTs to verify data in WriteOnlySet.
---
 fdbclient/AnnotateActor.h             | 3 +++
 fdbclient/GlobalConfig.actor.cpp      | 5 +----
 fdbclient/GlobalConfig.actor.h        | 8 --------
 fdbrpc/AsyncFileKAIO.actor.h          | 1 +
 fdbserver/ClusterController.actor.cpp | 4 +---
 flow/WriteOnlySet.actor.cpp           | 4 ++--
 flow/flow.cpp                         | 2 +-
 7 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h
index 660b777d69..23f7f67659 100644
--- a/fdbclient/AnnotateActor.h
+++ b/fdbclient/AnnotateActor.h
@@ -35,6 +35,9 @@ struct AnnotateActor {
 
 	AnnotateActor(Reference<ActorLineage> lineage) : set(true) {
 		index = g_network->getActorLineageSet().insert(lineage);
+		if (index == ActorLineageSet::npos) {
+			set = false;
+		}
 	}
 
 	AnnotateActor(const AnnotateActor& other) = delete;
diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 947c383689..071f39c661 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -48,10 +48,6 @@ void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>>
 	}
 }
 
-void GlobalConfig::updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo) {
-	_updater = updater(&GlobalConfig::globalConfig(), dbInfo);
-}
-
 GlobalConfig& GlobalConfig::globalConfig() {
 	void* res = g_network->global(INetwork::enGlobalConfig);
 	ASSERT(res);
@@ -201,6 +197,7 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 // Applies updates to the local copy of the global configuration when this
 // process receives an updated history.
 ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+	// wait(self->cx->onConnected());
 	wait(self->migrate(self));
 
 	wait(self->refresh(self));
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index a541145dd8..7babe0a5ef 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -83,14 +83,6 @@ public:
 	// For example, given "config/a", returns "\xff\xff/global_config/config/a".
 	static Key prefixedKey(KeyRef key);
 
-	// Update the ClientDBInfo object used internally to check for updates to
-	// global configuration. The ClientDBInfo reference must be the same one
-	// used in the cluster controller, but fdbserver requires initial creation
-	// of the GlobalConfig class before the cluster controller is initialized.
-	// This function allows the ClientDBInfo object to be updated after create
-	// was called.
-	void updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo);
-
 	// Get a value from the framework. Values are returned as a ConfigValue
 	// reference which also contains the arena holding the object. As long as
 	// the caller keeps the ConfigValue reference, the value is guaranteed to
diff --git a/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/AsyncFileKAIO.actor.h
index dbdb040d00..cdc7fe9954 100644
--- a/fdbrpc/AsyncFileKAIO.actor.h
+++ b/fdbrpc/AsyncFileKAIO.actor.h
@@ -244,6 +244,7 @@ public:
 
 		auto& actorLineageSet = IAsyncFileSystem::filesystem()->getActorLineageSet();
 		auto index = actorLineageSet.insert(currentLineage);
+		ASSERT(index != ActorLineageSet::npos);
 		Future<Void> res = success(result);
 		actorLineageSet.erase(index);
 		return res;
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index d0735301df..0ba69cbf84 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -135,9 +135,7 @@ public:
 		                                                                         true,
 		                                                                         TaskPriority::DefaultEndpoint,
 		                                                                         true)) // SOMEDAY: Locality!
-		{
-			GlobalConfig::globalConfig().updateDBInfo(clientInfo);
-		}
+		{}
 
 		void setDistributor(const DataDistributorInterface& interf) {
 			auto newInfo = serverInfo->get();
diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp
index 8e2e0ecfd8..4a2e60d542 100644
--- a/flow/WriteOnlySet.actor.cpp
+++ b/flow/WriteOnlySet.actor.cpp
@@ -62,6 +62,7 @@ bool WriteOnlySet<T, IndexType, CAPACITY>::eraseImpl(Index idx) {
 
 template <class T, class IndexType, IndexType CAPACITY>
 bool WriteOnlySet<T, IndexType, CAPACITY>::erase(Index idx) {
+	ASSERT(idx >= 0 && idx < CAPACITY);
 	auto res = eraseImpl(idx);
 	ASSERT(freeQueue.push(idx));
 	return res;
@@ -86,7 +87,6 @@ bool WriteOnlySet<T, IndexType, CAPACITY>::replace(Index idx, const Reference<T>
 				if (ptr) {
 					reinterpret_cast<T*>(ptr)->delref();
 				}
-				_set[idx].store(lineagePtr);
 				return ptr != 0;
 			}
 		}
@@ -98,7 +98,7 @@ WriteOnlySet<T, IndexType, CAPACITY>::WriteOnlySet() : _set(CAPACITY) {
 	// insert the free indexes in reverse order
 	for (unsigned i = CAPACITY; i > 0; --i) {
 		freeQueue.push(i - 1);
-		_set[i] = uintptr_t(0);
+		std::atomic_init(&_set[i - 1], uintptr_t(0));
 	}
 }
 
diff --git a/flow/flow.cpp b/flow/flow.cpp
index 1332207e38..ec65640fe2 100644
--- a/flow/flow.cpp
+++ b/flow/flow.cpp
@@ -31,7 +31,7 @@ WriteOnlyVariable<ActorLineage, unsigned> currentLineageThreadSafe;
 
 LineagePropertiesBase::~LineagePropertiesBase() {}
 
-ActorLineage::ActorLineage() : parent(currentLineage) {}
+ActorLineage::ActorLineage() : properties(), parent(currentLineage) {}
 
 ActorLineage::~ActorLineage() {
 	for (auto ptr : properties) {

From 637699be324879f57a662a2befb78b38ec6c6332 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Sat, 1 May 2021 21:41:10 -0700
Subject: [PATCH 312/317] Fix issue with fdbserver not receiving global config
 change notifications

---
 fdbclient/GlobalConfig.actor.cpp      | 17 +++++++++++------
 fdbclient/GlobalConfig.actor.h        | 10 +++++++++-
 fdbserver/ClusterController.actor.cpp |  4 +++-
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 071f39c661..8d10c37dcc 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -43,8 +43,9 @@ void GlobalConfig::create(DatabaseContext* cx, Reference<AsyncVar<ClientDBInfo>>
 	if (g_network->global(INetwork::enGlobalConfig) == nullptr) {
 		auto config = new GlobalConfig{};
 		config->cx = Database(cx);
+		config->dbInfo = dbInfo;
 		g_network->setGlobal(INetwork::enGlobalConfig, config);
-		config->_updater = updater(config, dbInfo);
+		config->_updater = updater(config);
 	}
 }
 
@@ -54,6 +55,10 @@ GlobalConfig& GlobalConfig::globalConfig() {
 	return *reinterpret_cast<GlobalConfig*>(res);
 }
 
+void GlobalConfig::updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+	this->dbInfo = dbInfo;
+}
+
 Key GlobalConfig::prefixedKey(KeyRef key) {
 	return key.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG).begin);
 }
@@ -196,7 +201,7 @@ ACTOR Future<Void> GlobalConfig::refresh(GlobalConfig* self) {
 
 // Applies updates to the local copy of the global configuration when this
 // process receives an updated history.
-ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo) {
+ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self) {
 	// wait(self->cx->onConnected());
 	wait(self->migrate(self));
 
@@ -205,9 +210,9 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 
 	loop {
 		try {
-			wait(dbInfo->onChange());
+			wait(self->dbInfo->onChange());
 
-			auto& history = dbInfo->get().history;
+			auto& history = self->dbInfo->get().history;
 			if (history.size() == 0) {
 				continue;
 			}
@@ -217,8 +222,8 @@ ACTOR Future<Void> GlobalConfig::updater(GlobalConfig* self, Reference<AsyncVar<
 				// history updates or the protocol version changed, so it
 				// must re-read the entire configuration range.
 				wait(self->refresh(self));
-				if (dbInfo->get().history.size() > 0) {
-					self->lastUpdate = dbInfo->get().history.back().version;
+				if (self->dbInfo->get().history.size() > 0) {
+					self->lastUpdate = self->dbInfo->get().history.back().version;
 				}
 			} else {
 				// Apply history in order, from lowest version to highest
diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h
index 7babe0a5ef..967ec77f8d 100644
--- a/fdbclient/GlobalConfig.actor.h
+++ b/fdbclient/GlobalConfig.actor.h
@@ -77,6 +77,13 @@ public:
 	// configuration.
 	static GlobalConfig& globalConfig();
 
+	// Updates the ClientDBInfo object used by global configuration to read new
+	// data. For server processes, this value needs to be set by the cluster
+	// controller, but global config is initialized before the cluster
+	// controller is, so this function provides a mechanism to update the
+	// object after initialization.
+	void updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo);
+
 	// Use this function to turn a global configuration key defined above into
 	// the full path needed to set the value in the database.
 	//
@@ -149,9 +156,10 @@ private:
 
 	ACTOR static Future<Void> migrate(GlobalConfig* self);
 	ACTOR static Future<Void> refresh(GlobalConfig* self);
-	ACTOR static Future<Void> updater(GlobalConfig* self, Reference<AsyncVar<ClientDBInfo>> dbInfo);
+	ACTOR static Future<Void> updater(GlobalConfig* self);
 
 	Database cx;
+	Reference<AsyncVar<ClientDBInfo>> dbInfo;
 	Future<Void> _updater;
 	Promise<Void> initialized;
 	AsyncTrigger configChanged;
diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index 0ba69cbf84..d0735301df 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -135,7 +135,9 @@ public:
 		                                                                         true,
 		                                                                         TaskPriority::DefaultEndpoint,
 		                                                                         true)) // SOMEDAY: Locality!
-		{}
+		{
+			GlobalConfig::globalConfig().updateDBInfo(clientInfo);
+		}
 
 		void setDistributor(const DataDistributorInterface& interf) {
 			auto newInfo = serverInfo->get();

From c016e154a7c0092be7885f6b744efb75ec290dae Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Sun, 2 May 2021 11:03:07 -0700
Subject: [PATCH 313/317] Remove global config fdbserver fix

This is causing problems with the 5.2.0 restarting test. Removing this
line disables fdbserver processes from receiving global config updates,
instead requiring a restart to see them.
---
 fdbclient/GlobalConfig.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp
index 8d10c37dcc..e0991f4b44 100644
--- a/fdbclient/GlobalConfig.actor.cpp
+++ b/fdbclient/GlobalConfig.actor.cpp
@@ -56,7 +56,7 @@ GlobalConfig& GlobalConfig::globalConfig() {
 }
 
 void GlobalConfig::updateDBInfo(Reference<AsyncVar<ClientDBInfo>> dbInfo) {
-	this->dbInfo = dbInfo;
+	// this->dbInfo = dbInfo;
 }
 
 Key GlobalConfig::prefixedKey(KeyRef key) {

From f275fd3c32c9fd98e8a45eb24ccf9886420d8778 Mon Sep 17 00:00:00 2001
From: Lukas Joswiak <lukas.joswiak@snowflake.com>
Date: Sun, 2 May 2021 17:27:18 -0700
Subject: [PATCH 314/317] Fix gcc compilation

---
 fdbclient/ActorLineageProfiler.cpp | 4 ++--
 fdbclient/AnnotateActor.h          | 2 ++
 fdbserver/WorkerInterface.actor.h  | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp
index 4095725ffc..9be7a60704 100644
--- a/fdbclient/ActorLineageProfiler.cpp
+++ b/fdbclient/ActorLineageProfiler.cpp
@@ -49,8 +49,8 @@ class Packer : public msgpack::packer<msgpack::sbuffer> {
 				populate_visitor_map<Tail...>::populate(map);
 			}
 		};
-		template <>
-		struct populate_visitor_map<> {
+		template <class Head>
+		struct populate_visitor_map<Head> {
 			static void populate(VisitorMap&) {}
 		};
 
diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h
index 23f7f67659..dfc944fd02 100644
--- a/fdbclient/AnnotateActor.h
+++ b/fdbclient/AnnotateActor.h
@@ -77,6 +77,8 @@ constexpr std::string_view to_string(WaitState st) {
 		return "Network"sv;
 	case WaitState::Running:
 		return "Running"sv;
+	default:
+		return ""sv;
 	}
 }
 
diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h
index a09408dca0..fb9a190feb 100644
--- a/fdbserver/WorkerInterface.actor.h
+++ b/fdbserver/WorkerInterface.actor.h
@@ -818,6 +818,7 @@ struct Role {
 		case ProcessClass::Worker:
 			return WORKER;
 		case ProcessClass::NoRole:
+		default:
 			ASSERT(false);
 			throw internal_error();
 		}

From 13a6c4cf06e81f2cacfb52b7e84e4317a2a54c3b Mon Sep 17 00:00:00 2001
From: sfc-gh-tclinkenbeard <trevor.clinkenbeard@snowflake.com>
Date: Mon, 3 May 2021 08:09:52 -0700
Subject: [PATCH 315/317] Add ErrorKind field to SevError trace events

---
 flow/Error.cpp |  7 +++++++
 flow/Error.h   |  1 +
 flow/Trace.cpp | 30 +++++++++++++++++++++++++++---
 flow/Trace.h   | 13 +++++++++++++
 4 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/flow/Error.cpp b/flow/Error.cpp
index 3063759df5..42cbeec462 100644
--- a/flow/Error.cpp
+++ b/flow/Error.cpp
@@ -40,6 +40,10 @@ Error Error::fromUnvalidatedCode(int code) {
 		return Error::fromCode(code);
 }
 
+bool Error::isDiskError() const {
+	return (error_code == error_code_io_error || error_code == error_code_io_timeout);
+}
+
 Error internal_error_impl(const char* file, int line) {
 	fprintf(stderr, "Internal Error @ %s %d:\n  %s\n", file, line, platform::get_backtrace().c_str());
 
@@ -47,6 +51,7 @@ Error internal_error_impl(const char* file, int line) {
 	    .error(Error::fromCode(error_code_internal_error))
 	    .detail("File", file)
 	    .detail("Line", line)
+	    .setErrorKind(ErrorKind::BugDetected)
 	    .backtrace();
 	flushTraceFileVoid();
 	return Error(error_code_internal_error);
@@ -60,6 +65,7 @@ Error internal_error_impl(const char* msg, const char* file, int line) {
 	    .detail("FailedAssertion", msg)
 	    .detail("File", file)
 	    .detail("Line", line)
+	    .setErrorKind(ErrorKind::BugDetected)
 	    .backtrace();
 	flushTraceFileVoid();
 	return Error(error_code_internal_error);
@@ -86,6 +92,7 @@ Error internal_error_impl(const char* a_nm,
 	    .detail("RightValue", b)
 	    .detail("File", file)
 	    .detail("Line", line)
+	    .setErrorKind(ErrorKind::BugDetected)
 	    .backtrace();
 	flushTraceFileVoid();
 	return Error(error_code_internal_error);
diff --git a/flow/Error.h b/flow/Error.h
index ad02e2bd0b..d3612f72dd 100644
--- a/flow/Error.h
+++ b/flow/Error.h
@@ -50,6 +50,7 @@ public:
 		return flags & FLAG_INJECTED_FAULT;
 	} // Use as little as possible, so injected faults effectively test real faults!
 	bool isValid() const { return error_code != invalid_error_code; }
+	bool isDiskError() const;
 
 	template <class Ar>
 	void serialize(Ar& ar) {
diff --git a/flow/Trace.cpp b/flow/Trace.cpp
index b713617d9e..53d607ea1c 100644
--- a/flow/Trace.cpp
+++ b/flow/Trace.cpp
@@ -355,7 +355,6 @@ public:
 		if (localAddress.present()) {
 			fields.addField("Machine", formatIpPort(localAddress.get().ip, localAddress.get().port));
 		}
-
 		fields.addField("LogGroup", logGroup);
 
 		RoleInfo const& r = mutateRoleInfo();
@@ -653,6 +652,21 @@ bool traceClockSource(std::string& source) {
 		return false;
 	}
 }
+
+std::string toString(ErrorKind errorKind) {
+	switch (errorKind) {
+		case ErrorKind::Unset:
+			return "Unset";
+		case ErrorKind::DiskIssue:
+			return "DiskIssue";
+		case ErrorKind::BugDetected:
+			return "BugDetected";
+		default:
+			UNSTOPPABLE_ASSERT(false);
+			return "";
+	}
+}
+
 } // namespace
 
 bool selectTraceFormatter(std::string format) {
@@ -900,6 +914,10 @@ bool TraceEvent::init() {
 		}
 
 		detail("Severity", int(severity));
+		if (severity >= SevError) {
+			detail("ErrorKind", errorKind);
+			errorKindIndex = fields.size()-1;
+		}
 		detail("Time", "0.000000");
 		timeIndex = fields.size() - 1;
 		if (FLOW_KNOBS && FLOW_KNOBS->TRACE_DATETIME_ENABLED) {
@@ -942,6 +960,9 @@ TraceEvent& TraceEvent::errorImpl(class Error const& error, bool includeCancelle
 			detail("ErrorDescription", error.what());
 			detail("ErrorCode", error.code());
 		}
+		if (err.isDiskError()) {
+			setErrorKind(ErrorKind::DiskIssue);
+		}
 	} else {
 		if (initialized) {
 			TraceEvent(g_network && g_network->isSimulated() ? SevError : SevWarnAlways,
@@ -1152,6 +1173,9 @@ void TraceEvent::log() {
 					severity = SevInfo;
 					backtrace();
 					severity = SevError;
+					if (errorKindIndex != -1) {
+						fields.mutate(errorKindIndex).second = toString(errorKind);
+					}
 				}
 
 				if (isNetworkThread()) {
@@ -1365,12 +1389,12 @@ TraceEventFields::TraceEventFields() : bytes(0), annotated(false) {}
 
 void TraceEventFields::addField(const std::string& key, const std::string& value) {
 	bytes += key.size() + value.size();
-	fields.push_back(std::make_pair(key, value));
+	fields.emplace_back(key, value);
 }
 
 void TraceEventFields::addField(std::string&& key, std::string&& value) {
 	bytes += key.size() + value.size();
-	fields.push_back(std::make_pair(std::move(key), std::move(value)));
+	fields.emplace_back(std::move(key), std::move(value));
 }
 
 size_t TraceEventFields::size() const {
diff --git a/flow/Trace.h b/flow/Trace.h
index 1193171a83..aed19d3cf8 100644
--- a/flow/Trace.h
+++ b/flow/Trace.h
@@ -64,6 +64,12 @@ enum Severity {
 	SevMax = 1000000
 };
 
+enum class ErrorKind : uint8_t {
+	Unset,
+	DiskIssue,
+	BugDetected,
+};
+
 const int NUM_MAJOR_LEVELS_OF_EVENTS = SevMaxUsed / 10 + 1;
 
 class TraceEventFields {
@@ -457,6 +463,11 @@ public:
 
 	bool isEnabled() const { return enabled; }
 
+	TraceEvent &setErrorKind(ErrorKind errorKind) {
+		this->errorKind = errorKind;
+		return *this;
+	}
+
 	explicit operator bool() const { return enabled; }
 
 	void log();
@@ -475,6 +486,7 @@ private:
 	std::string trackingKey;
 	TraceEventFields fields;
 	Severity severity;
+	ErrorKind errorKind;
 	const char* type;
 	UID id;
 	Error err;
@@ -482,6 +494,7 @@ private:
 	int maxFieldLength;
 	int maxEventLength;
 	int timeIndex;
+	int errorKindIndex { -1 };
 
 	void setSizeLimits();
 

From f9ede75b422d1fdcc128b76b419c61d505546a43 Mon Sep 17 00:00:00 2001
From: sfc-gh-tclinkenbeard <trevor.clinkenbeard@snowflake.com>
Date: Mon, 3 May 2021 10:56:42 -0700
Subject: [PATCH 316/317] Remove unused variable in ClusterController.actor.cpp

---
 fdbserver/ClusterController.actor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp
index d0735301df..9dbb9bca2f 100644
--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@@ -1888,7 +1888,7 @@ public:
 			updateKnownIds(&firstUsed);
 			updateKnownIds(&secondUsed);
 
-			auto mworker = id_worker.find(masterProcessId);
+			// auto mworker = id_worker.find(masterProcessId);
 			//TraceEvent("CompareAddressesMaster")
 			//    .detail("Master",
 			//            mworker != id_worker.end() ? mworker->second.details.interf.address() : NetworkAddress());

From 2bcfbd681679584606c9fdf5135a2c499e0fbdf7 Mon Sep 17 00:00:00 2001
From: sfc-gh-tclinkenbeard <trevor.clinkenbeard@snowflake.com>
Date: Mon, 3 May 2021 10:59:44 -0700
Subject: [PATCH 317/317] Move TraceEvent::setErrorKind implementation to cpp
 file

---
 flow/Trace.cpp | 5 +++++
 flow/Trace.h   | 5 +----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/flow/Trace.cpp b/flow/Trace.cpp
index 53d607ea1c..06c7f33ec1 100644
--- a/flow/Trace.cpp
+++ b/flow/Trace.cpp
@@ -1105,6 +1105,11 @@ TraceEvent& TraceEvent::suppressFor(double duration, bool logSuppressedEventCoun
 	return *this;
 }
 
+TraceEvent &TraceEvent::setErrorKind(ErrorKind errorKind) {
+	this->errorKind = errorKind;
+	return *this;
+}
+
 TraceEvent& TraceEvent::setMaxFieldLength(int maxFieldLength) {
 	ASSERT(!logged);
 	if (maxFieldLength == 0) {
diff --git a/flow/Trace.h b/flow/Trace.h
index aed19d3cf8..ef86671cc1 100644
--- a/flow/Trace.h
+++ b/flow/Trace.h
@@ -463,10 +463,7 @@ public:
 
 	bool isEnabled() const { return enabled; }
 
-	TraceEvent &setErrorKind(ErrorKind errorKind) {
-		this->errorKind = errorKind;
-		return *this;
-	}
+	TraceEvent &setErrorKind(ErrorKind errorKind);
 
 	explicit operator bool() const { return enabled; }