From 76838a20b7bd936472d3431bbc7534afac883dad Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 30 Oct 2020 09:11:08 -0700 Subject: [PATCH 001/317] A model used to quickly simulate various GRV scenarios and algorithms --- contrib/grv_proxy_model/grv_test.py | 134 ++++++++ contrib/grv_proxy_model/plot.py | 107 +++++++ contrib/grv_proxy_model/priority.py | 40 +++ contrib/grv_proxy_model/proxy_model.py | 338 ++++++++++++++++++++ contrib/grv_proxy_model/rate_model.py | 83 +++++ contrib/grv_proxy_model/ratekeeper_model.py | 67 ++++ contrib/grv_proxy_model/smoother.py | 53 +++ contrib/grv_proxy_model/workload_model.py | 201 ++++++++++++ 8 files changed, 1023 insertions(+) create mode 100755 contrib/grv_proxy_model/grv_test.py create mode 100755 contrib/grv_proxy_model/plot.py create mode 100755 contrib/grv_proxy_model/priority.py create mode 100755 contrib/grv_proxy_model/proxy_model.py create mode 100755 contrib/grv_proxy_model/rate_model.py create mode 100755 contrib/grv_proxy_model/ratekeeper_model.py create mode 100644 contrib/grv_proxy_model/smoother.py create mode 100755 contrib/grv_proxy_model/workload_model.py diff --git a/contrib/grv_proxy_model/grv_test.py b/contrib/grv_proxy_model/grv_test.py new file mode 100755 index 0000000000..1cd0224538 --- /dev/null +++ b/contrib/grv_proxy_model/grv_test.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 + +# +# grv_test.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import inspect +import sys + +import rate_model +import workload_model +import proxy_model +import ratekeeper_model +from priority import Priority +from plot import Plotter + +parser = argparse.ArgumentParser() +parser.add_argument('-w', '--workload', type=str, help='Name of workload to run') +parser.add_argument('-r', '--ratekeeper', type=str, help='Name of ratekeeper model') +parser.add_argument('-d', '--duration', type=int, default=240, help='Duration of simulated test, in seconds. Defaults to 240.') +parser.add_argument('-L', '--limiter', type=str, default='Original', help='Name of limiter implementation. Defaults to \'Original\'.') +parser.add_argument('-p', '--proxy', type=str, default='ProxyModel', help='Name of proxy implementation. Defaults to \'ProxyModel\'.') +parser.add_argument('--list', action='store_true', default=False, help='List options for all models.') +parser.add_argument('--no-graph', action='store_true', default=False, help='Disable graphical output.') + +args = parser.parse_args() + +def print_choices_list(context=None): + if context == 'workload' or context is None: + print('Workloads:') + for w in workload_model.predefined_workloads.keys(): + print(' %s' % w) + + if context == 'ratekeeper' or context is None: + print('\nRatekeeper models:') + for r in ratekeeper_model.predefined_ratekeeper.keys(): + print(' %s' % r) + + proxy_model_classes = [c for c in [getattr(proxy_model, a) for a in dir(proxy_model)] if inspect.isclass(c)] + + if context == 'proxy' or context is None: + print('\nProxy models:') + for p in proxy_model_classes: + if issubclass(p, proxy_model.ProxyModel): + print(' %s' % p.__name__) + + if context == 'limiter' or context is None: + print('\nProxy limiters:') + for p in proxy_model_classes: + if issubclass(p, proxy_model.Limiter) and p != proxy_model.Limiter: + name = p.__name__ + if name.endswith('Limiter'): + name = name[0:-len('Limiter')] + print(' %s' % name) + +if args.workload is None or args.ratekeeper is None: + print('ERROR: A workload (-w/--workload) and ratekeeper model (-r/--ratekeeper) must be specified.\n') + print_choices_list() + sys.exit(1) + +if args.list: + print_choices_list() + sys.exit(0) + +def validate_class_type(var, name, superclass): + cls = getattr(var, name, None) + return cls is not None and inspect.isclass(cls) and issubclass(cls, superclass) + +if not args.ratekeeper in ratekeeper_model.predefined_ratekeeper: + print('Invalid ratekeeper model `%s\'' % args.ratekeeper) + print_choices_list('ratekeeper') + sys.exit(1) + +if not args.workload in workload_model.predefined_workloads: + print('Invalid workload model `%s\'' % args.workload) + print_choices_list('workload') + sys.exit(1) + +if not validate_class_type(proxy_model, args.proxy, proxy_model.ProxyModel): + print('Invalid proxy model `%s\'' % args.proxy) + print_choices_list('proxy') + sys.exit(1) + +limiter_name = args.limiter +if not validate_class_type(proxy_model, limiter_name, proxy_model.Limiter): + limiter_name += 'Limiter' + if not validate_class_type(proxy_model, limiter_name, proxy_model.Limiter): + print('Invalid proxy limiter `%s\'' % args.limiter) + print_choices_list('limiter') + sys.exit(1) + +ratekeeper = ratekeeper_model.predefined_ratekeeper[args.ratekeeper] +workload = workload_model.predefined_workloads[args.workload] + +limiter = getattr(proxy_model, limiter_name) +proxy = getattr(proxy_model, args.proxy)(args.duration, ratekeeper, workload, limiter) + +proxy.run() + +for priority in workload.priorities(): + latencies = sorted([p for t in proxy.results.latencies[priority].values() for p in t]) + total_started = sum(proxy.results.started[priority].values()) + still_queued = sum([r.count for r in proxy.request_queue if r.priority == priority]) + + if len(latencies) > 0: + print('\n%s: %d requests in %d seconds (rate=%f). %d still queued.' % (priority, total_started, proxy.time, float(total_started)/proxy.time, still_queued)) + print(' Median latency: %f' % latencies[len(latencies)//2]) + print(' 90%% latency: %f' % latencies[int(0.9*len(latencies))]) + print(' 99%% latency: %f' % latencies[int(0.99*len(latencies))]) + print(' 99.9%% latency: %f' % latencies[int(0.999*len(latencies))]) + print(' Max latency: %f' % latencies[-1]) + +print('') + +if not args.no_graph: + plotter = Plotter(proxy.results) + plotter.display() diff --git a/contrib/grv_proxy_model/plot.py b/contrib/grv_proxy_model/plot.py new file mode 100755 index 0000000000..9334e2c844 --- /dev/null +++ b/contrib/grv_proxy_model/plot.py @@ -0,0 +1,107 @@ +# +# plot.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import matplotlib.pyplot as plt + +class Plotter: + def __init__(self, results): + self.results = results + + def add_plot(data, time_resolution, label, use_avg=False): + out_data = {} + counts = {} + for t in data.keys(): + out_data.setdefault(t//time_resolution*time_resolution, 0) + counts.setdefault(t//time_resolution*time_resolution, 0) + out_data[t//time_resolution*time_resolution] += data[t] + counts[t//time_resolution*time_resolution] += 1 + + if use_avg: + out_data = { t: v/counts[t] for t,v in out_data.items() } + + plt.plot(list(out_data.keys()), list(out_data.values()), label=label) + + def add_plot_with_times(data, label): + plt.plot(list(data.keys()), list(data.values()), label=label) + + def display(self, time_resolution=0.1): + plt.figure(figsize=(40,9)) + plt.subplot(3, 3, 1) + for priority in self.results.started.keys(): + Plotter.add_plot(self.results.started[priority], time_resolution, priority) + + plt.xlabel('Time (s)') + plt.ylabel('Released/s') + plt.legend() + + plt.subplot(3, 3, 2) + for priority in self.results.queued.keys(): + Plotter.add_plot(self.results.queued[priority], time_resolution, priority) + + plt.xlabel('Time (s)') + plt.ylabel('Requests/s') + plt.legend() + + plt.subplot(3, 3, 3) + for priority in self.results.unprocessed_queue_sizes.keys(): + data = {k: max(v) for (k,v) in self.results.unprocessed_queue_sizes[priority].items()} + Plotter.add_plot(data, time_resolution, priority) + + plt.xlabel('Time (s)') + plt.ylabel('Max queue size') + plt.legend() + + num = 4 + for priority in self.results.latencies.keys(): + plt.subplot(3, 3, num) + median_latencies = {k: v[int(0.5*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()} + percentile90_latencies = {k: v[int(0.9*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()} + max_latencies = {k: max(v) if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()} + + Plotter.add_plot(median_latencies, time_resolution, 'median') + Plotter.add_plot(percentile90_latencies, time_resolution, '90th percentile') + Plotter.add_plot(max_latencies, time_resolution, 'max') + + plt.xlabel('Time (s)') + plt.ylabel(str(priority) + ' Latency (s)') + plt.yscale('log') + plt.legend() + num += 1 + + for priority in self.results.rate.keys(): + plt.subplot(3, 3, num) + if len(self.results.rate[priority]) > 0: + Plotter.add_plot(self.results.rate[priority], time_resolution, 'Rate', use_avg=True) + if len(self.results.released[priority]) > 0: + Plotter.add_plot(self.results.released[priority], time_resolution, 'Released', use_avg=True) + if len(self.results.limit[priority]) > 0: + Plotter.add_plot(self.results.limit[priority], time_resolution, 'Limit', use_avg=True) + if len(self.results.limit_and_budget[priority]) > 0: + Plotter.add_plot(self.results.limit_and_budget[priority], time_resolution, 'Limit and budget', use_avg=True) + if len(self.results.budget[priority]) > 0: + Plotter.add_plot(self.results.budget[priority], time_resolution, 'Budget', use_avg=True) + + plt.xlabel('Time (s)') + plt.ylabel('Value (' + str(priority) + ')') + plt.legend() + num += 1 + + plt.show() + diff --git a/contrib/grv_proxy_model/priority.py b/contrib/grv_proxy_model/priority.py new file mode 100755 index 0000000000..3ba5c05f2e --- /dev/null +++ b/contrib/grv_proxy_model/priority.py @@ -0,0 +1,40 @@ +# +# priority.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import functools + +@functools.total_ordering +class Priority: + def __init__(self, priority_value, label): + self.priority_value = priority_value + self.label = label + + def __lt__(self, other): + return self.priority_value < other.priority_value + + def __str__(self): + return self.label + + def __repr__(self): + return repr(self.label) + +Priority.SYSTEM = Priority(0, "System") +Priority.DEFAULT = Priority(1, "Default") +Priority.BATCH = Priority(2, "Batch") diff --git a/contrib/grv_proxy_model/proxy_model.py b/contrib/grv_proxy_model/proxy_model.py new file mode 100755 index 0000000000..9ca2a39bfe --- /dev/null +++ b/contrib/grv_proxy_model/proxy_model.py @@ -0,0 +1,338 @@ +# +# proxy_model.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import copy +import functools +import heapq + +from priority import Priority +from smoother import Smoother + +@functools.total_ordering +class Task: + def __init__(self, time, fxn): + self.time = time + self.fxn = fxn + + def __lt__(self, other): + return self.time < other.time + +class Limiter: + class UpdateRateParams: + def __init__(self, time): + self.time = time + + class UpdateLimitParams: + def __init__(self, time, elapsed): + self.time = time + self.elapsed = elapsed + + class CanStartParams: + def __init__(self, time, num_started, count): + self.time = time + self.num_started = num_started + self.count = count + + class UpdateBudgetParams: + def __init__(self, time, num_started, num_started_at_priority, min_priority, last_batch, queue_empty, elapsed): + self.time = time + self.num_started = num_started + self.num_started_at_priority = num_started_at_priority + self.min_priority = min_priority + self.last_batch = last_batch + self.queue_empty = queue_empty + self.elapsed = elapsed + + def __init__(self, priority, ratekeeper_model, proxy_model): + self.priority = priority + self.ratekeeper_model = ratekeeper_model + self.proxy_model = proxy_model + self.limit = 0 + self.rate = self.ratekeeper_model.get_limit(0, self.priority) + + def update_rate(self, params): + pass + + def update_limit(self, params): + pass + + def can_start(self, params): + pass + + def update_budget(self, params): + pass + +class OriginalLimiter(Limiter): + def __init__(self, priority, limit_rate_model, proxy_model): + Limiter.__init__(self, priority, limit_rate_model, proxy_model) + + def update_rate(self, params): + self.rate = self.ratekeeper_model.get_limit(params.time, self.priority) + + def update_limit(self, params): + self.limit = min(0, self.limit) + params.elapsed * self.rate + self.limit = min(self.limit, self.rate * 0.01) + self.limit = min(self.limit, 100000) + + self.proxy_model.results.rate[self.priority][params.time] = self.rate + self.proxy_model.results.limit[self.priority][params.time] = self.limit + + def can_start(self, params): + return params.num_started < self.limit + + def update_budget(self, params): + self.limit -= params.num_started + +class PositiveBudgetLimiter(OriginalLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + OriginalLimiter.__init__(self, priority, limit_rate_model, proxy_model) + + def update_limit(self, params): + self.limit += params.elapsed * self.rate + self.limit = min(self.limit, 2.0 * self.rate) + +class ClampedBudgetLimiter(PositiveBudgetLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model) + + def update_budget(self, params): + min_budget = -self.rate * 5.0 + if self.limit > min_budget: + self.limit = max(self.limit - params.num_started, min_budget) + +class TimeLimiter(PositiveBudgetLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model) + self.locked_until = 0 + + def can_start(self, params): + return params.time >= self.locked_until and PositiveBudgetLimiter.can_start(self, params) + + def update_budget(self, params): + #print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch)) + + if params.min_priority >= self.priority or params.num_started < self.limit: + self.limit -= params.num_started + else: + self.limit = min(self.limit, max(self.limit - params.num_started, -params.last_batch)) + self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + (params.num_started - self.limit)/self.rate) + + #print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority)) + +class TimePositiveBudgetLimiter(PositiveBudgetLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model) + self.locked_until = 0 + + def update_limit(self, params): + if params.time >= self.locked_until: + PositiveBudgetLimiter.update_limit(self, params) + + def can_start(self, params): + return params.num_started + params.count <= self.limit + + def update_budget(self, params): + #if params.num_started > 0: + #print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch)) + + if params.num_started > self.limit: + self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + penalty/self.rate) + self.limit = 0 + else: + self.limit -= params.num_started + + #if params.num_started > 0: + #print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority)) + +class SmoothingLimiter(OriginalLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + OriginalLimiter.__init__(self, priority, limit_rate_model, proxy_model) + self.smooth_released = Smoother(2) + self.smooth_rate_limit = Smoother(2) + self.rate_set = False + + def update_rate(self, params): + OriginalLimiter.update_rate(self, params) + if not self.rate_set: + self.rate_set = True + self.smooth_rate_limit.reset(self.rate) + else: + self.smooth_rate_limit.set_total(params.time, self.rate) + + def update_limit(self, params): + self.limit = 2.0 * (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time)) + + def can_start(self, params): + return params.num_started + params.count <= self.limit + + def update_budget(self, params): + self.smooth_released.add_delta(params.time, params.num_started) + +class SmoothingBudgetLimiter(SmoothingLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + SmoothingLimiter.__init__(self, priority, limit_rate_model, proxy_model) + #self.smooth_filled = Smoother(2) + self.budget = 0 + + def update_limit(self, params): + release_rate = (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time)) + #self.smooth_filled.set_total(params.time, 1 if release_rate > 0 else 0) + self.limit = 2.0 * release_rate + + self.proxy_model.results.rate[self.priority][params.time] = self.smooth_rate_limit.smooth_total(params.time) + self.proxy_model.results.released[self.priority][params.time] = self.smooth_released.smooth_rate(params.time) + self.proxy_model.results.limit[self.priority][params.time] = self.limit + self.proxy_model.results.limit_and_budget[self.priority][params.time] = self.limit + self.budget + self.proxy_model.results.budget[self.priority][params.time] = self.budget + + #self.budget = max(0, self.budget + params.elapsed * self.smooth_rate_limit.smooth_total(params.time)) + + #if self.smooth_filled.smooth_total(params.time) >= 0.1: + #self.budget += params.elapsed * self.smooth_rate_limit.smooth_total(params.time) + + #print('Update limit: time=%f, priority=%s, limit=%f, rate=%f, released=%f, budget=%f' % (params.time, self.priority, self.limit, self.smooth_rate_limit.smooth_total(params.time), self.smooth_released.smooth_rate(params.time), self.budget)) + + def can_start(self, params): + return params.num_started + params.count <= self.limit + self.budget #or params.num_started + params.count <= self.budget + + def update_budget(self, params): + self.budget = max(0, self.budget + (self.limit - params.num_started_at_priority) / 2 * params.elapsed) + + if params.queue_empty: + self.budget = min(10, self.budget) + + self.smooth_released.add_delta(params.time, params.num_started_at_priority) + +class ProxyModel: + class Results: + def __init__(self, priorities, duration): + self.started = self.init_result(priorities, 0, duration) + self.queued = self.init_result(priorities, 0, duration) + self.latencies = self.init_result(priorities, [], duration) + self.unprocessed_queue_sizes = self.init_result(priorities, [], duration) + + self.rate = {p:{} for p in priorities} + self.released = {p:{} for p in priorities} + self.limit = {p:{} for p in priorities} + self.limit_and_budget = {p:{} for p in priorities} + self.budget = {p:{} for p in priorities} + + def init_result(self, priorities, starting_value, duration): + return {p: {s: copy.copy(starting_value) for s in range(0, duration)} for p in priorities} + + def __init__(self, duration, ratekeeper_model, workload_model, Limiter): + self.time = 0 + self.log_time = 0 + self.duration = duration + self.priority_limiters = { priority: Limiter(priority, ratekeeper_model, self) for priority in workload_model.priorities() } + self.workload_model = workload_model + self.request_scheduled = { p: False for p in self.workload_model.priorities()} + + self.tasks = [] + self.request_queue = [] + self.results = ProxyModel.Results(self.workload_model.priorities(), duration) + + def run(self): + self.update_rate() + self.process_requests(self.time) + + for priority in self.workload_model.priorities(): + next_request = self.workload_model.next_request(self.time, priority) + assert next_request is not None + heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request))) + self.request_scheduled[priority] = True + + while True:# or len(self.request_queue) > 0: + if int(self.time) > self.log_time: + self.log_time = int(self.time) + #print(self.log_time) + + task = heapq.heappop(self.tasks) + self.time = task.time + if self.time >= self.duration: + break + + task.fxn() + + def update_rate(self): + for limiter in self.priority_limiters.values(): + limiter.update_rate(Limiter.UpdateRateParams(self.time)) + + heapq.heappush(self.tasks, Task(self.time + 0.01, lambda: self.update_rate())) + + def receive_request(self, request): + heapq.heappush(self.request_queue, request) + + self.results.queued[request.priority][int(self.time)] += request.count + + next_request = self.workload_model.next_request(self.time, request.priority) + if next_request is not None and next_request.time < self.duration: + heapq.heappush(self.tasks, Task(next_request.time, lambda: self.receive_request(next_request))) + else: + self.request_scheduled[request.priority] = False + + def process_requests(self, last_time): + elapsed = self.time - last_time + for limiter in self.priority_limiters.values(): + limiter.update_limit(Limiter.UpdateLimitParams(self.time, elapsed)) + + current_started = 0 + started = {p:0 for p in self.workload_model.priorities()} + + min_priority = Priority.SYSTEM + last_batch = 0 + while len(self.request_queue) > 0: + request = self.request_queue[0] + + if not self.priority_limiters[request.priority].can_start(Limiter.CanStartParams(self.time, current_started, request.count)): + break + + min_priority = request.priority + last_batch = request.count + + if self.workload_model.request_completed(request) and not self.request_scheduled[request.priority]: + next_request = self.workload_model.next_request(self.time, request.priority) + assert next_request is not None + heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request))) + self.request_scheduled[request.priority] = True + + current_started += request.count + started[request.priority] += request.count + + heapq.heappop(self.request_queue) + self.results.started[request.priority][int(self.time)] += request.count + self.results.latencies[request.priority][int(self.time)].append(self.time-request.time) + + if len(self.request_queue) == 0: + min_priority = Priority.BATCH + + for priority, limiter in self.priority_limiters.items(): + started_at_priority = sum([v for p,v in started.items() if p <= priority]) + limiter.update_budget(Limiter.UpdateBudgetParams(self.time, current_started, started_at_priority, min_priority, last_batch, len(self.request_queue) == 0 or self.request_queue[0].priority > priority, elapsed)) + + for priority in self.workload_model.priorities(): + self.results.unprocessed_queue_sizes[priority][int(self.time)].append(self.workload_model.workload_models[priority].outstanding) + + current_time = self.time + + delay = 0.001 + heapq.heappush(self.tasks, Task(self.time + delay, lambda: self.process_requests(current_time))) + + diff --git a/contrib/grv_proxy_model/rate_model.py b/contrib/grv_proxy_model/rate_model.py new file mode 100755 index 0000000000..1fabce2c7e --- /dev/null +++ b/contrib/grv_proxy_model/rate_model.py @@ -0,0 +1,83 @@ +# +# rate_model.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy + +class RateModel: + def __init__(self): + pass + + def get_rate(self, time): + pass + +class FixedRateModel(RateModel): + def __init__(self, rate): + RateModel.__init__(self) + self.rate = rate + + def get_rate(self, time): + return self.rate + +class UnlimitedRateModel(FixedRateModel): + def __init__(self): + self.rate = 1e9 + +class IntervalRateModel(RateModel): + def __init__(self, intervals): + self.intervals = sorted(intervals) + + def get_rate(self, time): + if len(self.intervals) == 0 or time < self.intervals[0][0]: + return 0 + + target_interval = len(self.intervals)-1 + for i in range(1, len(self.intervals)): + if time < self.intervals[i][0]: + target_interval = i-1 + break + + self.intervals = self.intervals[target_interval:] + return self.intervals[0][1] + +class SawtoothRateModel(RateModel): + def __init__(self, low, high, frequency): + self.low = low + self.high = high + self.frequency = frequency + + def get_rate(self, time): + if int(2*time/self.frequency) % 2 == 0: + return self.low + else: + return self.high + +class DistributionRateModel(RateModel): + def __init__(self, distribution, frequency): + self.distribution = distribution + self.frequency = frequency + self.last_change = 0 + self.rate = None + + def get_rate(self, time): + if self.frequency == 0 or int((time - self.last_change) / self.frequency) > int(self.last_change / self.frequency) or self.rate is None: + self.last_change = time + self.rate = self.distribution() + + return self.rate diff --git a/contrib/grv_proxy_model/ratekeeper_model.py b/contrib/grv_proxy_model/ratekeeper_model.py new file mode 100755 index 0000000000..57125dc4c0 --- /dev/null +++ b/contrib/grv_proxy_model/ratekeeper_model.py @@ -0,0 +1,67 @@ +# +# ratekeeper.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy +import rate_model +from priority import Priority + +class RatekeeperModel: + def __init__(self, limit_models): + self.limit_models = limit_models + + def get_limit(self, time, priority): + return self.limit_models[priority].get_rate(time) + +predefined_ratekeeper = {} + +predefined_ratekeeper['default200_batch100'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.FixedRateModel(200), + Priority.BATCH: rate_model.FixedRateModel(100) +}) + +predefined_ratekeeper['default_sawtooth'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.SawtoothRateModel(10, 200, 1), + Priority.BATCH: rate_model.FixedRateModel(0) +}) + +predefined_ratekeeper['default_uniform_random'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.DistributionRateModel(lambda: numpy.random.uniform(10, 200), 1), + Priority.BATCH: rate_model.FixedRateModel(0) +}) + +predefined_ratekeeper['default_trickle'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.FixedRateModel(3), + Priority.BATCH: rate_model.FixedRateModel(0) +}) + +predefined_ratekeeper['default1000'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.FixedRateModel(1000), + Priority.BATCH: rate_model.FixedRateModel(500) +}) diff --git a/contrib/grv_proxy_model/smoother.py b/contrib/grv_proxy_model/smoother.py new file mode 100644 index 0000000000..bc1b32ea12 --- /dev/null +++ b/contrib/grv_proxy_model/smoother.py @@ -0,0 +1,53 @@ +# +# smoother.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import math + +class Smoother: + def __init__(self, folding_time): + self.folding_time = folding_time + self.reset(0) + + def reset(self, value): + self.time = 0 + self.total = value + self.estimate = value + + def set_total(self, time, total): + self.add_delta(time, total-self.total) + + def add_delta(self, time, delta): + self.update(time) + self.total += delta + + def smooth_total(self, time): + self.update(time) + return self.estimate + + def smooth_rate(self, time): + self.update(time) + return (self.total-self.estimate) / self.folding_time + + def update(self, time): + elapsed = time - self.time + if elapsed > 0: + self.time = time + self.estimate += (self.total-self.estimate) * (1-math.exp(-elapsed/self.folding_time)) + diff --git a/contrib/grv_proxy_model/workload_model.py b/contrib/grv_proxy_model/workload_model.py new file mode 100755 index 0000000000..63fb4c472e --- /dev/null +++ b/contrib/grv_proxy_model/workload_model.py @@ -0,0 +1,201 @@ +# +# workload_model.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import functools +import numpy +import math + +import rate_model +from priority import Priority + +@functools.total_ordering +class Request: + def __init__(self, time, count, priority): + self.time = time + self.count = count + self.priority = priority + + def __lt__(self, other): + return self.priority < other.priority + +class PriorityWorkloadModel: + def __init__(self, priority, rate_model, batch_model, generator, max_outstanding=1e9): + self.priority = priority + self.rate_model = rate_model + self.batch_model = batch_model + self.generator = generator + self.max_outstanding = max_outstanding + self.outstanding = 0 + + def next_request(self, time): + if self.outstanding >= self.max_outstanding: + return None + + batch_size = self.batch_model.next_batch() + self.outstanding += batch_size + interval = self.generator.next_request_interval(self.rate_model.get_rate(time)) + return Request(time + interval, batch_size, self.priority) + + def request_completed(self, request): + was_full = self.max_outstanding <= self.outstanding + self.outstanding -= request.count + + return was_full and self.outstanding < self.max_outstanding + +class WorkloadModel: + def __init__(self, workload_models): + self.workload_models = workload_models + + def priorities(self): + return list(self.workload_models.keys()) + + def next_request(self, time, priority): + return self.workload_models[priority].next_request(time) + + def request_completed(self, request): + return self.workload_models[request.priority].request_completed(request) + +class Distribution: + EXPONENTIAL = lambda x: numpy.random.exponential(x) + UNIFORM = lambda x: numpy.random.uniform(0, 2.0*x) + FIXED = lambda x: x + +class BatchGenerator: + def __init__(self): + pass + + def next_batch(self): + pass + +class DistributionBatchGenerator(BatchGenerator): + def __init__(self, distribution, size): + BatchGenerator.__init__(self) + self.distribution = distribution + self.size = size + + def next_batch(self): + return math.ceil(self.distribution(self.size)) + +class RequestGenerator: + def __init__(self): + pass + + def next_request_interval(self, rate): + pass + +class DistributionRequestGenerator(RequestGenerator): + def __init__(self, distribution): + RequestGenerator.__init__(self) + self.distribution = distribution + + def next_request_interval(self, rate): + if rate == 0: + return 1e9 + + return self.distribution(1.0/rate) + +predefined_workloads = {} + +predefined_workloads['slow_exponential'] = WorkloadModel( +{ + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.FixedRateModel(100), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.EXPONENTIAL), + max_outstanding=100 + ) +}) + +predefined_workloads['fixed_uniform'] = WorkloadModel( +{ + Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, + rate_model.FixedRateModel(0), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=10 + ), + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.FixedRateModel(95), + DistributionBatchGenerator(Distribution.FIXED, 10), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ), + Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, + rate_model.FixedRateModel(1), + DistributionBatchGenerator(Distribution.UNIFORM, 500), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ) +}) + +predefined_workloads['batch_starvation'] = WorkloadModel( +{ + Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, + rate_model.FixedRateModel(1), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=10 + ), + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.IntervalRateModel([(0,50), (60,150), (120,90)]), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ), + Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, + rate_model.FixedRateModel(100), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ) +}) + +predefined_workloads['default_low_high_low'] = WorkloadModel( +{ + Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, + rate_model.FixedRateModel(0), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=10 + ), + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.IntervalRateModel([(0,100), (60,300), (120,100)]), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ), + Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, + rate_model.FixedRateModel(0), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ) +}) + +for rate in [83, 100, 180, 190, 200]: + predefined_workloads['default%d' % rate] = WorkloadModel( + { + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.FixedRateModel(rate), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.EXPONENTIAL), + max_outstanding=1000 + ) + }) From 82f7f541c39377ae2386cc52b777b354b3f545c4 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 25 Nov 2020 11:38:08 -0700 Subject: [PATCH 002/317] started lineage implementation --- flow/flow.cpp | 2 ++ flow/flow.h | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/flow/flow.cpp b/flow/flow.cpp index 89f04bd5df..a2bfcc1510 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -26,6 +26,8 @@ #include #include +thread_local ActorLineagePropertyMap* currentLineage = nullptr; + #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER) // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test. void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) { diff --git a/flow/flow.h b/flow/flow.h index a72465143d..155c5db2a2 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -36,6 +36,7 @@ #include #include #include +#include #include "flow/Platform.h" #include "flow/FastAlloc.h" @@ -407,6 +408,30 @@ struct SingleCallback { } }; +// in the future we might want to read these from a different thread. std::shared_ptr +// seems to be better suited for this... +struct ActorLineagePropertyMap : std::enable_shared_from_this { + std::shared_ptr parent = nullptr; +}; + +extern thread_local ActorLineagePropertyMap* currentLineage; + +struct ActorLineage { + std::shared_ptr properties = std::make_shared(); + ActorLineage() { + if (currentLineage) { + properties->parent = currentLineage->shared_from_this(); + } + } +}; + +struct save_lineage { + ActorLineagePropertyMap* current = currentLineage; + ~save_lineage() { + currentLineage = current; + } +}; + // SAV is short for Single Assignment Variable: It can be assigned for only once! template struct SAV : private Callback, FastAllocated> { @@ -445,6 +470,7 @@ public: ASSERT(canBeSet()); new (&value_storage) T(std::forward(value)); this->error_state = Error::fromCode(SET_ERROR_CODE); + save_lineage _{}; while (Callback::next != this) Callback::next->fire(this->value()); } @@ -457,6 +483,7 @@ public: void sendError(Error err) { ASSERT(canBeSet() && int16_t(err.code()) > 0); this->error_state = err; + save_lineage _{}; while (Callback::next != this) Callback::next->error(err); } @@ -477,6 +504,7 @@ public: void finishSendAndDelPromiseRef() { // Call only after value_storage has already been initialized! this->error_state = Error::fromCode(SET_ERROR_CODE); + save_lineage _{}; while (Callback::next != this) Callback::next->fire(this->value()); @@ -500,6 +528,7 @@ public: } this->error_state = err; + save_lineage _{}; while (Callback::next != this) Callback::next->error(err); @@ -987,7 +1016,7 @@ static inline void destruct(T& t) { } template -struct Actor : SAV { +struct Actor : SAV, ActorLineage { int8_t actor_wait_state; // -1 means actor is cancelled; 0 means actor is not waiting; 1-N mean waiting in callback group # Actor() : SAV(1, 1), actor_wait_state(0) { /*++actorCount;*/ } @@ -995,7 +1024,7 @@ struct Actor : SAV { }; template <> -struct Actor { +struct Actor : ActorLineage { // This specialization is for a void actor (one not returning a future, hence also uncancellable) int8_t actor_wait_state; // 0 means actor is not waiting; 1-N mean waiting in callback group # From 05f77f905fb3a32c026729479de3de5456a5789e Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 7 Dec 2020 15:15:25 -0700 Subject: [PATCH 003/317] Added actor lineage --- flow/actorcompiler/ActorCompiler.cs | 1 + flow/actorcompiler/actorcompiler.csproj | 108 +----------------------- flow/actorcompiler/actorcompiler.sln | 34 ++++++++ flow/flow.cpp | 5 +- flow/flow.h | 96 +++++++++++++-------- flow/genericactors.actor.h | 4 + 6 files changed, 110 insertions(+), 138 deletions(-) create mode 100644 flow/actorcompiler/actorcompiler.sln diff --git a/flow/actorcompiler/ActorCompiler.cs b/flow/actorcompiler/ActorCompiler.cs index 7aef82a42e..dc9de91868 100644 --- a/flow/actorcompiler/ActorCompiler.cs +++ b/flow/actorcompiler/ActorCompiler.cs @@ -452,6 +452,7 @@ namespace actorcompiler fullClassName, string.Join(", ", actor.parameters.Select(p => p.name).ToArray())); + writer.WriteLine("restore_lineage _;"); if (actor.returnType != null) writer.WriteLine("\treturn Future<{1}>({0});", newActor, actor.returnType); else diff --git a/flow/actorcompiler/actorcompiler.csproj b/flow/actorcompiler/actorcompiler.csproj index e737adabd2..b590913634 100644 --- a/flow/actorcompiler/actorcompiler.csproj +++ b/flow/actorcompiler/actorcompiler.csproj @@ -1,108 +1,8 @@ - - + + - Debug - 10.0.20506 - 2.0 - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51} Exe - Properties - actorcompiler - actorcompiler - v4.0 - 512 - $(SolutionDir)bin\$(Configuration)\ - publish\ - true - Disk - false - Foreground - 7 - Days - false - false - true - 0 - 1.0.0.%2a - false - false - true + net5.0 - - true - DEBUG;TRACE - full - AnyCPU - default - prompt - false - false - - - TRACE - true - pdbonly - AnyCPU - default - prompt - false - false - - - - - 3.5 - - - 3.5 - - - 3.5 - - - 4.0 - - - - - - - - - - - - - - False - Microsoft .NET Framework 4 %28x86 and x64%29 - true - - - False - .NET Framework 3.5 SP1 Client Profile - false - - - False - .NET Framework 3.5 SP1 - false - - - False - Windows Installer 3.1 - true - - - - - - - + \ No newline at end of file diff --git a/flow/actorcompiler/actorcompiler.sln b/flow/actorcompiler/actorcompiler.sln new file mode 100644 index 0000000000..a4292bfaaa --- /dev/null +++ b/flow/actorcompiler/actorcompiler.sln @@ -0,0 +1,34 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.26124.0 +MinimumVisualStudioVersion = 15.0.26124.0 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "actorcompiler", "actorcompiler.csproj", "{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|Any CPU = Release|Any CPU + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|Any CPU.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x64.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x64.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x86.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x86.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|Any CPU.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|Any CPU.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x64.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x64.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x86.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x86.Build.0 = Debug|Any CPU + EndGlobalSection +EndGlobal diff --git a/flow/flow.cpp b/flow/flow.cpp index a2bfcc1510..c4a6097300 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -26,7 +26,10 @@ #include #include -thread_local ActorLineagePropertyMap* currentLineage = nullptr; +extern thread_local Reference currentLineage; + +ActorLineage::ActorLineage() : parent(currentLineage) { +} #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER) // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test. diff --git a/flow/flow.h b/flow/flow.h index 155c5db2a2..a0c9793a7a 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -20,6 +20,7 @@ #ifndef FLOW_FLOW_H #define FLOW_FLOW_H +#include "flow/FastRef.h" #pragma once #pragma warning( disable: 4244 4267 ) // SOMEDAY: Carefully check for integer overflow issues (e.g. size_t to int conversions like this suppresses) @@ -408,28 +409,21 @@ struct SingleCallback { } }; -// in the future we might want to read these from a different thread. std::shared_ptr -// seems to be better suited for this... -struct ActorLineagePropertyMap : std::enable_shared_from_this { - std::shared_ptr parent = nullptr; +struct ActorLineagePropertyMap : ReferenceCounted { }; -extern thread_local ActorLineagePropertyMap* currentLineage; - -struct ActorLineage { - std::shared_ptr properties = std::make_shared(); - ActorLineage() { - if (currentLineage) { - properties->parent = currentLineage->shared_from_this(); - } - } +struct ActorLineage : ReferenceCounted { + Reference map; + Reference parent; + ActorLineage(); }; -struct save_lineage { - ActorLineagePropertyMap* current = currentLineage; - ~save_lineage() { - currentLineage = current; - } +extern thread_local Reference currentLineage; + +struct restore_lineage { + Reference lineage; + restore_lineage() : lineage(currentLineage) {} + ~restore_lineage() { currentLineage = lineage; } }; // SAV is short for Single Assignment Variable: It can be assigned for only once! @@ -447,7 +441,8 @@ public: T& value() { return *(T*)&value_storage; } - SAV(int futures, int promises) : futures(futures), promises(promises), error_state(Error::fromCode(UNSET_ERROR_CODE)) { + SAV(int futures, int promises) + : futures(futures), promises(promises), error_state(Error::fromCode(UNSET_ERROR_CODE)) { Callback::prev = Callback::next = this; } ~SAV() { @@ -466,13 +461,14 @@ public: } template - void send(U && value) { + void send(U&& value) { ASSERT(canBeSet()); new (&value_storage) T(std::forward(value)); this->error_state = Error::fromCode(SET_ERROR_CODE); - save_lineage _{}; - while (Callback::next != this) + restore_lineage _; + while (Callback::next != this) { Callback::next->fire(this->value()); + } } void send(Never) { @@ -483,13 +479,15 @@ public: void sendError(Error err) { ASSERT(canBeSet() && int16_t(err.code()) > 0); this->error_state = err; - save_lineage _{}; - while (Callback::next != this) + restore_lineage _; + while (Callback::next != this) { Callback::next->error(err); + } } template void sendAndDelPromiseRef(U && value) { + restore_lineage _; ASSERT(canBeSet()); if (promises == 1 && !futures) { // No one is left to receive the value, so we can just die @@ -503,8 +501,8 @@ public: void finishSendAndDelPromiseRef() { // Call only after value_storage has already been initialized! + restore_lineage _; this->error_state = Error::fromCode(SET_ERROR_CODE); - save_lineage _{}; while (Callback::next != this) Callback::next->fire(this->value()); @@ -520,6 +518,7 @@ public: } void sendErrorAndDelPromiseRef(Error err) { + restore_lineage _; ASSERT(canBeSet() && int16_t(err.code()) > 0); if (promises == 1 && !futures) { // No one is left to receive the value, so we can just die @@ -528,7 +527,6 @@ public: } this->error_state = err; - save_lineage _{}; while (Callback::next != this) Callback::next->error(err); @@ -624,6 +622,7 @@ struct NotifiedQueue : private SingleCallback, FastAllocated if (error.isValid()) return; if (SingleCallback::next != this) { + restore_lineage _; SingleCallback::next->fire(std::forward(value)); } else { @@ -635,8 +634,10 @@ struct NotifiedQueue : private SingleCallback, FastAllocated if (error.isValid()) return; this->error = err; - if (SingleCallback::next != this) + if (SingleCallback::next != this) { + restore_lineage _; SingleCallback::next->error(err); + } } void addPromiseRef() { promises++; } @@ -1016,38 +1017,67 @@ static inline void destruct(T& t) { } template -struct Actor : SAV, ActorLineage { +struct Actor : SAV { + Reference lineage = Reference{new ActorLineage() }; int8_t actor_wait_state; // -1 means actor is cancelled; 0 means actor is not waiting; 1-N mean waiting in callback group # - Actor() : SAV(1, 1), actor_wait_state(0) { /*++actorCount;*/ } + Actor() : SAV(1, 1), actor_wait_state(0) { + /*++actorCount;*/ + currentLineage = lineage; + } + + Reference setLineage() { + auto res = currentLineage; + currentLineage = lineage; + return res; + } //~Actor() { --actorCount; } }; template <> -struct Actor : ActorLineage { +struct Actor { // This specialization is for a void actor (one not returning a future, hence also uncancellable) + Reference lineage = Reference{new ActorLineage() }; int8_t actor_wait_state; // 0 means actor is not waiting; 1-N mean waiting in callback group # - Actor() : actor_wait_state(0) { /*++actorCount;*/ } + Actor() : actor_wait_state(0) { + /*++actorCount;*/ + currentLineage = lineage; + } + + Reference setLineage() { + auto res = currentLineage; + currentLineage = lineage; + return res; + } //~Actor() { --actorCount; } }; template struct ActorCallback : Callback { - virtual void fire(ValueType const& value) override { static_cast(this)->a_callback_fire(this, value); } - virtual void error(Error e) override { static_cast(this)->a_callback_error(this, e); } + virtual void fire(ValueType const& value) override { + auto _ = static_cast(this)->setLineage(); + static_cast(this)->a_callback_fire(this, value); + } + virtual void error(Error e) override { + auto _ = static_cast(this)->setLineage(); + static_cast(this)->a_callback_error(this, e); + } }; template struct ActorSingleCallback : SingleCallback { virtual void fire(ValueType const& value) override { + auto _ = static_cast(this)->setLineage(); static_cast(this)->a_callback_fire(this, value); } virtual void fire(ValueType && value) override { + auto _ = static_cast(this)->setLineage(); static_cast(this)->a_callback_fire(this, std::move(value)); } virtual void error(Error e) override { + auto _ = static_cast(this)->setLineage(); static_cast(this)->a_callback_error(this, e); } }; diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 3fcab1f7dd..ab9d9c07d5 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -1493,6 +1493,10 @@ struct YieldedFutureActor : SAV, ActorCallback setLineage() { + return currentLineage; + } + void a_callback_fire(ActorCallback*, Void) { if (int16_t(in_error_state.code()) == UNSET_ERROR_CODE) { in_error_state = Error::fromCode(SET_ERROR_CODE); From d837e923ad9f8cbf3a5bcd5668a74d4ee0222c32 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 7 Dec 2020 15:23:18 -0700 Subject: [PATCH 004/317] minor bugfix --- flow/flow.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/flow.cpp b/flow/flow.cpp index c4a6097300..ed977141bd 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -26,7 +26,7 @@ #include #include -extern thread_local Reference currentLineage; +thread_local Reference currentLineage; ActorLineage::ActorLineage() : parent(currentLineage) { } From 2c4e38329e536172d2413da61d884ef944277598 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 9 Dec 2020 10:19:32 -0700 Subject: [PATCH 005/317] fix some compiler warnings --- fdbclient/SystemData.cpp | 6 +++--- fdbserver/BackupProgress.actor.cpp | 2 +- fdbserver/BackupWorker.actor.cpp | 6 +++--- fdbserver/CommitProxyServer.actor.cpp | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index b402ad99a7..16733b1ad6 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -57,7 +57,7 @@ const Value keyServersValue( Standalone result, const std::vecto std::vector destTag; bool foundOldLocality = false; - for (const KeyValueRef kv : result) { + for (const KeyValueRef& kv : result) { UID uid = decodeServerTagKey(kv.key); if (std::find(src.begin(), src.end(), uid) != src.end()) { srcTag.push_back( decodeServerTagValue(kv.value) ); @@ -109,7 +109,7 @@ void decodeKeyServersValue( Standalone result, const ValueRef& v src.clear(); dest.clear(); - for (const KeyValueRef kv : result) { + for (const KeyValueRef& kv : result) { Tag tag = decodeServerTagValue(kv.value); if (std::find(srcTag.begin(), srcTag.end(), tag) != srcTag.end()) { src.push_back( decodeServerTagKey(kv.key) ); @@ -122,7 +122,7 @@ void decodeKeyServersValue( Standalone result, const ValueRef& v std::sort(dest.begin(), dest.end()); if(missingIsError && (src.size() != srcTag.size() || dest.size() != destTag.size())) { TraceEvent(SevError, "AttemptedToDecodeMissingTag"); - for (const KeyValueRef kv : result) { + for (const KeyValueRef& kv : result) { Tag tag = decodeServerTagValue(kv.value); UID serverID = decodeServerTagKey(kv.key); TraceEvent("TagUIDMap").detail("Tag", tag.toString()).detail("UID", serverID.toString()); diff --git a/fdbserver/BackupProgress.actor.cpp b/fdbserver/BackupProgress.actor.cpp index 3f1d564c16..f496ec0558 100644 --- a/fdbserver/BackupProgress.actor.cpp +++ b/fdbserver/BackupProgress.actor.cpp @@ -121,7 +121,7 @@ std::map, std::map> BackupProgr } } - for (const Tag tag : tags) { // tags without progress data + for (const Tag& tag : tags) { // tags without progress data tagVersions.insert({ tag, adjustedBeginVersion }); TraceEvent("BackupVersionRange", dbgid) .detail("OldEpoch", epoch) diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp index 3cea9f6611..b5f78593e2 100644 --- a/fdbserver/BackupWorker.actor.cpp +++ b/fdbserver/BackupWorker.actor.cpp @@ -508,7 +508,7 @@ ACTOR Future setBackupKeys(BackupData* self, std::map savedL state std::vector>> prevVersions; state std::vector versionConfigs; state std::vector>> allWorkersReady; - for (const auto [uid, version] : savedLogVersions) { + for (const auto& [uid, version] : savedLogVersions) { versionConfigs.emplace_back(uid); prevVersions.push_back(versionConfigs.back().latestBackupWorkerSavedVersion().get(tr)); allWorkersReady.push_back(versionConfigs.back().allWorkerStarted().get(tr)); @@ -573,7 +573,7 @@ ACTOR Future monitorBackupProgress(BackupData* self) { if (self->recruitedEpoch == self->oldestBackupEpoch) { // update update progress so far if previous epochs are done Version v = std::numeric_limits::max(); - for (const auto [tag, version] : tagVersions) { + for (const auto& [tag, version] : tagVersions) { v = std::min(v, version); } savedLogVersions.emplace(uid, v); @@ -783,7 +783,7 @@ ACTOR Future saveMutationsToFile(BackupData* self, Version popVersion, int .detail("TagId", self->tag.id) .detail("File", file->getFileName()); } - for (const UID uid : activeUids) { + for (const UID& uid : activeUids) { self->backups[uid].lastSavedVersion = popVersion + 1; } diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index eac0f0d4c2..96ae4c000c 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -1778,7 +1778,7 @@ ACTOR Future commitProxyServerCore(CommitProxyInterface proxy, MasterInter state KeyRange txnKeys = allKeys; Standalone UIDtoTagMap = commitData.txnStateStore->readRange( serverTagKeys ).get(); state std::map tag_uid; - for (const KeyValueRef kv : UIDtoTagMap) { + for (const KeyValueRef& kv : UIDtoTagMap) { tag_uid[decodeServerTagValue(kv.value)] = decodeServerTagKey(kv.key); } loop { From 0d324cee80b306797e6f92392414b786ad5ce914 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 9 Dec 2020 10:19:59 -0700 Subject: [PATCH 006/317] Annotation framework and role lineage --- fdbrpc/CMakeLists.txt | 2 + fdbrpc/Locality.h | 1 + fdbrpc/RoleLineage.cpp | 23 ++++++++++ fdbrpc/RoleLineage.h | 31 +++++++++++++ fdbserver/worker.actor.cpp | 3 ++ flow/flow.cpp | 6 +++ flow/flow.h | 90 ++++++++++++++++++++++++++++++++------ 7 files changed, 142 insertions(+), 14 deletions(-) create mode 100644 fdbrpc/RoleLineage.cpp create mode 100644 fdbrpc/RoleLineage.h diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt index b4fb20098d..41229dce47 100644 --- a/fdbrpc/CMakeLists.txt +++ b/fdbrpc/CMakeLists.txt @@ -22,6 +22,8 @@ set(FDBRPC_SRCS ReplicationPolicy.cpp ReplicationTypes.cpp ReplicationUtils.cpp + RoleLineage.h + RoleLineage.cpp Stats.actor.cpp Stats.h sim2.actor.cpp diff --git a/fdbrpc/Locality.h b/fdbrpc/Locality.h index 11c209071a..2129b7a3b7 100644 --- a/fdbrpc/Locality.h +++ b/fdbrpc/Locality.h @@ -63,6 +63,7 @@ struct ProcessClass { Ratekeeper, StorageCache, Backup, + Worker, // used for actor lineage tracking NoRole }; enum ClassSource { CommandLineSource, AutoSource, DBSource, InvalidSource = -1 }; diff --git a/fdbrpc/RoleLineage.cpp b/fdbrpc/RoleLineage.cpp new file mode 100644 index 0000000000..89a64bbe40 --- /dev/null +++ b/fdbrpc/RoleLineage.cpp @@ -0,0 +1,23 @@ +/* + * RoleLineage.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbrpc/RoleLineage.h" + +StringRef RoleLineage::name = "RoleLineage"_sr; diff --git a/fdbrpc/RoleLineage.h b/fdbrpc/RoleLineage.h new file mode 100644 index 0000000000..30a2ea2650 --- /dev/null +++ b/fdbrpc/RoleLineage.h @@ -0,0 +1,31 @@ +/* + * RoleLineage.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "fdbrpc/Locality.h" + +struct RoleLineage : LineageProperties { + static StringRef name; + ProcessClass::ClusterRole role = ProcessClass::NoRole; + + bool isSet(ProcessClass::ClusterRole RoleLineage::*member) { + return this->*member != ProcessClass::NoRole; + } +}; diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index ca34f903a2..98363ea247 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -22,6 +22,7 @@ #include #include "fdbrpc/Locality.h" +#include "fdbrpc/RoleLineage.h" #include "fdbclient/StorageServerInterface.h" #include "fdbserver/Knobs.h" #include "flow/ActorCollection.h" @@ -46,6 +47,7 @@ #include "flow/Profiler.h" #include "flow/ThreadHelper.actor.h" #include "flow/Trace.h" +#include "flow/flow.h" #ifdef __linux__ #include @@ -1810,6 +1812,7 @@ ACTOR Future fdbd( { state vector> actors; state Promise recoveredDiskFiles; + currentLineage->modify(&RoleLineage::role) = ProcessClass::Worker; try { ServerCoordinators coordinators( connFile ); diff --git a/flow/flow.cpp b/flow/flow.cpp index ed977141bd..5b354fe054 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -31,6 +31,12 @@ thread_local Reference currentLineage; ActorLineage::ActorLineage() : parent(currentLineage) { } +ActorLineage::~ActorLineage() { + for (auto ptr : properties) { + delete ptr.second; + } +} + #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER) // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test. void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) { diff --git a/flow/flow.h b/flow/flow.h index a0c9793a7a..0ffc895a86 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -20,6 +20,7 @@ #ifndef FLOW_FLOW_H #define FLOW_FLOW_H +#include "flow/Arena.h" #include "flow/FastRef.h" #pragma once @@ -29,6 +30,7 @@ #include #include +#include #include #include #include @@ -409,21 +411,88 @@ struct SingleCallback { } }; -struct ActorLineagePropertyMap : ReferenceCounted { +struct LineagePropertiesBase { +}; + +// helper class to make implementation of LineageProperties easier +template +struct LineageProperties : LineagePropertiesBase { + // Contract: + // + // StringRef name = "SomeUniqueName"_str; + + + // this has to be implemented by subclasses + // but can't be made virtual. + // A user should implement this for any type + // within the properies class. + template + bool isSet(Value Derived::*member) { + return true; + } }; struct ActorLineage : ReferenceCounted { - Reference map; +private: + std::unordered_map properties; Reference parent; +public: ActorLineage(); + ~ActorLineage(); + bool isRoot() const { + return parent.getPtr() == nullptr; + } + void makeRoot() { + parent.clear(); + } + template + V& modify(V T::*member) { + auto& res = properties[T::name]; + if (!res) { + res = new T{}; + } + T* map = static_cast(res); + return map->*member; + } + template + std::optional get(V T::*member) const { + auto current = this; + while (current != nullptr) { + auto iter = current->properties.find(T::name); + if (iter != current->properties.end()) { + T const& map = static_cast(*iter->second); + if (map.isSet(member)) { + return map.*member; + } + } + current = current->parent.getPtr(); + } + return std::optional{}; + } + template + std::stack stack(V T::*member) const { + auto current = this; + std::stack res; + while (current != nullptr) { + auto iter = current->properties.find(T::name); + if (iter != current->properties.end()) { + T const& map = static_cast(*iter->second); + if (map.isSet(member)) { + res.push(map.*member); + } + } + current = current->parent.getPtr(); + } + return res; + } }; extern thread_local Reference currentLineage; struct restore_lineage { - Reference lineage; - restore_lineage() : lineage(currentLineage) {} - ~restore_lineage() { currentLineage = lineage; } + Reference prev; + restore_lineage() : prev(currentLineage) {} + ~restore_lineage() { currentLineage = prev; } }; // SAV is short for Single Assignment Variable: It can be assigned for only once! @@ -465,7 +534,6 @@ public: ASSERT(canBeSet()); new (&value_storage) T(std::forward(value)); this->error_state = Error::fromCode(SET_ERROR_CODE); - restore_lineage _; while (Callback::next != this) { Callback::next->fire(this->value()); } @@ -479,7 +547,6 @@ public: void sendError(Error err) { ASSERT(canBeSet() && int16_t(err.code()) > 0); this->error_state = err; - restore_lineage _; while (Callback::next != this) { Callback::next->error(err); } @@ -487,7 +554,6 @@ public: template void sendAndDelPromiseRef(U && value) { - restore_lineage _; ASSERT(canBeSet()); if (promises == 1 && !futures) { // No one is left to receive the value, so we can just die @@ -501,7 +567,6 @@ public: void finishSendAndDelPromiseRef() { // Call only after value_storage has already been initialized! - restore_lineage _; this->error_state = Error::fromCode(SET_ERROR_CODE); while (Callback::next != this) Callback::next->fire(this->value()); @@ -518,7 +583,6 @@ public: } void sendErrorAndDelPromiseRef(Error err) { - restore_lineage _; ASSERT(canBeSet() && int16_t(err.code()) > 0); if (promises == 1 && !futures) { // No one is left to receive the value, so we can just die @@ -622,7 +686,6 @@ struct NotifiedQueue : private SingleCallback, FastAllocated if (error.isValid()) return; if (SingleCallback::next != this) { - restore_lineage _; SingleCallback::next->fire(std::forward(value)); } else { @@ -635,7 +698,6 @@ struct NotifiedQueue : private SingleCallback, FastAllocated this->error = err; if (SingleCallback::next != this) { - restore_lineage _; SingleCallback::next->error(err); } } @@ -1025,13 +1087,13 @@ struct Actor : SAV { /*++actorCount;*/ currentLineage = lineage; } + //~Actor() { --actorCount; } Reference setLineage() { auto res = currentLineage; currentLineage = lineage; return res; } - //~Actor() { --actorCount; } }; template <> @@ -1045,13 +1107,13 @@ struct Actor { /*++actorCount;*/ currentLineage = lineage; } + //~Actor() { --actorCount; } Reference setLineage() { auto res = currentLineage; currentLineage = lineage; return res; } - //~Actor() { --actorCount; } }; template From 945d0246cddc0dcfff982f22af54c43617bc79a8 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 9 Dec 2020 13:28:15 -0700 Subject: [PATCH 007/317] add actor stacktrace feature --- flow/actorcompiler/ActorCompiler.cs | 3 ++- flow/flow.cpp | 6 ++++++ flow/flow.h | 12 ++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/flow/actorcompiler/ActorCompiler.cs b/flow/actorcompiler/ActorCompiler.cs index dc9de91868..28771f4503 100644 --- a/flow/actorcompiler/ActorCompiler.cs +++ b/flow/actorcompiler/ActorCompiler.cs @@ -452,7 +452,7 @@ namespace actorcompiler fullClassName, string.Join(", ", actor.parameters.Select(p => p.name).ToArray())); - writer.WriteLine("restore_lineage _;"); + writer.WriteLine("\trestore_lineage _;"); if (actor.returnType != null) writer.WriteLine("\treturn Future<{1}>({0});", newActor, actor.returnType); else @@ -1287,6 +1287,7 @@ namespace actorcompiler constructor.WriteLine("{"); constructor.Indent(+1); ProbeEnter(constructor, actor.name); + constructor.WriteLine("currentLineage->modify(&StackLineage::actorName) = LiteralStringRef(\"{0}\");", actor.name); constructor.WriteLine("this->{0};", body.call()); ProbeExit(constructor, actor.name); WriteFunction(writer, constructor, constructor.BodyText); diff --git a/flow/flow.cpp b/flow/flow.cpp index 5b354fe054..2e47847fcd 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -37,6 +37,12 @@ ActorLineage::~ActorLineage() { } } +StringRef StackLineage::name = "StackLineage"_sr; + +std::stack getActorStackTrace() { + return currentLineage->stack(&StackLineage::actorName); +} + #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER) // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test. void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) { diff --git a/flow/flow.h b/flow/flow.h index 0ffc895a86..518dbd036c 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -495,6 +495,18 @@ struct restore_lineage { ~restore_lineage() { currentLineage = prev; } }; +struct StackLineage : LineageProperties { + static StringRef name; + StringRef actorName; + + template + bool isSet(Value StackLineage::*member) { + return true; + } +}; + +extern std::stack getActorStackTrace(); + // SAV is short for Single Assignment Variable: It can be assigned for only once! template struct SAV : private Callback, FastAllocated> { From f8e1df6c4f8c5a687afffe2b9a28aa13e32ae9d5 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 10 Dec 2020 10:42:04 -0700 Subject: [PATCH 008/317] Support for actor stack traces --- fdbrpc/RoleLineage.h | 2 +- fdbserver/CMakeLists.txt | 1 + fdbserver/SigStack.cpp | 23 +++++++++++++++++++++++ fdbserver/worker.actor.cpp | 3 +++ flow/flow.h | 7 +------ tests/TestRunner/local_cluster.py | 2 +- 6 files changed, 30 insertions(+), 8 deletions(-) create mode 100644 fdbserver/SigStack.cpp diff --git a/fdbrpc/RoleLineage.h b/fdbrpc/RoleLineage.h index 30a2ea2650..8e9d3f4e9e 100644 --- a/fdbrpc/RoleLineage.h +++ b/fdbrpc/RoleLineage.h @@ -25,7 +25,7 @@ struct RoleLineage : LineageProperties { static StringRef name; ProcessClass::ClusterRole role = ProcessClass::NoRole; - bool isSet(ProcessClass::ClusterRole RoleLineage::*member) { + bool isSet(ProcessClass::ClusterRole RoleLineage::*member) const { return this->*member != ProcessClass::NoRole; } }; diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index bf266069cb..f52e5b8279 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -88,6 +88,7 @@ set(FDBSERVER_SRCS ResolverInterface.h ServerDBInfo.actor.h ServerDBInfo.h + SigStack.cpp SimulatedCluster.actor.cpp SimulatedCluster.h SkipList.cpp diff --git a/fdbserver/SigStack.cpp b/fdbserver/SigStack.cpp new file mode 100644 index 0000000000..efec5aff7d --- /dev/null +++ b/fdbserver/SigStack.cpp @@ -0,0 +1,23 @@ +#include "flow/flow.h" +#include +#include +#include + +// This is not yet correct, as this is not async safe +// However, this should be good enough for an initial +// proof of concept. +extern "C" void stackSignalHandler(int sig) { + auto stack = getActorStackTrace(); + int i = 0; + while (!stack.empty()) { + auto s = stack.top(); + stack.pop(); + std::string_view n(reinterpret_cast(s.begin()), s.size()); + std::cout << i << ": " << n << std::endl; + ++i; + } +} + +void setupStackSignal() { + std::signal(SIGUSR1, &stackSignalHandler); +} diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 98363ea247..5d371c0c80 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -1798,6 +1798,8 @@ ACTOR Future monitorLeaderRemotelyWithDelayedCandidacy( Reference fdbd( Reference connFile, LocalityData localities, @@ -1812,6 +1814,7 @@ ACTOR Future fdbd( { state vector> actors; state Promise recoveredDiskFiles; + setupStackSignal(); currentLineage->modify(&RoleLineage::role) = ProcessClass::Worker; try { diff --git a/flow/flow.h b/flow/flow.h index 518dbd036c..b1e4c1e1fb 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -427,7 +427,7 @@ struct LineageProperties : LineagePropertiesBase { // A user should implement this for any type // within the properies class. template - bool isSet(Value Derived::*member) { + bool isSet(Value Derived::*member) const { return true; } }; @@ -498,11 +498,6 @@ struct restore_lineage { struct StackLineage : LineageProperties { static StringRef name; StringRef actorName; - - template - bool isSet(Value StackLineage::*member) { - return true; - } }; extern std::stack getActorStackTrace(); diff --git a/tests/TestRunner/local_cluster.py b/tests/TestRunner/local_cluster.py index 68318d51dd..85f2094774 100644 --- a/tests/TestRunner/local_cluster.py +++ b/tests/TestRunner/local_cluster.py @@ -38,7 +38,7 @@ cluster_file = {etcdir}/fdb.cluster command = {fdbserver_bin} public_address = auto:$ID listen_address = public -datadir = {datadir} +datadir = {datadir}/$ID logdir = {logdir} # logsize = 10MiB # maxlogssize = 100MiB From fb64902d5c5b6e88501ebe906d4d939f61257b9b Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 19 Jan 2021 16:04:09 -0700 Subject: [PATCH 009/317] Assign roles --- fdbrpc/CMakeLists.txt | 2 -- fdbserver/CMakeLists.txt | 2 ++ .../RoleLineage.actor.cpp | 2 +- .../RoleLineage.actor.h | 21 ++++++++++++++- fdbserver/worker.actor.cpp | 26 ++++++++++++++++++- flow/flow.cpp | 5 ++-- flow/flow.h | 16 ++++++++++++ 7 files changed, 67 insertions(+), 7 deletions(-) rename fdbrpc/RoleLineage.cpp => fdbserver/RoleLineage.actor.cpp (95%) rename fdbrpc/RoleLineage.h => fdbserver/RoleLineage.actor.h (59%) diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt index 7a9ce26a10..af84676be7 100644 --- a/fdbrpc/CMakeLists.txt +++ b/fdbrpc/CMakeLists.txt @@ -22,8 +22,6 @@ set(FDBRPC_SRCS ReplicationPolicy.cpp ReplicationTypes.cpp ReplicationUtils.cpp - RoleLineage.h - RoleLineage.cpp Stats.actor.cpp Stats.h sim2.actor.cpp diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index afc45b2cc4..9e406a0d26 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -86,6 +86,8 @@ set(FDBSERVER_SRCS RestoreWorker.actor.cpp Resolver.actor.cpp ResolverInterface.h + RoleLineage.actor.h + RoleLineage.actor.cpp ServerDBInfo.actor.h ServerDBInfo.h SigStack.cpp diff --git a/fdbrpc/RoleLineage.cpp b/fdbserver/RoleLineage.actor.cpp similarity index 95% rename from fdbrpc/RoleLineage.cpp rename to fdbserver/RoleLineage.actor.cpp index 89a64bbe40..6d1b49527a 100644 --- a/fdbrpc/RoleLineage.cpp +++ b/fdbserver/RoleLineage.actor.cpp @@ -18,6 +18,6 @@ * limitations under the License. */ -#include "fdbrpc/RoleLineage.h" +#include "fdbserver/RoleLineage.actor.h" StringRef RoleLineage::name = "RoleLineage"_sr; diff --git a/fdbrpc/RoleLineage.h b/fdbserver/RoleLineage.actor.h similarity index 59% rename from fdbrpc/RoleLineage.h rename to fdbserver/RoleLineage.actor.h index 8e9d3f4e9e..d35c749771 100644 --- a/fdbrpc/RoleLineage.h +++ b/fdbserver/RoleLineage.actor.h @@ -1,5 +1,5 @@ /* - * RoleLineage.h + * RoleLineage.actor.h * * This source file is part of the FoundationDB open source project * @@ -19,7 +19,15 @@ */ #pragma once +#include "flow/flow.h" +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_G_H) +# define FDBSERVER_ROLE_LINEAGE_ACTOR_G_H +# include "fdbserver/RoleLineage.actor.g.h" +#elif !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_H) +# define FDBSERVER_ROLE_LINEAGE_ACTOR_H + #include "fdbrpc/Locality.h" +#include "flow/actorcompiler.h" // This must be the last include struct RoleLineage : LineageProperties { static StringRef name; @@ -29,3 +37,14 @@ struct RoleLineage : LineageProperties { return this->*member != ProcessClass::NoRole; } }; + +// creates a new root and sets the role lineage +ACTOR template +Future()())> runInRole(Fun fun, ProcessClass::ClusterRole role) { + currentLineage->makeRoot(); + currentLineage->modify(&RoleLineage::role) = role; + decltype(std::declval()()) res = wait(fun()); + return res; +} + +#endif diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 36f5c14860..19aea8622c 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -22,7 +22,6 @@ #include #include "fdbrpc/Locality.h" -#include "fdbrpc/RoleLineage.h" #include "fdbclient/StorageServerInterface.h" #include "fdbserver/Knobs.h" #include "flow/ActorCollection.h" @@ -33,6 +32,7 @@ #include "fdbclient/NativeAPI.actor.h" #include "fdbserver/MetricLogger.h" #include "fdbserver/BackupInterface.h" +#include "fdbserver/RoleLineage.actor.h" #include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/IKeyValueStore.h" #include "fdbserver/WaitFailure.h" @@ -1024,6 +1024,8 @@ ACTOR Future workerServer( DiskStore s = stores[f]; // FIXME: Error handling if( s.storedComponent == DiskStore::Storage ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage; IKeyValueStore* kv = openKVStore(s.storeType, s.filename, s.storeID, memoryLimit, false, validateDataFiles); Future kvClosed = kv->onClosed(); filesClosed.add( kvClosed ); @@ -1058,6 +1060,8 @@ ACTOR Future workerServer( f = storageServerRollbackRebooter( f, s.storeType, s.filename, recruited.id(), recruited.locality, dbInfo, folder, &filesClosed, memoryLimit, kv); errorForwarders.add( forwardError( errors, Role::STORAGE_SERVER, recruited.id(), f ) ); } else if( s.storedComponent == DiskStore::TLogData ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::TLog; std::string logQueueBasename; const std::string filename = basename(s.filename); if (StringRef(filename).startsWith(fileLogDataPrefix)) { @@ -1218,6 +1222,8 @@ ACTOR Future workerServer( } } when( RecruitMasterRequest req = waitNext(interf.master.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Master; MasterInterface recruited; recruited.locality = locality; recruited.initEndpoints(); @@ -1238,6 +1244,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when ( InitializeDataDistributorRequest req = waitNext(interf.dataDistributor.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::DataDistributor; DataDistributorInterface recruited(locality); recruited.initEndpoints(); @@ -1256,6 +1264,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when ( InitializeRatekeeperRequest req = waitNext(interf.ratekeeper.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Ratekeeper; RatekeeperInterface recruited(locality, req.reqId); recruited.initEndpoints(); @@ -1280,6 +1290,8 @@ ACTOR Future workerServer( } when (InitializeBackupRequest req = waitNext(interf.backup.getFuture())) { if (!backupWorkerCache.exists(req.reqId)) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Backup; BackupInterface recruited(locality); recruited.initEndpoints(); @@ -1309,6 +1321,8 @@ ACTOR Future workerServer( .detail("MinRecruitable", TLogVersion::MIN_RECRUITABLE); req.reply.sendError(internal_error()); } + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::TLog; TLogOptions tLogOptions(req.logVersion, req.spillType); TLogFn tLogFn = tLogFnForOptions(tLogOptions); auto& logData = sharedLogs[SharedLogsKey(tLogOptions, req.storeType)]; @@ -1341,6 +1355,8 @@ ACTOR Future workerServer( } when( InitializeStorageRequest req = waitNext(interf.storage.getFuture()) ) { if( !storageCache.exists( req.reqId ) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage; StorageServerInterface recruited(req.interfaceId); recruited.locality = locality; recruited.initEndpoints(); @@ -1379,6 +1395,8 @@ ACTOR Future workerServer( forwardPromise( req.reply, storageCache.get( req.reqId ) ); } when(InitializeCommitProxyRequest req = waitNext(interf.commitProxy.getFuture())) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::CommitProxy; CommitProxyInterface recruited; recruited.processId = locality.processId(); recruited.provisional = false; @@ -1402,6 +1420,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when( InitializeGrvProxyRequest req = waitNext(interf.grvProxy.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::GrvProxy; GrvProxyInterface recruited; recruited.processId = locality.processId(); recruited.provisional = false; @@ -1421,6 +1441,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when( InitializeResolverRequest req = waitNext(interf.resolver.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Resolver; ResolverInterface recruited; recruited.locality = locality; recruited.initEndpoints(); @@ -1438,6 +1460,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when( InitializeLogRouterRequest req = waitNext(interf.logRouter.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::LogRouter; TLogInterface recruited(locality); recruited.initEndpoints(); diff --git a/flow/flow.cpp b/flow/flow.cpp index 2e47847fcd..c90bbbe9ae 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -28,8 +28,9 @@ thread_local Reference currentLineage; -ActorLineage::ActorLineage() : parent(currentLineage) { -} +LineagePropertiesBase::~LineagePropertiesBase() {} + +ActorLineage::ActorLineage() : parent(currentLineage) {} ActorLineage::~ActorLineage() { for (auto ptr : properties) { diff --git a/flow/flow.h b/flow/flow.h index e043ab49d4..9b3ba698b6 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -412,6 +412,7 @@ struct SingleCallback { }; struct LineagePropertiesBase { + virtual ~LineagePropertiesBase(); }; // helper class to make implementation of LineageProperties easier @@ -433,6 +434,7 @@ struct LineageProperties : LineagePropertiesBase { }; struct ActorLineage : ReferenceCounted { + friend class LocalLineage; private: std::unordered_map properties; Reference parent; @@ -489,6 +491,20 @@ public: extern thread_local Reference currentLineage; +// This class can be used in order to modify all lineage properties +// of actors created within a (non-actor) scope +struct LocalLineage { + Reference lineage = Reference{new ActorLineage() }; + Reference oldLineage; + LocalLineage() { + oldLineage = currentLineage; + currentLineage = lineage; + } + ~LocalLineage() { + currentLineage = oldLineage; + } +}; + struct restore_lineage { Reference prev; restore_lineage() : prev(currentLineage) {} From f40d8c2f490a08351ce3d7e91bfd6752e268548a Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 19 Jan 2021 16:04:21 -0700 Subject: [PATCH 010/317] make profiler signal handler reentrant safe --- flow/Profiler.actor.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp index ece9bcfafd..33d1542db7 100644 --- a/flow/Profiler.actor.cpp +++ b/flow/Profiler.actor.cpp @@ -148,6 +148,8 @@ struct Profiler { } void signal_handler() { // async signal safe! + static std::atomic inSigHandler = false; + if (!inSigHandler.exchange(true)) { return; } if(profilingEnabled) { double t = timer(); output_buffer->push(*(void**)&t); @@ -156,6 +158,7 @@ struct Profiler { output_buffer->push(addresses[i]); output_buffer->push((void*)-1LL); } + inSigHandler.store(false); } static void signal_handler_for_closure(int, siginfo_t* si, void*, void* self) { // async signal safe! From c3efbe3040770dae65319446b9b3877f29b0ee44 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 19 Jan 2021 16:52:30 -0700 Subject: [PATCH 011/317] fixed minor bug --- flow/Profiler.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp index 33d1542db7..d691f46205 100644 --- a/flow/Profiler.actor.cpp +++ b/flow/Profiler.actor.cpp @@ -149,7 +149,7 @@ struct Profiler { void signal_handler() { // async signal safe! static std::atomic inSigHandler = false; - if (!inSigHandler.exchange(true)) { return; } + if (inSigHandler.exchange(true)) { return; } if(profilingEnabled) { double t = timer(); output_buffer->push(*(void**)&t); From 5259721a5858a4bdd4eba0877cf931667cc5ac12 Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Sun, 14 Mar 2021 19:46:12 +0000 Subject: [PATCH 012/317] Use only one IP address that matches the hostname --- packaging/docker/create_server_environment.bash | 9 +++------ packaging/docker/fdb.bash | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/packaging/docker/create_server_environment.bash b/packaging/docker/create_server_environment.bash index 04a23792e2..51a782f991 100644 --- a/packaging/docker/create_server_environment.bash +++ b/packaging/docker/create_server_environment.bash @@ -23,21 +23,18 @@ source /var/fdb/scripts/create_cluster_file.bash function create_server_environment() { - fdb_dir=/var/fdb - env_file=$fdb_dir/.fdbenv - - : > $env_file + env_file=/var/fdb/.fdbenv if [[ "$FDB_NETWORKING_MODE" == "host" ]]; then public_ip=127.0.0.1 elif [[ "$FDB_NETWORKING_MODE" == "container" ]]; then - public_ip=$(grep `hostname` /etc/hosts | sed -e "s/\s *`hostname`.*//") + public_ip=$(hostname -i | awk '{print $1}') else echo "Unknown FDB Networking mode \"$FDB_NETWORKING_MODE\"" 1>&2 exit 1 fi - echo "export PUBLIC_IP=$public_ip" >> $env_file + echo "export PUBLIC_IP=$public_ip" > $env_file if [[ -z $FDB_COORDINATOR && -z "$FDB_CLUSTER_FILE_CONTENTS" ]]; then FDB_CLUSTER_FILE_CONTENTS="docker:docker@$public_ip:$FDB_PORT" fi diff --git a/packaging/docker/fdb.bash b/packaging/docker/fdb.bash index 3bf1c6a680..943c8ed58b 100644 --- a/packaging/docker/fdb.bash +++ b/packaging/docker/fdb.bash @@ -26,4 +26,4 @@ source /var/fdb/.fdbenv echo "Starting FDB server on $PUBLIC_IP:$FDB_PORT" fdbserver --listen_address 0.0.0.0:$FDB_PORT --public_address $PUBLIC_IP:$FDB_PORT \ --datadir /var/fdb/data --logdir /var/fdb/logs \ - --locality_zoneid=`hostname` --locality_machineid=`hostname` --class $FDB_PROCESS_CLASS + --locality_zoneid="$(hostname)" --locality_machineid="$(hostname)" --class $FDB_PROCESS_CLASS From 29c626ca6a0d02f1d412327e177cc5db36b02042 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Mon, 15 Mar 2021 17:36:13 -0400 Subject: [PATCH 013/317] Changed code flow to fix loophole that avoided the knob guarding higher protocol versions and also added new restarting tests --- fdbserver/MoveKeys.actor.cpp | 24 ++++++++------- tests/CMakeLists.txt | 3 ++ .../to_6.2.33/CycleTestRestart-1.txt | 30 +++++++++++++++++++ .../to_6.2.33/CycleTestRestart-2.txt | 26 ++++++++++++++++ 4 files changed, 73 insertions(+), 10 deletions(-) create mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-1.txt create mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-2.txt diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index c08f3f3476..83f7170e95 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -1232,23 +1232,27 @@ void seedShardServers(Arena& arena, CommitTransactionRef& tr, vector serverTags; + std::vector serverSrcUID; serverTags.reserve(servers.size()); - for (int i = 0; i < servers.size(); i++) - serverTags.push_back(server_tag[servers[i].id()]); + for (auto& s : servers) { + serverTags.push_back(server_tag[s.id()]); + serverSrcUID.push_back(s.id()); + } + auto ksValue = CLIENT_KNOBS->TAG_ENCODE_KEY_SERVERS ? keyServersValue(serverTags) + : keyServersValue(Standalone(), serverSrcUID); // We have to set this range in two blocks, because the master tracking of "keyServersLocations" depends on a change // to a specific // key (keyServersKeyServersKey) - krmSetPreviouslyEmptyRange( - tr, arena, keyServersPrefix, KeyRangeRef(KeyRef(), allKeys.end), keyServersValue(serverTags), Value()); + krmSetPreviouslyEmptyRange(tr, arena, keyServersPrefix, KeyRangeRef(KeyRef(), allKeys.end), ksValue, Value()); - for (int s = 0; s < servers.size(); s++) - krmSetPreviouslyEmptyRange( - tr, arena, serverKeysPrefixFor(servers[s].id()), allKeys, serverKeysTrue, serverKeysFalse); + for (auto& s : servers) { + krmSetPreviouslyEmptyRange(tr, arena, serverKeysPrefixFor(s.id()), allKeys, serverKeysTrue, serverKeysFalse); + } } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 132616b1bb..16f0eb2170 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -204,6 +204,9 @@ if(WITH_PYTHON) add_fdb_test( TEST_FILES restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml) + add_fdb_test( + TEST_FILES restarting/to_6.2.33/CycleTestRestart-1.txt + restarting/to_6.2.33/CycleTestRestart-2.txt IGNORE) add_fdb_test( TEST_FILES restarting/to_6.3.10/CycleTestRestart-1.txt restarting/to_6.3.10/CycleTestRestart-2.txt) diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-1.txt b/tests/restarting/to_6.2.33/CycleTestRestart-1.txt new file mode 100644 index 0000000000..647c2f3fe3 --- /dev/null +++ b/tests/restarting/to_6.2.33/CycleTestRestart-1.txt @@ -0,0 +1,30 @@ +testTitle=Clogged + clearAfterTest=false + testName=Cycle + transactionsPerSecond=500.0 + nodeCount=2500 + testDuration=10.0 + expectedRate=0 + + testName=RandomClogging + testDuration=10.0 + + testName=Rollback + meanDelay=10.0 + testDuration=10.0 + + testName=Attrition + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 + + testName=Attrition + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 + + testName=SaveAndKill + restartInfoLocation=simfdb/restartInfo.ini + testDuration=10.0 diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-2.txt b/tests/restarting/to_6.2.33/CycleTestRestart-2.txt new file mode 100644 index 0000000000..7d498f2be1 --- /dev/null +++ b/tests/restarting/to_6.2.33/CycleTestRestart-2.txt @@ -0,0 +1,26 @@ +testTitle=Clogged + runSetup=false + testName=Cycle + transactionsPerSecond=2500.0 + nodeCount=2500 + testDuration=10.0 + expectedRate=0 + + testName=RandomClogging + testDuration=10.0 + + testName=Rollback + meanDelay=10.0 + testDuration=10.0 + + testName=Attrition + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 + + testName=Attrition + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 From a8c7a798f2483c22ffd6c8dacbb0946c81237c12 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 17 Mar 2021 15:34:20 -0600 Subject: [PATCH 014/317] First prototype of actorlineageset --- flow/ActorLineageSet.cpp | 118 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 flow/ActorLineageSet.cpp diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp new file mode 100644 index 0000000000..9fb93e9df7 --- /dev/null +++ b/flow/ActorLineageSet.cpp @@ -0,0 +1,118 @@ +/* + * ActorLineageSet.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/flow.h" +#include + +class ActorLineageSet { +public: + // The type we use for lookup into the set. Gets assigned during insert + using Index = unsigned; + // For now we use a fixed size capacity + constexpr static Index CAPACITY = 1024; + constexpr static Index npos = std::numeric_limits::max(); + + explicit ActorLineageSet(); + ActorLineageSet(const ActorLineageSet&) = delete; + ActorLineageSet& operator=(const ActorLineageSet&) = delete; + + // Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so + // the actual size might change anytime after or even during the call. This function only guarantees that the size + // was whatever the method returns at one point between the start and the end of the function call. The safest way + // to handle this is by assuming that this returns an estimate. + unsigned size(); + + Index insert(const Reference& lineage); + void erase(Index idx); + std::vector> copy(); + +private: + static constexpr uintptr_t FREE = 0b1; + static constexpr uintptr_t LOCK = 0b10; + std::atomic _size = 0; + std::vector> _set; + boost::lockfree::queue, boost::lockfree::capacity> freeQueue; + boost::lockfree::queue, boost::lockfree::capacity> + freeList; +}; + +ActorLineageSet::ActorLineageSet() { + // insert the free indexes in reverse order + for (unsigned i = CAPACITY; i > 0; --i) { + freeQueue.push(i - 1); + _set[i] = uintptr_t(1); + } +} + +std::vector> ActorLineageSet::copy() { + std::vector> result; + for (int i = 0; i < CAPACITY; ++i) { + auto ptr = _set[i].load(); + if ((ptr & FREE) != 0) { + ASSERT((ptr & LOCK) == 0); + if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) { + ActorLineage* entry = reinterpret_cast(ptr); + ptr |= LOCK; + entry->addref(); + // we try to unlock now. If this element was removed while we incremented the refcount, the element will + // end up in the freeList, so we will decrement later. + _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); + result.emplace_back(entry); + } + } + } + // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread + // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next + // iteration + ActorLineage* toClean; + while (freeList.pop(toClean)) { + toClean->delref(); + } + return result; +} + +ActorLineageSet::Index ActorLineageSet::insert(const Reference& lineage) { + Index res; + if (!freeQueue.pop(res)) { + TraceEvent(SevWarnAlways, "NoCapacityInActorLineageSet"); + return npos; + } + ASSERT(_set[res].load() & FREE); + auto ptr = reinterpret_cast(lineage.getPtr()); + lineage->addref(); + _set[res].store(ptr); + return res; +} + +void ActorLineageSet::erase(Index idx) { + while (true) { + auto ptr = _set[idx].load(); + if (ptr & LOCK) { + _set[idx].store(FREE); + freeList.push(reinterpret_cast(ptr ^ LOCK)); + return; + } else { + if (_set[idx].compare_exchange_strong(ptr, FREE)) { + reinterpret_cast(ptr)->delref(); + return; + } + } + } +} \ No newline at end of file From 9812a49058adf16c2cdd1445f876f372be074109 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 17 Mar 2021 15:40:19 -0600 Subject: [PATCH 015/317] use consume_all to clean up after copy --- flow/ActorLineageSet.cpp | 5 +---- flow/CMakeLists.txt | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp index 9fb93e9df7..0957339501 100644 --- a/flow/ActorLineageSet.cpp +++ b/flow/ActorLineageSet.cpp @@ -81,10 +81,7 @@ std::vector> ActorLineageSet::copy() { // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next // iteration - ActorLineage* toClean; - while (freeList.pop(toClean)) { - toClean->delref(); - } + freeList.consume_all([](auto toClean) { toClean->delRef(); }); return result; } diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index c838e8eff8..5e89fe4d28 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -3,6 +3,7 @@ find_package(Threads REQUIRED) set(FLOW_SRCS ActorCollection.actor.cpp ActorCollection.h + ActorLineageSet.cpp Arena.cpp Arena.h AsioReactor.h From f6c7aa6ac77e55266e030109eb77d24b8894952e Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 17 Mar 2021 15:50:29 -0600 Subject: [PATCH 016/317] fixed typo --- flow/ActorLineageSet.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp index 0957339501..9a0d34c9bf 100644 --- a/flow/ActorLineageSet.cpp +++ b/flow/ActorLineageSet.cpp @@ -81,7 +81,7 @@ std::vector> ActorLineageSet::copy() { // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next // iteration - freeList.consume_all([](auto toClean) { toClean->delRef(); }); + freeList.consume_all([](auto toClean) { toClean->delref(); }); return result; } From 4f1b807e1f480f24a0e3cb9622149953c295a4ab Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 17 Mar 2021 16:01:23 -0600 Subject: [PATCH 017/317] assert object alignment --- flow/ActorLineageSet.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp index 9a0d34c9bf..570976379c 100644 --- a/flow/ActorLineageSet.cpp +++ b/flow/ActorLineageSet.cpp @@ -93,6 +93,7 @@ ActorLineageSet::Index ActorLineageSet::insert(const Reference& li } ASSERT(_set[res].load() & FREE); auto ptr = reinterpret_cast(lineage.getPtr()); + ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned lineage->addref(); _set[res].store(ptr); return res; From 650e0de62570338ebff06cedc819a9bb00a0b925 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Thu, 18 Mar 2021 15:32:17 -0400 Subject: [PATCH 018/317] Remove extra downgrade workloads to restrict downgrade testing to 1 version apart --- tests/CMakeLists.txt | 3 -- .../to_6.2.33/CycleTestRestart-1.txt | 30 ------------------- .../to_6.2.33/CycleTestRestart-2.txt | 26 ---------------- 3 files changed, 59 deletions(-) delete mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-1.txt delete mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-2.txt diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 16f0eb2170..132616b1bb 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -204,9 +204,6 @@ if(WITH_PYTHON) add_fdb_test( TEST_FILES restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml) - add_fdb_test( - TEST_FILES restarting/to_6.2.33/CycleTestRestart-1.txt - restarting/to_6.2.33/CycleTestRestart-2.txt IGNORE) add_fdb_test( TEST_FILES restarting/to_6.3.10/CycleTestRestart-1.txt restarting/to_6.3.10/CycleTestRestart-2.txt) diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-1.txt b/tests/restarting/to_6.2.33/CycleTestRestart-1.txt deleted file mode 100644 index 647c2f3fe3..0000000000 --- a/tests/restarting/to_6.2.33/CycleTestRestart-1.txt +++ /dev/null @@ -1,30 +0,0 @@ -testTitle=Clogged - clearAfterTest=false - testName=Cycle - transactionsPerSecond=500.0 - nodeCount=2500 - testDuration=10.0 - expectedRate=0 - - testName=RandomClogging - testDuration=10.0 - - testName=Rollback - meanDelay=10.0 - testDuration=10.0 - - testName=Attrition - machinesToKill=10 - machinesToLeave=3 - reboot=true - testDuration=10.0 - - testName=Attrition - machinesToKill=10 - machinesToLeave=3 - reboot=true - testDuration=10.0 - - testName=SaveAndKill - restartInfoLocation=simfdb/restartInfo.ini - testDuration=10.0 diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-2.txt b/tests/restarting/to_6.2.33/CycleTestRestart-2.txt deleted file mode 100644 index 7d498f2be1..0000000000 --- a/tests/restarting/to_6.2.33/CycleTestRestart-2.txt +++ /dev/null @@ -1,26 +0,0 @@ -testTitle=Clogged - runSetup=false - testName=Cycle - transactionsPerSecond=2500.0 - nodeCount=2500 - testDuration=10.0 - expectedRate=0 - - testName=RandomClogging - testDuration=10.0 - - testName=Rollback - meanDelay=10.0 - testDuration=10.0 - - testName=Attrition - machinesToKill=10 - machinesToLeave=3 - reboot=true - testDuration=10.0 - - testName=Attrition - machinesToKill=10 - machinesToLeave=3 - reboot=true - testDuration=10.0 From 7080ea1f1f1b281070ecf8f5ab9caa5c7365355b Mon Sep 17 00:00:00 2001 From: Xiaoge Su Date: Tue, 16 Mar 2021 05:05:03 -0700 Subject: [PATCH 019/317] Add document describes how a get/commit is done in FDB --- design/Commit/Commit.svg | 1 + design/Commit/CommitOverall.svg | 1 + design/Commit/GRV.svg | 1 + design/Commit/Get.svg | 1 + design/Commit/GetRange.svg | 1 + design/Commit/GetRangeFallback.svg | 1 + design/Commit/How a commit is done in FDB.md | 204 +++++++++++++++++++ design/Commit/commit.sequence | 148 ++++++++++++++ design/Commit/commitoverall.sequence | 54 +++++ design/Commit/get.sequence | 68 +++++++ design/Commit/getrange.sequence | 60 ++++++ design/Commit/getrangefallback.sequence | 80 ++++++++ design/Commit/grv.sequence | 66 ++++++ 13 files changed, 686 insertions(+) create mode 100644 design/Commit/Commit.svg create mode 100644 design/Commit/CommitOverall.svg create mode 100644 design/Commit/GRV.svg create mode 100644 design/Commit/Get.svg create mode 100644 design/Commit/GetRange.svg create mode 100644 design/Commit/GetRangeFallback.svg create mode 100644 design/Commit/How a commit is done in FDB.md create mode 100644 design/Commit/commit.sequence create mode 100644 design/Commit/commitoverall.sequence create mode 100644 design/Commit/get.sequence create mode 100644 design/Commit/getrange.sequence create mode 100644 design/Commit/getrangefallback.sequence create mode 100644 design/Commit/grv.sequence diff --git a/design/Commit/Commit.svg b/design/Commit/Commit.svg new file mode 100644 index 0000000000..6a59a6c0bd --- /dev/null +++ b/design/Commit/Commit.svg @@ -0,0 +1 @@ +CommitClient (NativeAPI.actor.cpp)CommitProxy (CommitProxyServer.actor.cpp)Master Resolver (Resolver.actor.cpp)TLog (TLogServer.actor.cpp)Storage Server (storageserver.actor.cpp)Transaction::commitcommitAndWatchtryCommitwatchValuecommitBatchercommitBatchTagPartitionedLogSystemgetVersionserveLiveCommittedVersionresolveBatchtLogCommitserveWatchValueRequestsCommitTransactionRequestCommitAttachIDNativeAPI.commit.BeforeCommitTransactionRequestBatch commit requestsBatched CommitTransactionRequestsCommitProxyServer.batcherPreresolutionGettingCommitVersionGetCommitVersionRequestGetCommitVersionReplyGotCommitVersionResolveResolveTransactionBatchRequestBeforeWait for memory/needed versionAfterQueueSizeCheckWait for resolver versionAfterOrdererResolve the conflictsAfterResolveTransactionBatchReplyProcessingMutationsCalculate the metadataDetermine which transactions should be committedAssign storage server tags to mutationsGetRawCommittedVersionRequestGetRawCommittedVersionReplyAfterStoreCommitsVersion, LogPushDataTLogCommitRequestCommitAttachIDBeforeWaitForVersionWait for the versionBeforeStore the commitPut commit into persistent queueAfterTLogCommitWait all prior message being committedAfterTLogCommitReplyVersion (min)AfterLogPushCommitIDNativeAPI.commit.AfterVersionWatchValueAttachIDBeforeWatchValueRequestwatchValueQ.BeforeEnsure version is not too oldwatchValueQ.AfterVersionCheck storageserver::getValueQwatchValueQ.AfterReadVersionAfterloop[Batch requests]loop[Wait txn commit version enter the MVCC window]loop[Value not change] \ No newline at end of file diff --git a/design/Commit/CommitOverall.svg b/design/Commit/CommitOverall.svg new file mode 100644 index 0000000000..a96b08c205 --- /dev/null +++ b/design/Commit/CommitOverall.svg @@ -0,0 +1 @@ +Commit in FoundationDBClientGetReadVersionProxyCommitProxyMasterResolverTLogRequest read versionRequest committed versionRespond committed versionRespond read versionCommit a mutation with read versionPre-resolutionRequest a commit versionCommit versionCommit versionNeverResolutionSend the transaction to the resolverTransactionCommittedTransactionConflictTransactionTooOldPost-resolutionPush the transaction data to TLogThe version of the transactions that are already durabletlog_stoppedReplyReport raw commit versionVoidCommit versionNot committed: conflictNot committed: too oldalt[New request][Replied before with a commit version][Replied before without commit version]alt[No conflict][Conflict][Read snapshot older than oldest version]alt[TLog not stopped][TLog stopped]alt[Commit successful][Conflict][Transaction too old] \ No newline at end of file diff --git a/design/Commit/GRV.svg b/design/Commit/GRV.svg new file mode 100644 index 0000000000..ab2451fa03 --- /dev/null +++ b/design/Commit/GRV.svg @@ -0,0 +1 @@ +Get Read VersionClient (NativeAPI.actor.cpp)GRVProxy (GrvProxyServer.actor.cpp)Master (masterserver.actor.cpp)Transaction::getReadVersionreadVersionBatchergetConsistentReadVersionqueueGetReadVersionRequeststransactionStartergetLiveCommittedVersionserveLiveCommittedVersionVersionRequestBatch read version requestsTransactionAttachIDBeforeGetReadVersionRequestBatch read version requestsGrvProxyServer.queueTransactionStartRequests.BeforeTransactionAttachIDAskLiveCommittedVersionFromMasterconfirmEpochLiveGetRawCommittedVersionRequestGetRawCommittedVersionGetRawCommittedVersionReplyAfterGetReadVersionReplyAfterGetReadVersionReplyGetReadVersionReplyloop[Batch requests]loop[Batch requests] \ No newline at end of file diff --git a/design/Commit/Get.svg b/design/Commit/Get.svg new file mode 100644 index 0000000000..ab2451fa03 --- /dev/null +++ b/design/Commit/Get.svg @@ -0,0 +1 @@ +Get Read VersionClient (NativeAPI.actor.cpp)GRVProxy (GrvProxyServer.actor.cpp)Master (masterserver.actor.cpp)Transaction::getReadVersionreadVersionBatchergetConsistentReadVersionqueueGetReadVersionRequeststransactionStartergetLiveCommittedVersionserveLiveCommittedVersionVersionRequestBatch read version requestsTransactionAttachIDBeforeGetReadVersionRequestBatch read version requestsGrvProxyServer.queueTransactionStartRequests.BeforeTransactionAttachIDAskLiveCommittedVersionFromMasterconfirmEpochLiveGetRawCommittedVersionRequestGetRawCommittedVersionGetRawCommittedVersionReplyAfterGetReadVersionReplyAfterGetReadVersionReplyGetReadVersionReplyloop[Batch requests]loop[Batch requests] \ No newline at end of file diff --git a/design/Commit/GetRange.svg b/design/Commit/GetRange.svg new file mode 100644 index 0000000000..9aa3ac4d13 --- /dev/null +++ b/design/Commit/GetRange.svg @@ -0,0 +1 @@ +GetRangeClient (NativeAPI.actor.cpp)Storage Server (storageserver.actor.cpp)Transaction::getRangeTransaction::getReadVersiongetRangegetKeyLocationgetKeyValuesQKeyRangeVersionKeyConsult Get sectionLocationInfoBeforeGetKeyValuesRequeststorageserver.getKeyValues.BeforeWait the SS versionstorageserver.getKeyValues.AfterVersionRealign the keysstorageserver.getKeyValues.AfterKeysstorageserver.getKeyValues.SendGetKeyValuesReply (empty)storageserver.getKeyValues.AfterReadRangeGetKeyValuesReplyAfterCombines the resultsErrorFallbackRangeResultRef or ErrorRangeResultRefloop[Keys in the range]alt[No KV pair stored in this server][KV pair found]alt[Error][Successful] \ No newline at end of file diff --git a/design/Commit/GetRangeFallback.svg b/design/Commit/GetRangeFallback.svg new file mode 100644 index 0000000000..dcb2ea84f3 --- /dev/null +++ b/design/Commit/GetRangeFallback.svg @@ -0,0 +1 @@ +GetRange FallbackClient (NativeAPI.actor.cpp)Storage Server (storageserver.actor.cpp)getRangeFallbackgetKeygetExactRangegetKeyRangeLocationsserveGetKeyValuesRequestsserveGetKeyRequestsKeySelectorWait for the versionGetKeyAttachIDAfterVersionSee getKeyLocation in GetBeforeGetKeyRequestGetKeyReplyAfterKeyErrorUpdate read version if necessaryVersion, KeyRangeRefKeyRangeBeforeGet the locationsAfterLocationInfoBeforeGetKeyValuesRequeststorageserver.getKeyValues.BeforeWait the SS versionstorageserver.getKeyValues.AfterVersionRealign the keysstorageserver.getKeyValues.AfterKeysstorageserver.getKeyValues.SendGetKeyValuesReply (empty)storageserver.getKeyValues.AfterReadRangeGetKeyValuesReplyAfterRangeResultRefopt[Key need resolve]alt[Success][Error]loop[Loop over keys in the range]loop[Loop over shards]alt[No KV pair stored in this server][KV pair found] \ No newline at end of file diff --git a/design/Commit/How a commit is done in FDB.md b/design/Commit/How a commit is done in FDB.md new file mode 100644 index 0000000000..78d74ed1e2 --- /dev/null +++ b/design/Commit/How a commit is done in FDB.md @@ -0,0 +1,204 @@ +# How a commit is done in FDB + +## Overall description + +Legend: + +* `alt` means alternative paths + * The texts in `[]` are conditions + * The texts above the arrow are messages. + +The diagrams are generated using https://sequencediagram.org. The source code of the diagrams are the `*.sequence` files. + +![CommitOverall](CommitOverall.svg) + + + +## Description of each sections + +Before all RPCs mentioned below, the client would first verify if the commit proxies and GRV proxies are changed, by comparing the client information ID it holds to the ID the cluster coordinator holds. If they are different, the proxies are changed and the client will refresh the proxies list. + +### GetReadVersion Section + +* The GRV Proxy sends a request to master to retrieve the current commit version. This version is the read version of the request. + +### Preresolution Section + +* The commit proxy sends a request for commit version, with a request number. + +* - The request number is a monotonically increasing number per commit proxy. + - This ensures for each proxy, the master will process the requests in order. + +* The master server waits until the request number is current. + + When the current request number is larger than the incoming request number + + * If a commit version is already assigned to the incoming request number, return the commit version and the version that is immediately before the commit version (prevVersion). + + * Otherwise return `Never` + + * Increase current commit version, return it back to the commit proxy. + + * Only one process serves as master. Thus the commit version is unique for each cluster. + + * The monotonically increasing commit version will ensure each transaction processed in strict ordering. + +### Resolution section + +* The commit proxy sends the transaction to the resolver. +* Resolver waits until its version reaches `prevVersion` + * Ensures all transactions having version smaller than this transaction are resolved. + * Detects conflicts for the given transaction: + * If there is no conflict, return `TransactionCommitted` as the status + * Any conflict, return `TransactionConflict` status + * If the read snapshot is not in MVCC, return `TransactionTooOld` status + +### Post Resolution section + +* The proxy waits until the local batch number is current +* The proxy will update the metadata keys and calculate which storage servers are affected +* The proxy then waits until the commit version is current, i.e. only those commits in the MVCC window should be processed. +* The proxy pushs the commit data to TLog +* TLog waits the commit version to current, then persist the commit. + +### TLog section + +* Wait until *all* TLogs returns the transaction result. + +### Reply section + +* The proxy will update the master its commit version +* Reply the result to the client, base on the result from the resolver. + +## Tracking the process using `g_traceBatch` + +`g_traceBatch` can be used for querying the transactions and commits. A typical query string: + +``` +index=iffdb LogGroup=loggroup Type=location Location=location +``` + +The format of `location` is, in general, `..`, e.g. + +``` +NativeAPI.getConsistentReadVersion.Before +``` + +means the `location` is at `NativeAPI.actor.cpp`, `ACTOR` `getConsistentReadVersion`, `Before` requesting the read version from GRV Proxy. + +In the following sections, green tag indicates an attach; blue tag indicates an event that the location follows the format mentioned above, where only the `` is included; light-blue tag indicates an event that the location is not following the format, where the full location is included. All the `g_traceBatch` events are tabularized after the diagram. + +`contrib/commit_debug.py` can be used to visualize the commit process. + +### Get Read Version + +![GetReadVersion](GRV.svg) + +| **Role** | **File name** | **Function/Actor** | **Trace** | **Type** | **Location** | +| ------------ | -------------- | --------------------------- | --------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | +| **Client** | NativeAPI | Transaction::getReadVersion | | | | +| | | readVersionBatcher | | [*TransactionAttachID*](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4639) | | +| | | getConsistentReadVersion | Before | TransactionDebug | [NativeAPI.getConsistentReadVersion.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4564) | +| **GRVProxy** | GrvProxyServer | queueGetReadVersionRequests | Before | TransactionDebug | [GrvProxyServer.queueTransactionStartRequests.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L373-L375) | +| | | transactionStarter | | [*TransactionAttachID*](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L734-L735) | | +| | | | AskLiveCommittedVersionFromMaster | TransactionDebug | [GrvProxyServer.transactionStarter.AskLiveCommittedVersionFromMaster](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L787-L789) | +| | | getLiveCommittedVersion | confirmEpochLive | TransactionDebug | [GrvProxyServer.getLiveCommittedVersion.confirmEpochLive](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L479-L480) | +| **Master** | MasterServer | serveLiveCommittedVersion | GetRawCommittedVersion | TransactionDebug | [MasterServer.serveLiveCommittedVersion.GetRawCommittedVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/masterserver.actor.cpp#L1187-L1189) | +| **GRVProxy** | GrvProxyServer | getLiveCommittedVersion | After | TransactionDebug | [GrvProxyServer.getLiveCommittedVersion.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/GrvProxyServer.actor.cpp#L500-L501) | +| **Client** | NativeAPI | getConsistentReadVersion | After | TransactionDebug | [NativeAPI.getConsistentReadVersion.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4594-L4595) | + +### Get + +![Get](Get.svg) + +| **Role** | **File name** | **Function/Actor** | **Trace** | **Name** | **Location** | **Notes** | +| ------------------ | ------------------- | ----------------------------------- | ------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| **Client** | NativeAPI | Transaction::get | | | | | +| | | Transaction::getReadVersion | | | *(Refer to GetReadVersion)* | | +| | | getKeyLocation | Before | TransactionDebug | [NativeAPI.getKeyLocation.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L1975-L1976) | getKeyLocation is called by getValue, getKeyLocation actually calls getKeyLocation_internal | +| | | | After | TransactionDebug | [NativeAPI.getKeyLocation.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L1988-L1989) | | +| | | getValue | | [*GetValueAttachID*](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2164) | | | +| | | | Before | GetValueDebug | [NativeAPI.getValue.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2165-L2167) | | +| **Storage Server** | StorageServer | serveGetValueRequests | received | GetValueDebug | [StorageServer.received](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L4325-L4327) | | +| | | getValueQ | DoRead | GetValueDebug | [getValueQ.DoRead](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1115-L1117) | | +| | | | AfterVersion | GetValueDebug | [getValueQ.AfterVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1122-L1124) | | +| | KeyValueStoreSQLite | KeyValueStoreSQLite::Reader::action | Before | GetValueDebug | [Reader.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/KeyValueStoreSQLite.actor.cpp#L1654-L1656) | | +| | | | After | GetValueDebug | [Reader.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/KeyValueStoreSQLite.actor.cpp#L1662-L1664) | | +| | StorageServer | | AfterRead | GetValueDebug | [getValueQ.AfterRead](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1185-L1187) | | +| **Client** | NativeAPI | getValue | After | GetValueDebug | [NativeAPI.getValue.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2216-L2218) | (When successful) | +| | | | Error | GetValueDebug | [NativeAPI.getValue.Error](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2232-L2234) | (Wehn failure) | + + + +### Get Range + +![GetRange](GetRange.svg) + +| **Role** | **File name** | **Function/Actor** | **Trace** | **Name** | **Location** | **Notes** | +| ------------------ | ------------- | --------------------------- | -------------- | ---------------- | ------------------------------------------------------------ | ------------------------------------ | +| **Client** | NativeAPI | Transaction::getRange | | | | | +| | | Transaction::getReadVersion | | | *(Refer to GetReadVersion)* | | +| | | getKeyLocation | Before | TransactionDebug | [NativeAPI.getKeyLocation.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L1975) | getKeyLocation is called by getRange | +| | | | After | TransactionDebug | [NativeAPI.getKeyLocation.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L1988-L1989) | | +| | | getRange | Before | TransactionDebug | [NativeAPI.getRange.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L3004) | | +| **Storage Server** | storageserver | getKeyValuesQ | Before | TransactionDebug | [storageserver.getKeyValues.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1812) | | +| | | | AfterVersion | TransactionDebug | [storageserver.getKeyValues.AfterVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1821) | | +| | | | AfterKeys | TransactionDebug | [storageserver.getKeyValues.AfterKeys](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1846) | | +| | | | Send | TransactionDebug | [storageserver.getKeyValues.Send](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1866) | (When no keys found) | +| | | | AfterReadRange | TransactionDebug | [storageserver.getKeyValues.AfterReadRange](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/storageserver.actor.cpp#L1886) | (When found keys in this SS) | +| **Client** | NativeAPI | getRange | After | TransactionDebug | [NativeAPI.getRange.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L3044-L3046) | (When successful) | +| | | | Error | TransactionDebug | [NativeAPI.getRange.Error](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L3155-L3156) | (Wehn failure) | + +### GetRange Fallback + +![GetRangeFallback](GetRangeFallback.svg) + +| **Role** | **File name** | **Function/Actor** | **Trace** | **Type** | **Location** | **Notes** | +| ---------- | ------------- | -------------------- | ------------ | ---------------- | ------------------------------------------------------------ | ----------------------------------------------- | +| **Client** | NativeAPI | getRangeFallback | | | | | +| | | getKey | | | *GetKeyAttachID* | | +| | | | AfterVersion | GetKeyDebug | [NativeAPI.getKey.AfterVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2263-L2266) | | +| | | | Before | GetKeyDebug | [NativeAPI.getKey.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2285-L2288) | | +| | | | After | GetKeyDebug | [NativeAPI.getKey.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2316-L2318) | Success | +| | | | Error | GetKeyDebug | [NativeAPI.getKey.Error](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2326) | Error | +| | | getReadVersion | | | | *(Refer to GetReadVersion)* | +| | | getKeyRangeLocations | Before | TransactionDebug | [NativeAPI.getKeyLocations.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2029) | | +| | | | After | TransactionDebug | [NativeAPI.getKeyLocations.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2044) | | +| | | getExactRange | Before | TransactionDebug | [NativeAPI.getExactRange.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2674) | getKeyRangeLocations is called by getExactRange | +| | | | After | TransactionDebug | [NativeAPI.getExactRange.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2707) | | + +### Commit + +![Commit](Commit.svg) + +| **Role** | **File name** | **Function/Actor** | **Trace** | **Type** | **Location** | **Notes** | +| ---------------- | ----------------- | ------------------------------------------- | -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | --------- | +| **Client** | NativeAPI | Transaction::commit | | | | | +| | | commitAndWatch | | | | | +| | | tryCommit | | *[commitAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4100)* | | | +| | | | Before | CommitDebug | [NativeAPI.commit.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4101-L4102) | | +| **Commit Proxy** | CommitProxyServer | commitBatcher | batcher | CommitDebug | [CommitProxyServer.batcher](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L244-L245) | | +| | | commitBatch | | | | | +| | | CommitBatchContext::setupTraceBatch | | *[CommitAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L526)* | | | +| | | | Before | CommitDebug | [CommitProxyServer.commitBatch.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L532) | | +| | | CommitBatchContext::preresolutionProcessing | GettingCommitVersion | CommitDebug | [CommitProxyServer.commitBatch.GettingCommitVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L616-L617) | | +| | | | GotCommitVersion | CommitDebug | [CommitProxyServer.commitBatch.GotCommitVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L643) | | +| **Resolver** | Resolver | resolveBatch | | *[CommitAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L116)* | | | +| | | | Before | CommitDebug | [Resolver.resolveBatch.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L117) | | +| | | | AfterQueueSizeCheck | CommitDebug | [Resolver.resolveBatch.AfterQueueSizeCheck](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L137) | | +| | | | AfterOrderer | CommitDebug | [Resolver.resolveBatch.AfterOrderer](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L172) | | +| | | | After | CommitDebug | [Resolver.resolveBatch.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/Resolver.actor.cpp#L296) | | +| **Commit Proxy** | CommitProxyServer | CommitBatchContext::postResolution | ProcessingMutations | CommitDebug | [CommitProxyServer.CommitBatch.ProcessingMutations](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L1074) | | +| | | | AfterStoreCommits | CommitDebug | [CommitProxyServer.CommitBatch.AfterStoreCommits](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L1154) | | +| **TLog** | TLogServer | tLogCommit | | *[commitAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2047)* | | | +| | | | BeforeWaitForVersion | CommitDebug | [TLogServer.tLogCommit.BeforeWaitForVersion](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2048) | | +| | | | Before | CommitDebug | [TLog.tLogCommit.Before](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2083) | | +| | | | AfterTLogCommit | CommitDebug | [TLog.tLogCommit.AfterTLogCommit](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2107) | | +| | | | After | CommitDebug | [TLog.tLogCommit.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/TLogServer.actor.cpp#L2125) | | +| **Commit Proxy** | CommitProxyServer | CommitBatchContext::reply | AfterLogPush | CommitDebug | [CommitProxyServer.CommitBatch.AfterLogPush](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbserver/CommitProxyServer.actor.cpp#L1263) | | +| **Client** | NativeAPI | tryCommit | After | CommitDebug | [NativeAPI.commit.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L4152) | | +| | | commitAndWatch | | | | | +| | | watchValue | | *[WatchValueAttachID](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2408)* | | | +| | | | Before | WatchValueDebug | [NativeAPI.watchValue.Before]() | | +| | | | After | WatchValueDebug | [NativeAPI.watchValue.After](https://github.com/apple/foundationdb/blob/ffb8e27f4325db3dc8465e145bc308f6854500eb/fdbclient/NativeAPI.actor.cpp#L2431-L2433) | | + diff --git a/design/Commit/commit.sequence b/design/Commit/commit.sequence new file mode 100644 index 0000000000..502a572c22 --- /dev/null +++ b/design/Commit/commit.sequence @@ -0,0 +1,148 @@ +title Commit + +participantgroup **Client** (NativeAPI.actor.cpp) + participant "Transaction::commit" as tC + participant "commitAndWatch" as cAW + participant "tryCommit" as Commit + participant "watchValue" as wV +end + +participantgroup **CommitProxy** (CommitProxyServer.actor.cpp) + participant "commitBatcher" as cB + participant "commitBatch" as Batch + participant "TagPartitionedLogSystem" as TPLS +end + +participantgroup **Master** + participant "getVersion" as gV + participant "serveLiveCommittedVersion" as sLCV +end + +participantgroup **Resolver** (Resolver.actor.cpp) + participant "resolveBatch" as rB +end + +participantgroup **TLog** (TLogServer.actor.cpp) + participant "tLogCommit" as tLC +end + +participantgroup **Storage Server** (storageserver.actor.cpp) + participant "serveWatchValueRequests" as sWVR +end + +autoactivation off + +tC -> cAW: +cAW -> Commit: CommitTransactionRequest + +note right of Commit: //CommitAttachID// +note right of Commit: NativeAPI.commit.Before + +Commit -> cB: CommitTransactionRequest +loop Batch requests + box over cB: Batch commit requests +end + +cB -> Batch: Batched CommitTransactionRequests + +note right of Batch: --CommitProxyServer.batcher-- + +box over Batch: Preresolution + +note right of Batch: GettingCommitVersion + +Batch -> gV: GetCommitVersionRequest +gV -> Batch: GetCommitVersionReply + +note right of Batch: GotCommitVersion + +box over Batch: Resolve + +Batch -> rB: ResolveTransactionBatchRequest + +note right of rB: Before + +box over rB: Wait for memory/needed version + +note right of rB: AfterQueueSizeCheck + +box over rB: Wait for resolver version + +note right of rB: AfterOrderer + +box over rB: Resolve the conflicts + +note right of rB: After + +rB --> Batch: ResolveTransactionBatchReply + +note right of Batch: ProcessingMutations + +box over Batch: Calculate the metadata + +box over Batch: Determine which transactions should be committed + +box over Batch: Assign storage server tags to mutations + +loop Wait txn commit version enter the MVCC window + Batch -> sLCV: GetRawCommittedVersionRequest + sLCV --> Batch: GetRawCommittedVersionReply +end + +note right of Batch: AfterStoreCommits + +Batch -> TPLS: Version, LogPushData +TPLS -> tLC: TLogCommitRequest + +note right of tLC: //CommitAttachID// + +note right of tLC: BeforeWaitForVersion + +box over tLC: Wait for the version + +note right of tLC: Before + +box over tLC: Store the commit + +box over tLC: Put commit into persistent queue + +note right of tLC: AfterTLogCommit + +box over tLC: Wait all prior message being committed + +note right of tLC: After + +tLC --> TPLS: TLogCommitReply +TPLS -> Batch: Version (min) + +note right of Batch: AfterLogPush + +Batch --> Commit: CommitID + +note right of Commit: --NativeAPI.commit.After-- + +Commit --> cAW: + +cAW -> wV: Version + +note right of wV: //WatchValueAttachID// +note right of wV: Before + +wV -> sWVR: WatchValueRequest + +note right of sWVR: --watchValueQ.Before-- + +box over sWVR: Ensure version is not too old + +note right of sWVR: --watchValueQ.AfterVersion-- + +loop Value not change + box over sWVR: Check storageserver::getValueQ + note right of sWVR: --watchValueQ.AfterRead-- +end + +sWVR --> wV: Version + +note right of wV: After + +cAW --> tC: diff --git a/design/Commit/commitoverall.sequence b/design/Commit/commitoverall.sequence new file mode 100644 index 0000000000..4bb250b219 --- /dev/null +++ b/design/Commit/commitoverall.sequence @@ -0,0 +1,54 @@ +title Commit in FoundationDB + +participant "Client" as C +participant "GetReadVersionProxy" as GRV +participant "CommitProxy" as P +participant "Master" as M +participant "Resolver" as R +participant "TLog" as T + +C ->> GRV: Request read version +GRV ->> M: Request committed version +M ->> GRV: Respond committed version +GRV ->> C: Respond read version + +C ->> P: Commit a mutation with read version + +box right of P: Pre-resolution +P ->> M: Request a commit version +alt New request + M ->> P: Commit version +else Replied before with a commit version + M ->> P: Commit version +else Replied before without commit version + M --x P: Never +end + +box right of P: Resolution +P ->> R: Send the transaction to the resolver +alt No conflict + R ->> P: TransactionCommitted +else Conflict + R ->> P: TransactionConflict +else Read snapshot older than oldest version + R ->> P: TransactionTooOld +end + +box right of P: Post-resolution +P ->> T: Push the transaction data to TLog +alt TLog not stopped + T ->> P: The version of the transactions that are already durable +else TLog stopped + T ->> P: tlog_stopped +end + +box right of P: Reply +P ->> M: Report raw commit version +M -->> P: Void +alt Commit successful + P ->> C: Commit version +else Conflict + P ->> C: Not committed: conflict +else Transaction too old + P ->> C: Not committed: too old +end diff --git a/design/Commit/get.sequence b/design/Commit/get.sequence new file mode 100644 index 0000000000..dcd2ee7073 --- /dev/null +++ b/design/Commit/get.sequence @@ -0,0 +1,68 @@ +title Get + +participantgroup **Client** (NativeAPI.actor.cpp) + participant "Transaction::get" as get + participant "Transaction::getReadVersion" as gRV + participant "getValue" as gV + participant "getKeyLocation" as gKL +end + +participantgroup **CommitProxy** (CommitProxyServer.actor.cpp) + participant "doKeyServerLocationRequest" as dKSLR +end + +participantgroup **Storage Server** (storageserver.actor.cpp) + participant "serveGetValueRequests" as sGVR + participant "getValueQ" as gVQ +end + +participantgroup **KeyValueStoreSQLite** (KeyValueStoreSQLite.actor.cpp) + participant "KeyValueStoreSQLite::Reader::action" as axn +end + +autoactivation off + +get -> gRV: +box over gRV: //Consult Get Read Version section// +gRV --> get: Version + +get -> gV: Version, Key +gV -> gKL: Key +note right of gKL: Before + +gKL -> dKSLR: GetKeyServerLocationsRequest +dKSLR --> gKL: GetKeyServerLocationsReply + +note right of gKL: After + +gKL --> gV: LocationInfo + +note right of gV: //GetValueAttachID// + +note right of gV: Before + +gV -> sGVR: GetValueRequest +note right of sGVR: --storageServer.received-- + +sGVR -> gVQ: GetValueRequest + +note right of gVQ: --getValueQ.DoRead-- + +note right of gVQ: --getValueQ.AfterVersion-- + +gVQ -> axn: Key + +note right of axn: --Reader.Before-- +note right of axn: --Reader.After-- + +axn --> gVQ: Value +note right of gVQ: --getValueQ.AfterRead-- + +gVQ --> gV: GetValueReply +alt Error + note right of gV: Error + gV --> get: Error +else Success + note right of gV: After + gV --> get: Value +end diff --git a/design/Commit/getrange.sequence b/design/Commit/getrange.sequence new file mode 100644 index 0000000000..5a07436b99 --- /dev/null +++ b/design/Commit/getrange.sequence @@ -0,0 +1,60 @@ +title GetRange + +participantgroup **Client** (NativeAPI.actor.cpp) + participant "Transaction::getRange" as tGR + participant "Transaction::getReadVersion" as gRV + participant "getRange" as gR + participant "getKeyLocation" as gKL +end + +participantgroup **Storage Server** (storageserver.actor.cpp) + participant "getKeyValuesQ" as gKVQ +end + +autoactivation off + +tGR -> gRV: +tGR -> gR: KeyRange +gRV -->(2) gR: Version + +loop Keys in the range + gR -> gKL: Key + + box over gKL: //Consult Get section// + + gKL --> gR: LocationInfo + + note right of gR: Before + + gR -> gKVQ: GetKeyValuesRequest + + note right of gKVQ: --storageserver.getKeyValues.Before-- + + box over gKVQ: Wait the SS version + + note right of gKVQ: --storageserver.getKeyValues.AfterVersion-- + + box over gKVQ: Realign the keys + + note right of gKVQ: --storageserver.getKeyValues.AfterKeys-- + + alt No KV pair stored in this server + note right of gKVQ: --storageserver.getKeyValues.Send-- + gKVQ --> gR: GetKeyValuesReply (empty) + else KV pair found + note right of gKVQ: --storageserver.getKeyValues.AfterReadRange-- + gKVQ --> gR: GetKeyValuesReply + end + + note right of gR: After + + box over gR: Combines the results +end + +alt Error + note right of gR: Error + box over gR: Fallback + gR -> tGR: RangeResultRef or Error +else Successful + gR -> tGR: RangeResultRef +end diff --git a/design/Commit/getrangefallback.sequence b/design/Commit/getrangefallback.sequence new file mode 100644 index 0000000000..7fdbf56a3c --- /dev/null +++ b/design/Commit/getrangefallback.sequence @@ -0,0 +1,80 @@ +title GetRange Fallback + +participantgroup **Client** (NativeAPI.actor.cpp) + participant "getRangeFallback" as gRF + participant "getKey" as gK + participant "getExactRange" as gER + participant "getKeyRangeLocations" as gKRL +end + +participantgroup **Storage Server** (storageserver.actor.cpp) + participant "serveGetKeyValuesRequests" as sGKVR + participant "serveGetKeyRequests" as sGKR +end + +autoactivation off + +opt Key need resolve + gRF -> gK: KeySelector + + box over gK: Wait for the version + + note right of gK: //GetKeyAttachID// + note right of gK: AfterVersion + + box over gK: See getKeyLocation in Get + + note right of gK: Before + + gK -> sGKR: GetKeyRequest + sGKR --> gK: GetKeyReply + + alt Success + note right of gK: After + gK --> gRF: Key + else Error + note right of gK: Error + end +end + +box over gRF: Update read version if necessary + +gRF -> gER: Version, KeyRangeRef + +loop Loop over keys in the range + gER -> gKRL: KeyRange + + note right of gKRL: Before + box over gKRL: Get the locations + note right of gKRL: After + + gKRL --> gER: LocationInfo + + loop Loop over shards + note right of gER: Before + + gER -> sGKVR: GetKeyValuesRequest + + note right of sGKVR: --storageserver.getKeyValues.Before-- + + box over sGKVR: Wait the SS version + + note right of sGKVR: --storageserver.getKeyValues.AfterVersion-- + + box over sGKVR: Realign the keys + + note right of sGKVR: --storageserver.getKeyValues.AfterKeys-- + + alt No KV pair stored in this server + note right of sGKVR: --storageserver.getKeyValues.Send-- + sGKVR --> gER: GetKeyValuesReply (empty) + else KV pair found + note right of sGKVR: --storageserver.getKeyValues.AfterReadRange-- + sGKVR --> gER: GetKeyValuesReply + end + + note right of gER: After + end +end + +gER --> gRF: RangeResultRef diff --git a/design/Commit/grv.sequence b/design/Commit/grv.sequence new file mode 100644 index 0000000000..c09ac97830 --- /dev/null +++ b/design/Commit/grv.sequence @@ -0,0 +1,66 @@ +title Get Read Version + +participantgroup **Client** (NativeAPI.actor.cpp) + participant "Transaction::getReadVersion" as gRV + participant "readVersionBatcher" as rVB + participant "getConsistentReadVersion" as gCRV +end + +participantgroup **GRVProxy** (GrvProxyServer.actor.cpp) + participant "queueGetReadVersionRequests" as qGRVR + participant "transactionStarter" as tS + participant "getLiveCommittedVersion" as gLCV +end + +participantgroup **Master** (masterserver.actor.cpp) + participant "serveLiveCommittedVersion" as sLCV +end + +autoactivation off + +gRV -> rVB: VersionRequest + +loop Batch requests + box over rVB:Batch read version requests +end + +note right of rVB: //TransactionAttachID// + +rVB -> gCRV: + +note right of gCRV: Before + +gCRV -> qGRVR: GetReadVersionRequest + +loop Batch requests + box over qGRVR: Batch read version requests +end + +note right of qGRVR: --GrvProxyServer.queueTransactionStartRequests.Before-- + +qGRVR -> tS: + +note right of tS: //TransactionAttachID// + +note right of tS: AskLiveCommittedVersionFromMaster + +tS -> gLCV: + +note right of gLCV: confirmEpochLive + +gLCV -> sLCV: GetRawCommittedVersionRequest + +note right of sLCV: GetRawCommittedVersion + +sLCV --> gLCV: GetRawCommittedVersionReply + +note right of gLCV: After + +gLCV --> gCRV: GetReadVersionReply + +note right of gCRV: After + +gCRV --> rVB: GetReadVersionReply + +rVB --> gRV: GetReadVersionReply + From 927c7993ccd1bdb27ca821a773e451bad11292ba Mon Sep 17 00:00:00 2001 From: Xiaoge Su Date: Wed, 17 Mar 2021 01:15:21 -0700 Subject: [PATCH 020/317] Update the documentation per comments --- design/Commit/How a commit is done in FDB.md | 32 +++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/design/Commit/How a commit is done in FDB.md b/design/Commit/How a commit is done in FDB.md index 78d74ed1e2..1b6c9e62e3 100644 --- a/design/Commit/How a commit is done in FDB.md +++ b/design/Commit/How a commit is done in FDB.md @@ -26,14 +26,14 @@ Before all RPCs mentioned below, the client would first verify if the commit pro * The commit proxy sends a request for commit version, with a request number. -* - The request number is a monotonically increasing number per commit proxy. + - The request number is a monotonically increasing number per commit proxy. - This ensures for each proxy, the master will process the requests in order. * The master server waits until the request number is current. When the current request number is larger than the incoming request number - * If a commit version is already assigned to the incoming request number, return the commit version and the version that is immediately before the commit version (prevVersion). + * If a commit version is already assigned to the incoming request number, return the commit version and the previous commit version. (i.e. `prevVersion`) * Otherwise return `Never` @@ -41,7 +41,7 @@ Before all RPCs mentioned below, the client would first verify if the commit pro * Only one process serves as master. Thus the commit version is unique for each cluster. - * The monotonically increasing commit version will ensure each transaction processed in strict ordering. + * The monotonically increasing commit version will ensure that each transaction is processed in a strict serial order. ### Resolution section @@ -56,26 +56,25 @@ Before all RPCs mentioned below, the client would first verify if the commit pro ### Post Resolution section * The proxy waits until the local batch number is current -* The proxy will update the metadata keys and calculate which storage servers are affected -* The proxy then waits until the commit version is current, i.e. only those commits in the MVCC window should be processed. -* The proxy pushs the commit data to TLog -* TLog waits the commit version to current, then persist the commit. ### TLog section - +* The proxy updates the metadata keys and attaches corresponding storage servers' tags to all mutations. +* The proxy then waits until the commit version is current, i.e. the proxy's committed version is catching up with the commit version of the batch and these two versions are within the MVCC window. +* The proxy pushs the commit data to TLog +* TLog waits the commit version to current, then persist the commit. * Wait until *all* TLogs returns the transaction result. ### Reply section -* The proxy will update the master its commit version +* The proxy updates the master with the committed version for next GRV request at the master. * Reply the result to the client, base on the result from the resolver. ## Tracking the process using `g_traceBatch` -`g_traceBatch` can be used for querying the transactions and commits. A typical query string: +`g_traceBatch` can be used for querying the transactions and commits. A typical query string for Splunk is: ``` -index=iffdb LogGroup=loggroup Type=location Location=location +LogGroup=loggroup Type=type Location=location ``` The format of `location` is, in general, `..`, e.g. @@ -86,6 +85,17 @@ NativeAPI.getConsistentReadVersion.Before means the `location` is at `NativeAPI.actor.cpp`, `ACTOR` `getConsistentReadVersion`, `Before` requesting the read version from GRV Proxy. +Some of example queries are: + +``` +LogGroup=loggroup Type=TransactionDebug Location=NativeAPI* +``` + +``` +LogGroup=loggroup Type=CommitDebug Location=storageserver* +``` + + In the following sections, green tag indicates an attach; blue tag indicates an event that the location follows the format mentioned above, where only the `` is included; light-blue tag indicates an event that the location is not following the format, where the full location is included. All the `g_traceBatch` events are tabularized after the diagram. `contrib/commit_debug.py` can be used to visualize the commit process. From 924253da86afa4dd87c14a911e8d68160d0733a3 Mon Sep 17 00:00:00 2001 From: Xiaoge Su Date: Thu, 18 Mar 2021 17:03:19 -0700 Subject: [PATCH 021/317] Update the documentation per comments II --- design/Commit/How a commit is done in FDB.md | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/design/Commit/How a commit is done in FDB.md b/design/Commit/How a commit is done in FDB.md index 1b6c9e62e3..739549aedf 100644 --- a/design/Commit/How a commit is done in FDB.md +++ b/design/Commit/How a commit is done in FDB.md @@ -56,13 +56,11 @@ Before all RPCs mentioned below, the client would first verify if the commit pro ### Post Resolution section * The proxy waits until the local batch number is current - -### TLog section * The proxy updates the metadata keys and attaches corresponding storage servers' tags to all mutations. * The proxy then waits until the commit version is current, i.e. the proxy's committed version is catching up with the commit version of the batch and these two versions are within the MVCC window. -* The proxy pushs the commit data to TLog -* TLog waits the commit version to current, then persist the commit. -* Wait until *all* TLogs returns the transaction result. +* The proxy pushes the commit data to TLogs. +* TLog waits the commit version to be current, then persists the commit. +* Wait until *all* TLogs return the transaction result. ### Reply section @@ -71,10 +69,10 @@ Before all RPCs mentioned below, the client would first verify if the commit pro ## Tracking the process using `g_traceBatch` -`g_traceBatch` can be used for querying the transactions and commits. A typical query string for Splunk is: +`g_traceBatch` can be used for querying the transactions and commits. A typical query in the trace logs is: ``` -LogGroup=loggroup Type=type Location=location +Type=type Location=location ``` The format of `location` is, in general, `..`, e.g. @@ -85,14 +83,14 @@ NativeAPI.getConsistentReadVersion.Before means the `location` is at `NativeAPI.actor.cpp`, `ACTOR` `getConsistentReadVersion`, `Before` requesting the read version from GRV Proxy. -Some of example queries are: +Some example queries are: ``` -LogGroup=loggroup Type=TransactionDebug Location=NativeAPI* +Type=TransactionDebug Location=NativeAPI* ``` ``` -LogGroup=loggroup Type=CommitDebug Location=storageserver* +LogGroup=loggroup Type=CommitDebug Location=Resolver.resolveBatch.* ``` From 5c1b674815b1765dbc08eed4d98875163dee5708 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 19 Mar 2021 10:31:58 -0600 Subject: [PATCH 022/317] implemented test --- flow/CMakeLists.txt | 2 +- flow/WriteOnlySet.actor.cpp | 159 +++++++++++++++++++ flow/{ActorLineageSet.cpp => WriteOnlySet.h} | 75 ++++----- 3 files changed, 187 insertions(+), 49 deletions(-) create mode 100644 flow/WriteOnlySet.actor.cpp rename flow/{ActorLineageSet.cpp => WriteOnlySet.h} (60%) diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index 5e89fe4d28..4c28aee437 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -3,7 +3,6 @@ find_package(Threads REQUIRED) set(FLOW_SRCS ActorCollection.actor.cpp ActorCollection.h - ActorLineageSet.cpp Arena.cpp Arena.h AsioReactor.h @@ -70,6 +69,7 @@ set(FLOW_SRCS TreeBenchmark.h UnitTest.cpp UnitTest.h + WriteOnlySet.actor.cpp XmlTraceLogFormatter.cpp XmlTraceLogFormatter.h actorcompiler.h diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp new file mode 100644 index 0000000000..d0f7c514ad --- /dev/null +++ b/flow/WriteOnlySet.actor.cpp @@ -0,0 +1,159 @@ +/* + * WriteOnlySet.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/DeterministicRandom.h" +#include "flow/WriteOnlySet.h" +#include "flow/flow.h" +#include "flow/UnitTest.h" + +#include +#include +#include "flow/actorcompiler.h" // has to be last include + +template +auto WriteOnlySet::insert(const Reference& lineage) -> Index { + Index res; + if (!freeQueue.pop(res)) { + TraceEvent(SevWarnAlways, "NoCapacityInWriteOnlySet"); + return npos; + } + ASSERT(_set[res].load() & FREE); + auto ptr = reinterpret_cast(lineage.getPtr()); + ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned + ASSERT((ptr & FREE) == 0 && (ptr & LOCK) == 0); + lineage->addref(); + _set[res].store(ptr); + return res; +} + +template +void WriteOnlySet::erase(Index idx) { + while (true) { + auto ptr = _set[idx].load(); + if (ptr & LOCK) { + _set[idx].store(FREE); + freeList.push(reinterpret_cast(ptr ^ LOCK)); + return; + } else { + if (_set[idx].compare_exchange_strong(ptr, FREE)) { + reinterpret_cast(ptr)->delref(); + return; + } + } + } +} + +// Explicit instantiation +template class WriteOnlySet; + +// testing code +namespace { + +std::atomic instanceCounter = 0; +constexpr double iteration_frequency = 10.0; + +struct TestObject { + mutable std::atomic _refCount = 1; + TestObject() { instanceCounter.fetch_add(1); } + void delref() const { + if (--_refCount == 0) { + delete this; + --instanceCounter; + } + } + void addref() const { ++_refCount; } +}; + +using TestSet = WriteOnlySet; +using Clock = std::chrono::steady_clock; + +ACTOR Future threadjoiner(std::shared_ptr> threads, std::shared_ptr set) { + loop { + wait(delay(0.1)); + for (unsigned i = 0;;) { + if (threads->size() == i) { + break; + } + auto& t = (*threads)[i]; + if (t.joinable()) { + t.join(); + if (i + 1 < threads->size()) { + std::swap(*threads->rbegin(), (*threads)[i]); + } + threads->pop_back(); + } else { + ++i; + } + } + if (threads->empty()) { + set->copy(); + ASSERT(instanceCounter.load() == 0); + return Void(); + } + } +} + +void testCopier(std::shared_ptr set, std::chrono::seconds runFor) { + auto start = Clock::now(); + while (true) { + if (Clock::now() - start > runFor) { + return; + } + auto copy = set->copy(); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } +} + +void writer(std::shared_ptr set, std::chrono::seconds runFor) { + auto start = Clock::now(); + std::random_device rDev; + DeterministicRandom rnd(rDev()); + while (true) { + if (Clock::now() - start > runFor) { + return; + } + std::vector positions; + for (int i = 0; i < rnd.randomInt(1, 101); ++i) { + positions.push_back(set->insert(Reference(new TestObject()))); + } + rnd.randomShuffle(positions); + for (auto p : positions) { + set->erase(p); + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } +} + +TEST_CASE("/flow/WriteOnlySet") { + if (g_network->isSimulated()) { + // This test is not deterministic, so we shouldn't run it in simulation + return Void(); + } + auto set = std::make_shared(); + auto threads = std::make_shared>(); + std::chrono::seconds runFor(10); + for (int i = 0; i < 5; ++i) { + threads->emplace_back([set, runFor]() { writer(set, runFor); }); + } + threads->emplace_back([set, runFor]() { testCopier(set, runFor); }); + wait(threadjoiner(threads, set)); + return Void(); +} +} // namespace \ No newline at end of file diff --git a/flow/ActorLineageSet.cpp b/flow/WriteOnlySet.h similarity index 60% rename from flow/ActorLineageSet.cpp rename to flow/WriteOnlySet.h index 570976379c..a319ad22f0 100644 --- a/flow/ActorLineageSet.cpp +++ b/flow/WriteOnlySet.h @@ -1,9 +1,9 @@ /* - * ActorLineageSet.cpp + * WriteOnlySet.cpp * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,20 +18,23 @@ * limitations under the License. */ -#include "flow/flow.h" +#pragma once +#include "flow/Error.h" +#include "flow/FastRef.h" +#include "flow/Trace.h" #include -class ActorLineageSet { +template +class WriteOnlySet { public: // The type we use for lookup into the set. Gets assigned during insert - using Index = unsigned; + using Index = IndexType; // For now we use a fixed size capacity - constexpr static Index CAPACITY = 1024; constexpr static Index npos = std::numeric_limits::max(); - explicit ActorLineageSet(); - ActorLineageSet(const ActorLineageSet&) = delete; - ActorLineageSet& operator=(const ActorLineageSet&) = delete; + explicit WriteOnlySet(); + WriteOnlySet(const WriteOnlySet&) = delete; + WriteOnlySet& operator=(const WriteOnlySet&) = delete; // Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so // the actual size might change anytime after or even during the call. This function only guarantees that the size @@ -39,36 +42,39 @@ public: // to handle this is by assuming that this returns an estimate. unsigned size(); - Index insert(const Reference& lineage); + Index insert(const Reference& lineage); void erase(Index idx); - std::vector> copy(); + std::vector> copy(); private: static constexpr uintptr_t FREE = 0b1; static constexpr uintptr_t LOCK = 0b10; - std::atomic _size = 0; + std::atomic _size = 0; std::vector> _set; + static_assert(std::atomic::is_always_lock_free, "Index type can't be used as a lock-free type"); + static_assert(std::atomic::is_always_lock_free, "uintptr_t can't be used as a lock-free type"); boost::lockfree::queue, boost::lockfree::capacity> freeQueue; - boost::lockfree::queue, boost::lockfree::capacity> - freeList; + boost::lockfree::queue, boost::lockfree::capacity> freeList; }; -ActorLineageSet::ActorLineageSet() { +template +WriteOnlySet::WriteOnlySet() : _set(CAPACITY) { // insert the free indexes in reverse order for (unsigned i = CAPACITY; i > 0; --i) { freeQueue.push(i - 1); - _set[i] = uintptr_t(1); + _set[i] = uintptr_t(FREE); } } -std::vector> ActorLineageSet::copy() { - std::vector> result; +template +std::vector> WriteOnlySet::copy() { + std::vector> result; for (int i = 0; i < CAPACITY; ++i) { auto ptr = _set[i].load(); if ((ptr & FREE) != 0) { ASSERT((ptr & LOCK) == 0); if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) { - ActorLineage* entry = reinterpret_cast(ptr); + T* entry = reinterpret_cast(ptr); ptr |= LOCK; entry->addref(); // we try to unlock now. If this element was removed while we incremented the refcount, the element will @@ -85,32 +91,5 @@ std::vector> ActorLineageSet::copy() { return result; } -ActorLineageSet::Index ActorLineageSet::insert(const Reference& lineage) { - Index res; - if (!freeQueue.pop(res)) { - TraceEvent(SevWarnAlways, "NoCapacityInActorLineageSet"); - return npos; - } - ASSERT(_set[res].load() & FREE); - auto ptr = reinterpret_cast(lineage.getPtr()); - ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned - lineage->addref(); - _set[res].store(ptr); - return res; -} - -void ActorLineageSet::erase(Index idx) { - while (true) { - auto ptr = _set[idx].load(); - if (ptr & LOCK) { - _set[idx].store(FREE); - freeList.push(reinterpret_cast(ptr ^ LOCK)); - return; - } else { - if (_set[idx].compare_exchange_strong(ptr, FREE)) { - reinterpret_cast(ptr)->delref(); - return; - } - } - } -} \ No newline at end of file +class ActorLineage; +extern template class WriteOnlySet; From 459afeed4cd9d6df4892e085f94d369af59f1efc Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 19 Mar 2021 11:25:55 -0600 Subject: [PATCH 023/317] disable jemalloc on macOS --- cmake/Jemalloc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Jemalloc.cmake b/cmake/Jemalloc.cmake index 6dff173b93..e89ef3ce82 100644 --- a/cmake/Jemalloc.cmake +++ b/cmake/Jemalloc.cmake @@ -3,7 +3,7 @@ add_library(jemalloc INTERFACE) set(USE_JEMALLOC ON) # We don't want to use jemalloc on Windows # Nor on FreeBSD, where jemalloc is the default system allocator -if(USE_SANITIZER OR WIN32 OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")) +if(USE_SANITIZER OR WIN32 OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") OR APPLE) set(USE_JEMALLOC OFF) return() endif() From 995ae34b1e637f6f776fc889e00474eb1ca1a322 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 19 Mar 2021 17:10:42 -0600 Subject: [PATCH 024/317] Bugfxies & hack to allow new unit test to run --- fdbserver/fdbserver.actor.cpp | 4 ++ flow/WriteOnlySet.actor.cpp | 89 ++++++++++++++++++++++++++++++----- flow/WriteOnlySet.h | 44 +++-------------- 3 files changed, 89 insertions(+), 48 deletions(-) diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index ff28269e4f..a285c0b958 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -66,6 +66,7 @@ #include "flow/SystemMonitor.h" #include "flow/TLSConfig.actor.h" #include "flow/Tracing.h" +#include "flow/WriteOnlySet.h" #if defined(__linux__) || defined(__FreeBSD__) #include @@ -1572,6 +1573,9 @@ private: } // namespace int main(int argc, char* argv[]) { + // TODO: Remove later, this is just to force the statics to be initialized + // otherwise the unit test won't run + ActorLineageSet _; try { platformInit(); diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index d0f7c514ad..32023f5e24 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -34,32 +34,75 @@ auto WriteOnlySet::insert(const Reference& lineage) - TraceEvent(SevWarnAlways, "NoCapacityInWriteOnlySet"); return npos; } - ASSERT(_set[res].load() & FREE); + ASSERT(_set[res].load() == 0); auto ptr = reinterpret_cast(lineage.getPtr()); - ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned - ASSERT((ptr & FREE) == 0 && (ptr & LOCK) == 0); + ASSERT((ptr % 2) == 0); // this needs to be at least 2-byte aligned + ASSERT(ptr != 0); lineage->addref(); _set[res].store(ptr); return res; } template -void WriteOnlySet::erase(Index idx) { +bool WriteOnlySet::eraseImpl(Index idx) { while (true) { auto ptr = _set[idx].load(); if (ptr & LOCK) { - _set[idx].store(FREE); + _set[idx].store(0); freeList.push(reinterpret_cast(ptr ^ LOCK)); - return; + return false; } else { - if (_set[idx].compare_exchange_strong(ptr, FREE)) { + if (_set[idx].compare_exchange_strong(ptr, 0)) { reinterpret_cast(ptr)->delref(); - return; + return true; } } } } +template +bool WriteOnlySet::erase(Index idx) { + auto res = eraseImpl(idx); + ASSERT(freeQueue.push(idx)); + return res; +} + +template +WriteOnlySet::WriteOnlySet() : _set(CAPACITY) { + // insert the free indexes in reverse order + for (unsigned i = CAPACITY; i > 0; --i) { + freeQueue.push(i - 1); + _set[i] = uintptr_t(0); + } +} + +template +std::vector> WriteOnlySet::copy() { + std::vector> result; + for (int i = 0; i < CAPACITY; ++i) { + auto ptr = _set[i].load(); + if (ptr) { + ASSERT((ptr & LOCK) == 0); // if we lock something we need to immediately unlock after we're done copying + // We attempt lock so this won't get deleted. We will try this only once, if the other thread removed the + // object from the set between the previews lines and now, we just won't make it part of the result. + if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) { + T* entry = reinterpret_cast(ptr); + ptr |= LOCK; + entry->addref(); + // we try to unlock now. If this element was removed while we incremented the refcount, the element will + // end up in the freeList, so we will decrement later. + _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); + result.emplace_back(entry); + } + } + } + // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread + // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next + // iteration + freeList.consume_all([](auto toClean) { toClean->delref(); }); + return result; +} + // Explicit instantiation template class WriteOnlySet; @@ -67,7 +110,10 @@ template class WriteOnlySet; namespace { std::atomic instanceCounter = 0; -constexpr double iteration_frequency = 10.0; +std::atomic numInserts = 0; +std::atomic numErase = 0; +std::atomic numLockedErase = 0; +std::atomic numCopied = 0; struct TestObject { mutable std::atomic _refCount = 1; @@ -117,6 +163,7 @@ void testCopier(std::shared_ptr set, std::chrono::seconds runFor) { return; } auto copy = set->copy(); + numCopied.fetch_add(copy.size()); std::this_thread::sleep_for(std::chrono::milliseconds(10)); } } @@ -126,17 +173,32 @@ void writer(std::shared_ptr set, std::chrono::seconds runFor) { std::random_device rDev; DeterministicRandom rnd(rDev()); while (true) { + unsigned inserts = 0, erases = 0; if (Clock::now() - start > runFor) { return; } std::vector positions; for (int i = 0; i < rnd.randomInt(1, 101); ++i) { - positions.push_back(set->insert(Reference(new TestObject()))); + Reference o(new TestObject()); + auto pos = set->insert(o); + if (pos == TestSet::npos) { + // could not insert -- ignore + break; + } + ++inserts; + ASSERT(pos < TestSet::capacity); + positions.push_back(pos); } rnd.randomShuffle(positions); for (auto p : positions) { - set->erase(p); + if (!set->erase(p)) { + ++numLockedErase; + } + ++erases; } + numInserts.fetch_add(inserts); + numErase.fetch_add(erases); + ASSERT(inserts == erases); std::this_thread::sleep_for(std::chrono::milliseconds(1)); } } @@ -154,6 +216,11 @@ TEST_CASE("/flow/WriteOnlySet") { } threads->emplace_back([set, runFor]() { testCopier(set, runFor); }); wait(threadjoiner(threads, set)); + TraceEvent("WriteOnlySetTestResult") + .detail("Inserts", numInserts.load()) + .detail("Erases", numErase.load()) + .detail("Copies", numCopied.load()) + .detail("LockedErase", numLockedErase.load()); return Void(); } } // namespace \ No newline at end of file diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h index a319ad22f0..9d80795c68 100644 --- a/flow/WriteOnlySet.h +++ b/flow/WriteOnlySet.h @@ -31,6 +31,7 @@ public: using Index = IndexType; // For now we use a fixed size capacity constexpr static Index npos = std::numeric_limits::max(); + constexpr static IndexType capacity = CAPACITY; explicit WriteOnlySet(); WriteOnlySet(const WriteOnlySet&) = delete; @@ -43,12 +44,13 @@ public: unsigned size(); Index insert(const Reference& lineage); - void erase(Index idx); + bool erase(Index idx); std::vector> copy(); private: - static constexpr uintptr_t FREE = 0b1; - static constexpr uintptr_t LOCK = 0b10; + bool eraseImpl(Index idx); + + static constexpr uintptr_t LOCK = 0b1; std::atomic _size = 0; std::vector> _set; static_assert(std::atomic::is_always_lock_free, "Index type can't be used as a lock-free type"); @@ -57,39 +59,7 @@ private: boost::lockfree::queue, boost::lockfree::capacity> freeList; }; -template -WriteOnlySet::WriteOnlySet() : _set(CAPACITY) { - // insert the free indexes in reverse order - for (unsigned i = CAPACITY; i > 0; --i) { - freeQueue.push(i - 1); - _set[i] = uintptr_t(FREE); - } -} - -template -std::vector> WriteOnlySet::copy() { - std::vector> result; - for (int i = 0; i < CAPACITY; ++i) { - auto ptr = _set[i].load(); - if ((ptr & FREE) != 0) { - ASSERT((ptr & LOCK) == 0); - if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) { - T* entry = reinterpret_cast(ptr); - ptr |= LOCK; - entry->addref(); - // we try to unlock now. If this element was removed while we incremented the refcount, the element will - // end up in the freeList, so we will decrement later. - _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); - result.emplace_back(entry); - } - } - } - // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread - // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next - // iteration - freeList.consume_all([](auto toClean) { toClean->delref(); }); - return result; -} - class ActorLineage; extern template class WriteOnlySet; + +using ActorLineageSet = WriteOnlySet; From 99ac47e96c10922ca40e1267467bcfcbb51a51a0 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 19 Mar 2021 18:08:09 -0600 Subject: [PATCH 025/317] documentation --- flow/WriteOnlySet.actor.cpp | 6 ++++ flow/WriteOnlySet.h | 65 +++++++++++++++++++++++++++++++++---- 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index 32023f5e24..93d9e99fc7 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -109,12 +109,14 @@ template class WriteOnlySet; // testing code namespace { +// Some statistics std::atomic instanceCounter = 0; std::atomic numInserts = 0; std::atomic numErase = 0; std::atomic numLockedErase = 0; std::atomic numCopied = 0; +// A simple object that counts the number of its instances. This is used to detect memory leaks. struct TestObject { mutable std::atomic _refCount = 1; TestObject() { instanceCounter.fetch_add(1); } @@ -130,6 +132,7 @@ struct TestObject { using TestSet = WriteOnlySet; using Clock = std::chrono::steady_clock; +// An actor that can join a set of threads in an async way. ACTOR Future threadjoiner(std::shared_ptr> threads, std::shared_ptr set) { loop { wait(delay(0.1)); @@ -156,6 +159,7 @@ ACTOR Future threadjoiner(std::shared_ptr> thread } } +// occasionally copy the contents of the past set. void testCopier(std::shared_ptr set, std::chrono::seconds runFor) { auto start = Clock::now(); while (true) { @@ -168,6 +172,7 @@ void testCopier(std::shared_ptr set, std::chrono::seconds runFor) { } } +// In a loop adds and removes a set of objects to the set void writer(std::shared_ptr set, std::chrono::seconds runFor) { auto start = Clock::now(); std::random_device rDev; @@ -203,6 +208,7 @@ void writer(std::shared_ptr set, std::chrono::seconds runFor) { } } +// This unit test creates 5 writer threads and one copier thread. TEST_CASE("/flow/WriteOnlySet") { if (g_network->isSimulated()) { // This test is not deterministic, so we shouldn't run it in simulation diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h index 9d80795c68..a2589ec387 100644 --- a/flow/WriteOnlySet.h +++ b/flow/WriteOnlySet.h @@ -24,6 +24,21 @@ #include "flow/Trace.h" #include +/** + * This is a Write-Only set that supports copying the whole content. This data structure is lock-free and allows a user + * to insert and remove objects up to a given capacity (passed by a template). + * + * Template parameters: + * \param T The type to store. + * \param IndexType The type used as an index + * \param CAPACITY The maximum number of object this structure can store (if a user tries to store more, insert will + * fail gracefully) + * \pre T implements `void addref() const` and `void delref() const` + * \pre IndexType must have a copy constructor + * \pre IndexType must have a trivial assignment operator + * \pre IndexType must have a trivial destructor + * \pre IndexType can be used as an index into a std::vector + */ template class WriteOnlySet { public: @@ -37,25 +52,61 @@ public: WriteOnlySet(const WriteOnlySet&) = delete; WriteOnlySet& operator=(const WriteOnlySet&) = delete; - // Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so - // the actual size might change anytime after or even during the call. This function only guarantees that the size - // was whatever the method returns at one point between the start and the end of the function call. The safest way - // to handle this is by assuming that this returns an estimate. - unsigned size(); + /** + * Attempts to insert \p lineage into the set. This method can fail if the set is full (its size is equal to its + * capacity). Calling insert on a full set is safe but the method will return \ref npos if the operation fails. + * + * \param lineage A reference to the object the user wants to insert. + * \ret An index that can later be used to erase the value again or \ref npos if the insert failed. + * \pre lineage.getPtr() % 2 == 0 (the memory for lineage has to be at least 2 byte aligned) + */ + [[nodiscard]] Index insert(const Reference& lineage); - Index insert(const Reference& lineage); + /** + * Erases the object associated with \p idx from the set. + * + * \ret Whether the reference count was decremented. Usually the return value is only interesting for testing and + * benchmarking purposes and will in most cases be ignored. If \ref delref wasn't called, it will be called + * later. Note that at the time the return value is checked, \ref delref might already have been called. + */ bool erase(Index idx); + /** + * Copies all elements that are stored in the set into a vector. This copy operation does NOT provide a snapshot of + * the data structure. The contract is weak: + * - All object that were in the set before copy is called and weren't removed until after copy returned are + * guaranteed to be in the result. + * - Any object that was inserted while copy is running might be in the result. + * - Any object that was erased while copy is running might be in the result. + */ std::vector> copy(); private: + // the implementation of erase -- the wrapper just makes the function a bit more readable. bool eraseImpl(Index idx); + // the last bit of a pointer within the set is used like a boolean and true means that the object is locked. Locking + // an object is only relevant for memory management. A locked pointer can still be erased from the set, but the + // erase won't call delref on the object. Instead it will push the pointer into the \ref freeList and copy will call + // delref later. static constexpr uintptr_t LOCK = 0b1; - std::atomic _size = 0; + + // The actual memory std::vector> _set; static_assert(std::atomic::is_always_lock_free, "Index type can't be used as a lock-free type"); static_assert(std::atomic::is_always_lock_free, "uintptr_t can't be used as a lock-free type"); + + // The freeQueue. On creation all indexes (0..capacity-1) are pushed into this queue. On insert one element from + // this queue is consumed and the resulting number is used as an index into the set. On erase the index is given + // back to the freeQueue. boost::lockfree::queue, boost::lockfree::capacity> freeQueue; + + // The freeList is used for memory management. Generally copying a shared pointer can't be done in a lock-free way. + // Instead, when we copy the data structure we first copy the address, then attempt to set the last bit to 1 and + // only if that succeeds we will increment the reference count. Whenever we attempt to remove an object + // in \ref erase we remove the object from the set (using an atomic compare and swap) and only decrement the + // reference count if the last bit is 0. If it's not we'll push the pointer into this free list. + // \ref copy will consume all elements from this freeList each time it runs and decrements the refcount for each + // element. boost::lockfree::queue, boost::lockfree::capacity> freeList; }; From 6746bbaba7d919736ecd7b8d5d9134507f448bde Mon Sep 17 00:00:00 2001 From: Xiaoge Su Date: Mon, 22 Mar 2021 10:36:45 -0700 Subject: [PATCH 026/317] Update the document per comments III --- design/Commit/How a commit is done in FDB.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/design/Commit/How a commit is done in FDB.md b/design/Commit/How a commit is done in FDB.md index 739549aedf..1e34ac481e 100644 --- a/design/Commit/How a commit is done in FDB.md +++ b/design/Commit/How a commit is done in FDB.md @@ -1,5 +1,8 @@ # How a commit is done in FDB +This doc describes how commit is done in FDB 6.3+. +The commit path in FDB 6.3 and before is documented in [documentation/sphinx/source/read-write-path.rst](https://github.com/apple/foundationdb/pull/4099). + ## Overall description Legend: From 61352b912444c5d3601b8e33de234cc1f61fe32b Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 22 Mar 2021 11:41:45 -0600 Subject: [PATCH 027/317] use push_back where emplace_back is unnecessary --- flow/WriteOnlySet.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index 93d9e99fc7..9ab63aa56f 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -92,7 +92,7 @@ std::vector> WriteOnlySet::copy() { // we try to unlock now. If this element was removed while we incremented the refcount, the element will // end up in the freeList, so we will decrement later. _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); - result.emplace_back(entry); + result.push_back(entry); } } } From 301daf326939d6378d410420d007322f7c7a3dd3 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 22 Mar 2021 11:46:16 -0600 Subject: [PATCH 028/317] address review comments --- flow/WriteOnlySet.actor.cpp | 2 +- flow/WriteOnlySet.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index 9ab63aa56f..364c53460d 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -1,5 +1,5 @@ /* - * WriteOnlySet.cpp + * WriteOnlySet.actor.cpp * * This source file is part of the FoundationDB open source project * diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h index a2589ec387..c71736f852 100644 --- a/flow/WriteOnlySet.h +++ b/flow/WriteOnlySet.h @@ -1,5 +1,5 @@ /* - * WriteOnlySet.cpp + * WriteOnlySet.h * * This source file is part of the FoundationDB open source project * @@ -50,7 +50,9 @@ public: explicit WriteOnlySet(); WriteOnlySet(const WriteOnlySet&) = delete; + WriteOnlySet(WriteOnlySet&&) = delete; WriteOnlySet& operator=(const WriteOnlySet&) = delete; + WriteOnlySet& operator=(WriteOnlySet&&) = delete; /** * Attempts to insert \p lineage into the set. This method can fail if the set is full (its size is equal to its @@ -93,7 +95,7 @@ private: // The actual memory std::vector> _set; static_assert(std::atomic::is_always_lock_free, "Index type can't be used as a lock-free type"); - static_assert(std::atomic::is_always_lock_free, "uintptr_t can't be used as a lock-free type"); + static_assert(std::atomic::is_always_lock_free, "uintptr_t can't be used as a lock-free type"); // The freeQueue. On creation all indexes (0..capacity-1) are pushed into this queue. On insert one element from // this queue is consumed and the resulting number is used as an index into the set. On erase the index is given From 5bd79de88179945a78e7862d90e7de183d3d690c Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Mon, 22 Mar 2021 10:01:28 -0700 Subject: [PATCH 029/317] Fix build --- flow/Profiler.actor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp index 46b0bcecb4..24bba87739 100644 --- a/flow/Profiler.actor.cpp +++ b/flow/Profiler.actor.cpp @@ -142,6 +142,8 @@ struct Profiler { } void signal_handler() { // async signal safe! + static std::atomic inSigHandler = false; + if (inSigHandler.exchange(true)) { return; } if (profilingEnabled) { double t = timer(); output_buffer->push(*(void**)&t); From 0ec7340a6f72f8d29b43ade50667d2b0e88ebd75 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Mon, 22 Mar 2021 10:55:52 -0700 Subject: [PATCH 030/317] Create reference --- flow/WriteOnlySet.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index 364c53460d..92eceea7bc 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -92,7 +92,7 @@ std::vector> WriteOnlySet::copy() { // we try to unlock now. If this element was removed while we incremented the refcount, the element will // end up in the freeList, so we will decrement later. _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); - result.push_back(entry); + result.push_back(Reference(entry)); } } } @@ -229,4 +229,4 @@ TEST_CASE("/flow/WriteOnlySet") { .detail("LockedErase", numLockedErase.load()); return Void(); } -} // namespace \ No newline at end of file +} // namespace From b246e673bceab43b28cc4a855584333eb3404146 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Wed, 24 Mar 2021 15:34:19 -0400 Subject: [PATCH 031/317] Added comment to seedShardServers (taken from existing desc in .h file) --- fdbserver/MoveKeys.actor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index 83f7170e95..0702b8d097 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -1212,6 +1212,8 @@ ACTOR Future moveKeys(Database cx, return Void(); } +// Called by the master server to write the very first transaction to the database +// establishing a set of shard servers and all invariants of the systemKeys. void seedShardServers(Arena& arena, CommitTransactionRef& tr, vector servers) { std::map, Tag> dcId_locality; std::map server_tag; From 2dfd420882537d7fa7d477c08b699f1a5e961a1c Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Wed, 24 Mar 2021 14:52:42 -0700 Subject: [PATCH 032/317] Add sampling profiler thread --- fdbrpc/AsyncFileKAIO.actor.h | 6 +++++- fdbrpc/IAsyncFile.h | 4 ++++ fdbrpc/Net2FileSystem.cpp | 4 ++++ fdbrpc/Net2FileSystem.h | 3 +++ fdbrpc/sim2.actor.cpp | 4 ++++ fdbrpc/simulator.h | 4 ++++ fdbserver/fdbserver.actor.cpp | 1 + flow/Platform.actor.cpp | 27 +++++++++++++++++++++++++++ flow/Platform.h | 2 ++ 9 files changed, 54 insertions(+), 1 deletion(-) diff --git a/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/AsyncFileKAIO.actor.h index 5e6592e6ba..dbdb040d00 100644 --- a/fdbrpc/AsyncFileKAIO.actor.h +++ b/fdbrpc/AsyncFileKAIO.actor.h @@ -242,7 +242,11 @@ public: // result = map(result, [=](int r) mutable { KAIOLogBlockEvent(io, OpLogEntry::READY, r); return r; }); #endif - return success(result); + auto& actorLineageSet = IAsyncFileSystem::filesystem()->getActorLineageSet(); + auto index = actorLineageSet.insert(currentLineage); + Future res = success(result); + actorLineageSet.erase(index); + return res; } // TODO(alexmiller): Remove when we upgrade the dev docker image to >14.10 #ifndef FALLOC_FL_ZERO_RANGE diff --git a/fdbrpc/IAsyncFile.h b/fdbrpc/IAsyncFile.h index ed703514c6..ad48db5f07 100644 --- a/fdbrpc/IAsyncFile.h +++ b/fdbrpc/IAsyncFile.h @@ -25,6 +25,7 @@ #include #include "flow/flow.h" +#include "flow/WriteOnlySet.h" #include "fdbrpc/IRateControl.h" // All outstanding operations must be cancelled before the destructor of IAsyncFile is called. @@ -118,6 +119,9 @@ public: // Returns the time of the last modification of the file. virtual Future lastWriteTime(const std::string& filename) = 0; + // Returns the shared memory data structure used to store actor lineages. + virtual ActorLineageSet& getActorLineageSet() = 0; + static IAsyncFileSystem* filesystem() { return filesystem(g_network); } static runCycleFuncPtr runCycleFunc() { return reinterpret_cast( diff --git a/fdbrpc/Net2FileSystem.cpp b/fdbrpc/Net2FileSystem.cpp index 71a7d784a1..8e895c08dc 100644 --- a/fdbrpc/Net2FileSystem.cpp +++ b/fdbrpc/Net2FileSystem.cpp @@ -89,6 +89,10 @@ Future Net2FileSystem::lastWriteTime(const std::string& filename) { return Net2AsyncFile::lastWriteTime(filename); } +ActorLineageSet& Net2FileSystem::getActorLineageSet() { + return actorLineageSet; +} + void Net2FileSystem::newFileSystem(double ioTimeout, const std::string& fileSystemPath) { g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Net2FileSystem(ioTimeout, fileSystemPath)); } diff --git a/fdbrpc/Net2FileSystem.h b/fdbrpc/Net2FileSystem.h index 702b87828f..0c2229b5ca 100644 --- a/fdbrpc/Net2FileSystem.h +++ b/fdbrpc/Net2FileSystem.h @@ -39,6 +39,8 @@ public: Future renameFile(std::string const& from, std::string const& to) override; + ActorLineageSet& getActorLineageSet() override; + // void init(); static void stop(); @@ -52,6 +54,7 @@ public: dev_t fileSystemDeviceId; bool checkFileSystem; #endif + ActorLineageSet actorLineageSet; }; #endif diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 6101ca8512..e9219f3ff3 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -2494,6 +2494,10 @@ Future Sim2FileSystem::lastWriteTime(const std::string& filename) { return fileWrites[filename]; } +ActorLineageSet& Sim2FileSystem::getActorLineageSet() { + return actorLineageSet; +} + void Sim2FileSystem::newFileSystem() { g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Sim2FileSystem()); } diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index cde0eb0dda..08b4264e81 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -471,6 +471,8 @@ public: Future lastWriteTime(const std::string& filename) override; + ActorLineageSet& getActorLineageSet() override; + Future renameFile(std::string const& from, std::string const& to) override; Sim2FileSystem() {} @@ -478,6 +480,8 @@ public: ~Sim2FileSystem() override {} static void newFileSystem(); + + ActorLineageSet actorLineageSet; }; #endif diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index a285c0b958..fbcd7fd9ee 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -1948,6 +1948,7 @@ int main(int argc, char* argv[]) { ASSERT(opts.connectionFile); setupRunLoopProfiler(); + setupSamplingProfiler(); auto dataFolder = opts.dataFolder; if (!dataFolder.size()) diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 42d8decccc..756fb6a7e3 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -48,6 +48,8 @@ #include "flow/UnitTest.h" #include "flow/FaultInjection.h" +#include "fdbrpc/IAsyncFile.h" + #ifdef _WIN32 #include #include @@ -3673,6 +3675,31 @@ void setupRunLoopProfiler() { #endif } +void* sampleThread(void* arg) { + while (true) { + threadSleep(1.0); // TODO: Read sample rate from global config + + // TODO: Copy actor lineage of currently running actor + + auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy(); + printf("Disk ALPs: %d\n", diskAlps.size()); + + // TODO: Call collect on all actor lineages + for (auto actorLineage : diskAlps) { + } + + // TODO: Serialize collected actor linage properties + } + + return nullptr; +} + +void setupSamplingProfiler() { + // TODO: Add knob + TraceEvent("StartingSamplingProfilerThread"); + startThread(&sampleThread, nullptr); +} + // UnitTest for getMemoryInfo #ifdef __linux__ TEST_CASE("/flow/Platform/getMemoryInfo") { diff --git a/flow/Platform.h b/flow/Platform.h index 74c9395c53..edf9ff3997 100644 --- a/flow/Platform.h +++ b/flow/Platform.h @@ -741,6 +741,8 @@ void registerCrashHandler(); void setupRunLoopProfiler(); EXTERNC void setProfilingEnabled(int enabled); +void setupSamplingProfiler(); + // Use _exit() or criticalError(), not exit() #define exit static_assert(false, "Calls to exit() are forbidden by policy"); From 36f4c17ef143cd3c82b7038f001d256867e2a7fa Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 24 Mar 2021 15:04:45 -0700 Subject: [PATCH 033/317] Reduce the number of actor calls in load balancing to improve performance. --- fdbrpc/LoadBalance.actor.h | 321 +++++++++++++++++++++---------------- 1 file changed, 184 insertions(+), 137 deletions(-) diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h index 9b47912993..78f73352ba 100644 --- a/fdbrpc/LoadBalance.actor.h +++ b/fdbrpc/LoadBalance.actor.h @@ -75,109 +75,169 @@ struct LoadBalancedReply { Optional getLoadBalancedReply(const LoadBalancedReply* reply); Optional getLoadBalancedReply(const void*); -// Returns true if we got a value for our request -// Throws an error if the request returned an error that should bubble out -// Returns false if we got an error that should result in reissuing the request -template -bool checkAndProcessResult(ErrorOr result, Reference holder, bool atMostOnce, bool triedAllOptions) { - Optional loadBalancedReply; - if (!result.isError()) { - loadBalancedReply = getLoadBalancedReply(&result.get()); +// Stores state for a request made by the load balancer +template +struct RequestData : NonCopyable { + Future> response; + Reference modelHolder; + Future backoffDelay; + RequestStream const* stream = nullptr; + bool triedAllOptions = false; + + bool requestStarted = false; // true once the request has been sent to an alternative + bool requestProcessed = false; // true once a response has been received and handled by checkAndProcessResult + + // Whether or not the response future is valid + // This is true once setupRequest is called, even though at that point the response is Never(). + bool isValid() { return response.isValid(); } + + // Initializes the request state and starts the backoff delay + void setupRequest(double backoff, bool triedAllOptions, RequestStream const* stream) { + backoffDelay = (backoff > 0) ? delay(backoff) : Void(); + response = Never(); + modelHolder = Reference(); + requestStarted = false; + requestProcessed = false; + + this->stream = stream; + this->triedAllOptions = triedAllOptions; } - int errCode; - if (loadBalancedReply.present()) { - errCode = - loadBalancedReply.get().error.present() ? loadBalancedReply.get().error.get().code() : error_code_success; - } else { - errCode = result.isError() ? result.getError().code() : error_code_success; + // Sends the request to the configured stream + // This should not be called until after setupRequest has been called and the backoff delay has elapsed + void startRequest(Request request, QueueModel* model) { + ASSERT(stream); + ASSERT(backoffDelay.isReady()); + + backoffDelay = Never(); + modelHolder = Reference(new ModelHolder(model, stream->getEndpoint().token.first())); + response = stream->tryGetReply(request); + requestStarted = true; } - bool maybeDelivered = errCode == error_code_broken_promise || errCode == error_code_request_maybe_delivered; - bool receivedResponse = loadBalancedReply.present() ? !loadBalancedReply.get().error.present() : result.present(); - receivedResponse = receivedResponse || (!maybeDelivered && errCode != error_code_process_behind); - bool futureVersion = errCode == error_code_future_version || errCode == error_code_process_behind; + // Implementation of the logic to handle a response. + // Checks the state of the response, updates the queue model, and returns one of the following outcomes: + // A return value of true means that the request completed successfully + // A return value of false means that the request failed but should be retried + // A return value with an error means that the error should be thrown back to original caller + static ErrorOr checkAndProcessResultImpl(ErrorOr result, + Reference modelHolder, + bool atMostOnce, + bool triedAllOptions) { + ASSERT(modelHolder); - holder->release( - receivedResponse, futureVersion, loadBalancedReply.present() ? loadBalancedReply.get().penalty : -1.0); + Optional loadBalancedReply; + if (!result.isError()) { + loadBalancedReply = getLoadBalancedReply(&result.get()); + } + + int errCode; + if (loadBalancedReply.present()) { + errCode = loadBalancedReply.get().error.present() ? loadBalancedReply.get().error.get().code() + : error_code_success; + } else { + errCode = result.isError() ? result.getError().code() : error_code_success; + } + + bool maybeDelivered = errCode == error_code_broken_promise || errCode == error_code_request_maybe_delivered; + bool receivedResponse = + loadBalancedReply.present() ? !loadBalancedReply.get().error.present() : result.present(); + receivedResponse = receivedResponse || (!maybeDelivered && errCode != error_code_process_behind); + bool futureVersion = errCode == error_code_future_version || errCode == error_code_process_behind; + + modelHolder->release( + receivedResponse, futureVersion, loadBalancedReply.present() ? loadBalancedReply.get().penalty : -1.0); + + if (errCode == error_code_server_overloaded) { + return false; + } + + if (loadBalancedReply.present() && !loadBalancedReply.get().error.present()) { + return true; + } + + if (!loadBalancedReply.present() && result.present()) { + return true; + } + + if (receivedResponse) { + return loadBalancedReply.present() ? loadBalancedReply.get().error.get() : result.getError(); + } + + if (atMostOnce && maybeDelivered) { + return request_maybe_delivered(); + } + + if (triedAllOptions && errCode == error_code_process_behind) { + return process_behind(); + } - if (errCode == error_code_server_overloaded) { return false; } - if (loadBalancedReply.present() && !loadBalancedReply.get().error.present()) { - return true; + // Checks the state of the response, updates the queue model, and returns one of the following outcomes: + // A return value of true means that the request completed successfully + // A return value of false means that the request failed but should be retried + // In the event of a non-retryable failure, an error is thrown indicating the failure + bool checkAndProcessResult(bool atMostOnce) { + ASSERT(response.isReady()); + requestProcessed = true; + + ErrorOr outcome = + checkAndProcessResultImpl(response.get(), std::move(modelHolder), atMostOnce, triedAllOptions); + + if (outcome.isError()) { + throw outcome.getError(); + } else if (!outcome.get()) { + response = Future>(); + } + + return outcome.get(); } - if (!loadBalancedReply.present() && result.present()) { - return true; + // Convert this request to a lagging request. Such a request is no longer being waited on, but it still needs to be + // processed so we can update the queue model. + void makeLaggingRequest() { + ASSERT(response.isValid()); + ASSERT(!response.isReady()); + ASSERT(modelHolder); + ASSERT(modelHolder->model); + + QueueModel* model = modelHolder->model; + if (model->laggingRequestCount > FLOW_KNOBS->MAX_LAGGING_REQUESTS_OUTSTANDING || + model->laggingRequests.isReady()) { + model->laggingRequests.cancel(); + model->laggingRequestCount = 0; + model->addActor = PromiseStream>(); + model->laggingRequests = actorCollection(model->addActor.getFuture(), &model->laggingRequestCount); + } + + // We need to process the lagging request in order to update the queue model + Reference holderCapture = std::move(modelHolder); + bool triedAllOptionsCapture = triedAllOptions; + Future updateModel = + map(response, [holderCapture, triedAllOptionsCapture](ErrorOr result) { + checkAndProcessResultImpl(result, holderCapture, false, triedAllOptionsCapture); + return Void(); + }); + model->addActor.send(updateModel); } - if (receivedResponse) { - throw loadBalancedReply.present() ? loadBalancedReply.get().error.get() : result.getError(); - } - - if (atMostOnce && maybeDelivered) { - throw request_maybe_delivered(); - } - - if (triedAllOptions && errCode == error_code_process_behind) { - throw process_behind(); - } - - return false; -} - -ACTOR template -Future> makeRequest(RequestStream const* stream, - Request request, - double backoff, - Future requestUnneeded, - QueueModel* model, - bool isFirstRequest, - bool atMostOnce, - bool triedAllOptions) { - if (backoff > 0.0) { - wait(delay(backoff) || requestUnneeded); - } - - if (requestUnneeded.isReady()) { - return Optional(); - } - - state Reference holder(new ModelHolder(model, stream->getEndpoint().token.first())); - - ErrorOr result = wait(stream->tryGetReply(request)); - if (checkAndProcessResult(result, holder, atMostOnce, triedAllOptions)) { - return result.get(); - } else { - return Optional(); - } -} - -template -void addLaggingRequest(Future> reply, Promise requestFinished, QueueModel* model) { - requestFinished.send(Void()); - if (!reply.isReady()) { - if (model) { - if (model->laggingRequestCount > FLOW_KNOBS->MAX_LAGGING_REQUESTS_OUTSTANDING || - model->laggingRequests.isReady()) { - model->laggingRequests.cancel(); - model->laggingRequestCount = 0; - model->addActor = PromiseStream>(); - model->laggingRequests = actorCollection(model->addActor.getFuture(), &model->laggingRequestCount); - } - - model->addActor.send(success(errorOr(reply))); + ~RequestData() { + // If the request has been started but hasn't completed, mark it as a lagging request + if (requestStarted && !requestProcessed && modelHolder && modelHolder->model) { + makeLaggingRequest(); } } -} +}; -// Keep trying to get a reply from any of servers until success or cancellation; tries to take into account -// failMon's information for load balancing and avoiding failed servers +// Try to get a reply from one of the alternatives until success, cancellation, or certain errors. +// Load balancing has a budget to race requests to a second alternative if the first request is slow. +// Tries to take into account failMon's information for load balancing and avoiding failed servers. // If ALL the servers are failed and the list of servers is not fresh, throws an exception to let the caller refresh the -// list of servers. When model is set, load balance among alternatives in the same DC, aiming to balance request queue -// length on these interfaces. If too many interfaces in the same DC are bad, try remote interfaces. +// list of servers. +// When model is set, load balance among alternatives in the same DC aims to balance request queue length on these +// interfaces. If too many interfaces in the same DC are bad, try remote interfaces. ACTOR template Future loadBalance( Reference> alternatives, @@ -186,9 +246,11 @@ Future loadBalance( TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint, bool atMostOnce = false, // if true, throws request_maybe_delivered() instead of retrying automatically QueueModel* model = nullptr) { - state Future> firstRequest; + + state RequestData firstRequestData; + state RequestData secondRequestData; + state Optional firstRequestEndpoint; - state Future> secondRequest; state Future secondDelay = Never(); state Promise requestFinished; @@ -320,7 +382,7 @@ Future loadBalance( } // Find an alternative, if any, that is not failed, starting with - // nextAlt. This logic matters only if model == NULL. Otherwise, the + // nextAlt. This logic matters only if model == nullptr. Otherwise, the // bestAlt and nextAlt have been decided. state RequestStream const* stream = nullptr; for (int alternativeNum = 0; alternativeNum < alternatives->size(); alternativeNum++) { @@ -340,7 +402,7 @@ Future loadBalance( stream = nullptr; } - if (!stream && !firstRequest.isValid()) { + if (!stream && !firstRequestData.isValid()) { // Everything is down! Wait for someone to be up. vector> ok(alternatives->size()); @@ -391,50 +453,40 @@ Future loadBalance( numAttempts = 0; // now that we've got a server back, reset the backoff } else if (!stream) { // Only the first location is available. - Optional result = wait(firstRequest); - if (result.present()) { - return result.get(); - } + loop choose { + when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); } + when(ErrorOr result = wait(firstRequestData.response)) { + if (firstRequestData.checkAndProcessResult(atMostOnce)) { + return result.get(); + } - firstRequest = Future>(); - firstRequestEndpoint = Optional(); - } else if (firstRequest.isValid()) { + firstRequestEndpoint = Optional(); + break; + } + } + } else if (firstRequestData.isValid()) { // Issue a second request, the first one is taking a long time. - secondRequest = makeRequest( - stream, request, backoff, requestFinished.getFuture(), model, false, atMostOnce, triedAllOptions); + secondRequestData.setupRequest(backoff, triedAllOptions, stream); state bool firstFinished = false; - loop { - choose { - when(ErrorOr> result = - wait(firstRequest.isValid() ? errorOr(firstRequest) : Never())) { - if (result.isError() || result.get().present()) { - addLaggingRequest(secondRequest, requestFinished, model); - if (result.isError()) { - throw result.getError(); - } else { - return result.get().get(); - } - } - - firstRequest = Future>(); - firstRequestEndpoint = Optional(); - firstFinished = true; + loop choose { + when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); } + when(wait(secondRequestData.backoffDelay)) { secondRequestData.startRequest(request, model); } + when(ErrorOr result = + wait(firstRequestData.response.isValid() ? firstRequestData.response : Never())) { + if (firstRequestData.checkAndProcessResult(atMostOnce)) { + return result.get(); } - when(ErrorOr> result = wait(errorOr(secondRequest))) { - if (result.isError() || result.get().present()) { - if (!firstFinished) { - addLaggingRequest(firstRequest, requestFinished, model); - } - if (result.isError()) { - throw result.getError(); - } else { - return result.get().get(); - } - } - break; + firstRequestEndpoint = Optional(); + firstFinished = true; + } + when(ErrorOr result = wait(secondRequestData.response)) { + if (secondRequestData.checkAndProcessResult(atMostOnce)) { + return result.get(); } + + break; } } @@ -445,13 +497,13 @@ Future loadBalance( } } else { // Issue a request, if it takes too long to get a reply, go around the loop - firstRequest = makeRequest( - stream, request, backoff, requestFinished.getFuture(), model, true, atMostOnce, triedAllOptions); + firstRequestData.setupRequest(backoff, triedAllOptions, stream); firstRequestEndpoint = stream->getEndpoint().token.first(); loop { choose { - when(ErrorOr> result = wait(errorOr(firstRequest))) { + when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); } + when(ErrorOr result = wait(firstRequestData.response)) { if (model) { model->secondMultiplier = std::max(model->secondMultiplier - FLOW_KNOBS->SECOND_REQUEST_MULTIPLIER_DECAY, 1.0); @@ -460,15 +512,10 @@ Future loadBalance( FLOW_KNOBS->SECOND_REQUEST_MAX_BUDGET); } - if (result.isError()) { - throw result.getError(); + if (firstRequestData.checkAndProcessResult(atMostOnce)) { + return result.get(); } - if (result.get().present()) { - return result.get().get(); - } - - firstRequest = Future>(); firstRequestEndpoint = Optional(); break; } From f7d3b31ef8f93a9ec845bef3a8216e70c384d804 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 24 Mar 2021 16:27:35 -0600 Subject: [PATCH 034/317] Actually close files in simulation --- fdbrpc/AsyncFileNonDurable.actor.h | 4 ++++ fdbrpc/sim2.actor.cpp | 16 ++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 49fe0e2c8f..13fdcc25a5 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -267,6 +267,10 @@ public: Future deleteFuture = deleteFile(this); if (!deleteFuture.isReady()) filesBeingDeleted[filename] = deleteFuture; + } else if (isSoleOwner()) { + // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we + // we remove the file from the map to make sure it gets closed. + g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); } } diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 1af14ec676..6cddbb7e88 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -536,7 +536,10 @@ public: std::string getFilename() const override { return actualFilename; } - ~SimpleFile() override { _close(h); } + ~SimpleFile() override { + _close(h); + --openCount; + } private: int h; @@ -1933,10 +1936,7 @@ public: TraceEvent("ClogInterface") .detail("IP", ip.toString()) .detail("Delay", seconds) - .detail("Queue", - mode == ClogSend ? "Send" - : mode == ClogReceive ? "Receive" - : "All"); + .detail("Queue", mode == ClogSend ? "Send" : mode == ClogReceive ? "Receive" : "All"); if (mode == ClogSend || mode == ClogAll) g_clogging.clogSendFor(ip, seconds); @@ -2408,9 +2408,9 @@ int sf_open(const char* filename, int flags, int convFlags, int mode) { GENERIC_READ | ((flags & IAsyncFile::OPEN_READWRITE) ? GENERIC_WRITE : 0), FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, nullptr, - (flags & IAsyncFile::OPEN_EXCLUSIVE) ? CREATE_NEW - : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS - : OPEN_EXISTING, + (flags & IAsyncFile::OPEN_EXCLUSIVE) + ? CREATE_NEW + : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS : OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); int h = -1; From 6a344ddeab4eac19ee34f1af7649a6b5e8e39efc Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 24 Mar 2021 16:56:11 -0600 Subject: [PATCH 035/317] fix typo --- fdbrpc/AsyncFileNonDurable.actor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 13fdcc25a5..8cc65bf4a5 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -269,7 +269,7 @@ public: filesBeingDeleted[filename] = deleteFuture; } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we - // we remove the file from the map to make sure it gets closed. + // remove the file from the map to make sure it gets closed. g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); } } From b5412b355e3f900f7b40adadf5d7b51ee142141a Mon Sep 17 00:00:00 2001 From: Nim Wijetunga Date: Wed, 24 Mar 2021 23:34:34 +0000 Subject: [PATCH 036/317] Add Java API for network busyness --- bindings/java/fdbJNI.cpp | 11 +++++++++++ .../src/main/com/apple/foundationdb/Database.java | 9 +++++++++ .../src/main/com/apple/foundationdb/FDBDatabase.java | 11 +++++++++++ 3 files changed, 31 insertions(+) diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp index ebe83269e6..f5d66577fd 100644 --- a/bindings/java/fdbJNI.cpp +++ b/bindings/java/fdbJNI.cpp @@ -580,6 +580,17 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1setOpti } } +JNIEXPORT jdouble JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1getMainThreadBusyness(JNIEnv* jenv, + jobject, + jlong dbPtr) { + if (!dbPtr) { + throwParamNotNull(jenv); + return 0; + } + FDBDatabase* database = (FDBDatabase*)dbPtr; + return (jdouble) fdb_database_get_main_thread_busyness(database); +} + JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FDB_Error_1predicate(JNIEnv* jenv, jobject, jint predicate, diff --git a/bindings/java/src/main/com/apple/foundationdb/Database.java b/bindings/java/src/main/com/apple/foundationdb/Database.java index e5f2d36de6..741fa1c5eb 100644 --- a/bindings/java/src/main/com/apple/foundationdb/Database.java +++ b/bindings/java/src/main/com/apple/foundationdb/Database.java @@ -80,6 +80,15 @@ public interface Database extends AutoCloseable, TransactionContext { */ DatabaseOptions options(); + /** + * Returns a value which indicates the saturation of the client + *
+ * Note: By default, this value is updated every second + * + * @return a value where 0 indicates that the client is idle and 1 (or larger) indicates that the client is saturated. + */ + double getMainThreadBusyness(); + /** * Runs a read-only transactional function against this {@code Database} with retry logic. * {@link Function#apply(Object) apply(ReadTransaction)} will be called on the diff --git a/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java b/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java index 620b5aaa4e..8df1fd75b6 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java +++ b/bindings/java/src/main/com/apple/foundationdb/FDBDatabase.java @@ -150,6 +150,16 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume } } + @Override + public double getMainThreadBusyness() { + pointerReadLock.lock(); + try { + return Database_getMainThreadBusyness(getPtr()); + } finally { + pointerReadLock.unlock(); + } + } + @Override public Executor getExecutor() { return executor; @@ -163,4 +173,5 @@ class FDBDatabase extends NativeObjectWrapper implements Database, OptionConsume private native long Database_createTransaction(long cPtr); private native void Database_dispose(long cPtr); private native void Database_setOption(long cPtr, int code, byte[] value) throws FDBException; + private native double Database_getMainThreadBusyness(long cPtr); } \ No newline at end of file From 21f1e1d5de98ab75264ccc30cecd35f682b9b647 Mon Sep 17 00:00:00 2001 From: Nim Wijetunga Date: Wed, 24 Mar 2021 23:38:42 +0000 Subject: [PATCH 037/317] add comment --- bindings/java/fdbJNI.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp index f5d66577fd..482098e815 100644 --- a/bindings/java/fdbJNI.cpp +++ b/bindings/java/fdbJNI.cpp @@ -580,6 +580,9 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1setOpti } } +// Get network thread busyness (updated every 1s) +// A value of 0 indicates that the client is more or less idle +// A value of 1 (or more) indicates that the client is saturated JNIEXPORT jdouble JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1getMainThreadBusyness(JNIEnv* jenv, jobject, jlong dbPtr) { From a84592df7e4151e54a3a7717e58d24e2f701f410 Mon Sep 17 00:00:00 2001 From: Nim Wijetunga Date: Wed, 24 Mar 2021 23:59:40 +0000 Subject: [PATCH 038/317] add test for network busyness --- .../test/com/apple/foundationdb/test/AsyncStackTester.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java index f584f452a9..87ea5adfe0 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java @@ -510,6 +510,12 @@ public class AsyncStackTester { db.options().setTransactionCausalReadRisky(); db.options().setTransactionIncludePortInAddress(); + // Test network busyness + double busyness = db.getMainThreadBusyness(); + if (busyness < 0) { + throw new IllegalStateException("Network busyness cannot be less than 0"); + } + tr.options().setPrioritySystemImmediate(); tr.options().setPriorityBatch(); tr.options().setCausalReadRisky(); From bdccf8bc801504e846892b242a57ab829818b643 Mon Sep 17 00:00:00 2001 From: Nim Wijetunga Date: Thu, 25 Mar 2021 00:11:11 +0000 Subject: [PATCH 039/317] fix formatting issues --- bindings/java/fdbJNI.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bindings/java/fdbJNI.cpp b/bindings/java/fdbJNI.cpp index 482098e815..06acae658e 100644 --- a/bindings/java/fdbJNI.cpp +++ b/bindings/java/fdbJNI.cpp @@ -584,14 +584,14 @@ JNIEXPORT void JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1setOpti // A value of 0 indicates that the client is more or less idle // A value of 1 (or more) indicates that the client is saturated JNIEXPORT jdouble JNICALL Java_com_apple_foundationdb_FDBDatabase_Database_1getMainThreadBusyness(JNIEnv* jenv, - jobject, - jlong dbPtr) { + jobject, + jlong dbPtr) { if (!dbPtr) { throwParamNotNull(jenv); return 0; } FDBDatabase* database = (FDBDatabase*)dbPtr; - return (jdouble) fdb_database_get_main_thread_busyness(database); + return (jdouble)fdb_database_get_main_thread_busyness(database); } JNIEXPORT jboolean JNICALL Java_com_apple_foundationdb_FDB_Error_1predicate(JNIEnv* jenv, From b51e4aa59048ed73afbb6a6d82b4d86f520f6129 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 24 Mar 2021 19:57:24 -0600 Subject: [PATCH 040/317] handle file renames properly --- fdbrpc/AsyncFileNonDurable.actor.h | 12 +++++++++++- flow/flow.h | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 8cc65bf4a5..21cfda8907 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -270,7 +270,17 @@ public: } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we // remove the file from the map to make sure it gets closed. - g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); + auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; + auto iter = openFiles.find(filename); + // the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the + // map anymore. + if (iter != openFiles.end()) { + // even if the filename exists, it doesn't mean that it references the same file. It could be that the + // file was renamed and later a file with the same name was opened. + if (iter->second.canGet() && iter->second.get().getPtr() == this) { + openFiles.erase(filename); + } + } } } diff --git a/flow/flow.h b/flow/flow.h index 987572d7c5..e03d598d9b 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -674,6 +674,8 @@ public: bool isValid() const { return sav != 0; } bool isReady() const { return sav->isSet(); } bool isError() const { return sav->isError(); } + // returns true if get can be called on this future (counterpart of canBeSet on Promises) + bool canGet() const { return isValid() && isReady() && !isError(); } Error& getError() const { ASSERT(isError()); return sav->error_state; From 1385a776daa0b90cb20478251d0faf8766cb1a10 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 25 Mar 2021 13:22:29 -0600 Subject: [PATCH 041/317] only remove files from the open map if they have no modifications in flight --- fdbrpc/AsyncFileNonDurable.actor.h | 49 ++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 21cfda8907..281b3f289d 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -259,6 +259,37 @@ public: //TraceEvent("AsyncFileNonDurable_Destroy", id).detail("Filename", filename); } + // The purpose of this actor is to simply keep a reference to a non-durable file until all pending modifications + // have completed. When they return, this actor will die and therefore decrement the reference count by 1. + ACTOR void waitOnOutstandingModifications(Reference self) { + state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); + state std::string filename = self->filename; + + wait(g_simulator.onMachine(currentProcess)); + try { + Promise startSyncPromise = self->startSyncPromise; + self->startSyncPromise = Promise(); + startSyncPromise.send(true); + + std::vector> outstandingModifications; + + for (auto itr = self->pendingModifications.ranges().begin(); + itr != self->pendingModifications.ranges().end(); + ++itr) + if (itr->value().isValid() && !itr->value().isReady()) + outstandingModifications.push_back(itr->value()); + + // Ignore errors here so that all modifications can finish + wait(waitForAllReady(outstandingModifications)); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); + } catch (Error& e) { + state Error err = e; + wait(g_simulator.onProcess(currentProcess, currentTaskID)); + throw err; + } + } + void addref() override { ReferenceCounted::addref(); } void delref() override { if (delref_no_destroy()) { @@ -270,6 +301,24 @@ public: } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we // remove the file from the map to make sure it gets closed. + bool hasPendingModifications = false; + for (auto iter = pendingModifications.ranges().begin(); iter != pendingModifications.ranges().end(); + ++iter) { + if (iter->value().isValid() && !iter->value().isReady()) { + hasPendingModifications = true; + break; + } + } + if (hasPendingModifications) { + // If we still have pending references we won't close the file and instead wait for them. But while we + // wait for those to complete, another actor might open the file. So we call into an actor that will + // hold a refernce until all pending operations are complete. If someone opens this file before this + // completes, nothing will happen. Otherwise we will enter delref again but this time + // hasPendingModifications will evalualte to false. + addref(); + waitOnOutstandingModifications(Reference(this)); + return; + } auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; auto iter = openFiles.find(filename); // the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the From 1033db9fba275a809b3159fc2d52a92293350a45 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 25 Mar 2021 14:00:07 -0600 Subject: [PATCH 042/317] Revert change --- fdbrpc/AsyncFileNonDurable.actor.h | 47 +++++++----------------------- 1 file changed, 11 insertions(+), 36 deletions(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 281b3f289d..f65895067e 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -267,27 +267,20 @@ public: state std::string filename = self->filename; wait(g_simulator.onMachine(currentProcess)); - try { - Promise startSyncPromise = self->startSyncPromise; - self->startSyncPromise = Promise(); - startSyncPromise.send(true); + Promise startSyncPromise = self->startSyncPromise; + self->startSyncPromise = Promise(); + startSyncPromise.send(true); - std::vector> outstandingModifications; + std::vector> outstandingModifications; - for (auto itr = self->pendingModifications.ranges().begin(); - itr != self->pendingModifications.ranges().end(); - ++itr) - if (itr->value().isValid() && !itr->value().isReady()) - outstandingModifications.push_back(itr->value()); + for (auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end(); + ++itr) + if (itr->value().isValid() && !itr->value().isReady()) + outstandingModifications.push_back(itr->value()); - // Ignore errors here so that all modifications can finish - wait(waitForAllReady(outstandingModifications)); - wait(g_simulator.onProcess(currentProcess, currentTaskID)); - } catch (Error& e) { - state Error err = e; - wait(g_simulator.onProcess(currentProcess, currentTaskID)); - throw err; - } + // Ignore errors here so that all modifications can finish + wait(waitForAllReady(outstandingModifications)); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); } void addref() override { ReferenceCounted::addref(); } @@ -301,24 +294,6 @@ public: } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we // remove the file from the map to make sure it gets closed. - bool hasPendingModifications = false; - for (auto iter = pendingModifications.ranges().begin(); iter != pendingModifications.ranges().end(); - ++iter) { - if (iter->value().isValid() && !iter->value().isReady()) { - hasPendingModifications = true; - break; - } - } - if (hasPendingModifications) { - // If we still have pending references we won't close the file and instead wait for them. But while we - // wait for those to complete, another actor might open the file. So we call into an actor that will - // hold a refernce until all pending operations are complete. If someone opens this file before this - // completes, nothing will happen. Otherwise we will enter delref again but this time - // hasPendingModifications will evalualte to false. - addref(); - waitOnOutstandingModifications(Reference(this)); - return; - } auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; auto iter = openFiles.find(filename); // the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the From c3ba4659ff461d3a5eb16eaa62d563627ea2032b Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Fri, 26 Mar 2021 18:06:21 +0000 Subject: [PATCH 043/317] Document that ryw disable can only be set at beginning of transaction --- fdbclient/vexillographer/fdb.options | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options index 82ba1910c2..db68bb31a4 100644 --- a/fdbclient/vexillographer/fdb.options +++ b/fdbclient/vexillographer/fdb.options @@ -210,7 +210,7 @@ description is not currently required but encouraged.