From 76838a20b7bd936472d3431bbc7534afac883dad Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 30 Oct 2020 09:11:08 -0700 Subject: [PATCH 001/180] A model used to quickly simulate various GRV scenarios and algorithms --- contrib/grv_proxy_model/grv_test.py | 134 ++++++++ contrib/grv_proxy_model/plot.py | 107 +++++++ contrib/grv_proxy_model/priority.py | 40 +++ contrib/grv_proxy_model/proxy_model.py | 338 ++++++++++++++++++++ contrib/grv_proxy_model/rate_model.py | 83 +++++ contrib/grv_proxy_model/ratekeeper_model.py | 67 ++++ contrib/grv_proxy_model/smoother.py | 53 +++ contrib/grv_proxy_model/workload_model.py | 201 ++++++++++++ 8 files changed, 1023 insertions(+) create mode 100755 contrib/grv_proxy_model/grv_test.py create mode 100755 contrib/grv_proxy_model/plot.py create mode 100755 contrib/grv_proxy_model/priority.py create mode 100755 contrib/grv_proxy_model/proxy_model.py create mode 100755 contrib/grv_proxy_model/rate_model.py create mode 100755 contrib/grv_proxy_model/ratekeeper_model.py create mode 100644 contrib/grv_proxy_model/smoother.py create mode 100755 contrib/grv_proxy_model/workload_model.py diff --git a/contrib/grv_proxy_model/grv_test.py b/contrib/grv_proxy_model/grv_test.py new file mode 100755 index 0000000000..1cd0224538 --- /dev/null +++ b/contrib/grv_proxy_model/grv_test.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 + +# +# grv_test.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import inspect +import sys + +import rate_model +import workload_model +import proxy_model +import ratekeeper_model +from priority import Priority +from plot import Plotter + +parser = argparse.ArgumentParser() +parser.add_argument('-w', '--workload', type=str, help='Name of workload to run') +parser.add_argument('-r', '--ratekeeper', type=str, help='Name of ratekeeper model') +parser.add_argument('-d', '--duration', type=int, default=240, help='Duration of simulated test, in seconds. Defaults to 240.') +parser.add_argument('-L', '--limiter', type=str, default='Original', help='Name of limiter implementation. Defaults to \'Original\'.') +parser.add_argument('-p', '--proxy', type=str, default='ProxyModel', help='Name of proxy implementation. Defaults to \'ProxyModel\'.') +parser.add_argument('--list', action='store_true', default=False, help='List options for all models.') +parser.add_argument('--no-graph', action='store_true', default=False, help='Disable graphical output.') + +args = parser.parse_args() + +def print_choices_list(context=None): + if context == 'workload' or context is None: + print('Workloads:') + for w in workload_model.predefined_workloads.keys(): + print(' %s' % w) + + if context == 'ratekeeper' or context is None: + print('\nRatekeeper models:') + for r in ratekeeper_model.predefined_ratekeeper.keys(): + print(' %s' % r) + + proxy_model_classes = [c for c in [getattr(proxy_model, a) for a in dir(proxy_model)] if inspect.isclass(c)] + + if context == 'proxy' or context is None: + print('\nProxy models:') + for p in proxy_model_classes: + if issubclass(p, proxy_model.ProxyModel): + print(' %s' % p.__name__) + + if context == 'limiter' or context is None: + print('\nProxy limiters:') + for p in proxy_model_classes: + if issubclass(p, proxy_model.Limiter) and p != proxy_model.Limiter: + name = p.__name__ + if name.endswith('Limiter'): + name = name[0:-len('Limiter')] + print(' %s' % name) + +if args.workload is None or args.ratekeeper is None: + print('ERROR: A workload (-w/--workload) and ratekeeper model (-r/--ratekeeper) must be specified.\n') + print_choices_list() + sys.exit(1) + +if args.list: + print_choices_list() + sys.exit(0) + +def validate_class_type(var, name, superclass): + cls = getattr(var, name, None) + return cls is not None and inspect.isclass(cls) and issubclass(cls, superclass) + +if not args.ratekeeper in ratekeeper_model.predefined_ratekeeper: + print('Invalid ratekeeper model `%s\'' % args.ratekeeper) + print_choices_list('ratekeeper') + sys.exit(1) + +if not args.workload in workload_model.predefined_workloads: + print('Invalid workload model `%s\'' % args.workload) + print_choices_list('workload') + sys.exit(1) + +if not validate_class_type(proxy_model, args.proxy, proxy_model.ProxyModel): + print('Invalid proxy model `%s\'' % args.proxy) + print_choices_list('proxy') + sys.exit(1) + +limiter_name = args.limiter +if not validate_class_type(proxy_model, limiter_name, proxy_model.Limiter): + limiter_name += 'Limiter' + if not validate_class_type(proxy_model, limiter_name, proxy_model.Limiter): + print('Invalid proxy limiter `%s\'' % args.limiter) + print_choices_list('limiter') + sys.exit(1) + +ratekeeper = ratekeeper_model.predefined_ratekeeper[args.ratekeeper] +workload = workload_model.predefined_workloads[args.workload] + +limiter = getattr(proxy_model, limiter_name) +proxy = getattr(proxy_model, args.proxy)(args.duration, ratekeeper, workload, limiter) + +proxy.run() + +for priority in workload.priorities(): + latencies = sorted([p for t in proxy.results.latencies[priority].values() for p in t]) + total_started = sum(proxy.results.started[priority].values()) + still_queued = sum([r.count for r in proxy.request_queue if r.priority == priority]) + + if len(latencies) > 0: + print('\n%s: %d requests in %d seconds (rate=%f). %d still queued.' % (priority, total_started, proxy.time, float(total_started)/proxy.time, still_queued)) + print(' Median latency: %f' % latencies[len(latencies)//2]) + print(' 90%% latency: %f' % latencies[int(0.9*len(latencies))]) + print(' 99%% latency: %f' % latencies[int(0.99*len(latencies))]) + print(' 99.9%% latency: %f' % latencies[int(0.999*len(latencies))]) + print(' Max latency: %f' % latencies[-1]) + +print('') + +if not args.no_graph: + plotter = Plotter(proxy.results) + plotter.display() diff --git a/contrib/grv_proxy_model/plot.py b/contrib/grv_proxy_model/plot.py new file mode 100755 index 0000000000..9334e2c844 --- /dev/null +++ b/contrib/grv_proxy_model/plot.py @@ -0,0 +1,107 @@ +# +# plot.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import matplotlib.pyplot as plt + +class Plotter: + def __init__(self, results): + self.results = results + + def add_plot(data, time_resolution, label, use_avg=False): + out_data = {} + counts = {} + for t in data.keys(): + out_data.setdefault(t//time_resolution*time_resolution, 0) + counts.setdefault(t//time_resolution*time_resolution, 0) + out_data[t//time_resolution*time_resolution] += data[t] + counts[t//time_resolution*time_resolution] += 1 + + if use_avg: + out_data = { t: v/counts[t] for t,v in out_data.items() } + + plt.plot(list(out_data.keys()), list(out_data.values()), label=label) + + def add_plot_with_times(data, label): + plt.plot(list(data.keys()), list(data.values()), label=label) + + def display(self, time_resolution=0.1): + plt.figure(figsize=(40,9)) + plt.subplot(3, 3, 1) + for priority in self.results.started.keys(): + Plotter.add_plot(self.results.started[priority], time_resolution, priority) + + plt.xlabel('Time (s)') + plt.ylabel('Released/s') + plt.legend() + + plt.subplot(3, 3, 2) + for priority in self.results.queued.keys(): + Plotter.add_plot(self.results.queued[priority], time_resolution, priority) + + plt.xlabel('Time (s)') + plt.ylabel('Requests/s') + plt.legend() + + plt.subplot(3, 3, 3) + for priority in self.results.unprocessed_queue_sizes.keys(): + data = {k: max(v) for (k,v) in self.results.unprocessed_queue_sizes[priority].items()} + Plotter.add_plot(data, time_resolution, priority) + + plt.xlabel('Time (s)') + plt.ylabel('Max queue size') + plt.legend() + + num = 4 + for priority in self.results.latencies.keys(): + plt.subplot(3, 3, num) + median_latencies = {k: v[int(0.5*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()} + percentile90_latencies = {k: v[int(0.9*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()} + max_latencies = {k: max(v) if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()} + + Plotter.add_plot(median_latencies, time_resolution, 'median') + Plotter.add_plot(percentile90_latencies, time_resolution, '90th percentile') + Plotter.add_plot(max_latencies, time_resolution, 'max') + + plt.xlabel('Time (s)') + plt.ylabel(str(priority) + ' Latency (s)') + plt.yscale('log') + plt.legend() + num += 1 + + for priority in self.results.rate.keys(): + plt.subplot(3, 3, num) + if len(self.results.rate[priority]) > 0: + Plotter.add_plot(self.results.rate[priority], time_resolution, 'Rate', use_avg=True) + if len(self.results.released[priority]) > 0: + Plotter.add_plot(self.results.released[priority], time_resolution, 'Released', use_avg=True) + if len(self.results.limit[priority]) > 0: + Plotter.add_plot(self.results.limit[priority], time_resolution, 'Limit', use_avg=True) + if len(self.results.limit_and_budget[priority]) > 0: + Plotter.add_plot(self.results.limit_and_budget[priority], time_resolution, 'Limit and budget', use_avg=True) + if len(self.results.budget[priority]) > 0: + Plotter.add_plot(self.results.budget[priority], time_resolution, 'Budget', use_avg=True) + + plt.xlabel('Time (s)') + plt.ylabel('Value (' + str(priority) + ')') + plt.legend() + num += 1 + + plt.show() + diff --git a/contrib/grv_proxy_model/priority.py b/contrib/grv_proxy_model/priority.py new file mode 100755 index 0000000000..3ba5c05f2e --- /dev/null +++ b/contrib/grv_proxy_model/priority.py @@ -0,0 +1,40 @@ +# +# priority.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import functools + +@functools.total_ordering +class Priority: + def __init__(self, priority_value, label): + self.priority_value = priority_value + self.label = label + + def __lt__(self, other): + return self.priority_value < other.priority_value + + def __str__(self): + return self.label + + def __repr__(self): + return repr(self.label) + +Priority.SYSTEM = Priority(0, "System") +Priority.DEFAULT = Priority(1, "Default") +Priority.BATCH = Priority(2, "Batch") diff --git a/contrib/grv_proxy_model/proxy_model.py b/contrib/grv_proxy_model/proxy_model.py new file mode 100755 index 0000000000..9ca2a39bfe --- /dev/null +++ b/contrib/grv_proxy_model/proxy_model.py @@ -0,0 +1,338 @@ +# +# proxy_model.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import copy +import functools +import heapq + +from priority import Priority +from smoother import Smoother + +@functools.total_ordering +class Task: + def __init__(self, time, fxn): + self.time = time + self.fxn = fxn + + def __lt__(self, other): + return self.time < other.time + +class Limiter: + class UpdateRateParams: + def __init__(self, time): + self.time = time + + class UpdateLimitParams: + def __init__(self, time, elapsed): + self.time = time + self.elapsed = elapsed + + class CanStartParams: + def __init__(self, time, num_started, count): + self.time = time + self.num_started = num_started + self.count = count + + class UpdateBudgetParams: + def __init__(self, time, num_started, num_started_at_priority, min_priority, last_batch, queue_empty, elapsed): + self.time = time + self.num_started = num_started + self.num_started_at_priority = num_started_at_priority + self.min_priority = min_priority + self.last_batch = last_batch + self.queue_empty = queue_empty + self.elapsed = elapsed + + def __init__(self, priority, ratekeeper_model, proxy_model): + self.priority = priority + self.ratekeeper_model = ratekeeper_model + self.proxy_model = proxy_model + self.limit = 0 + self.rate = self.ratekeeper_model.get_limit(0, self.priority) + + def update_rate(self, params): + pass + + def update_limit(self, params): + pass + + def can_start(self, params): + pass + + def update_budget(self, params): + pass + +class OriginalLimiter(Limiter): + def __init__(self, priority, limit_rate_model, proxy_model): + Limiter.__init__(self, priority, limit_rate_model, proxy_model) + + def update_rate(self, params): + self.rate = self.ratekeeper_model.get_limit(params.time, self.priority) + + def update_limit(self, params): + self.limit = min(0, self.limit) + params.elapsed * self.rate + self.limit = min(self.limit, self.rate * 0.01) + self.limit = min(self.limit, 100000) + + self.proxy_model.results.rate[self.priority][params.time] = self.rate + self.proxy_model.results.limit[self.priority][params.time] = self.limit + + def can_start(self, params): + return params.num_started < self.limit + + def update_budget(self, params): + self.limit -= params.num_started + +class PositiveBudgetLimiter(OriginalLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + OriginalLimiter.__init__(self, priority, limit_rate_model, proxy_model) + + def update_limit(self, params): + self.limit += params.elapsed * self.rate + self.limit = min(self.limit, 2.0 * self.rate) + +class ClampedBudgetLimiter(PositiveBudgetLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model) + + def update_budget(self, params): + min_budget = -self.rate * 5.0 + if self.limit > min_budget: + self.limit = max(self.limit - params.num_started, min_budget) + +class TimeLimiter(PositiveBudgetLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model) + self.locked_until = 0 + + def can_start(self, params): + return params.time >= self.locked_until and PositiveBudgetLimiter.can_start(self, params) + + def update_budget(self, params): + #print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch)) + + if params.min_priority >= self.priority or params.num_started < self.limit: + self.limit -= params.num_started + else: + self.limit = min(self.limit, max(self.limit - params.num_started, -params.last_batch)) + self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + (params.num_started - self.limit)/self.rate) + + #print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority)) + +class TimePositiveBudgetLimiter(PositiveBudgetLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model) + self.locked_until = 0 + + def update_limit(self, params): + if params.time >= self.locked_until: + PositiveBudgetLimiter.update_limit(self, params) + + def can_start(self, params): + return params.num_started + params.count <= self.limit + + def update_budget(self, params): + #if params.num_started > 0: + #print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch)) + + if params.num_started > self.limit: + self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + penalty/self.rate) + self.limit = 0 + else: + self.limit -= params.num_started + + #if params.num_started > 0: + #print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority)) + +class SmoothingLimiter(OriginalLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + OriginalLimiter.__init__(self, priority, limit_rate_model, proxy_model) + self.smooth_released = Smoother(2) + self.smooth_rate_limit = Smoother(2) + self.rate_set = False + + def update_rate(self, params): + OriginalLimiter.update_rate(self, params) + if not self.rate_set: + self.rate_set = True + self.smooth_rate_limit.reset(self.rate) + else: + self.smooth_rate_limit.set_total(params.time, self.rate) + + def update_limit(self, params): + self.limit = 2.0 * (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time)) + + def can_start(self, params): + return params.num_started + params.count <= self.limit + + def update_budget(self, params): + self.smooth_released.add_delta(params.time, params.num_started) + +class SmoothingBudgetLimiter(SmoothingLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + SmoothingLimiter.__init__(self, priority, limit_rate_model, proxy_model) + #self.smooth_filled = Smoother(2) + self.budget = 0 + + def update_limit(self, params): + release_rate = (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time)) + #self.smooth_filled.set_total(params.time, 1 if release_rate > 0 else 0) + self.limit = 2.0 * release_rate + + self.proxy_model.results.rate[self.priority][params.time] = self.smooth_rate_limit.smooth_total(params.time) + self.proxy_model.results.released[self.priority][params.time] = self.smooth_released.smooth_rate(params.time) + self.proxy_model.results.limit[self.priority][params.time] = self.limit + self.proxy_model.results.limit_and_budget[self.priority][params.time] = self.limit + self.budget + self.proxy_model.results.budget[self.priority][params.time] = self.budget + + #self.budget = max(0, self.budget + params.elapsed * self.smooth_rate_limit.smooth_total(params.time)) + + #if self.smooth_filled.smooth_total(params.time) >= 0.1: + #self.budget += params.elapsed * self.smooth_rate_limit.smooth_total(params.time) + + #print('Update limit: time=%f, priority=%s, limit=%f, rate=%f, released=%f, budget=%f' % (params.time, self.priority, self.limit, self.smooth_rate_limit.smooth_total(params.time), self.smooth_released.smooth_rate(params.time), self.budget)) + + def can_start(self, params): + return params.num_started + params.count <= self.limit + self.budget #or params.num_started + params.count <= self.budget + + def update_budget(self, params): + self.budget = max(0, self.budget + (self.limit - params.num_started_at_priority) / 2 * params.elapsed) + + if params.queue_empty: + self.budget = min(10, self.budget) + + self.smooth_released.add_delta(params.time, params.num_started_at_priority) + +class ProxyModel: + class Results: + def __init__(self, priorities, duration): + self.started = self.init_result(priorities, 0, duration) + self.queued = self.init_result(priorities, 0, duration) + self.latencies = self.init_result(priorities, [], duration) + self.unprocessed_queue_sizes = self.init_result(priorities, [], duration) + + self.rate = {p:{} for p in priorities} + self.released = {p:{} for p in priorities} + self.limit = {p:{} for p in priorities} + self.limit_and_budget = {p:{} for p in priorities} + self.budget = {p:{} for p in priorities} + + def init_result(self, priorities, starting_value, duration): + return {p: {s: copy.copy(starting_value) for s in range(0, duration)} for p in priorities} + + def __init__(self, duration, ratekeeper_model, workload_model, Limiter): + self.time = 0 + self.log_time = 0 + self.duration = duration + self.priority_limiters = { priority: Limiter(priority, ratekeeper_model, self) for priority in workload_model.priorities() } + self.workload_model = workload_model + self.request_scheduled = { p: False for p in self.workload_model.priorities()} + + self.tasks = [] + self.request_queue = [] + self.results = ProxyModel.Results(self.workload_model.priorities(), duration) + + def run(self): + self.update_rate() + self.process_requests(self.time) + + for priority in self.workload_model.priorities(): + next_request = self.workload_model.next_request(self.time, priority) + assert next_request is not None + heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request))) + self.request_scheduled[priority] = True + + while True:# or len(self.request_queue) > 0: + if int(self.time) > self.log_time: + self.log_time = int(self.time) + #print(self.log_time) + + task = heapq.heappop(self.tasks) + self.time = task.time + if self.time >= self.duration: + break + + task.fxn() + + def update_rate(self): + for limiter in self.priority_limiters.values(): + limiter.update_rate(Limiter.UpdateRateParams(self.time)) + + heapq.heappush(self.tasks, Task(self.time + 0.01, lambda: self.update_rate())) + + def receive_request(self, request): + heapq.heappush(self.request_queue, request) + + self.results.queued[request.priority][int(self.time)] += request.count + + next_request = self.workload_model.next_request(self.time, request.priority) + if next_request is not None and next_request.time < self.duration: + heapq.heappush(self.tasks, Task(next_request.time, lambda: self.receive_request(next_request))) + else: + self.request_scheduled[request.priority] = False + + def process_requests(self, last_time): + elapsed = self.time - last_time + for limiter in self.priority_limiters.values(): + limiter.update_limit(Limiter.UpdateLimitParams(self.time, elapsed)) + + current_started = 0 + started = {p:0 for p in self.workload_model.priorities()} + + min_priority = Priority.SYSTEM + last_batch = 0 + while len(self.request_queue) > 0: + request = self.request_queue[0] + + if not self.priority_limiters[request.priority].can_start(Limiter.CanStartParams(self.time, current_started, request.count)): + break + + min_priority = request.priority + last_batch = request.count + + if self.workload_model.request_completed(request) and not self.request_scheduled[request.priority]: + next_request = self.workload_model.next_request(self.time, request.priority) + assert next_request is not None + heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request))) + self.request_scheduled[request.priority] = True + + current_started += request.count + started[request.priority] += request.count + + heapq.heappop(self.request_queue) + self.results.started[request.priority][int(self.time)] += request.count + self.results.latencies[request.priority][int(self.time)].append(self.time-request.time) + + if len(self.request_queue) == 0: + min_priority = Priority.BATCH + + for priority, limiter in self.priority_limiters.items(): + started_at_priority = sum([v for p,v in started.items() if p <= priority]) + limiter.update_budget(Limiter.UpdateBudgetParams(self.time, current_started, started_at_priority, min_priority, last_batch, len(self.request_queue) == 0 or self.request_queue[0].priority > priority, elapsed)) + + for priority in self.workload_model.priorities(): + self.results.unprocessed_queue_sizes[priority][int(self.time)].append(self.workload_model.workload_models[priority].outstanding) + + current_time = self.time + + delay = 0.001 + heapq.heappush(self.tasks, Task(self.time + delay, lambda: self.process_requests(current_time))) + + diff --git a/contrib/grv_proxy_model/rate_model.py b/contrib/grv_proxy_model/rate_model.py new file mode 100755 index 0000000000..1fabce2c7e --- /dev/null +++ b/contrib/grv_proxy_model/rate_model.py @@ -0,0 +1,83 @@ +# +# rate_model.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy + +class RateModel: + def __init__(self): + pass + + def get_rate(self, time): + pass + +class FixedRateModel(RateModel): + def __init__(self, rate): + RateModel.__init__(self) + self.rate = rate + + def get_rate(self, time): + return self.rate + +class UnlimitedRateModel(FixedRateModel): + def __init__(self): + self.rate = 1e9 + +class IntervalRateModel(RateModel): + def __init__(self, intervals): + self.intervals = sorted(intervals) + + def get_rate(self, time): + if len(self.intervals) == 0 or time < self.intervals[0][0]: + return 0 + + target_interval = len(self.intervals)-1 + for i in range(1, len(self.intervals)): + if time < self.intervals[i][0]: + target_interval = i-1 + break + + self.intervals = self.intervals[target_interval:] + return self.intervals[0][1] + +class SawtoothRateModel(RateModel): + def __init__(self, low, high, frequency): + self.low = low + self.high = high + self.frequency = frequency + + def get_rate(self, time): + if int(2*time/self.frequency) % 2 == 0: + return self.low + else: + return self.high + +class DistributionRateModel(RateModel): + def __init__(self, distribution, frequency): + self.distribution = distribution + self.frequency = frequency + self.last_change = 0 + self.rate = None + + def get_rate(self, time): + if self.frequency == 0 or int((time - self.last_change) / self.frequency) > int(self.last_change / self.frequency) or self.rate is None: + self.last_change = time + self.rate = self.distribution() + + return self.rate diff --git a/contrib/grv_proxy_model/ratekeeper_model.py b/contrib/grv_proxy_model/ratekeeper_model.py new file mode 100755 index 0000000000..57125dc4c0 --- /dev/null +++ b/contrib/grv_proxy_model/ratekeeper_model.py @@ -0,0 +1,67 @@ +# +# ratekeeper.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy +import rate_model +from priority import Priority + +class RatekeeperModel: + def __init__(self, limit_models): + self.limit_models = limit_models + + def get_limit(self, time, priority): + return self.limit_models[priority].get_rate(time) + +predefined_ratekeeper = {} + +predefined_ratekeeper['default200_batch100'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.FixedRateModel(200), + Priority.BATCH: rate_model.FixedRateModel(100) +}) + +predefined_ratekeeper['default_sawtooth'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.SawtoothRateModel(10, 200, 1), + Priority.BATCH: rate_model.FixedRateModel(0) +}) + +predefined_ratekeeper['default_uniform_random'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.DistributionRateModel(lambda: numpy.random.uniform(10, 200), 1), + Priority.BATCH: rate_model.FixedRateModel(0) +}) + +predefined_ratekeeper['default_trickle'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.FixedRateModel(3), + Priority.BATCH: rate_model.FixedRateModel(0) +}) + +predefined_ratekeeper['default1000'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.FixedRateModel(1000), + Priority.BATCH: rate_model.FixedRateModel(500) +}) diff --git a/contrib/grv_proxy_model/smoother.py b/contrib/grv_proxy_model/smoother.py new file mode 100644 index 0000000000..bc1b32ea12 --- /dev/null +++ b/contrib/grv_proxy_model/smoother.py @@ -0,0 +1,53 @@ +# +# smoother.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import math + +class Smoother: + def __init__(self, folding_time): + self.folding_time = folding_time + self.reset(0) + + def reset(self, value): + self.time = 0 + self.total = value + self.estimate = value + + def set_total(self, time, total): + self.add_delta(time, total-self.total) + + def add_delta(self, time, delta): + self.update(time) + self.total += delta + + def smooth_total(self, time): + self.update(time) + return self.estimate + + def smooth_rate(self, time): + self.update(time) + return (self.total-self.estimate) / self.folding_time + + def update(self, time): + elapsed = time - self.time + if elapsed > 0: + self.time = time + self.estimate += (self.total-self.estimate) * (1-math.exp(-elapsed/self.folding_time)) + diff --git a/contrib/grv_proxy_model/workload_model.py b/contrib/grv_proxy_model/workload_model.py new file mode 100755 index 0000000000..63fb4c472e --- /dev/null +++ b/contrib/grv_proxy_model/workload_model.py @@ -0,0 +1,201 @@ +# +# workload_model.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import functools +import numpy +import math + +import rate_model +from priority import Priority + +@functools.total_ordering +class Request: + def __init__(self, time, count, priority): + self.time = time + self.count = count + self.priority = priority + + def __lt__(self, other): + return self.priority < other.priority + +class PriorityWorkloadModel: + def __init__(self, priority, rate_model, batch_model, generator, max_outstanding=1e9): + self.priority = priority + self.rate_model = rate_model + self.batch_model = batch_model + self.generator = generator + self.max_outstanding = max_outstanding + self.outstanding = 0 + + def next_request(self, time): + if self.outstanding >= self.max_outstanding: + return None + + batch_size = self.batch_model.next_batch() + self.outstanding += batch_size + interval = self.generator.next_request_interval(self.rate_model.get_rate(time)) + return Request(time + interval, batch_size, self.priority) + + def request_completed(self, request): + was_full = self.max_outstanding <= self.outstanding + self.outstanding -= request.count + + return was_full and self.outstanding < self.max_outstanding + +class WorkloadModel: + def __init__(self, workload_models): + self.workload_models = workload_models + + def priorities(self): + return list(self.workload_models.keys()) + + def next_request(self, time, priority): + return self.workload_models[priority].next_request(time) + + def request_completed(self, request): + return self.workload_models[request.priority].request_completed(request) + +class Distribution: + EXPONENTIAL = lambda x: numpy.random.exponential(x) + UNIFORM = lambda x: numpy.random.uniform(0, 2.0*x) + FIXED = lambda x: x + +class BatchGenerator: + def __init__(self): + pass + + def next_batch(self): + pass + +class DistributionBatchGenerator(BatchGenerator): + def __init__(self, distribution, size): + BatchGenerator.__init__(self) + self.distribution = distribution + self.size = size + + def next_batch(self): + return math.ceil(self.distribution(self.size)) + +class RequestGenerator: + def __init__(self): + pass + + def next_request_interval(self, rate): + pass + +class DistributionRequestGenerator(RequestGenerator): + def __init__(self, distribution): + RequestGenerator.__init__(self) + self.distribution = distribution + + def next_request_interval(self, rate): + if rate == 0: + return 1e9 + + return self.distribution(1.0/rate) + +predefined_workloads = {} + +predefined_workloads['slow_exponential'] = WorkloadModel( +{ + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.FixedRateModel(100), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.EXPONENTIAL), + max_outstanding=100 + ) +}) + +predefined_workloads['fixed_uniform'] = WorkloadModel( +{ + Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, + rate_model.FixedRateModel(0), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=10 + ), + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.FixedRateModel(95), + DistributionBatchGenerator(Distribution.FIXED, 10), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ), + Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, + rate_model.FixedRateModel(1), + DistributionBatchGenerator(Distribution.UNIFORM, 500), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ) +}) + +predefined_workloads['batch_starvation'] = WorkloadModel( +{ + Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, + rate_model.FixedRateModel(1), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=10 + ), + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.IntervalRateModel([(0,50), (60,150), (120,90)]), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ), + Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, + rate_model.FixedRateModel(100), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ) +}) + +predefined_workloads['default_low_high_low'] = WorkloadModel( +{ + Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, + rate_model.FixedRateModel(0), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=10 + ), + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.IntervalRateModel([(0,100), (60,300), (120,100)]), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ), + Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, + rate_model.FixedRateModel(0), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ) +}) + +for rate in [83, 100, 180, 190, 200]: + predefined_workloads['default%d' % rate] = WorkloadModel( + { + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.FixedRateModel(rate), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.EXPONENTIAL), + max_outstanding=1000 + ) + }) From 82f7f541c39377ae2386cc52b777b354b3f545c4 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 25 Nov 2020 11:38:08 -0700 Subject: [PATCH 002/180] started lineage implementation --- flow/flow.cpp | 2 ++ flow/flow.h | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/flow/flow.cpp b/flow/flow.cpp index 89f04bd5df..a2bfcc1510 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -26,6 +26,8 @@ #include #include +thread_local ActorLineagePropertyMap* currentLineage = nullptr; + #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER) // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test. void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) { diff --git a/flow/flow.h b/flow/flow.h index a72465143d..155c5db2a2 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -36,6 +36,7 @@ #include #include #include +#include #include "flow/Platform.h" #include "flow/FastAlloc.h" @@ -407,6 +408,30 @@ struct SingleCallback { } }; +// in the future we might want to read these from a different thread. std::shared_ptr +// seems to be better suited for this... +struct ActorLineagePropertyMap : std::enable_shared_from_this { + std::shared_ptr parent = nullptr; +}; + +extern thread_local ActorLineagePropertyMap* currentLineage; + +struct ActorLineage { + std::shared_ptr properties = std::make_shared(); + ActorLineage() { + if (currentLineage) { + properties->parent = currentLineage->shared_from_this(); + } + } +}; + +struct save_lineage { + ActorLineagePropertyMap* current = currentLineage; + ~save_lineage() { + currentLineage = current; + } +}; + // SAV is short for Single Assignment Variable: It can be assigned for only once! template struct SAV : private Callback, FastAllocated> { @@ -445,6 +470,7 @@ public: ASSERT(canBeSet()); new (&value_storage) T(std::forward(value)); this->error_state = Error::fromCode(SET_ERROR_CODE); + save_lineage _{}; while (Callback::next != this) Callback::next->fire(this->value()); } @@ -457,6 +483,7 @@ public: void sendError(Error err) { ASSERT(canBeSet() && int16_t(err.code()) > 0); this->error_state = err; + save_lineage _{}; while (Callback::next != this) Callback::next->error(err); } @@ -477,6 +504,7 @@ public: void finishSendAndDelPromiseRef() { // Call only after value_storage has already been initialized! this->error_state = Error::fromCode(SET_ERROR_CODE); + save_lineage _{}; while (Callback::next != this) Callback::next->fire(this->value()); @@ -500,6 +528,7 @@ public: } this->error_state = err; + save_lineage _{}; while (Callback::next != this) Callback::next->error(err); @@ -987,7 +1016,7 @@ static inline void destruct(T& t) { } template -struct Actor : SAV { +struct Actor : SAV, ActorLineage { int8_t actor_wait_state; // -1 means actor is cancelled; 0 means actor is not waiting; 1-N mean waiting in callback group # Actor() : SAV(1, 1), actor_wait_state(0) { /*++actorCount;*/ } @@ -995,7 +1024,7 @@ struct Actor : SAV { }; template <> -struct Actor { +struct Actor : ActorLineage { // This specialization is for a void actor (one not returning a future, hence also uncancellable) int8_t actor_wait_state; // 0 means actor is not waiting; 1-N mean waiting in callback group # From 05f77f905fb3a32c026729479de3de5456a5789e Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 7 Dec 2020 15:15:25 -0700 Subject: [PATCH 003/180] Added actor lineage --- flow/actorcompiler/ActorCompiler.cs | 1 + flow/actorcompiler/actorcompiler.csproj | 108 +----------------------- flow/actorcompiler/actorcompiler.sln | 34 ++++++++ flow/flow.cpp | 5 +- flow/flow.h | 96 +++++++++++++-------- flow/genericactors.actor.h | 4 + 6 files changed, 110 insertions(+), 138 deletions(-) create mode 100644 flow/actorcompiler/actorcompiler.sln diff --git a/flow/actorcompiler/ActorCompiler.cs b/flow/actorcompiler/ActorCompiler.cs index 7aef82a42e..dc9de91868 100644 --- a/flow/actorcompiler/ActorCompiler.cs +++ b/flow/actorcompiler/ActorCompiler.cs @@ -452,6 +452,7 @@ namespace actorcompiler fullClassName, string.Join(", ", actor.parameters.Select(p => p.name).ToArray())); + writer.WriteLine("restore_lineage _;"); if (actor.returnType != null) writer.WriteLine("\treturn Future<{1}>({0});", newActor, actor.returnType); else diff --git a/flow/actorcompiler/actorcompiler.csproj b/flow/actorcompiler/actorcompiler.csproj index e737adabd2..b590913634 100644 --- a/flow/actorcompiler/actorcompiler.csproj +++ b/flow/actorcompiler/actorcompiler.csproj @@ -1,108 +1,8 @@ - - + + - Debug - 10.0.20506 - 2.0 - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51} Exe - Properties - actorcompiler - actorcompiler - v4.0 - 512 - $(SolutionDir)bin\$(Configuration)\ - publish\ - true - Disk - false - Foreground - 7 - Days - false - false - true - 0 - 1.0.0.%2a - false - false - true + net5.0 - - true - DEBUG;TRACE - full - AnyCPU - default - prompt - false - false - - - TRACE - true - pdbonly - AnyCPU - default - prompt - false - false - - - - - 3.5 - - - 3.5 - - - 3.5 - - - 4.0 - - - - - - - - - - - - - - False - Microsoft .NET Framework 4 %28x86 and x64%29 - true - - - False - .NET Framework 3.5 SP1 Client Profile - false - - - False - .NET Framework 3.5 SP1 - false - - - False - Windows Installer 3.1 - true - - - - - - - + \ No newline at end of file diff --git a/flow/actorcompiler/actorcompiler.sln b/flow/actorcompiler/actorcompiler.sln new file mode 100644 index 0000000000..a4292bfaaa --- /dev/null +++ b/flow/actorcompiler/actorcompiler.sln @@ -0,0 +1,34 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.26124.0 +MinimumVisualStudioVersion = 15.0.26124.0 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "actorcompiler", "actorcompiler.csproj", "{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|Any CPU = Release|Any CPU + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|Any CPU.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x64.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x64.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x86.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x86.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|Any CPU.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|Any CPU.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x64.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x64.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x86.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x86.Build.0 = Debug|Any CPU + EndGlobalSection +EndGlobal diff --git a/flow/flow.cpp b/flow/flow.cpp index a2bfcc1510..c4a6097300 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -26,7 +26,10 @@ #include #include -thread_local ActorLineagePropertyMap* currentLineage = nullptr; +extern thread_local Reference currentLineage; + +ActorLineage::ActorLineage() : parent(currentLineage) { +} #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER) // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test. diff --git a/flow/flow.h b/flow/flow.h index 155c5db2a2..a0c9793a7a 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -20,6 +20,7 @@ #ifndef FLOW_FLOW_H #define FLOW_FLOW_H +#include "flow/FastRef.h" #pragma once #pragma warning( disable: 4244 4267 ) // SOMEDAY: Carefully check for integer overflow issues (e.g. size_t to int conversions like this suppresses) @@ -408,28 +409,21 @@ struct SingleCallback { } }; -// in the future we might want to read these from a different thread. std::shared_ptr -// seems to be better suited for this... -struct ActorLineagePropertyMap : std::enable_shared_from_this { - std::shared_ptr parent = nullptr; +struct ActorLineagePropertyMap : ReferenceCounted { }; -extern thread_local ActorLineagePropertyMap* currentLineage; - -struct ActorLineage { - std::shared_ptr properties = std::make_shared(); - ActorLineage() { - if (currentLineage) { - properties->parent = currentLineage->shared_from_this(); - } - } +struct ActorLineage : ReferenceCounted { + Reference map; + Reference parent; + ActorLineage(); }; -struct save_lineage { - ActorLineagePropertyMap* current = currentLineage; - ~save_lineage() { - currentLineage = current; - } +extern thread_local Reference currentLineage; + +struct restore_lineage { + Reference lineage; + restore_lineage() : lineage(currentLineage) {} + ~restore_lineage() { currentLineage = lineage; } }; // SAV is short for Single Assignment Variable: It can be assigned for only once! @@ -447,7 +441,8 @@ public: T& value() { return *(T*)&value_storage; } - SAV(int futures, int promises) : futures(futures), promises(promises), error_state(Error::fromCode(UNSET_ERROR_CODE)) { + SAV(int futures, int promises) + : futures(futures), promises(promises), error_state(Error::fromCode(UNSET_ERROR_CODE)) { Callback::prev = Callback::next = this; } ~SAV() { @@ -466,13 +461,14 @@ public: } template - void send(U && value) { + void send(U&& value) { ASSERT(canBeSet()); new (&value_storage) T(std::forward(value)); this->error_state = Error::fromCode(SET_ERROR_CODE); - save_lineage _{}; - while (Callback::next != this) + restore_lineage _; + while (Callback::next != this) { Callback::next->fire(this->value()); + } } void send(Never) { @@ -483,13 +479,15 @@ public: void sendError(Error err) { ASSERT(canBeSet() && int16_t(err.code()) > 0); this->error_state = err; - save_lineage _{}; - while (Callback::next != this) + restore_lineage _; + while (Callback::next != this) { Callback::next->error(err); + } } template void sendAndDelPromiseRef(U && value) { + restore_lineage _; ASSERT(canBeSet()); if (promises == 1 && !futures) { // No one is left to receive the value, so we can just die @@ -503,8 +501,8 @@ public: void finishSendAndDelPromiseRef() { // Call only after value_storage has already been initialized! + restore_lineage _; this->error_state = Error::fromCode(SET_ERROR_CODE); - save_lineage _{}; while (Callback::next != this) Callback::next->fire(this->value()); @@ -520,6 +518,7 @@ public: } void sendErrorAndDelPromiseRef(Error err) { + restore_lineage _; ASSERT(canBeSet() && int16_t(err.code()) > 0); if (promises == 1 && !futures) { // No one is left to receive the value, so we can just die @@ -528,7 +527,6 @@ public: } this->error_state = err; - save_lineage _{}; while (Callback::next != this) Callback::next->error(err); @@ -624,6 +622,7 @@ struct NotifiedQueue : private SingleCallback, FastAllocated if (error.isValid()) return; if (SingleCallback::next != this) { + restore_lineage _; SingleCallback::next->fire(std::forward(value)); } else { @@ -635,8 +634,10 @@ struct NotifiedQueue : private SingleCallback, FastAllocated if (error.isValid()) return; this->error = err; - if (SingleCallback::next != this) + if (SingleCallback::next != this) { + restore_lineage _; SingleCallback::next->error(err); + } } void addPromiseRef() { promises++; } @@ -1016,38 +1017,67 @@ static inline void destruct(T& t) { } template -struct Actor : SAV, ActorLineage { +struct Actor : SAV { + Reference lineage = Reference{new ActorLineage() }; int8_t actor_wait_state; // -1 means actor is cancelled; 0 means actor is not waiting; 1-N mean waiting in callback group # - Actor() : SAV(1, 1), actor_wait_state(0) { /*++actorCount;*/ } + Actor() : SAV(1, 1), actor_wait_state(0) { + /*++actorCount;*/ + currentLineage = lineage; + } + + Reference setLineage() { + auto res = currentLineage; + currentLineage = lineage; + return res; + } //~Actor() { --actorCount; } }; template <> -struct Actor : ActorLineage { +struct Actor { // This specialization is for a void actor (one not returning a future, hence also uncancellable) + Reference lineage = Reference{new ActorLineage() }; int8_t actor_wait_state; // 0 means actor is not waiting; 1-N mean waiting in callback group # - Actor() : actor_wait_state(0) { /*++actorCount;*/ } + Actor() : actor_wait_state(0) { + /*++actorCount;*/ + currentLineage = lineage; + } + + Reference setLineage() { + auto res = currentLineage; + currentLineage = lineage; + return res; + } //~Actor() { --actorCount; } }; template struct ActorCallback : Callback { - virtual void fire(ValueType const& value) override { static_cast(this)->a_callback_fire(this, value); } - virtual void error(Error e) override { static_cast(this)->a_callback_error(this, e); } + virtual void fire(ValueType const& value) override { + auto _ = static_cast(this)->setLineage(); + static_cast(this)->a_callback_fire(this, value); + } + virtual void error(Error e) override { + auto _ = static_cast(this)->setLineage(); + static_cast(this)->a_callback_error(this, e); + } }; template struct ActorSingleCallback : SingleCallback { virtual void fire(ValueType const& value) override { + auto _ = static_cast(this)->setLineage(); static_cast(this)->a_callback_fire(this, value); } virtual void fire(ValueType && value) override { + auto _ = static_cast(this)->setLineage(); static_cast(this)->a_callback_fire(this, std::move(value)); } virtual void error(Error e) override { + auto _ = static_cast(this)->setLineage(); static_cast(this)->a_callback_error(this, e); } }; diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 3fcab1f7dd..ab9d9c07d5 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -1493,6 +1493,10 @@ struct YieldedFutureActor : SAV, ActorCallback setLineage() { + return currentLineage; + } + void a_callback_fire(ActorCallback*, Void) { if (int16_t(in_error_state.code()) == UNSET_ERROR_CODE) { in_error_state = Error::fromCode(SET_ERROR_CODE); From d837e923ad9f8cbf3a5bcd5668a74d4ee0222c32 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 7 Dec 2020 15:23:18 -0700 Subject: [PATCH 004/180] minor bugfix --- flow/flow.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/flow.cpp b/flow/flow.cpp index c4a6097300..ed977141bd 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -26,7 +26,7 @@ #include #include -extern thread_local Reference currentLineage; +thread_local Reference currentLineage; ActorLineage::ActorLineage() : parent(currentLineage) { } From 2c4e38329e536172d2413da61d884ef944277598 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 9 Dec 2020 10:19:32 -0700 Subject: [PATCH 005/180] fix some compiler warnings --- fdbclient/SystemData.cpp | 6 +++--- fdbserver/BackupProgress.actor.cpp | 2 +- fdbserver/BackupWorker.actor.cpp | 6 +++--- fdbserver/CommitProxyServer.actor.cpp | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index b402ad99a7..16733b1ad6 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -57,7 +57,7 @@ const Value keyServersValue( Standalone result, const std::vecto std::vector destTag; bool foundOldLocality = false; - for (const KeyValueRef kv : result) { + for (const KeyValueRef& kv : result) { UID uid = decodeServerTagKey(kv.key); if (std::find(src.begin(), src.end(), uid) != src.end()) { srcTag.push_back( decodeServerTagValue(kv.value) ); @@ -109,7 +109,7 @@ void decodeKeyServersValue( Standalone result, const ValueRef& v src.clear(); dest.clear(); - for (const KeyValueRef kv : result) { + for (const KeyValueRef& kv : result) { Tag tag = decodeServerTagValue(kv.value); if (std::find(srcTag.begin(), srcTag.end(), tag) != srcTag.end()) { src.push_back( decodeServerTagKey(kv.key) ); @@ -122,7 +122,7 @@ void decodeKeyServersValue( Standalone result, const ValueRef& v std::sort(dest.begin(), dest.end()); if(missingIsError && (src.size() != srcTag.size() || dest.size() != destTag.size())) { TraceEvent(SevError, "AttemptedToDecodeMissingTag"); - for (const KeyValueRef kv : result) { + for (const KeyValueRef& kv : result) { Tag tag = decodeServerTagValue(kv.value); UID serverID = decodeServerTagKey(kv.key); TraceEvent("TagUIDMap").detail("Tag", tag.toString()).detail("UID", serverID.toString()); diff --git a/fdbserver/BackupProgress.actor.cpp b/fdbserver/BackupProgress.actor.cpp index 3f1d564c16..f496ec0558 100644 --- a/fdbserver/BackupProgress.actor.cpp +++ b/fdbserver/BackupProgress.actor.cpp @@ -121,7 +121,7 @@ std::map, std::map> BackupProgr } } - for (const Tag tag : tags) { // tags without progress data + for (const Tag& tag : tags) { // tags without progress data tagVersions.insert({ tag, adjustedBeginVersion }); TraceEvent("BackupVersionRange", dbgid) .detail("OldEpoch", epoch) diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp index 3cea9f6611..b5f78593e2 100644 --- a/fdbserver/BackupWorker.actor.cpp +++ b/fdbserver/BackupWorker.actor.cpp @@ -508,7 +508,7 @@ ACTOR Future setBackupKeys(BackupData* self, std::map savedL state std::vector>> prevVersions; state std::vector versionConfigs; state std::vector>> allWorkersReady; - for (const auto [uid, version] : savedLogVersions) { + for (const auto& [uid, version] : savedLogVersions) { versionConfigs.emplace_back(uid); prevVersions.push_back(versionConfigs.back().latestBackupWorkerSavedVersion().get(tr)); allWorkersReady.push_back(versionConfigs.back().allWorkerStarted().get(tr)); @@ -573,7 +573,7 @@ ACTOR Future monitorBackupProgress(BackupData* self) { if (self->recruitedEpoch == self->oldestBackupEpoch) { // update update progress so far if previous epochs are done Version v = std::numeric_limits::max(); - for (const auto [tag, version] : tagVersions) { + for (const auto& [tag, version] : tagVersions) { v = std::min(v, version); } savedLogVersions.emplace(uid, v); @@ -783,7 +783,7 @@ ACTOR Future saveMutationsToFile(BackupData* self, Version popVersion, int .detail("TagId", self->tag.id) .detail("File", file->getFileName()); } - for (const UID uid : activeUids) { + for (const UID& uid : activeUids) { self->backups[uid].lastSavedVersion = popVersion + 1; } diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index eac0f0d4c2..96ae4c000c 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -1778,7 +1778,7 @@ ACTOR Future commitProxyServerCore(CommitProxyInterface proxy, MasterInter state KeyRange txnKeys = allKeys; Standalone UIDtoTagMap = commitData.txnStateStore->readRange( serverTagKeys ).get(); state std::map tag_uid; - for (const KeyValueRef kv : UIDtoTagMap) { + for (const KeyValueRef& kv : UIDtoTagMap) { tag_uid[decodeServerTagValue(kv.value)] = decodeServerTagKey(kv.key); } loop { From 0d324cee80b306797e6f92392414b786ad5ce914 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 9 Dec 2020 10:19:59 -0700 Subject: [PATCH 006/180] Annotation framework and role lineage --- fdbrpc/CMakeLists.txt | 2 + fdbrpc/Locality.h | 1 + fdbrpc/RoleLineage.cpp | 23 ++++++++++ fdbrpc/RoleLineage.h | 31 +++++++++++++ fdbserver/worker.actor.cpp | 3 ++ flow/flow.cpp | 6 +++ flow/flow.h | 90 ++++++++++++++++++++++++++++++++------ 7 files changed, 142 insertions(+), 14 deletions(-) create mode 100644 fdbrpc/RoleLineage.cpp create mode 100644 fdbrpc/RoleLineage.h diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt index b4fb20098d..41229dce47 100644 --- a/fdbrpc/CMakeLists.txt +++ b/fdbrpc/CMakeLists.txt @@ -22,6 +22,8 @@ set(FDBRPC_SRCS ReplicationPolicy.cpp ReplicationTypes.cpp ReplicationUtils.cpp + RoleLineage.h + RoleLineage.cpp Stats.actor.cpp Stats.h sim2.actor.cpp diff --git a/fdbrpc/Locality.h b/fdbrpc/Locality.h index 11c209071a..2129b7a3b7 100644 --- a/fdbrpc/Locality.h +++ b/fdbrpc/Locality.h @@ -63,6 +63,7 @@ struct ProcessClass { Ratekeeper, StorageCache, Backup, + Worker, // used for actor lineage tracking NoRole }; enum ClassSource { CommandLineSource, AutoSource, DBSource, InvalidSource = -1 }; diff --git a/fdbrpc/RoleLineage.cpp b/fdbrpc/RoleLineage.cpp new file mode 100644 index 0000000000..89a64bbe40 --- /dev/null +++ b/fdbrpc/RoleLineage.cpp @@ -0,0 +1,23 @@ +/* + * RoleLineage.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbrpc/RoleLineage.h" + +StringRef RoleLineage::name = "RoleLineage"_sr; diff --git a/fdbrpc/RoleLineage.h b/fdbrpc/RoleLineage.h new file mode 100644 index 0000000000..30a2ea2650 --- /dev/null +++ b/fdbrpc/RoleLineage.h @@ -0,0 +1,31 @@ +/* + * RoleLineage.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "fdbrpc/Locality.h" + +struct RoleLineage : LineageProperties { + static StringRef name; + ProcessClass::ClusterRole role = ProcessClass::NoRole; + + bool isSet(ProcessClass::ClusterRole RoleLineage::*member) { + return this->*member != ProcessClass::NoRole; + } +}; diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index ca34f903a2..98363ea247 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -22,6 +22,7 @@ #include #include "fdbrpc/Locality.h" +#include "fdbrpc/RoleLineage.h" #include "fdbclient/StorageServerInterface.h" #include "fdbserver/Knobs.h" #include "flow/ActorCollection.h" @@ -46,6 +47,7 @@ #include "flow/Profiler.h" #include "flow/ThreadHelper.actor.h" #include "flow/Trace.h" +#include "flow/flow.h" #ifdef __linux__ #include @@ -1810,6 +1812,7 @@ ACTOR Future fdbd( { state vector> actors; state Promise recoveredDiskFiles; + currentLineage->modify(&RoleLineage::role) = ProcessClass::Worker; try { ServerCoordinators coordinators( connFile ); diff --git a/flow/flow.cpp b/flow/flow.cpp index ed977141bd..5b354fe054 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -31,6 +31,12 @@ thread_local Reference currentLineage; ActorLineage::ActorLineage() : parent(currentLineage) { } +ActorLineage::~ActorLineage() { + for (auto ptr : properties) { + delete ptr.second; + } +} + #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER) // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test. void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) { diff --git a/flow/flow.h b/flow/flow.h index a0c9793a7a..0ffc895a86 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -20,6 +20,7 @@ #ifndef FLOW_FLOW_H #define FLOW_FLOW_H +#include "flow/Arena.h" #include "flow/FastRef.h" #pragma once @@ -29,6 +30,7 @@ #include #include +#include #include #include #include @@ -409,21 +411,88 @@ struct SingleCallback { } }; -struct ActorLineagePropertyMap : ReferenceCounted { +struct LineagePropertiesBase { +}; + +// helper class to make implementation of LineageProperties easier +template +struct LineageProperties : LineagePropertiesBase { + // Contract: + // + // StringRef name = "SomeUniqueName"_str; + + + // this has to be implemented by subclasses + // but can't be made virtual. + // A user should implement this for any type + // within the properies class. + template + bool isSet(Value Derived::*member) { + return true; + } }; struct ActorLineage : ReferenceCounted { - Reference map; +private: + std::unordered_map properties; Reference parent; +public: ActorLineage(); + ~ActorLineage(); + bool isRoot() const { + return parent.getPtr() == nullptr; + } + void makeRoot() { + parent.clear(); + } + template + V& modify(V T::*member) { + auto& res = properties[T::name]; + if (!res) { + res = new T{}; + } + T* map = static_cast(res); + return map->*member; + } + template + std::optional get(V T::*member) const { + auto current = this; + while (current != nullptr) { + auto iter = current->properties.find(T::name); + if (iter != current->properties.end()) { + T const& map = static_cast(*iter->second); + if (map.isSet(member)) { + return map.*member; + } + } + current = current->parent.getPtr(); + } + return std::optional{}; + } + template + std::stack stack(V T::*member) const { + auto current = this; + std::stack res; + while (current != nullptr) { + auto iter = current->properties.find(T::name); + if (iter != current->properties.end()) { + T const& map = static_cast(*iter->second); + if (map.isSet(member)) { + res.push(map.*member); + } + } + current = current->parent.getPtr(); + } + return res; + } }; extern thread_local Reference currentLineage; struct restore_lineage { - Reference lineage; - restore_lineage() : lineage(currentLineage) {} - ~restore_lineage() { currentLineage = lineage; } + Reference prev; + restore_lineage() : prev(currentLineage) {} + ~restore_lineage() { currentLineage = prev; } }; // SAV is short for Single Assignment Variable: It can be assigned for only once! @@ -465,7 +534,6 @@ public: ASSERT(canBeSet()); new (&value_storage) T(std::forward(value)); this->error_state = Error::fromCode(SET_ERROR_CODE); - restore_lineage _; while (Callback::next != this) { Callback::next->fire(this->value()); } @@ -479,7 +547,6 @@ public: void sendError(Error err) { ASSERT(canBeSet() && int16_t(err.code()) > 0); this->error_state = err; - restore_lineage _; while (Callback::next != this) { Callback::next->error(err); } @@ -487,7 +554,6 @@ public: template void sendAndDelPromiseRef(U && value) { - restore_lineage _; ASSERT(canBeSet()); if (promises == 1 && !futures) { // No one is left to receive the value, so we can just die @@ -501,7 +567,6 @@ public: void finishSendAndDelPromiseRef() { // Call only after value_storage has already been initialized! - restore_lineage _; this->error_state = Error::fromCode(SET_ERROR_CODE); while (Callback::next != this) Callback::next->fire(this->value()); @@ -518,7 +583,6 @@ public: } void sendErrorAndDelPromiseRef(Error err) { - restore_lineage _; ASSERT(canBeSet() && int16_t(err.code()) > 0); if (promises == 1 && !futures) { // No one is left to receive the value, so we can just die @@ -622,7 +686,6 @@ struct NotifiedQueue : private SingleCallback, FastAllocated if (error.isValid()) return; if (SingleCallback::next != this) { - restore_lineage _; SingleCallback::next->fire(std::forward(value)); } else { @@ -635,7 +698,6 @@ struct NotifiedQueue : private SingleCallback, FastAllocated this->error = err; if (SingleCallback::next != this) { - restore_lineage _; SingleCallback::next->error(err); } } @@ -1025,13 +1087,13 @@ struct Actor : SAV { /*++actorCount;*/ currentLineage = lineage; } + //~Actor() { --actorCount; } Reference setLineage() { auto res = currentLineage; currentLineage = lineage; return res; } - //~Actor() { --actorCount; } }; template <> @@ -1045,13 +1107,13 @@ struct Actor { /*++actorCount;*/ currentLineage = lineage; } + //~Actor() { --actorCount; } Reference setLineage() { auto res = currentLineage; currentLineage = lineage; return res; } - //~Actor() { --actorCount; } }; template From 945d0246cddc0dcfff982f22af54c43617bc79a8 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 9 Dec 2020 13:28:15 -0700 Subject: [PATCH 007/180] add actor stacktrace feature --- flow/actorcompiler/ActorCompiler.cs | 3 ++- flow/flow.cpp | 6 ++++++ flow/flow.h | 12 ++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/flow/actorcompiler/ActorCompiler.cs b/flow/actorcompiler/ActorCompiler.cs index dc9de91868..28771f4503 100644 --- a/flow/actorcompiler/ActorCompiler.cs +++ b/flow/actorcompiler/ActorCompiler.cs @@ -452,7 +452,7 @@ namespace actorcompiler fullClassName, string.Join(", ", actor.parameters.Select(p => p.name).ToArray())); - writer.WriteLine("restore_lineage _;"); + writer.WriteLine("\trestore_lineage _;"); if (actor.returnType != null) writer.WriteLine("\treturn Future<{1}>({0});", newActor, actor.returnType); else @@ -1287,6 +1287,7 @@ namespace actorcompiler constructor.WriteLine("{"); constructor.Indent(+1); ProbeEnter(constructor, actor.name); + constructor.WriteLine("currentLineage->modify(&StackLineage::actorName) = LiteralStringRef(\"{0}\");", actor.name); constructor.WriteLine("this->{0};", body.call()); ProbeExit(constructor, actor.name); WriteFunction(writer, constructor, constructor.BodyText); diff --git a/flow/flow.cpp b/flow/flow.cpp index 5b354fe054..2e47847fcd 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -37,6 +37,12 @@ ActorLineage::~ActorLineage() { } } +StringRef StackLineage::name = "StackLineage"_sr; + +std::stack getActorStackTrace() { + return currentLineage->stack(&StackLineage::actorName); +} + #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER) // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test. void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) { diff --git a/flow/flow.h b/flow/flow.h index 0ffc895a86..518dbd036c 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -495,6 +495,18 @@ struct restore_lineage { ~restore_lineage() { currentLineage = prev; } }; +struct StackLineage : LineageProperties { + static StringRef name; + StringRef actorName; + + template + bool isSet(Value StackLineage::*member) { + return true; + } +}; + +extern std::stack getActorStackTrace(); + // SAV is short for Single Assignment Variable: It can be assigned for only once! template struct SAV : private Callback, FastAllocated> { From f8e1df6c4f8c5a687afffe2b9a28aa13e32ae9d5 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 10 Dec 2020 10:42:04 -0700 Subject: [PATCH 008/180] Support for actor stack traces --- fdbrpc/RoleLineage.h | 2 +- fdbserver/CMakeLists.txt | 1 + fdbserver/SigStack.cpp | 23 +++++++++++++++++++++++ fdbserver/worker.actor.cpp | 3 +++ flow/flow.h | 7 +------ tests/TestRunner/local_cluster.py | 2 +- 6 files changed, 30 insertions(+), 8 deletions(-) create mode 100644 fdbserver/SigStack.cpp diff --git a/fdbrpc/RoleLineage.h b/fdbrpc/RoleLineage.h index 30a2ea2650..8e9d3f4e9e 100644 --- a/fdbrpc/RoleLineage.h +++ b/fdbrpc/RoleLineage.h @@ -25,7 +25,7 @@ struct RoleLineage : LineageProperties { static StringRef name; ProcessClass::ClusterRole role = ProcessClass::NoRole; - bool isSet(ProcessClass::ClusterRole RoleLineage::*member) { + bool isSet(ProcessClass::ClusterRole RoleLineage::*member) const { return this->*member != ProcessClass::NoRole; } }; diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index bf266069cb..f52e5b8279 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -88,6 +88,7 @@ set(FDBSERVER_SRCS ResolverInterface.h ServerDBInfo.actor.h ServerDBInfo.h + SigStack.cpp SimulatedCluster.actor.cpp SimulatedCluster.h SkipList.cpp diff --git a/fdbserver/SigStack.cpp b/fdbserver/SigStack.cpp new file mode 100644 index 0000000000..efec5aff7d --- /dev/null +++ b/fdbserver/SigStack.cpp @@ -0,0 +1,23 @@ +#include "flow/flow.h" +#include +#include +#include + +// This is not yet correct, as this is not async safe +// However, this should be good enough for an initial +// proof of concept. +extern "C" void stackSignalHandler(int sig) { + auto stack = getActorStackTrace(); + int i = 0; + while (!stack.empty()) { + auto s = stack.top(); + stack.pop(); + std::string_view n(reinterpret_cast(s.begin()), s.size()); + std::cout << i << ": " << n << std::endl; + ++i; + } +} + +void setupStackSignal() { + std::signal(SIGUSR1, &stackSignalHandler); +} diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 98363ea247..5d371c0c80 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -1798,6 +1798,8 @@ ACTOR Future monitorLeaderRemotelyWithDelayedCandidacy( Reference fdbd( Reference connFile, LocalityData localities, @@ -1812,6 +1814,7 @@ ACTOR Future fdbd( { state vector> actors; state Promise recoveredDiskFiles; + setupStackSignal(); currentLineage->modify(&RoleLineage::role) = ProcessClass::Worker; try { diff --git a/flow/flow.h b/flow/flow.h index 518dbd036c..b1e4c1e1fb 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -427,7 +427,7 @@ struct LineageProperties : LineagePropertiesBase { // A user should implement this for any type // within the properies class. template - bool isSet(Value Derived::*member) { + bool isSet(Value Derived::*member) const { return true; } }; @@ -498,11 +498,6 @@ struct restore_lineage { struct StackLineage : LineageProperties { static StringRef name; StringRef actorName; - - template - bool isSet(Value StackLineage::*member) { - return true; - } }; extern std::stack getActorStackTrace(); diff --git a/tests/TestRunner/local_cluster.py b/tests/TestRunner/local_cluster.py index 68318d51dd..85f2094774 100644 --- a/tests/TestRunner/local_cluster.py +++ b/tests/TestRunner/local_cluster.py @@ -38,7 +38,7 @@ cluster_file = {etcdir}/fdb.cluster command = {fdbserver_bin} public_address = auto:$ID listen_address = public -datadir = {datadir} +datadir = {datadir}/$ID logdir = {logdir} # logsize = 10MiB # maxlogssize = 100MiB From fb64902d5c5b6e88501ebe906d4d939f61257b9b Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 19 Jan 2021 16:04:09 -0700 Subject: [PATCH 009/180] Assign roles --- fdbrpc/CMakeLists.txt | 2 -- fdbserver/CMakeLists.txt | 2 ++ .../RoleLineage.actor.cpp | 2 +- .../RoleLineage.actor.h | 21 ++++++++++++++- fdbserver/worker.actor.cpp | 26 ++++++++++++++++++- flow/flow.cpp | 5 ++-- flow/flow.h | 16 ++++++++++++ 7 files changed, 67 insertions(+), 7 deletions(-) rename fdbrpc/RoleLineage.cpp => fdbserver/RoleLineage.actor.cpp (95%) rename fdbrpc/RoleLineage.h => fdbserver/RoleLineage.actor.h (59%) diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt index 7a9ce26a10..af84676be7 100644 --- a/fdbrpc/CMakeLists.txt +++ b/fdbrpc/CMakeLists.txt @@ -22,8 +22,6 @@ set(FDBRPC_SRCS ReplicationPolicy.cpp ReplicationTypes.cpp ReplicationUtils.cpp - RoleLineage.h - RoleLineage.cpp Stats.actor.cpp Stats.h sim2.actor.cpp diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index afc45b2cc4..9e406a0d26 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -86,6 +86,8 @@ set(FDBSERVER_SRCS RestoreWorker.actor.cpp Resolver.actor.cpp ResolverInterface.h + RoleLineage.actor.h + RoleLineage.actor.cpp ServerDBInfo.actor.h ServerDBInfo.h SigStack.cpp diff --git a/fdbrpc/RoleLineage.cpp b/fdbserver/RoleLineage.actor.cpp similarity index 95% rename from fdbrpc/RoleLineage.cpp rename to fdbserver/RoleLineage.actor.cpp index 89a64bbe40..6d1b49527a 100644 --- a/fdbrpc/RoleLineage.cpp +++ b/fdbserver/RoleLineage.actor.cpp @@ -18,6 +18,6 @@ * limitations under the License. */ -#include "fdbrpc/RoleLineage.h" +#include "fdbserver/RoleLineage.actor.h" StringRef RoleLineage::name = "RoleLineage"_sr; diff --git a/fdbrpc/RoleLineage.h b/fdbserver/RoleLineage.actor.h similarity index 59% rename from fdbrpc/RoleLineage.h rename to fdbserver/RoleLineage.actor.h index 8e9d3f4e9e..d35c749771 100644 --- a/fdbrpc/RoleLineage.h +++ b/fdbserver/RoleLineage.actor.h @@ -1,5 +1,5 @@ /* - * RoleLineage.h + * RoleLineage.actor.h * * This source file is part of the FoundationDB open source project * @@ -19,7 +19,15 @@ */ #pragma once +#include "flow/flow.h" +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_G_H) +# define FDBSERVER_ROLE_LINEAGE_ACTOR_G_H +# include "fdbserver/RoleLineage.actor.g.h" +#elif !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_H) +# define FDBSERVER_ROLE_LINEAGE_ACTOR_H + #include "fdbrpc/Locality.h" +#include "flow/actorcompiler.h" // This must be the last include struct RoleLineage : LineageProperties { static StringRef name; @@ -29,3 +37,14 @@ struct RoleLineage : LineageProperties { return this->*member != ProcessClass::NoRole; } }; + +// creates a new root and sets the role lineage +ACTOR template +Future()())> runInRole(Fun fun, ProcessClass::ClusterRole role) { + currentLineage->makeRoot(); + currentLineage->modify(&RoleLineage::role) = role; + decltype(std::declval()()) res = wait(fun()); + return res; +} + +#endif diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 36f5c14860..19aea8622c 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -22,7 +22,6 @@ #include #include "fdbrpc/Locality.h" -#include "fdbrpc/RoleLineage.h" #include "fdbclient/StorageServerInterface.h" #include "fdbserver/Knobs.h" #include "flow/ActorCollection.h" @@ -33,6 +32,7 @@ #include "fdbclient/NativeAPI.actor.h" #include "fdbserver/MetricLogger.h" #include "fdbserver/BackupInterface.h" +#include "fdbserver/RoleLineage.actor.h" #include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/IKeyValueStore.h" #include "fdbserver/WaitFailure.h" @@ -1024,6 +1024,8 @@ ACTOR Future workerServer( DiskStore s = stores[f]; // FIXME: Error handling if( s.storedComponent == DiskStore::Storage ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage; IKeyValueStore* kv = openKVStore(s.storeType, s.filename, s.storeID, memoryLimit, false, validateDataFiles); Future kvClosed = kv->onClosed(); filesClosed.add( kvClosed ); @@ -1058,6 +1060,8 @@ ACTOR Future workerServer( f = storageServerRollbackRebooter( f, s.storeType, s.filename, recruited.id(), recruited.locality, dbInfo, folder, &filesClosed, memoryLimit, kv); errorForwarders.add( forwardError( errors, Role::STORAGE_SERVER, recruited.id(), f ) ); } else if( s.storedComponent == DiskStore::TLogData ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::TLog; std::string logQueueBasename; const std::string filename = basename(s.filename); if (StringRef(filename).startsWith(fileLogDataPrefix)) { @@ -1218,6 +1222,8 @@ ACTOR Future workerServer( } } when( RecruitMasterRequest req = waitNext(interf.master.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Master; MasterInterface recruited; recruited.locality = locality; recruited.initEndpoints(); @@ -1238,6 +1244,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when ( InitializeDataDistributorRequest req = waitNext(interf.dataDistributor.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::DataDistributor; DataDistributorInterface recruited(locality); recruited.initEndpoints(); @@ -1256,6 +1264,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when ( InitializeRatekeeperRequest req = waitNext(interf.ratekeeper.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Ratekeeper; RatekeeperInterface recruited(locality, req.reqId); recruited.initEndpoints(); @@ -1280,6 +1290,8 @@ ACTOR Future workerServer( } when (InitializeBackupRequest req = waitNext(interf.backup.getFuture())) { if (!backupWorkerCache.exists(req.reqId)) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Backup; BackupInterface recruited(locality); recruited.initEndpoints(); @@ -1309,6 +1321,8 @@ ACTOR Future workerServer( .detail("MinRecruitable", TLogVersion::MIN_RECRUITABLE); req.reply.sendError(internal_error()); } + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::TLog; TLogOptions tLogOptions(req.logVersion, req.spillType); TLogFn tLogFn = tLogFnForOptions(tLogOptions); auto& logData = sharedLogs[SharedLogsKey(tLogOptions, req.storeType)]; @@ -1341,6 +1355,8 @@ ACTOR Future workerServer( } when( InitializeStorageRequest req = waitNext(interf.storage.getFuture()) ) { if( !storageCache.exists( req.reqId ) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage; StorageServerInterface recruited(req.interfaceId); recruited.locality = locality; recruited.initEndpoints(); @@ -1379,6 +1395,8 @@ ACTOR Future workerServer( forwardPromise( req.reply, storageCache.get( req.reqId ) ); } when(InitializeCommitProxyRequest req = waitNext(interf.commitProxy.getFuture())) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::CommitProxy; CommitProxyInterface recruited; recruited.processId = locality.processId(); recruited.provisional = false; @@ -1402,6 +1420,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when( InitializeGrvProxyRequest req = waitNext(interf.grvProxy.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::GrvProxy; GrvProxyInterface recruited; recruited.processId = locality.processId(); recruited.provisional = false; @@ -1421,6 +1441,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when( InitializeResolverRequest req = waitNext(interf.resolver.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Resolver; ResolverInterface recruited; recruited.locality = locality; recruited.initEndpoints(); @@ -1438,6 +1460,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when( InitializeLogRouterRequest req = waitNext(interf.logRouter.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::LogRouter; TLogInterface recruited(locality); recruited.initEndpoints(); diff --git a/flow/flow.cpp b/flow/flow.cpp index 2e47847fcd..c90bbbe9ae 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -28,8 +28,9 @@ thread_local Reference currentLineage; -ActorLineage::ActorLineage() : parent(currentLineage) { -} +LineagePropertiesBase::~LineagePropertiesBase() {} + +ActorLineage::ActorLineage() : parent(currentLineage) {} ActorLineage::~ActorLineage() { for (auto ptr : properties) { diff --git a/flow/flow.h b/flow/flow.h index e043ab49d4..9b3ba698b6 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -412,6 +412,7 @@ struct SingleCallback { }; struct LineagePropertiesBase { + virtual ~LineagePropertiesBase(); }; // helper class to make implementation of LineageProperties easier @@ -433,6 +434,7 @@ struct LineageProperties : LineagePropertiesBase { }; struct ActorLineage : ReferenceCounted { + friend class LocalLineage; private: std::unordered_map properties; Reference parent; @@ -489,6 +491,20 @@ public: extern thread_local Reference currentLineage; +// This class can be used in order to modify all lineage properties +// of actors created within a (non-actor) scope +struct LocalLineage { + Reference lineage = Reference{new ActorLineage() }; + Reference oldLineage; + LocalLineage() { + oldLineage = currentLineage; + currentLineage = lineage; + } + ~LocalLineage() { + currentLineage = oldLineage; + } +}; + struct restore_lineage { Reference prev; restore_lineage() : prev(currentLineage) {} From f40d8c2f490a08351ce3d7e91bfd6752e268548a Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 19 Jan 2021 16:04:21 -0700 Subject: [PATCH 010/180] make profiler signal handler reentrant safe --- flow/Profiler.actor.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp index ece9bcfafd..33d1542db7 100644 --- a/flow/Profiler.actor.cpp +++ b/flow/Profiler.actor.cpp @@ -148,6 +148,8 @@ struct Profiler { } void signal_handler() { // async signal safe! + static std::atomic inSigHandler = false; + if (!inSigHandler.exchange(true)) { return; } if(profilingEnabled) { double t = timer(); output_buffer->push(*(void**)&t); @@ -156,6 +158,7 @@ struct Profiler { output_buffer->push(addresses[i]); output_buffer->push((void*)-1LL); } + inSigHandler.store(false); } static void signal_handler_for_closure(int, siginfo_t* si, void*, void* self) { // async signal safe! From c3efbe3040770dae65319446b9b3877f29b0ee44 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 19 Jan 2021 16:52:30 -0700 Subject: [PATCH 011/180] fixed minor bug --- flow/Profiler.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp index 33d1542db7..d691f46205 100644 --- a/flow/Profiler.actor.cpp +++ b/flow/Profiler.actor.cpp @@ -149,7 +149,7 @@ struct Profiler { void signal_handler() { // async signal safe! static std::atomic inSigHandler = false; - if (!inSigHandler.exchange(true)) { return; } + if (inSigHandler.exchange(true)) { return; } if(profilingEnabled) { double t = timer(); output_buffer->push(*(void**)&t); From a8c7a798f2483c22ffd6c8dacbb0946c81237c12 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 17 Mar 2021 15:34:20 -0600 Subject: [PATCH 012/180] First prototype of actorlineageset --- flow/ActorLineageSet.cpp | 118 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 flow/ActorLineageSet.cpp diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp new file mode 100644 index 0000000000..9fb93e9df7 --- /dev/null +++ b/flow/ActorLineageSet.cpp @@ -0,0 +1,118 @@ +/* + * ActorLineageSet.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/flow.h" +#include + +class ActorLineageSet { +public: + // The type we use for lookup into the set. Gets assigned during insert + using Index = unsigned; + // For now we use a fixed size capacity + constexpr static Index CAPACITY = 1024; + constexpr static Index npos = std::numeric_limits::max(); + + explicit ActorLineageSet(); + ActorLineageSet(const ActorLineageSet&) = delete; + ActorLineageSet& operator=(const ActorLineageSet&) = delete; + + // Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so + // the actual size might change anytime after or even during the call. This function only guarantees that the size + // was whatever the method returns at one point between the start and the end of the function call. The safest way + // to handle this is by assuming that this returns an estimate. + unsigned size(); + + Index insert(const Reference& lineage); + void erase(Index idx); + std::vector> copy(); + +private: + static constexpr uintptr_t FREE = 0b1; + static constexpr uintptr_t LOCK = 0b10; + std::atomic _size = 0; + std::vector> _set; + boost::lockfree::queue, boost::lockfree::capacity> freeQueue; + boost::lockfree::queue, boost::lockfree::capacity> + freeList; +}; + +ActorLineageSet::ActorLineageSet() { + // insert the free indexes in reverse order + for (unsigned i = CAPACITY; i > 0; --i) { + freeQueue.push(i - 1); + _set[i] = uintptr_t(1); + } +} + +std::vector> ActorLineageSet::copy() { + std::vector> result; + for (int i = 0; i < CAPACITY; ++i) { + auto ptr = _set[i].load(); + if ((ptr & FREE) != 0) { + ASSERT((ptr & LOCK) == 0); + if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) { + ActorLineage* entry = reinterpret_cast(ptr); + ptr |= LOCK; + entry->addref(); + // we try to unlock now. If this element was removed while we incremented the refcount, the element will + // end up in the freeList, so we will decrement later. + _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); + result.emplace_back(entry); + } + } + } + // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread + // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next + // iteration + ActorLineage* toClean; + while (freeList.pop(toClean)) { + toClean->delref(); + } + return result; +} + +ActorLineageSet::Index ActorLineageSet::insert(const Reference& lineage) { + Index res; + if (!freeQueue.pop(res)) { + TraceEvent(SevWarnAlways, "NoCapacityInActorLineageSet"); + return npos; + } + ASSERT(_set[res].load() & FREE); + auto ptr = reinterpret_cast(lineage.getPtr()); + lineage->addref(); + _set[res].store(ptr); + return res; +} + +void ActorLineageSet::erase(Index idx) { + while (true) { + auto ptr = _set[idx].load(); + if (ptr & LOCK) { + _set[idx].store(FREE); + freeList.push(reinterpret_cast(ptr ^ LOCK)); + return; + } else { + if (_set[idx].compare_exchange_strong(ptr, FREE)) { + reinterpret_cast(ptr)->delref(); + return; + } + } + } +} \ No newline at end of file From 9812a49058adf16c2cdd1445f876f372be074109 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 17 Mar 2021 15:40:19 -0600 Subject: [PATCH 013/180] use consume_all to clean up after copy --- flow/ActorLineageSet.cpp | 5 +---- flow/CMakeLists.txt | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp index 9fb93e9df7..0957339501 100644 --- a/flow/ActorLineageSet.cpp +++ b/flow/ActorLineageSet.cpp @@ -81,10 +81,7 @@ std::vector> ActorLineageSet::copy() { // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next // iteration - ActorLineage* toClean; - while (freeList.pop(toClean)) { - toClean->delref(); - } + freeList.consume_all([](auto toClean) { toClean->delRef(); }); return result; } diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index c838e8eff8..5e89fe4d28 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -3,6 +3,7 @@ find_package(Threads REQUIRED) set(FLOW_SRCS ActorCollection.actor.cpp ActorCollection.h + ActorLineageSet.cpp Arena.cpp Arena.h AsioReactor.h From f6c7aa6ac77e55266e030109eb77d24b8894952e Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 17 Mar 2021 15:50:29 -0600 Subject: [PATCH 014/180] fixed typo --- flow/ActorLineageSet.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp index 0957339501..9a0d34c9bf 100644 --- a/flow/ActorLineageSet.cpp +++ b/flow/ActorLineageSet.cpp @@ -81,7 +81,7 @@ std::vector> ActorLineageSet::copy() { // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next // iteration - freeList.consume_all([](auto toClean) { toClean->delRef(); }); + freeList.consume_all([](auto toClean) { toClean->delref(); }); return result; } From 4f1b807e1f480f24a0e3cb9622149953c295a4ab Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 17 Mar 2021 16:01:23 -0600 Subject: [PATCH 015/180] assert object alignment --- flow/ActorLineageSet.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp index 9a0d34c9bf..570976379c 100644 --- a/flow/ActorLineageSet.cpp +++ b/flow/ActorLineageSet.cpp @@ -93,6 +93,7 @@ ActorLineageSet::Index ActorLineageSet::insert(const Reference& li } ASSERT(_set[res].load() & FREE); auto ptr = reinterpret_cast(lineage.getPtr()); + ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned lineage->addref(); _set[res].store(ptr); return res; From 5c1b674815b1765dbc08eed4d98875163dee5708 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 19 Mar 2021 10:31:58 -0600 Subject: [PATCH 016/180] implemented test --- flow/CMakeLists.txt | 2 +- flow/WriteOnlySet.actor.cpp | 159 +++++++++++++++++++ flow/{ActorLineageSet.cpp => WriteOnlySet.h} | 75 ++++----- 3 files changed, 187 insertions(+), 49 deletions(-) create mode 100644 flow/WriteOnlySet.actor.cpp rename flow/{ActorLineageSet.cpp => WriteOnlySet.h} (60%) diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index 5e89fe4d28..4c28aee437 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -3,7 +3,6 @@ find_package(Threads REQUIRED) set(FLOW_SRCS ActorCollection.actor.cpp ActorCollection.h - ActorLineageSet.cpp Arena.cpp Arena.h AsioReactor.h @@ -70,6 +69,7 @@ set(FLOW_SRCS TreeBenchmark.h UnitTest.cpp UnitTest.h + WriteOnlySet.actor.cpp XmlTraceLogFormatter.cpp XmlTraceLogFormatter.h actorcompiler.h diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp new file mode 100644 index 0000000000..d0f7c514ad --- /dev/null +++ b/flow/WriteOnlySet.actor.cpp @@ -0,0 +1,159 @@ +/* + * WriteOnlySet.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/DeterministicRandom.h" +#include "flow/WriteOnlySet.h" +#include "flow/flow.h" +#include "flow/UnitTest.h" + +#include +#include +#include "flow/actorcompiler.h" // has to be last include + +template +auto WriteOnlySet::insert(const Reference& lineage) -> Index { + Index res; + if (!freeQueue.pop(res)) { + TraceEvent(SevWarnAlways, "NoCapacityInWriteOnlySet"); + return npos; + } + ASSERT(_set[res].load() & FREE); + auto ptr = reinterpret_cast(lineage.getPtr()); + ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned + ASSERT((ptr & FREE) == 0 && (ptr & LOCK) == 0); + lineage->addref(); + _set[res].store(ptr); + return res; +} + +template +void WriteOnlySet::erase(Index idx) { + while (true) { + auto ptr = _set[idx].load(); + if (ptr & LOCK) { + _set[idx].store(FREE); + freeList.push(reinterpret_cast(ptr ^ LOCK)); + return; + } else { + if (_set[idx].compare_exchange_strong(ptr, FREE)) { + reinterpret_cast(ptr)->delref(); + return; + } + } + } +} + +// Explicit instantiation +template class WriteOnlySet; + +// testing code +namespace { + +std::atomic instanceCounter = 0; +constexpr double iteration_frequency = 10.0; + +struct TestObject { + mutable std::atomic _refCount = 1; + TestObject() { instanceCounter.fetch_add(1); } + void delref() const { + if (--_refCount == 0) { + delete this; + --instanceCounter; + } + } + void addref() const { ++_refCount; } +}; + +using TestSet = WriteOnlySet; +using Clock = std::chrono::steady_clock; + +ACTOR Future threadjoiner(std::shared_ptr> threads, std::shared_ptr set) { + loop { + wait(delay(0.1)); + for (unsigned i = 0;;) { + if (threads->size() == i) { + break; + } + auto& t = (*threads)[i]; + if (t.joinable()) { + t.join(); + if (i + 1 < threads->size()) { + std::swap(*threads->rbegin(), (*threads)[i]); + } + threads->pop_back(); + } else { + ++i; + } + } + if (threads->empty()) { + set->copy(); + ASSERT(instanceCounter.load() == 0); + return Void(); + } + } +} + +void testCopier(std::shared_ptr set, std::chrono::seconds runFor) { + auto start = Clock::now(); + while (true) { + if (Clock::now() - start > runFor) { + return; + } + auto copy = set->copy(); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } +} + +void writer(std::shared_ptr set, std::chrono::seconds runFor) { + auto start = Clock::now(); + std::random_device rDev; + DeterministicRandom rnd(rDev()); + while (true) { + if (Clock::now() - start > runFor) { + return; + } + std::vector positions; + for (int i = 0; i < rnd.randomInt(1, 101); ++i) { + positions.push_back(set->insert(Reference(new TestObject()))); + } + rnd.randomShuffle(positions); + for (auto p : positions) { + set->erase(p); + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } +} + +TEST_CASE("/flow/WriteOnlySet") { + if (g_network->isSimulated()) { + // This test is not deterministic, so we shouldn't run it in simulation + return Void(); + } + auto set = std::make_shared(); + auto threads = std::make_shared>(); + std::chrono::seconds runFor(10); + for (int i = 0; i < 5; ++i) { + threads->emplace_back([set, runFor]() { writer(set, runFor); }); + } + threads->emplace_back([set, runFor]() { testCopier(set, runFor); }); + wait(threadjoiner(threads, set)); + return Void(); +} +} // namespace \ No newline at end of file diff --git a/flow/ActorLineageSet.cpp b/flow/WriteOnlySet.h similarity index 60% rename from flow/ActorLineageSet.cpp rename to flow/WriteOnlySet.h index 570976379c..a319ad22f0 100644 --- a/flow/ActorLineageSet.cpp +++ b/flow/WriteOnlySet.h @@ -1,9 +1,9 @@ /* - * ActorLineageSet.cpp + * WriteOnlySet.cpp * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,20 +18,23 @@ * limitations under the License. */ -#include "flow/flow.h" +#pragma once +#include "flow/Error.h" +#include "flow/FastRef.h" +#include "flow/Trace.h" #include -class ActorLineageSet { +template +class WriteOnlySet { public: // The type we use for lookup into the set. Gets assigned during insert - using Index = unsigned; + using Index = IndexType; // For now we use a fixed size capacity - constexpr static Index CAPACITY = 1024; constexpr static Index npos = std::numeric_limits::max(); - explicit ActorLineageSet(); - ActorLineageSet(const ActorLineageSet&) = delete; - ActorLineageSet& operator=(const ActorLineageSet&) = delete; + explicit WriteOnlySet(); + WriteOnlySet(const WriteOnlySet&) = delete; + WriteOnlySet& operator=(const WriteOnlySet&) = delete; // Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so // the actual size might change anytime after or even during the call. This function only guarantees that the size @@ -39,36 +42,39 @@ public: // to handle this is by assuming that this returns an estimate. unsigned size(); - Index insert(const Reference& lineage); + Index insert(const Reference& lineage); void erase(Index idx); - std::vector> copy(); + std::vector> copy(); private: static constexpr uintptr_t FREE = 0b1; static constexpr uintptr_t LOCK = 0b10; - std::atomic _size = 0; + std::atomic _size = 0; std::vector> _set; + static_assert(std::atomic::is_always_lock_free, "Index type can't be used as a lock-free type"); + static_assert(std::atomic::is_always_lock_free, "uintptr_t can't be used as a lock-free type"); boost::lockfree::queue, boost::lockfree::capacity> freeQueue; - boost::lockfree::queue, boost::lockfree::capacity> - freeList; + boost::lockfree::queue, boost::lockfree::capacity> freeList; }; -ActorLineageSet::ActorLineageSet() { +template +WriteOnlySet::WriteOnlySet() : _set(CAPACITY) { // insert the free indexes in reverse order for (unsigned i = CAPACITY; i > 0; --i) { freeQueue.push(i - 1); - _set[i] = uintptr_t(1); + _set[i] = uintptr_t(FREE); } } -std::vector> ActorLineageSet::copy() { - std::vector> result; +template +std::vector> WriteOnlySet::copy() { + std::vector> result; for (int i = 0; i < CAPACITY; ++i) { auto ptr = _set[i].load(); if ((ptr & FREE) != 0) { ASSERT((ptr & LOCK) == 0); if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) { - ActorLineage* entry = reinterpret_cast(ptr); + T* entry = reinterpret_cast(ptr); ptr |= LOCK; entry->addref(); // we try to unlock now. If this element was removed while we incremented the refcount, the element will @@ -85,32 +91,5 @@ std::vector> ActorLineageSet::copy() { return result; } -ActorLineageSet::Index ActorLineageSet::insert(const Reference& lineage) { - Index res; - if (!freeQueue.pop(res)) { - TraceEvent(SevWarnAlways, "NoCapacityInActorLineageSet"); - return npos; - } - ASSERT(_set[res].load() & FREE); - auto ptr = reinterpret_cast(lineage.getPtr()); - ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned - lineage->addref(); - _set[res].store(ptr); - return res; -} - -void ActorLineageSet::erase(Index idx) { - while (true) { - auto ptr = _set[idx].load(); - if (ptr & LOCK) { - _set[idx].store(FREE); - freeList.push(reinterpret_cast(ptr ^ LOCK)); - return; - } else { - if (_set[idx].compare_exchange_strong(ptr, FREE)) { - reinterpret_cast(ptr)->delref(); - return; - } - } - } -} \ No newline at end of file +class ActorLineage; +extern template class WriteOnlySet; From 459afeed4cd9d6df4892e085f94d369af59f1efc Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 19 Mar 2021 11:25:55 -0600 Subject: [PATCH 017/180] disable jemalloc on macOS --- cmake/Jemalloc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Jemalloc.cmake b/cmake/Jemalloc.cmake index 6dff173b93..e89ef3ce82 100644 --- a/cmake/Jemalloc.cmake +++ b/cmake/Jemalloc.cmake @@ -3,7 +3,7 @@ add_library(jemalloc INTERFACE) set(USE_JEMALLOC ON) # We don't want to use jemalloc on Windows # Nor on FreeBSD, where jemalloc is the default system allocator -if(USE_SANITIZER OR WIN32 OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")) +if(USE_SANITIZER OR WIN32 OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") OR APPLE) set(USE_JEMALLOC OFF) return() endif() From 995ae34b1e637f6f776fc889e00474eb1ca1a322 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 19 Mar 2021 17:10:42 -0600 Subject: [PATCH 018/180] Bugfxies & hack to allow new unit test to run --- fdbserver/fdbserver.actor.cpp | 4 ++ flow/WriteOnlySet.actor.cpp | 89 ++++++++++++++++++++++++++++++----- flow/WriteOnlySet.h | 44 +++-------------- 3 files changed, 89 insertions(+), 48 deletions(-) diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index ff28269e4f..a285c0b958 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -66,6 +66,7 @@ #include "flow/SystemMonitor.h" #include "flow/TLSConfig.actor.h" #include "flow/Tracing.h" +#include "flow/WriteOnlySet.h" #if defined(__linux__) || defined(__FreeBSD__) #include @@ -1572,6 +1573,9 @@ private: } // namespace int main(int argc, char* argv[]) { + // TODO: Remove later, this is just to force the statics to be initialized + // otherwise the unit test won't run + ActorLineageSet _; try { platformInit(); diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index d0f7c514ad..32023f5e24 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -34,32 +34,75 @@ auto WriteOnlySet::insert(const Reference& lineage) - TraceEvent(SevWarnAlways, "NoCapacityInWriteOnlySet"); return npos; } - ASSERT(_set[res].load() & FREE); + ASSERT(_set[res].load() == 0); auto ptr = reinterpret_cast(lineage.getPtr()); - ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned - ASSERT((ptr & FREE) == 0 && (ptr & LOCK) == 0); + ASSERT((ptr % 2) == 0); // this needs to be at least 2-byte aligned + ASSERT(ptr != 0); lineage->addref(); _set[res].store(ptr); return res; } template -void WriteOnlySet::erase(Index idx) { +bool WriteOnlySet::eraseImpl(Index idx) { while (true) { auto ptr = _set[idx].load(); if (ptr & LOCK) { - _set[idx].store(FREE); + _set[idx].store(0); freeList.push(reinterpret_cast(ptr ^ LOCK)); - return; + return false; } else { - if (_set[idx].compare_exchange_strong(ptr, FREE)) { + if (_set[idx].compare_exchange_strong(ptr, 0)) { reinterpret_cast(ptr)->delref(); - return; + return true; } } } } +template +bool WriteOnlySet::erase(Index idx) { + auto res = eraseImpl(idx); + ASSERT(freeQueue.push(idx)); + return res; +} + +template +WriteOnlySet::WriteOnlySet() : _set(CAPACITY) { + // insert the free indexes in reverse order + for (unsigned i = CAPACITY; i > 0; --i) { + freeQueue.push(i - 1); + _set[i] = uintptr_t(0); + } +} + +template +std::vector> WriteOnlySet::copy() { + std::vector> result; + for (int i = 0; i < CAPACITY; ++i) { + auto ptr = _set[i].load(); + if (ptr) { + ASSERT((ptr & LOCK) == 0); // if we lock something we need to immediately unlock after we're done copying + // We attempt lock so this won't get deleted. We will try this only once, if the other thread removed the + // object from the set between the previews lines and now, we just won't make it part of the result. + if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) { + T* entry = reinterpret_cast(ptr); + ptr |= LOCK; + entry->addref(); + // we try to unlock now. If this element was removed while we incremented the refcount, the element will + // end up in the freeList, so we will decrement later. + _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); + result.emplace_back(entry); + } + } + } + // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread + // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next + // iteration + freeList.consume_all([](auto toClean) { toClean->delref(); }); + return result; +} + // Explicit instantiation template class WriteOnlySet; @@ -67,7 +110,10 @@ template class WriteOnlySet; namespace { std::atomic instanceCounter = 0; -constexpr double iteration_frequency = 10.0; +std::atomic numInserts = 0; +std::atomic numErase = 0; +std::atomic numLockedErase = 0; +std::atomic numCopied = 0; struct TestObject { mutable std::atomic _refCount = 1; @@ -117,6 +163,7 @@ void testCopier(std::shared_ptr set, std::chrono::seconds runFor) { return; } auto copy = set->copy(); + numCopied.fetch_add(copy.size()); std::this_thread::sleep_for(std::chrono::milliseconds(10)); } } @@ -126,17 +173,32 @@ void writer(std::shared_ptr set, std::chrono::seconds runFor) { std::random_device rDev; DeterministicRandom rnd(rDev()); while (true) { + unsigned inserts = 0, erases = 0; if (Clock::now() - start > runFor) { return; } std::vector positions; for (int i = 0; i < rnd.randomInt(1, 101); ++i) { - positions.push_back(set->insert(Reference(new TestObject()))); + Reference o(new TestObject()); + auto pos = set->insert(o); + if (pos == TestSet::npos) { + // could not insert -- ignore + break; + } + ++inserts; + ASSERT(pos < TestSet::capacity); + positions.push_back(pos); } rnd.randomShuffle(positions); for (auto p : positions) { - set->erase(p); + if (!set->erase(p)) { + ++numLockedErase; + } + ++erases; } + numInserts.fetch_add(inserts); + numErase.fetch_add(erases); + ASSERT(inserts == erases); std::this_thread::sleep_for(std::chrono::milliseconds(1)); } } @@ -154,6 +216,11 @@ TEST_CASE("/flow/WriteOnlySet") { } threads->emplace_back([set, runFor]() { testCopier(set, runFor); }); wait(threadjoiner(threads, set)); + TraceEvent("WriteOnlySetTestResult") + .detail("Inserts", numInserts.load()) + .detail("Erases", numErase.load()) + .detail("Copies", numCopied.load()) + .detail("LockedErase", numLockedErase.load()); return Void(); } } // namespace \ No newline at end of file diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h index a319ad22f0..9d80795c68 100644 --- a/flow/WriteOnlySet.h +++ b/flow/WriteOnlySet.h @@ -31,6 +31,7 @@ public: using Index = IndexType; // For now we use a fixed size capacity constexpr static Index npos = std::numeric_limits::max(); + constexpr static IndexType capacity = CAPACITY; explicit WriteOnlySet(); WriteOnlySet(const WriteOnlySet&) = delete; @@ -43,12 +44,13 @@ public: unsigned size(); Index insert(const Reference& lineage); - void erase(Index idx); + bool erase(Index idx); std::vector> copy(); private: - static constexpr uintptr_t FREE = 0b1; - static constexpr uintptr_t LOCK = 0b10; + bool eraseImpl(Index idx); + + static constexpr uintptr_t LOCK = 0b1; std::atomic _size = 0; std::vector> _set; static_assert(std::atomic::is_always_lock_free, "Index type can't be used as a lock-free type"); @@ -57,39 +59,7 @@ private: boost::lockfree::queue, boost::lockfree::capacity> freeList; }; -template -WriteOnlySet::WriteOnlySet() : _set(CAPACITY) { - // insert the free indexes in reverse order - for (unsigned i = CAPACITY; i > 0; --i) { - freeQueue.push(i - 1); - _set[i] = uintptr_t(FREE); - } -} - -template -std::vector> WriteOnlySet::copy() { - std::vector> result; - for (int i = 0; i < CAPACITY; ++i) { - auto ptr = _set[i].load(); - if ((ptr & FREE) != 0) { - ASSERT((ptr & LOCK) == 0); - if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) { - T* entry = reinterpret_cast(ptr); - ptr |= LOCK; - entry->addref(); - // we try to unlock now. If this element was removed while we incremented the refcount, the element will - // end up in the freeList, so we will decrement later. - _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); - result.emplace_back(entry); - } - } - } - // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread - // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next - // iteration - freeList.consume_all([](auto toClean) { toClean->delref(); }); - return result; -} - class ActorLineage; extern template class WriteOnlySet; + +using ActorLineageSet = WriteOnlySet; From 99ac47e96c10922ca40e1267467bcfcbb51a51a0 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 19 Mar 2021 18:08:09 -0600 Subject: [PATCH 019/180] documentation --- flow/WriteOnlySet.actor.cpp | 6 ++++ flow/WriteOnlySet.h | 65 +++++++++++++++++++++++++++++++++---- 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index 32023f5e24..93d9e99fc7 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -109,12 +109,14 @@ template class WriteOnlySet; // testing code namespace { +// Some statistics std::atomic instanceCounter = 0; std::atomic numInserts = 0; std::atomic numErase = 0; std::atomic numLockedErase = 0; std::atomic numCopied = 0; +// A simple object that counts the number of its instances. This is used to detect memory leaks. struct TestObject { mutable std::atomic _refCount = 1; TestObject() { instanceCounter.fetch_add(1); } @@ -130,6 +132,7 @@ struct TestObject { using TestSet = WriteOnlySet; using Clock = std::chrono::steady_clock; +// An actor that can join a set of threads in an async way. ACTOR Future threadjoiner(std::shared_ptr> threads, std::shared_ptr set) { loop { wait(delay(0.1)); @@ -156,6 +159,7 @@ ACTOR Future threadjoiner(std::shared_ptr> thread } } +// occasionally copy the contents of the past set. void testCopier(std::shared_ptr set, std::chrono::seconds runFor) { auto start = Clock::now(); while (true) { @@ -168,6 +172,7 @@ void testCopier(std::shared_ptr set, std::chrono::seconds runFor) { } } +// In a loop adds and removes a set of objects to the set void writer(std::shared_ptr set, std::chrono::seconds runFor) { auto start = Clock::now(); std::random_device rDev; @@ -203,6 +208,7 @@ void writer(std::shared_ptr set, std::chrono::seconds runFor) { } } +// This unit test creates 5 writer threads and one copier thread. TEST_CASE("/flow/WriteOnlySet") { if (g_network->isSimulated()) { // This test is not deterministic, so we shouldn't run it in simulation diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h index 9d80795c68..a2589ec387 100644 --- a/flow/WriteOnlySet.h +++ b/flow/WriteOnlySet.h @@ -24,6 +24,21 @@ #include "flow/Trace.h" #include +/** + * This is a Write-Only set that supports copying the whole content. This data structure is lock-free and allows a user + * to insert and remove objects up to a given capacity (passed by a template). + * + * Template parameters: + * \param T The type to store. + * \param IndexType The type used as an index + * \param CAPACITY The maximum number of object this structure can store (if a user tries to store more, insert will + * fail gracefully) + * \pre T implements `void addref() const` and `void delref() const` + * \pre IndexType must have a copy constructor + * \pre IndexType must have a trivial assignment operator + * \pre IndexType must have a trivial destructor + * \pre IndexType can be used as an index into a std::vector + */ template class WriteOnlySet { public: @@ -37,25 +52,61 @@ public: WriteOnlySet(const WriteOnlySet&) = delete; WriteOnlySet& operator=(const WriteOnlySet&) = delete; - // Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so - // the actual size might change anytime after or even during the call. This function only guarantees that the size - // was whatever the method returns at one point between the start and the end of the function call. The safest way - // to handle this is by assuming that this returns an estimate. - unsigned size(); + /** + * Attempts to insert \p lineage into the set. This method can fail if the set is full (its size is equal to its + * capacity). Calling insert on a full set is safe but the method will return \ref npos if the operation fails. + * + * \param lineage A reference to the object the user wants to insert. + * \ret An index that can later be used to erase the value again or \ref npos if the insert failed. + * \pre lineage.getPtr() % 2 == 0 (the memory for lineage has to be at least 2 byte aligned) + */ + [[nodiscard]] Index insert(const Reference& lineage); - Index insert(const Reference& lineage); + /** + * Erases the object associated with \p idx from the set. + * + * \ret Whether the reference count was decremented. Usually the return value is only interesting for testing and + * benchmarking purposes and will in most cases be ignored. If \ref delref wasn't called, it will be called + * later. Note that at the time the return value is checked, \ref delref might already have been called. + */ bool erase(Index idx); + /** + * Copies all elements that are stored in the set into a vector. This copy operation does NOT provide a snapshot of + * the data structure. The contract is weak: + * - All object that were in the set before copy is called and weren't removed until after copy returned are + * guaranteed to be in the result. + * - Any object that was inserted while copy is running might be in the result. + * - Any object that was erased while copy is running might be in the result. + */ std::vector> copy(); private: + // the implementation of erase -- the wrapper just makes the function a bit more readable. bool eraseImpl(Index idx); + // the last bit of a pointer within the set is used like a boolean and true means that the object is locked. Locking + // an object is only relevant for memory management. A locked pointer can still be erased from the set, but the + // erase won't call delref on the object. Instead it will push the pointer into the \ref freeList and copy will call + // delref later. static constexpr uintptr_t LOCK = 0b1; - std::atomic _size = 0; + + // The actual memory std::vector> _set; static_assert(std::atomic::is_always_lock_free, "Index type can't be used as a lock-free type"); static_assert(std::atomic::is_always_lock_free, "uintptr_t can't be used as a lock-free type"); + + // The freeQueue. On creation all indexes (0..capacity-1) are pushed into this queue. On insert one element from + // this queue is consumed and the resulting number is used as an index into the set. On erase the index is given + // back to the freeQueue. boost::lockfree::queue, boost::lockfree::capacity> freeQueue; + + // The freeList is used for memory management. Generally copying a shared pointer can't be done in a lock-free way. + // Instead, when we copy the data structure we first copy the address, then attempt to set the last bit to 1 and + // only if that succeeds we will increment the reference count. Whenever we attempt to remove an object + // in \ref erase we remove the object from the set (using an atomic compare and swap) and only decrement the + // reference count if the last bit is 0. If it's not we'll push the pointer into this free list. + // \ref copy will consume all elements from this freeList each time it runs and decrements the refcount for each + // element. boost::lockfree::queue, boost::lockfree::capacity> freeList; }; From 61352b912444c5d3601b8e33de234cc1f61fe32b Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 22 Mar 2021 11:41:45 -0600 Subject: [PATCH 020/180] use push_back where emplace_back is unnecessary --- flow/WriteOnlySet.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index 93d9e99fc7..9ab63aa56f 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -92,7 +92,7 @@ std::vector> WriteOnlySet::copy() { // we try to unlock now. If this element was removed while we incremented the refcount, the element will // end up in the freeList, so we will decrement later. _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); - result.emplace_back(entry); + result.push_back(entry); } } } From 301daf326939d6378d410420d007322f7c7a3dd3 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 22 Mar 2021 11:46:16 -0600 Subject: [PATCH 021/180] address review comments --- flow/WriteOnlySet.actor.cpp | 2 +- flow/WriteOnlySet.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index 9ab63aa56f..364c53460d 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -1,5 +1,5 @@ /* - * WriteOnlySet.cpp + * WriteOnlySet.actor.cpp * * This source file is part of the FoundationDB open source project * diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h index a2589ec387..c71736f852 100644 --- a/flow/WriteOnlySet.h +++ b/flow/WriteOnlySet.h @@ -1,5 +1,5 @@ /* - * WriteOnlySet.cpp + * WriteOnlySet.h * * This source file is part of the FoundationDB open source project * @@ -50,7 +50,9 @@ public: explicit WriteOnlySet(); WriteOnlySet(const WriteOnlySet&) = delete; + WriteOnlySet(WriteOnlySet&&) = delete; WriteOnlySet& operator=(const WriteOnlySet&) = delete; + WriteOnlySet& operator=(WriteOnlySet&&) = delete; /** * Attempts to insert \p lineage into the set. This method can fail if the set is full (its size is equal to its @@ -93,7 +95,7 @@ private: // The actual memory std::vector> _set; static_assert(std::atomic::is_always_lock_free, "Index type can't be used as a lock-free type"); - static_assert(std::atomic::is_always_lock_free, "uintptr_t can't be used as a lock-free type"); + static_assert(std::atomic::is_always_lock_free, "uintptr_t can't be used as a lock-free type"); // The freeQueue. On creation all indexes (0..capacity-1) are pushed into this queue. On insert one element from // this queue is consumed and the resulting number is used as an index into the set. On erase the index is given From 5bd79de88179945a78e7862d90e7de183d3d690c Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Mon, 22 Mar 2021 10:01:28 -0700 Subject: [PATCH 022/180] Fix build --- flow/Profiler.actor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp index 46b0bcecb4..24bba87739 100644 --- a/flow/Profiler.actor.cpp +++ b/flow/Profiler.actor.cpp @@ -142,6 +142,8 @@ struct Profiler { } void signal_handler() { // async signal safe! + static std::atomic inSigHandler = false; + if (inSigHandler.exchange(true)) { return; } if (profilingEnabled) { double t = timer(); output_buffer->push(*(void**)&t); From 0ec7340a6f72f8d29b43ade50667d2b0e88ebd75 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Mon, 22 Mar 2021 10:55:52 -0700 Subject: [PATCH 023/180] Create reference --- flow/WriteOnlySet.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index 364c53460d..92eceea7bc 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -92,7 +92,7 @@ std::vector> WriteOnlySet::copy() { // we try to unlock now. If this element was removed while we incremented the refcount, the element will // end up in the freeList, so we will decrement later. _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); - result.push_back(entry); + result.push_back(Reference(entry)); } } } @@ -229,4 +229,4 @@ TEST_CASE("/flow/WriteOnlySet") { .detail("LockedErase", numLockedErase.load()); return Void(); } -} // namespace \ No newline at end of file +} // namespace From 2dfd420882537d7fa7d477c08b699f1a5e961a1c Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Wed, 24 Mar 2021 14:52:42 -0700 Subject: [PATCH 024/180] Add sampling profiler thread --- fdbrpc/AsyncFileKAIO.actor.h | 6 +++++- fdbrpc/IAsyncFile.h | 4 ++++ fdbrpc/Net2FileSystem.cpp | 4 ++++ fdbrpc/Net2FileSystem.h | 3 +++ fdbrpc/sim2.actor.cpp | 4 ++++ fdbrpc/simulator.h | 4 ++++ fdbserver/fdbserver.actor.cpp | 1 + flow/Platform.actor.cpp | 27 +++++++++++++++++++++++++++ flow/Platform.h | 2 ++ 9 files changed, 54 insertions(+), 1 deletion(-) diff --git a/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/AsyncFileKAIO.actor.h index 5e6592e6ba..dbdb040d00 100644 --- a/fdbrpc/AsyncFileKAIO.actor.h +++ b/fdbrpc/AsyncFileKAIO.actor.h @@ -242,7 +242,11 @@ public: // result = map(result, [=](int r) mutable { KAIOLogBlockEvent(io, OpLogEntry::READY, r); return r; }); #endif - return success(result); + auto& actorLineageSet = IAsyncFileSystem::filesystem()->getActorLineageSet(); + auto index = actorLineageSet.insert(currentLineage); + Future res = success(result); + actorLineageSet.erase(index); + return res; } // TODO(alexmiller): Remove when we upgrade the dev docker image to >14.10 #ifndef FALLOC_FL_ZERO_RANGE diff --git a/fdbrpc/IAsyncFile.h b/fdbrpc/IAsyncFile.h index ed703514c6..ad48db5f07 100644 --- a/fdbrpc/IAsyncFile.h +++ b/fdbrpc/IAsyncFile.h @@ -25,6 +25,7 @@ #include #include "flow/flow.h" +#include "flow/WriteOnlySet.h" #include "fdbrpc/IRateControl.h" // All outstanding operations must be cancelled before the destructor of IAsyncFile is called. @@ -118,6 +119,9 @@ public: // Returns the time of the last modification of the file. virtual Future lastWriteTime(const std::string& filename) = 0; + // Returns the shared memory data structure used to store actor lineages. + virtual ActorLineageSet& getActorLineageSet() = 0; + static IAsyncFileSystem* filesystem() { return filesystem(g_network); } static runCycleFuncPtr runCycleFunc() { return reinterpret_cast( diff --git a/fdbrpc/Net2FileSystem.cpp b/fdbrpc/Net2FileSystem.cpp index 71a7d784a1..8e895c08dc 100644 --- a/fdbrpc/Net2FileSystem.cpp +++ b/fdbrpc/Net2FileSystem.cpp @@ -89,6 +89,10 @@ Future Net2FileSystem::lastWriteTime(const std::string& filename) { return Net2AsyncFile::lastWriteTime(filename); } +ActorLineageSet& Net2FileSystem::getActorLineageSet() { + return actorLineageSet; +} + void Net2FileSystem::newFileSystem(double ioTimeout, const std::string& fileSystemPath) { g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Net2FileSystem(ioTimeout, fileSystemPath)); } diff --git a/fdbrpc/Net2FileSystem.h b/fdbrpc/Net2FileSystem.h index 702b87828f..0c2229b5ca 100644 --- a/fdbrpc/Net2FileSystem.h +++ b/fdbrpc/Net2FileSystem.h @@ -39,6 +39,8 @@ public: Future renameFile(std::string const& from, std::string const& to) override; + ActorLineageSet& getActorLineageSet() override; + // void init(); static void stop(); @@ -52,6 +54,7 @@ public: dev_t fileSystemDeviceId; bool checkFileSystem; #endif + ActorLineageSet actorLineageSet; }; #endif diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 6101ca8512..e9219f3ff3 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -2494,6 +2494,10 @@ Future Sim2FileSystem::lastWriteTime(const std::string& filename) { return fileWrites[filename]; } +ActorLineageSet& Sim2FileSystem::getActorLineageSet() { + return actorLineageSet; +} + void Sim2FileSystem::newFileSystem() { g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Sim2FileSystem()); } diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index cde0eb0dda..08b4264e81 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -471,6 +471,8 @@ public: Future lastWriteTime(const std::string& filename) override; + ActorLineageSet& getActorLineageSet() override; + Future renameFile(std::string const& from, std::string const& to) override; Sim2FileSystem() {} @@ -478,6 +480,8 @@ public: ~Sim2FileSystem() override {} static void newFileSystem(); + + ActorLineageSet actorLineageSet; }; #endif diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index a285c0b958..fbcd7fd9ee 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -1948,6 +1948,7 @@ int main(int argc, char* argv[]) { ASSERT(opts.connectionFile); setupRunLoopProfiler(); + setupSamplingProfiler(); auto dataFolder = opts.dataFolder; if (!dataFolder.size()) diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 42d8decccc..756fb6a7e3 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -48,6 +48,8 @@ #include "flow/UnitTest.h" #include "flow/FaultInjection.h" +#include "fdbrpc/IAsyncFile.h" + #ifdef _WIN32 #include #include @@ -3673,6 +3675,31 @@ void setupRunLoopProfiler() { #endif } +void* sampleThread(void* arg) { + while (true) { + threadSleep(1.0); // TODO: Read sample rate from global config + + // TODO: Copy actor lineage of currently running actor + + auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy(); + printf("Disk ALPs: %d\n", diskAlps.size()); + + // TODO: Call collect on all actor lineages + for (auto actorLineage : diskAlps) { + } + + // TODO: Serialize collected actor linage properties + } + + return nullptr; +} + +void setupSamplingProfiler() { + // TODO: Add knob + TraceEvent("StartingSamplingProfilerThread"); + startThread(&sampleThread, nullptr); +} + // UnitTest for getMemoryInfo #ifdef __linux__ TEST_CASE("/flow/Platform/getMemoryInfo") { diff --git a/flow/Platform.h b/flow/Platform.h index 74c9395c53..edf9ff3997 100644 --- a/flow/Platform.h +++ b/flow/Platform.h @@ -741,6 +741,8 @@ void registerCrashHandler(); void setupRunLoopProfiler(); EXTERNC void setProfilingEnabled(int enabled); +void setupSamplingProfiler(); + // Use _exit() or criticalError(), not exit() #define exit static_assert(false, "Calls to exit() are forbidden by policy"); From f7d3b31ef8f93a9ec845bef3a8216e70c384d804 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 24 Mar 2021 16:27:35 -0600 Subject: [PATCH 025/180] Actually close files in simulation --- fdbrpc/AsyncFileNonDurable.actor.h | 4 ++++ fdbrpc/sim2.actor.cpp | 16 ++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 49fe0e2c8f..13fdcc25a5 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -267,6 +267,10 @@ public: Future deleteFuture = deleteFile(this); if (!deleteFuture.isReady()) filesBeingDeleted[filename] = deleteFuture; + } else if (isSoleOwner()) { + // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we + // we remove the file from the map to make sure it gets closed. + g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); } } diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 1af14ec676..6cddbb7e88 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -536,7 +536,10 @@ public: std::string getFilename() const override { return actualFilename; } - ~SimpleFile() override { _close(h); } + ~SimpleFile() override { + _close(h); + --openCount; + } private: int h; @@ -1933,10 +1936,7 @@ public: TraceEvent("ClogInterface") .detail("IP", ip.toString()) .detail("Delay", seconds) - .detail("Queue", - mode == ClogSend ? "Send" - : mode == ClogReceive ? "Receive" - : "All"); + .detail("Queue", mode == ClogSend ? "Send" : mode == ClogReceive ? "Receive" : "All"); if (mode == ClogSend || mode == ClogAll) g_clogging.clogSendFor(ip, seconds); @@ -2408,9 +2408,9 @@ int sf_open(const char* filename, int flags, int convFlags, int mode) { GENERIC_READ | ((flags & IAsyncFile::OPEN_READWRITE) ? GENERIC_WRITE : 0), FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, nullptr, - (flags & IAsyncFile::OPEN_EXCLUSIVE) ? CREATE_NEW - : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS - : OPEN_EXISTING, + (flags & IAsyncFile::OPEN_EXCLUSIVE) + ? CREATE_NEW + : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS : OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); int h = -1; From 6a344ddeab4eac19ee34f1af7649a6b5e8e39efc Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 24 Mar 2021 16:56:11 -0600 Subject: [PATCH 026/180] fix typo --- fdbrpc/AsyncFileNonDurable.actor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 13fdcc25a5..8cc65bf4a5 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -269,7 +269,7 @@ public: filesBeingDeleted[filename] = deleteFuture; } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we - // we remove the file from the map to make sure it gets closed. + // remove the file from the map to make sure it gets closed. g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); } } From b51e4aa59048ed73afbb6a6d82b4d86f520f6129 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 24 Mar 2021 19:57:24 -0600 Subject: [PATCH 027/180] handle file renames properly --- fdbrpc/AsyncFileNonDurable.actor.h | 12 +++++++++++- flow/flow.h | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 8cc65bf4a5..21cfda8907 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -270,7 +270,17 @@ public: } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we // remove the file from the map to make sure it gets closed. - g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); + auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; + auto iter = openFiles.find(filename); + // the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the + // map anymore. + if (iter != openFiles.end()) { + // even if the filename exists, it doesn't mean that it references the same file. It could be that the + // file was renamed and later a file with the same name was opened. + if (iter->second.canGet() && iter->second.get().getPtr() == this) { + openFiles.erase(filename); + } + } } } diff --git a/flow/flow.h b/flow/flow.h index 987572d7c5..e03d598d9b 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -674,6 +674,8 @@ public: bool isValid() const { return sav != 0; } bool isReady() const { return sav->isSet(); } bool isError() const { return sav->isError(); } + // returns true if get can be called on this future (counterpart of canBeSet on Promises) + bool canGet() const { return isValid() && isReady() && !isError(); } Error& getError() const { ASSERT(isError()); return sav->error_state; From 1385a776daa0b90cb20478251d0faf8766cb1a10 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 25 Mar 2021 13:22:29 -0600 Subject: [PATCH 028/180] only remove files from the open map if they have no modifications in flight --- fdbrpc/AsyncFileNonDurable.actor.h | 49 ++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 21cfda8907..281b3f289d 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -259,6 +259,37 @@ public: //TraceEvent("AsyncFileNonDurable_Destroy", id).detail("Filename", filename); } + // The purpose of this actor is to simply keep a reference to a non-durable file until all pending modifications + // have completed. When they return, this actor will die and therefore decrement the reference count by 1. + ACTOR void waitOnOutstandingModifications(Reference self) { + state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); + state std::string filename = self->filename; + + wait(g_simulator.onMachine(currentProcess)); + try { + Promise startSyncPromise = self->startSyncPromise; + self->startSyncPromise = Promise(); + startSyncPromise.send(true); + + std::vector> outstandingModifications; + + for (auto itr = self->pendingModifications.ranges().begin(); + itr != self->pendingModifications.ranges().end(); + ++itr) + if (itr->value().isValid() && !itr->value().isReady()) + outstandingModifications.push_back(itr->value()); + + // Ignore errors here so that all modifications can finish + wait(waitForAllReady(outstandingModifications)); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); + } catch (Error& e) { + state Error err = e; + wait(g_simulator.onProcess(currentProcess, currentTaskID)); + throw err; + } + } + void addref() override { ReferenceCounted::addref(); } void delref() override { if (delref_no_destroy()) { @@ -270,6 +301,24 @@ public: } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we // remove the file from the map to make sure it gets closed. + bool hasPendingModifications = false; + for (auto iter = pendingModifications.ranges().begin(); iter != pendingModifications.ranges().end(); + ++iter) { + if (iter->value().isValid() && !iter->value().isReady()) { + hasPendingModifications = true; + break; + } + } + if (hasPendingModifications) { + // If we still have pending references we won't close the file and instead wait for them. But while we + // wait for those to complete, another actor might open the file. So we call into an actor that will + // hold a refernce until all pending operations are complete. If someone opens this file before this + // completes, nothing will happen. Otherwise we will enter delref again but this time + // hasPendingModifications will evalualte to false. + addref(); + waitOnOutstandingModifications(Reference(this)); + return; + } auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; auto iter = openFiles.find(filename); // the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the From 1033db9fba275a809b3159fc2d52a92293350a45 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 25 Mar 2021 14:00:07 -0600 Subject: [PATCH 029/180] Revert change --- fdbrpc/AsyncFileNonDurable.actor.h | 47 +++++++----------------------- 1 file changed, 11 insertions(+), 36 deletions(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 281b3f289d..f65895067e 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -267,27 +267,20 @@ public: state std::string filename = self->filename; wait(g_simulator.onMachine(currentProcess)); - try { - Promise startSyncPromise = self->startSyncPromise; - self->startSyncPromise = Promise(); - startSyncPromise.send(true); + Promise startSyncPromise = self->startSyncPromise; + self->startSyncPromise = Promise(); + startSyncPromise.send(true); - std::vector> outstandingModifications; + std::vector> outstandingModifications; - for (auto itr = self->pendingModifications.ranges().begin(); - itr != self->pendingModifications.ranges().end(); - ++itr) - if (itr->value().isValid() && !itr->value().isReady()) - outstandingModifications.push_back(itr->value()); + for (auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end(); + ++itr) + if (itr->value().isValid() && !itr->value().isReady()) + outstandingModifications.push_back(itr->value()); - // Ignore errors here so that all modifications can finish - wait(waitForAllReady(outstandingModifications)); - wait(g_simulator.onProcess(currentProcess, currentTaskID)); - } catch (Error& e) { - state Error err = e; - wait(g_simulator.onProcess(currentProcess, currentTaskID)); - throw err; - } + // Ignore errors here so that all modifications can finish + wait(waitForAllReady(outstandingModifications)); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); } void addref() override { ReferenceCounted::addref(); } @@ -301,24 +294,6 @@ public: } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we // remove the file from the map to make sure it gets closed. - bool hasPendingModifications = false; - for (auto iter = pendingModifications.ranges().begin(); iter != pendingModifications.ranges().end(); - ++iter) { - if (iter->value().isValid() && !iter->value().isReady()) { - hasPendingModifications = true; - break; - } - } - if (hasPendingModifications) { - // If we still have pending references we won't close the file and instead wait for them. But while we - // wait for those to complete, another actor might open the file. So we call into an actor that will - // hold a refernce until all pending operations are complete. If someone opens this file before this - // completes, nothing will happen. Otherwise we will enter delref again but this time - // hasPendingModifications will evalualte to false. - addref(); - waitOnOutstandingModifications(Reference(this)); - return; - } auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; auto iter = openFiles.find(filename); // the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the From f2d368711058226f76b89ca57909a25a61127e85 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Mon, 29 Mar 2021 16:06:26 -0700 Subject: [PATCH 030/180] Print stack --- cmake/CompileBoost.cmake | 2 +- flow/Platform.actor.cpp | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/cmake/CompileBoost.cmake b/cmake/CompileBoost.cmake index 9e7fbd2971..0b1cc68502 100644 --- a/cmake/CompileBoost.cmake +++ b/cmake/CompileBoost.cmake @@ -10,7 +10,7 @@ function(compile_boost) set(BOOST_COMPILER_FLAGS -fvisibility=hidden -fPIC -std=c++14 -w) set(BOOST_CXX_COMPILER "${CMAKE_CXX_COMPILER}") if(APPLE) - set(BOOST_TOOLSET "darwin") + set(BOOST_TOOLSET "clang-darwin") # this is to fix a weird macOS issue -- by default # cmake would otherwise pass a compiler that can't # compile boost diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 756fb6a7e3..d81dee877a 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -3680,12 +3680,19 @@ void* sampleThread(void* arg) { threadSleep(1.0); // TODO: Read sample rate from global config // TODO: Copy actor lineage of currently running actor + // Read currentLineage auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy(); printf("Disk ALPs: %d\n", diskAlps.size()); // TODO: Call collect on all actor lineages for (auto actorLineage : diskAlps) { + auto stack = actorLineage->stack(&StackLineage::actorName); + while (!stack.empty()) { + printf("%s ", stack.top()); + stack.pop(); + } + printf("\n"); } // TODO: Serialize collected actor linage properties From c90be2003f8dffe6161d96ae90d53023cd6a4a3b Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Thu, 1 Apr 2021 10:34:59 -0700 Subject: [PATCH 031/180] Profile running actor --- flow/Platform.actor.cpp | 8 +++++++ flow/WriteOnlySet.actor.cpp | 41 +++++++++++++++++++++++++++++++++ flow/WriteOnlySet.h | 46 ++++++++++++++++++++++++++++++++++++- flow/flow.cpp | 1 + flow/flow.h | 17 ++++++++++++-- 5 files changed, 110 insertions(+), 3 deletions(-) diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index d81dee877a..50f252021b 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -3681,6 +3681,14 @@ void* sampleThread(void* arg) { // TODO: Copy actor lineage of currently running actor // Read currentLineage + auto actorLineage = currentLineageThreadSafe.get(); + printf("Currently running actor lineage (%p):\n", actorLineage.getPtr()); + auto stack = actorLineage->stack(&StackLineage::actorName); + while (!stack.empty()) { + printf("%s ", stack.top()); + stack.pop(); + } + printf("\n"); auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy(); printf("Disk ALPs: %d\n", diskAlps.size()); diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index 92eceea7bc..c79f8f4db7 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -67,6 +67,32 @@ bool WriteOnlySet::erase(Index idx) { return res; } +template +bool WriteOnlySet::replace(Index idx, const Reference& lineage) { + auto lineagePtr = reinterpret_cast(lineage.getPtr()); + ASSERT((lineagePtr % 2) == 0); // this needs to be at least 2-byte aligned + + while (true) { + if (lineage.isValid()) { + lineage->addref(); + } + + auto ptr = _set[idx].load(); + if (ptr & LOCK) { + _set[idx].store(lineagePtr); + return false; + } else { + if (_set[idx].compare_exchange_strong(ptr, lineagePtr)) { + if (ptr) { + reinterpret_cast(ptr)->delref(); + } + _set[idx].store(lineagePtr); + return ptr != 0; + } + } + } +} + template WriteOnlySet::WriteOnlySet() : _set(CAPACITY) { // insert the free indexes in reverse order @@ -103,8 +129,23 @@ std::vector> WriteOnlySet::copy() { return result; } +template +WriteOnlyVariable::WriteOnlyVariable() : WriteOnlySet() {} + +template +Reference WriteOnlyVariable::get() { + auto result = WriteOnlySet::copy(); + return result.size() ? result.at(0) : Reference(); +} + +template +bool WriteOnlyVariable::replace(const Reference& element) { + return WriteOnlySet::replace(0, element); +} + // Explicit instantiation template class WriteOnlySet; +template class WriteOnlyVariable; // testing code namespace { diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h index c71736f852..73da2bfac1 100644 --- a/flow/WriteOnlySet.h +++ b/flow/WriteOnlySet.h @@ -72,6 +72,17 @@ public: * later. Note that at the time the return value is checked, \ref delref might already have been called. */ bool erase(Index idx); + + /** + * Replaces the object associated with \p idx with \p lineage. + * + * \ret Whether the reference count of the replaced object was decremented. Usually the return value is only + * interesting for testing and benchmarking purposes and will in most cases be ignored. If \ref delref + * wasn't called, it will be called later. Note that at the time the return value is checked, \ref delref + * might already have been called. + */ + bool replace(Index idx, const Reference& lineage); + /** * Copies all elements that are stored in the set into a vector. This copy operation does NOT provide a snapshot of * the data structure. The contract is weak: @@ -82,7 +93,7 @@ public: */ std::vector> copy(); -private: +protected: // the implementation of erase -- the wrapper just makes the function a bit more readable. bool eraseImpl(Index idx); @@ -112,6 +123,39 @@ private: boost::lockfree::queue, boost::lockfree::capacity> freeList; }; +/** + * Provides a thread safe, lock-free write only variable. + * + * Template parameters: + * \param T The type to store. + * \param IndexType The type used as an index + * \pre T implements `void addref() const` and `void delref() const` + * \pre IndexType must have a copy constructor + * \pre IndexType must have a trivial assignment operator + * \pre IndexType must have a trivial destructor + * \pre IndexType can be used as an index into a std::vector + */ +template +class WriteOnlyVariable : private WriteOnlySet { +public: + explicit WriteOnlyVariable(); + + /** + * Returns a copied reference to the stored variable. + */ + Reference get(); + + /** + * Replaces the variable with \p lineage. \p lineage is permitted to be an invalid pointer. + * + * \ret Whether the reference count of the replaced object was decremented. Note that if the reference being replaced + * is invalid, this function will always return false. If \ref delref wasn't called and the reference was valid, + * it will be called later. Note that at the time the return value is checked, \ref delref might already have + * been called. + */ + bool replace(const Reference& element); +}; + class ActorLineage; extern template class WriteOnlySet; diff --git a/flow/flow.cpp b/flow/flow.cpp index 02e5b93410..82bf2be43b 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -27,6 +27,7 @@ #include thread_local Reference currentLineage; +WriteOnlyVariable currentLineageThreadSafe; LineagePropertiesBase::~LineagePropertiesBase() {} diff --git a/flow/flow.h b/flow/flow.h index 430a12a460..b61453c8f2 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -50,6 +50,7 @@ #include "flow/ThreadPrimitives.h" #include "flow/network.h" #include "flow/FileIdentifier.h" +#include "flow/WriteOnlySet.h" #include @@ -500,6 +501,7 @@ public: }; extern thread_local Reference currentLineage; +extern WriteOnlyVariable currentLineageThreadSafe; // This class can be used in order to modify all lineage properties // of actors created within a (non-actor) scope @@ -509,14 +511,21 @@ struct LocalLineage { LocalLineage() { oldLineage = currentLineage; currentLineage = lineage; + currentLineageThreadSafe.replace(lineage); + } + ~LocalLineage() { + currentLineage = oldLineage; + currentLineageThreadSafe.replace(oldLineage); } - ~LocalLineage() { currentLineage = oldLineage; } }; struct restore_lineage { Reference prev; restore_lineage() : prev(currentLineage) {} - ~restore_lineage() { currentLineage = prev; } + ~restore_lineage() { + currentLineage = prev; + currentLineageThreadSafe.replace(prev); + } }; struct StackLineage : LineageProperties { @@ -1108,12 +1117,14 @@ struct Actor : SAV { Actor() : SAV(1, 1), actor_wait_state(0) { /*++actorCount;*/ currentLineage = lineage; + currentLineageThreadSafe.replace(lineage); } //~Actor() { --actorCount; } Reference setLineage() { auto res = currentLineage; currentLineage = lineage; + currentLineageThreadSafe.replace(lineage); return res; } }; @@ -1128,12 +1139,14 @@ struct Actor { Actor() : actor_wait_state(0) { /*++actorCount;*/ currentLineage = lineage; + currentLineageThreadSafe.replace(lineage); } //~Actor() { --actorCount; } Reference setLineage() { auto res = currentLineage; currentLineage = lineage; + currentLineageThreadSafe.replace(lineage); return res; } }; From 41d1aee609374905ad217b11524bb3c19adef0cb Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 1 Apr 2021 14:06:13 -0600 Subject: [PATCH 032/180] delete dead code --- fdbrpc/AsyncFileNonDurable.actor.h | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 6168c01abc..2234ee0b26 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -268,30 +268,6 @@ public: //TraceEvent("AsyncFileNonDurable_Destroy", id).detail("Filename", filename); } - // The purpose of this actor is to simply keep a reference to a non-durable file until all pending modifications - // have completed. When they return, this actor will die and therefore decrement the reference count by 1. - ACTOR void waitOnOutstandingModifications(Reference self) { - state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state TaskPriority currentTaskID = g_network->getCurrentTask(); - state std::string filename = self->filename; - - wait(g_simulator.onMachine(currentProcess)); - Promise startSyncPromise = self->startSyncPromise; - self->startSyncPromise = Promise(); - startSyncPromise.send(true); - - std::vector> outstandingModifications; - - for (auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end(); - ++itr) - if (itr->value().isValid() && !itr->value().isReady()) - outstandingModifications.push_back(itr->value()); - - // Ignore errors here so that all modifications can finish - wait(waitForAllReady(outstandingModifications)); - wait(g_simulator.onProcess(currentProcess, currentTaskID)); - } - void addref() override { ReferenceCounted::addref(); } void delref() override { if (delref_no_destroy()) { From 90ebf90c8ba619afe85c1933851c7fb53fd56943 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sat, 3 Apr 2021 19:54:49 -0700 Subject: [PATCH 033/180] Refactored page rebuild logic to bulk build pages full and split pages more evenly. --- fdbserver/DeltaTree.h | 2 - fdbserver/IPager.h | 2 +- fdbserver/Knobs.cpp | 2 +- fdbserver/Knobs.h | 2 +- fdbserver/VersionedBTree.actor.cpp | 439 +++++++++++++++++------------ 5 files changed, 267 insertions(+), 180 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index bef753a440..2e0fee0b40 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -230,8 +230,6 @@ struct DeltaTree { inline Node& newNode() { return *(Node*)((uint8_t*)this + size()); } public: - // Get count of total overhead bytes (everything but the user-formatted Delta) for a tree given size n - static int emptyTreeSize() { return sizeof(DeltaTree); } struct DecodedNode { DecodedNode() {} diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 0f74c744a8..45c9f02fcc 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -76,7 +76,7 @@ public: virtual void delref() = 0; }; -// This API is probably customized to the behavior of DWALPager and probably needs some changes to be more generic. +// This API is probably too customized to the behavior of DWALPager and probably needs some changes to be more generic. class IPager2 : public IClosable { public: // Returns an IPage that can be passed to writePage. The data in the returned IPage might not be zeroed. diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 29fbc6fcc6..539637580a 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -703,7 +703,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( REDWOOD_DEFAULT_PAGE_SIZE, 4096 ); init( REDWOOD_KVSTORE_CONCURRENT_READS, 64 ); init( REDWOOD_COMMIT_CONCURRENT_READS, 64 ); - init( REDWOOD_PAGE_REBUILD_FILL_FACTOR, 0.66 ); + init( REDWOOD_PAGE_REBUILD_MAX_SLACK, 0.33 ); init( REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES, 10 ); init( REDWOOD_LAZY_CLEAR_MIN_PAGES, 0 ); init( REDWOOD_LAZY_CLEAR_MAX_PAGES, 1e6 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 16abf63692..1b8e6874cd 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -636,7 +636,7 @@ public: int REDWOOD_DEFAULT_PAGE_SIZE; // Page size for new Redwood files int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress. int REDWOOD_COMMIT_CONCURRENT_READS; // Max number of concurrent reads done to support commit operations - double REDWOOD_PAGE_REBUILD_FILL_FACTOR; // When rebuilding pages, start a new page after this capacity + double REDWOOD_PAGE_REBUILD_MAX_SLACK; // When rebuilding pages, max slack to allow in page int REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES; // Number of pages to try to pop from the lazy delete queue and process at // once int REDWOOD_LAZY_CLEAR_MIN_PAGES; // Minimum number of pages to free before ending a lazy clear cycle, unless the diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 8051d956b0..4f3bb874c4 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2864,7 +2864,8 @@ struct RedwoodRecordRef { bool operator>=(const RedwoodRecordRef& rhs) const { return compare(rhs) >= 0; } - // Worst case overhead means to assu + // Worst case overhead means to assume that either the prefix length or the suffix length + // could contain the full key size int deltaSize(const RedwoodRecordRef& base, int skipLen, bool worstCaseOverhead) const { int prefixLen = getCommonPrefixLen(base, skipLen); int keySuffixLen = key.size() - prefixLen; @@ -3732,6 +3733,184 @@ private: Future m_lazyClearActor; bool m_lazyClearStop; + struct PageToBuild { + PageToBuild(int index, int blockSize) + : startIndex(index), count(0), pageSize(blockSize), + bytesLeft(blockSize - sizeof(BTreePage) - sizeof(BTreePage::BinaryTree)), + largeDeltaTree(pageSize > BTreePage::BinaryTree::SmallSizeLimit), blockSize(blockSize), blockCount(1), + kvBytes(0) {} + + int startIndex; + int count; + int pageSize; + int bytesLeft; + bool largeDeltaTree; + int blockSize; + int blockCount; + int kvBytes; + + int size() const { return pageSize - bytesLeft; } + + double usedFraction() const { return (double)size() / pageSize; } + + double slackFraction() const { return (double)bytesLeft / pageSize; } + + double kvFraction() const { return (double)kvBytes / pageSize; } + + int endIndex() const { return startIndex + count; } + + int lastIndex() const { return endIndex() - 1; } + + std::string toString() const { + return format( + "{start=%d count=%d used %d/%d bytes (%.2f%% slack) kvBytes=%d blocks=%d blockSize=%d large=%d}", + startIndex, + count, + size(), + pageSize, + slackFraction() * 100, + kvBytes, + blockCount, + blockSize, + largeDeltaTree); + } + + // Move an item from a to b if a has 2 or more items and the item fits in b + // a and b must be consecutive pages from the same array of records + static bool shiftItem(PageToBuild& a, PageToBuild& b, int deltaSize, int kvBytes) { + if (a.count < 2) { + return false; + } + + // Size of the nodes in A and B, respectively + int aNodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(a.largeDeltaTree); + int bNodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(b.largeDeltaTree); + + if (b.bytesLeft < bNodeSize) { + return false; + } + + --a.count; + ++b.count; + --b.startIndex; + a.bytesLeft += aNodeSize; + b.bytesLeft -= bNodeSize; + a.kvBytes -= kvBytes; + b.kvBytes += kvBytes; + + return true; + } + + // Try to add a record of the given delta size to the page. + // If force is true, the page will be expanded to make the record fit if needed. + // Return value is whether or not the record was added to the page. + bool addRecord(const RedwoodRecordRef& rec, int deltaSize, bool force) { + int nodeSize = deltaSize + BTreePage::BinaryTree::Node::headerSize(largeDeltaTree); + + // If the record doesn't fit and the page can't be expanded then return false + if (nodeSize > bytesLeft && !force) { + return false; + } + + ++count; + bytesLeft -= nodeSize; + kvBytes += rec.kvBytes(); + + // If needed, expand page so that record fits. + // This is a loop because the first expansion may increase per-node overhead which could + // then require a second expansion. + while (bytesLeft < 0) { + int newBlocks = (-bytesLeft + blockSize - 1) / blockSize; + int extraSpace = newBlocks * blockSize; + blockCount += newBlocks; + bytesLeft += extraSpace; + pageSize += extraSpace; + + // If size has moved into the "large" range then every node has gotten bigger so adjust bytesLeft + if (!largeDeltaTree && pageSize > BTreePage::BinaryTree::SmallSizeLimit) { + largeDeltaTree = true; + bytesLeft -= (count * BTreePage::BinaryTree::LargeTreePerNodeExtraOverhead); + } + } + return true; + } + }; + + static std::vector splitPages(const RedwoodRecordRef* lowerBound, + const RedwoodRecordRef* upperBound, + int prefixLen, + VectorRef records, + int height, + int blockSize) { + debug_printf("splitPages height=%d records=%d lowerBound=%s upperBound=%s\n", + height, + records.size(), + lowerBound->toString(false).c_str(), + upperBound->toString(false).c_str()); + ASSERT(!records.empty()); + + // Leaves can have just one record if it's large, but internal pages should have at least 4 + int minRecords = height == 1 ? 1 : 4; + double maxSlack = SERVER_KNOBS->REDWOOD_PAGE_REBUILD_MAX_SLACK; + std::vector pages; + + // deltaSizes contains pair-wise delta sizes for [lowerBound, records..., upperBound] + std::vector deltaSizes(records.size() + 1); + deltaSizes.front() = records.front().deltaSize(*lowerBound, prefixLen, true); + deltaSizes.back() = records.back().deltaSize(*upperBound, prefixLen, true); + for (int i = 1; i < records.size(); ++i) { + deltaSizes[i] = records[i].deltaSize(records[i - 1], prefixLen, true); + } + + PageToBuild p(0, blockSize); + + for (int i = 0; i < records.size(); ++i) { + bool force = p.count < minRecords || p.slackFraction() > maxSlack; + debug_printf( + " before addRecord i=%d records=%d deltaSize=%d kvSize=%d force=%d pageToBuild=%s record=%s", + i, + records.size(), + deltaSizes[i], + records[i].kvBytes(), + force, + p.toString().c_str(), + records[i].toString(height == 1).c_str()); + + if (!p.addRecord(records[i], deltaSizes[i], force)) { + pages.push_back(p); + p = PageToBuild(p.endIndex(), blockSize); + p.addRecord(records[i], deltaSizes[i], true); + } + } + + if (p.count > 0) { + pages.push_back(p); + } + + debug_printf(" Before shift: %s\n", ::toString(pages).c_str()); + + // If page count is > 1, try to balance slack between last two pages + // The buggify disables this balancing as this will result in more edge + // cases of pages with very few records. + if (pages.size() > 1 && !BUGGIFY) { + PageToBuild& a = pages[pages.size() - 2]; + PageToBuild& b = pages.back(); + + // While the last page page has too much slack and the second to last page + // has more than the minimum record count, shift a record from the second + // to last page to the last page. + while (b.slackFraction() > maxSlack && a.count > minRecords) { + int i = a.lastIndex(); + if (!PageToBuild::shiftItem(a, b, deltaSizes[i], records[i].kvBytes())) { + break; + } + debug_printf(" After shifting i=%d: a=%s b=%s\n", i, a.toString().c_str(), b.toString().c_str()); + } + } + + return pages; + } + // Writes entries to 1 or more pages and return a vector of boundary keys with their IPage(s) ACTOR static Future>> writePages(VersionedBTree* self, const RedwoodRecordRef* lowerBound, @@ -3741,197 +3920,130 @@ private: Version v, BTreePageIDRef previousID) { ASSERT(entries.size() > 0); + state Standalone> records; - // This is how much space for the binary tree exists in the page, after the header - state int blockSize = self->m_blockSize; - state int pageSize = blockSize - sizeof(BTreePage); - state int pageFillTarget = pageSize * SERVER_KNOBS->REDWOOD_PAGE_REBUILD_FILL_FACTOR; - state int blockCount = 1; + // All records share the prefix shared by the lower and upper boundaries + state int prefixLen = lowerBound->getCommonPrefixLen(*upperBound); - state int kvBytes = 0; - state int compressedBytes = BTreePage::BinaryTree::emptyTreeSize(); - state bool largeTree = false; - - state int start = 0; - state int i = 0; - // The common prefix length between the first and last records are common to all records - state int skipLen = entries.front().getCommonPrefixLen(entries.back()); - - // Leaves can have just one record if it's large, but internal pages should have at least 4 - state int minimumEntries = (height == 1 ? 1 : 4); + state std::vector pagesToBuild = + splitPages(lowerBound, upperBound, prefixLen, entries, height, self->m_blockSize); + debug_printf("splitPages returning %s\n", toString(pagesToBuild).c_str()); // Lower bound of the page being added to state RedwoodRecordRef pageLowerBound = lowerBound->withoutValue(); state RedwoodRecordRef pageUpperBound; - while (1) { - // While there are still entries to add and the page isn't full enough, add an entry - while (i < entries.size() && (i - start < minimumEntries || compressedBytes < pageFillTarget)) { - const RedwoodRecordRef& entry = entries[i]; + state int pageIndex; - // Get delta from previous record or page lower boundary if this is the first item in a page - const RedwoodRecordRef& base = (i == start) ? pageLowerBound : entries[i - 1]; + for (pageIndex = 0; pageIndex < pagesToBuild.size(); ++pageIndex) { + auto& p = pagesToBuild[pageIndex]; + debug_printf("building page %d of %d %s\n", pageIndex + 1, pagesToBuild.size(), p.toString().c_str()); + ASSERT(p.count != 0); - // All record pairs in entries have skipLen bytes in common with each other, but for i == 0 the base is - // lowerBound - int skip = i == 0 ? 0 : skipLen; + // For internal pages, skip first entry if child link is null. Such links only exist + // to maintain a borrow-able prefix for the previous subtree after a subtree deletion. + // If the null link falls on a new page post-split, then the pageLowerBound of the page + // being built now will serve as the previous subtree's upper boundary as it is the same + // key as entries[p.startIndex] and there is no need to actually store the null link in + // the new page. + if (height != 1 && !entries[p.startIndex].value.present()) { + p.kvBytes -= entries[p.startIndex].key.size(); + ++p.startIndex; + --p.count; + debug_printf("Skipping first null record, new count=%d\n", p.count); - // In a delta tree, all common prefix bytes that can be borrowed, will be, but not necessarily - // by the same records during the linear estimate of the built page size. Since the key suffix bytes - // and therefore the key prefix lengths can be distributed differently in the balanced tree, worst case - // overhead for the delta size must be assumed. - int deltaSize = entry.deltaSize(base, skip, true); - - int nodeSize = BTreePage::BinaryTree::Node::headerSize(largeTree) + deltaSize; - debug_printf("Adding %3d of %3lu (i=%3d) klen %4d vlen %5d nodeSize %5d deltaSize %5d page usage: " - "%d/%d (%.2f%%) record=%s\n", - i + 1, - entries.size(), - i, - entry.key.size(), - entry.value.orDefault(StringRef()).size(), - nodeSize, - deltaSize, - compressedBytes, - pageSize, - (float)compressedBytes / pageSize * 100, - entry.toString(height == 1).c_str()); - - // While the node doesn't fit, expand the page. - // This is a loop because if the page size moves into "large" range for DeltaTree - // then the overhead will increase, which could require another page expansion. - int spaceAvailable = pageSize - compressedBytes; - if (nodeSize > spaceAvailable) { - // Figure out how many additional whole or partial blocks are needed - // newBlocks = ceil ( additional space needed / block size) - int newBlocks = 1 + (nodeSize - spaceAvailable - 1) / blockSize; - int newPageSize = pageSize + (newBlocks * blockSize); - - // If we've moved into "large" page range for the delta tree then add additional overhead required - if (!largeTree && newPageSize > BTreePage::BinaryTree::SmallSizeLimit) { - largeTree = true; - // Add increased overhead for the current node to nodeSize - nodeSize += BTreePage::BinaryTree::LargeTreePerNodeExtraOverhead; - // Add increased overhead for all previously added nodes - compressedBytes += (i - start) * BTreePage::BinaryTree::LargeTreePerNodeExtraOverhead; - - // Update calculations above made with previous overhead sizes - spaceAvailable = pageSize - compressedBytes; - newBlocks = 1 + (nodeSize - spaceAvailable - 1) / blockSize; - newPageSize = pageSize + (newBlocks * blockSize); - } - - blockCount += newBlocks; - pageSize = newPageSize; - pageFillTarget = pageSize * SERVER_KNOBS->REDWOOD_PAGE_REBUILD_FILL_FACTOR; + // If the page is now empty then it must be the last page in pagesToBuild, otherwise there would + // be more than 1 item since internal pages need to have multiple children. While there is no page + // to be built here, a record must be added to the output set because the upper boundary of the last + // page built does not match the upper boundary of the original page that this call to writePages() is + // replacing. Put another way, the upper boundary of the rightmost page of the page set that was just + // built does not match the upper boundary of the original page that the page set is replacing, so + // adding the extra null link fixes this. + if (p.count == 0) { + ASSERT(pageIndex == pagesToBuild.size() - 1); + records.push_back_deep(records.arena(), pageUpperBound); + break; } - - kvBytes += entry.kvBytes(); - compressedBytes += nodeSize; - ++i; - } - - // Flush the accumulated records to a page - state int nextStart = i; - // If we are building internal pages and there is a record after this page (index nextStart) but it has an - // empty childPage value then skip it. It only exists to serve as an upper boundary for a child page that - // has not been rewritten in the current commit, and that purpose will now be served by the upper bound of - // the page we are now building. - if (height != 1 && nextStart < entries.size() && !entries[nextStart].value.present()) { - ++nextStart; } // Use the next entry as the upper bound, or upperBound if there are no more entries beyond this page - pageUpperBound = (i == entries.size()) ? upperBound->withoutValue() : entries[i].withoutValue(); + int endIndex = p.endIndex(); + bool lastPage = endIndex == entries.size(); + pageUpperBound = lastPage ? upperBound->withoutValue() : entries[endIndex].withoutValue(); // If this is a leaf page, and not the last one to be written, shorten the upper boundary - state bool isLastPage = (nextStart == entries.size()); - if (!isLastPage && height == 1) { - int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[i - 1], 0); + if (!lastPage && height == 1) { + int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[endIndex - 1], prefixLen); pageUpperBound.truncate(commonPrefix + 1); } state std::vector> pages; BTreePage* btPage; - int capacity = blockSize * blockCount; - if (blockCount == 1) { + if (p.blockCount == 1) { Reference page = self->m_pager->newPageBuffer(); btPage = (BTreePage*)page->mutate(); pages.push_back(std::move(page)); } else { - ASSERT(blockCount > 1); - btPage = (BTreePage*)new uint8_t[capacity]; + ASSERT(p.blockCount > 1); + btPage = (BTreePage*)new uint8_t[p.pageSize]; } btPage->height = height; - btPage->kvBytes = kvBytes; + btPage->kvBytes = p.kvBytes; - debug_printf( - "Building tree. start=%d i=%d count=%d page usage: %d/%d (%.2f%%) bytes\nlower: %s\nupper: %s\n", - start, - i, - i - start, - compressedBytes, - pageSize, - (float)compressedBytes / pageSize * 100, - pageLowerBound.toString(false).c_str(), - pageUpperBound.toString(false).c_str()); + debug_printf("Building tree for %s\nlower: %s\nupper: %s\n", + p.toString().c_str(), + pageLowerBound.toString(false).c_str(), + pageUpperBound.toString(false).c_str()); - int written = - btPage->tree().build(pageSize, &entries[start], &entries[i], &pageLowerBound, &pageUpperBound); - if (written > pageSize) { - debug_printf("ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d\n", + int deltaTreeSpace = p.pageSize - sizeof(BTreePage); + state int written = btPage->tree().build( + deltaTreeSpace, &entries[p.startIndex], &entries[endIndex], &pageLowerBound, &pageUpperBound); + + if (written > deltaTreeSpace) { + debug_printf("ERROR: Wrote %d bytes to page %s deltaTreeSpace=%d\n", written, - pageSize, - blockCount, - i - start, - kvBytes, - compressedBytes); - fprintf(stderr, - "ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d\n", - written, - pageSize, - blockCount, - i - start, - kvBytes, - compressedBytes); + p.toString().c_str(), + deltaTreeSpace); + TraceEvent(SevError, "RedwoodDeltaTreeOverflow") + .detail("PageSize", p.pageSize) + .detail("BytesWritten", written); ASSERT(false); } auto& metrics = g_redwoodMetrics.level(btPage->height); metrics.pageBuild += 1; - metrics.pageBuildExt += blockCount - 1; - metrics.buildFillPct += (double)written / capacity; - metrics.buildStoredPct += (double)btPage->kvBytes / capacity; - metrics.buildItemCount += btPage->tree().numItems; + metrics.pageBuildExt += p.blockCount - 1; + metrics.buildFillPct += p.usedFraction(); + metrics.buildStoredPct += p.kvFraction(); + metrics.buildItemCount += p.count; // Create chunked pages // TODO: Avoid copying page bytes, but this is not trivial due to how pager checksums are currently handled. - if (blockCount != 1) { + if (p.blockCount != 1) { // Mark the slack in the page buffer as defined - VALGRIND_MAKE_MEM_DEFINED(((uint8_t*)btPage) + written, (blockCount * blockSize) - written); + VALGRIND_MAKE_MEM_DEFINED(((uint8_t*)btPage) + written, (p.blockCount * p.blockSize) - written); const uint8_t* rptr = (const uint8_t*)btPage; - for (int b = 0; b < blockCount; ++b) { + for (int b = 0; b < p.blockCount; ++b) { Reference page = self->m_pager->newPageBuffer(); - memcpy(page->mutate(), rptr, blockSize); - rptr += blockSize; + memcpy(page->mutate(), rptr, p.blockSize); + rptr += p.blockSize; pages.push_back(std::move(page)); } delete[](uint8_t*) btPage; } // Write this btree page, which is made of 1 or more pager pages. - state int p; state BTreePageIDRef childPageID; + state int k; // If we are only writing 1 page and it has the same BTreePageID size as the original then try to reuse the // LogicalPageIDs in previousID and try to update them atomically. - bool isOnlyPage = isLastPage && (start == 0); - if (isOnlyPage && previousID.size() == pages.size()) { - for (p = 0; p < pages.size(); ++p) { - LogicalPageID id = wait(self->m_pager->atomicUpdatePage(previousID[p], pages[p], v)); + if (pagesToBuild.size() == 1 && previousID.size() == pages.size()) { + for (k = 0; k < pages.size(); ++k) { + LogicalPageID id = wait(self->m_pager->atomicUpdatePage(previousID[k], pages[k], v)); childPageID.push_back(records.arena(), id); } } else { @@ -3942,31 +4054,25 @@ private: if (records.empty()) { self->freeBTreePage(previousID, v); } - for (p = 0; p < pages.size(); ++p) { + for (k = 0; k < pages.size(); ++k) { LogicalPageID id = wait(self->m_pager->newPageID()); - self->m_pager->updatePage(id, pages[p]); + self->m_pager->updatePage(id, pages[k]); childPageID.push_back(records.arena(), id); } } wait(yield()); - debug_printf("Flushing %s lastPage=%d original=%s start=%d i=%d count=%d page usage: %d/%d (%.2f%%) " - "bytes\nlower: %s\nupper: %s\n", - toString(childPageID).c_str(), - isLastPage, - toString(previousID).c_str(), - start, - i, - i - start, - compressedBytes, - pageSize, - (float)compressedBytes / pageSize * 100, - pageLowerBound.toString(false).c_str(), - pageUpperBound.toString(false).c_str()); - if (REDWOOD_DEBUG) { - for (int j = start; j < i; ++j) { + auto& p = pagesToBuild[pageIndex]; + debug_printf("Wrote %s original=%s deltaTreeSize=%d for %s\nlower: %s\nupper: %s\n", + toString(childPageID).c_str(), + toString(previousID).c_str(), + written, + p.toString().c_str(), + pageLowerBound.toString(false).c_str(), + pageUpperBound.toString(false).c_str()); + for (int j = p.startIndex; j < p.endIndex(); ++j) { debug_printf(" %3d: %s\n", j, entries[j].toString(height == 1).c_str()); } ASSERT(pageLowerBound.key <= pageUpperBound.key); @@ -3978,27 +4084,9 @@ private: // records.arena() above records.back().setChildPage(childPageID); - if (isLastPage) { - break; - } - - start = nextStart; - kvBytes = 0; - compressedBytes = BTreePage::BinaryTree::emptyTreeSize(); pageLowerBound = pageUpperBound; } - // If we're writing internal pages, if the last entry was the start of a new page and had an empty child link - // then it would not be written to a page. This means that the upper boundary for the the page set being built - // is not the upper bound of the final page in that set, so it must be added to the output set to preserve the - // decodability of the subtree to its left. Fortunately, this is easy to detect because the loop above would - // exit before i has reached the item count. - if (height != 1 && i != entries.size()) { - debug_printf("Adding dummy record to avoid writing useless page containing only one null link: %s\n", - pageUpperBound.toString(false).c_str()); - records.push_back_deep(records.arena(), pageUpperBound); - } - return records; } @@ -4294,11 +4382,12 @@ private: std::string toString() const { std::string s; - s += format("SubtreeSlice: addr=%p skipLen=%d subtreeCleared=%d childrenChanged=%d\n", + s += format("SubtreeSlice: addr=%p skipLen=%d subtreeCleared=%d childrenChanged=%d inPlaceUpdate=%d\n", this, skipLen, childrenChanged && newLinks.empty(), - childrenChanged); + childrenChanged, + inPlaceUpdate); s += format("SubtreeLower: %s\n", subtreeLowerBound->toString(false).c_str()); s += format(" DecodeLower: %s\n", decodeLowerBound->toString(false).c_str()); s += format(" DecodeUpper: %s\n", decodeUpperBound->toString(false).c_str()); From 5c93e684f8c130a399fdb6b7d998917b5a085f7f Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sun, 4 Apr 2021 19:23:08 -0700 Subject: [PATCH 034/180] Added comments. --- fdbserver/DeltaTree.h | 1 - fdbserver/VersionedBTree.actor.cpp | 28 ++++++++++++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 2e0fee0b40..ceff1f2ec3 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -230,7 +230,6 @@ struct DeltaTree { inline Node& newNode() { return *(Node*)((uint8_t*)this + size()); } public: - struct DecodedNode { DecodedNode() {} diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 4f3bb874c4..071ca5d074 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -3733,6 +3733,7 @@ private: Future m_lazyClearActor; bool m_lazyClearStop; + // Describes a range of a vector of records that should be built into a BTreePage struct PageToBuild { PageToBuild(int index, int blockSize) : startIndex(index), count(0), pageSize(blockSize), @@ -3740,27 +3741,33 @@ private: largeDeltaTree(pageSize > BTreePage::BinaryTree::SmallSizeLimit), blockSize(blockSize), blockCount(1), kvBytes(0) {} - int startIndex; - int count; - int pageSize; - int bytesLeft; - bool largeDeltaTree; - int blockSize; - int blockCount; - int kvBytes; + int startIndex; // Index of the first record + int count; // Number of records added to the page + int pageSize; // Page size required to hold a BTreePage of the added records, which is a multiple of blockSize + int bytesLeft; // Bytes in pageSize that are unused by the BTreePage so far + bool largeDeltaTree; // Whether or not the DeltaTree in the generated page is in the 'large' size range + int blockSize; // Base block size by which pageSize can be incremented + int blockCount; // The number of blocks in pageSize + int kvBytes; // The amount of user key/value bytes added to the page + // Number of bytes used by the generated/serialized BTreePage int size() const { return pageSize - bytesLeft; } + // Used fraction of pageSize bytes double usedFraction() const { return (double)size() / pageSize; } + // Unused fraction of pageSize bytes double slackFraction() const { return (double)bytesLeft / pageSize; } + // Fraction of PageSize in use by key or value string bytes, disregarding all overhead including string sizes double kvFraction() const { return (double)kvBytes / pageSize; } - int endIndex() const { return startIndex + count; } - + // Index of the last record to be included in this page int lastIndex() const { return endIndex() - 1; } + // Index of the first record NOT included in this page + int endIndex() const { return startIndex + count; } + std::string toString() const { return format( "{start=%d count=%d used %d/%d bytes (%.2f%% slack) kvBytes=%d blocks=%d blockSize=%d large=%d}", @@ -3836,6 +3843,7 @@ private: } }; + // Scans a vector of records and decides on page split points, returning a vector of 1+ pages to build static std::vector splitPages(const RedwoodRecordRef* lowerBound, const RedwoodRecordRef* upperBound, int prefixLen, From 5f89640b1bc8853ea1c069b038548848d5cc24b0 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Tue, 6 Apr 2021 02:45:33 -0700 Subject: [PATCH 035/180] Added performance unit test options for read parallelism, using existing file, and whether or not to insert new records. --- fdbserver/VersionedBTree.actor.cpp | 48 +++++++++++++++++++----------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 614ce3a51b..10866d8e12 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -8133,17 +8133,10 @@ TEST_CASE(":/redwood/performance/set") { g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting g_redwoodMetrics.clear(); - // If a test file is passed in by environment then don't write new data to it. - state bool reload = getenv("TESTFILE") == nullptr; - state std::string pagerFile = reload ? "unittest.redwood" : getenv("TESTFILE"); - - if (reload) { - printf("Deleting old test data\n"); - deleteFile(pagerFile); - } - + state std::string fileName = params.getParam("fileName").orDefault("unittest.redwood"); state int pageSize = params.getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE); - state int64_t pageCacheBytes = params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K); + state int64_t pageCacheBytes = + params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K); state int nodeCount = params.getIntParam("nodeCount").orDefault(1e9); state int maxRecordsPerCommit = params.getIntParam("maxRecordsPerCommit").orDefault(20000); state int maxKVBytesPerCommit = params.getIntParam("maxKVBytesPerCommit").orDefault(20e6); @@ -8158,6 +8151,10 @@ TEST_CASE(":/redwood/performance/set") { state char lastKeyChar = params.getParam("lastKeyChar").orDefault("m")[0]; state Version remapCleanupWindow = params.getIntParam("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW); + state bool openExisting = params.getIntParam("openExisting").orDefault(0); + state bool insertRecords = !openExisting || params.getIntParam("insertRecords").orDefault(0); + state int concurrentSeeks = params.getIntParam("concurrentSeeks").orDefault(64); + state int concurrentScans = params.getIntParam("concurrentScans").orDefault(64); printf("pageSize: %d\n", pageSize); printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes); @@ -8173,9 +8170,19 @@ TEST_CASE(":/redwood/performance/set") { printf("kvBytesTarget: %" PRId64 "\n", kvBytesTarget); printf("KeyLexicon '%c' to '%c'\n", firstKeyChar, lastKeyChar); printf("remapCleanupWindow: %" PRId64 "\n", remapCleanupWindow); + printf("fileName: %s\n", fileName.c_str()); + printf("concurrentScans: %d\n", concurrentScans); + printf("concurrentSeeks: %d\n", concurrentSeeks); + printf("openExisting: %d\n", openExisting); + printf("insertRecords: %d\n", insertRecords); - DWALPager* pager = new DWALPager(pageSize, pagerFile, pageCacheBytes, remapCleanupWindow); - state VersionedBTree* btree = new VersionedBTree(pager, pagerFile); + if (!openExisting) { + printf("Deleting old test data\n"); + deleteFile(fileName); + } + + DWALPager* pager = new DWALPager(pageSize, fileName, pageCacheBytes, remapCleanupWindow); + state VersionedBTree* btree = new VersionedBTree(pager, fileName); wait(btree->init()); state int64_t kvBytesThisCommit = 0; @@ -8188,7 +8195,7 @@ TEST_CASE(":/redwood/performance/set") { state double intervalStart = timer(); state double start = intervalStart; - if (reload) { + if (insertRecords) { while (kvBytesTotal < kvBytesTarget) { wait(yield()); @@ -8298,15 +8305,22 @@ TEST_CASE(":/redwood/performance/set") { wait(actors.signalAndReset()); printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); + printf("Parallel scans, concurrency=%d, no readAhead ...\n", concurrentScans); + for(int x = 0; x < concurrentScans; ++x) { + actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar)); + } + wait(actors.signalAndReset()); + printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); + printf("Serial seeks...\n"); actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - printf("Parallel seeks...\n"); - actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); - actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); - actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); + printf("Parallel seeks, concurrency=%d ...\n", concurrentSeeks); + for(int x = 0; x < concurrentSeeks; ++x) { + actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); + } wait(actors.signalAndReset()); printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); From aacee0656926dd19661dac94b9ef464619404af8 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Tue, 6 Apr 2021 03:06:29 -0700 Subject: [PATCH 036/180] Applied clang-format. --- fdbserver/VersionedBTree.actor.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 10866d8e12..c45ff44f38 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -8135,8 +8135,7 @@ TEST_CASE(":/redwood/performance/set") { state std::string fileName = params.getParam("fileName").orDefault("unittest.redwood"); state int pageSize = params.getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE); - state int64_t pageCacheBytes = - params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K); + state int64_t pageCacheBytes = params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K); state int nodeCount = params.getIntParam("nodeCount").orDefault(1e9); state int maxRecordsPerCommit = params.getIntParam("maxRecordsPerCommit").orDefault(20000); state int maxKVBytesPerCommit = params.getIntParam("maxKVBytesPerCommit").orDefault(20e6); @@ -8306,7 +8305,7 @@ TEST_CASE(":/redwood/performance/set") { printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Parallel scans, concurrency=%d, no readAhead ...\n", concurrentScans); - for(int x = 0; x < concurrentScans; ++x) { + for (int x = 0; x < concurrentScans; ++x) { actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar)); } wait(actors.signalAndReset()); @@ -8318,7 +8317,7 @@ TEST_CASE(":/redwood/performance/set") { printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Parallel seeks, concurrency=%d ...\n", concurrentSeeks); - for(int x = 0; x < concurrentSeeks; ++x) { + for (int x = 0; x < concurrentSeeks; ++x) { actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); } wait(actors.signalAndReset()); From 394e5628033175ef2629f06a43401e44cfc301ed Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Tue, 6 Apr 2021 03:44:49 -0700 Subject: [PATCH 037/180] Added seek and scan counts, parallel reads divide count over concurrent readers. --- fdbserver/VersionedBTree.actor.cpp | 31 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index c45ff44f38..a7b999539f 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -8154,6 +8154,8 @@ TEST_CASE(":/redwood/performance/set") { state bool insertRecords = !openExisting || params.getIntParam("insertRecords").orDefault(0); state int concurrentSeeks = params.getIntParam("concurrentSeeks").orDefault(64); state int concurrentScans = params.getIntParam("concurrentScans").orDefault(64); + state int seeks = params.getIntParam("seeks").orDefault(1000000); + state int scans = params.getIntParam("scans").orDefault(20000); printf("pageSize: %d\n", pageSize); printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes); @@ -8169,9 +8171,11 @@ TEST_CASE(":/redwood/performance/set") { printf("kvBytesTarget: %" PRId64 "\n", kvBytesTarget); printf("KeyLexicon '%c' to '%c'\n", firstKeyChar, lastKeyChar); printf("remapCleanupWindow: %" PRId64 "\n", remapCleanupWindow); - printf("fileName: %s\n", fileName.c_str()); printf("concurrentScans: %d\n", concurrentScans); printf("concurrentSeeks: %d\n", concurrentSeeks); + printf("seeks: %d\n", seeks); + printf("scans: %d\n", scans); + printf("fileName: %s\n", fileName.c_str()); printf("openExisting: %d\n", openExisting); printf("insertRecords: %d\n", insertRecords); @@ -8269,56 +8273,53 @@ TEST_CASE(":/redwood/performance/set") { kvBytesTotal / (timer() - start) / 1e6); } - int seeks = 1e6; printf("Warming cache with seeks\n"); - actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar)); - actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar)); - actors.add(randomSeeks(btree, seeks / 3, firstKeyChar, lastKeyChar)); + for (int x = 0; x < concurrentSeeks; ++x) { + actors.add(randomSeeks(btree, seeks / concurrentSeeks, firstKeyChar, lastKeyChar)); + } wait(actors.signalAndReset()); printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); - state int ops = 10000; - printf("Serial scans with adaptive readAhead...\n"); - actors.add(randomScans(btree, ops, 50, -1, firstKeyChar, lastKeyChar)); + actors.add(randomScans(btree, scans, 50, -1, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Serial scans with readAhead 3 pages...\n"); - actors.add(randomScans(btree, ops, 50, 12000, firstKeyChar, lastKeyChar)); + actors.add(randomScans(btree, scans, 50, 12000, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Serial scans with readAhead 2 pages...\n"); - actors.add(randomScans(btree, ops, 50, 8000, firstKeyChar, lastKeyChar)); + actors.add(randomScans(btree, scans, 50, 8000, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Serial scans with readAhead 1 page...\n"); - actors.add(randomScans(btree, ops, 50, 4000, firstKeyChar, lastKeyChar)); + actors.add(randomScans(btree, scans, 50, 4000, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Serial scans...\n"); - actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar)); + actors.add(randomScans(btree, scans, 50, 0, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Parallel scans, concurrency=%d, no readAhead ...\n", concurrentScans); for (int x = 0; x < concurrentScans; ++x) { - actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar)); + actors.add(randomScans(btree, scans / concurrentScans, 50, 0, firstKeyChar, lastKeyChar)); } wait(actors.signalAndReset()); printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Serial seeks...\n"); - actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, seeks, firstKeyChar, lastKeyChar)); wait(actors.signalAndReset()); printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); printf("Parallel seeks, concurrency=%d ...\n", concurrentSeeks); for (int x = 0; x < concurrentSeeks; ++x) { - actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, seeks / concurrentSeeks, firstKeyChar, lastKeyChar)); } wait(actors.signalAndReset()); printf("Stats:\n%s\n", g_redwoodMetrics.toString(true).c_str()); From 433872e17d2a128acc87d1efcc51fac152d0d706 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Tue, 6 Apr 2021 17:28:28 -0700 Subject: [PATCH 038/180] Sample actors waiting on network --- fdbclient/InstrumentRequest.h | 50 +++++++++++++++++++++++++++++++++++ fdbclient/NativeAPI.actor.cpp | 5 ++++ fdbrpc/FlowTests.actor.cpp | 4 +++ fdbrpc/sim2.actor.cpp | 7 +++++ flow/Net2.actor.cpp | 8 ++++++ flow/Platform.actor.cpp | 12 ++++++--- flow/network.h | 4 +++ 7 files changed, 86 insertions(+), 4 deletions(-) create mode 100644 fdbclient/InstrumentRequest.h diff --git a/fdbclient/InstrumentRequest.h b/fdbclient/InstrumentRequest.h new file mode 100644 index 0000000000..77adbd1490 --- /dev/null +++ b/fdbclient/InstrumentRequest.h @@ -0,0 +1,50 @@ +/* + * InstrumentRequest.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "flow/flow.h" +#include "flow/network.h" + +// Used to manually instrument waiting actors to collect samples for the +// sampling profiler. +struct InstrumentRequest { + unsigned index; + + InstrumentRequest() {} + + // This API isn't great. Ideally, no cleanup call is needed. I ran into an + // issue around the destructor being called twice because an instance of + // this class has to be stored as a class member (otherwise it goes away + // when wait is called), and due to how Flow does code generation the + // member will be default initialized and then initialized again when it is + // initially set. Then, the destructor will be called twice, causing issues + // when the WriteOnlySet tries to erase the same index twice. I'm working + // on this :) + + void start() { + index = g_network->getActorLineageSet().insert(currentLineage); + } + + void complete() { + g_network->getActorLineageSet().erase(index); + } +}; + diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index a0ed70997c..41e63c68f8 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -36,6 +36,7 @@ #include "fdbclient/ClusterInterface.h" #include "fdbclient/CoordinationInterface.h" #include "fdbclient/DatabaseContext.h" +#include "fdbclient/InstrumentRequest.h" #include "fdbclient/JsonBuilder.h" #include "fdbclient/KeyRangeMap.h" #include "fdbclient/Knobs.h" @@ -1770,6 +1771,7 @@ void runNetwork() { if (networkOptions.traceDirectory.present() && networkOptions.runLoopProfilingEnabled) { setupRunLoopProfiler(); } + setupSamplingProfiler(); g_network->run(); @@ -3025,6 +3027,8 @@ ACTOR Future> getRange(Database cx, throw deterministicRandom()->randomChoice( std::vector{ transaction_too_old(), future_version() }); } + state InstrumentRequest request; + request.start(); GetKeyValuesReply _rep = wait(loadBalance(cx.getPtr(), beginServer.second, @@ -3035,6 +3039,7 @@ ACTOR Future> getRange(Database cx, cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr)); rep = _rep; ++cx->transactionPhysicalReadsCompleted; + request.complete(); } catch (Error&) { ++cx->transactionPhysicalReadsCompleted; throw; diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp index 40e4ed1c52..c965149f70 100644 --- a/fdbrpc/FlowTests.actor.cpp +++ b/fdbrpc/FlowTests.actor.cpp @@ -24,6 +24,7 @@ #include "flow/UnitTest.h" #include "flow/DeterministicRandom.h" #include "flow/IThreadPool.h" +#include "flow/WriteOnlySet.h" #include "fdbrpc/fdbrpc.h" #include "fdbrpc/IAsyncFile.h" #include "flow/TLSConfig.actor.h" @@ -283,6 +284,9 @@ struct YieldMockNetwork final : INetwork, ReferenceCounted { static TLSConfig emptyConfig; return emptyConfig; } + ActorLineageSet& getActorLineageSet() override { + throw std::exception(); + } ProtocolVersion protocolVersion() override { return baseNetwork->protocolVersion(); } }; diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index e9219f3ff3..4bd2c9399e 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -31,6 +31,7 @@ #include "flow/IThreadPool.h" #include "flow/ProtocolVersion.h" #include "flow/Util.h" +#include "flow/WriteOnlySet.h" #include "fdbrpc/IAsyncFile.h" #include "fdbrpc/AsyncFileCached.actor.h" #include "fdbrpc/AsyncFileNonDurable.actor.h" @@ -975,6 +976,10 @@ public: bool checkRunnable() override { return net2->checkRunnable(); } + ActorLineageSet& getActorLineageSet() override { + return actorLineageSet; + } + void stop() override { isStopped = true; } void addStopCallback(std::function fn) override { stopCallbacks.emplace_back(std::move(fn)); } bool isSimulated() const override { return true; } @@ -2117,6 +2122,8 @@ public: // Whether or not yield has returned true during the current iteration of the run loop bool yielded; int yield_limit; // how many more times yield may return false before next returning true + + ActorLineageSet actorLineageSet; }; class UDPSimSocket : public IUDPSocket, ReferenceCounted { diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index 5026d6a982..bb3c675de4 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -198,6 +198,8 @@ public: bool checkRunnable() override; + ActorLineageSet& getActorLineageSet() override; + bool useThreadPool; // private: @@ -225,6 +227,8 @@ public: std::atomic stopped; mutable std::map addressOnHostCache; + ActorLineageSet actorLineageSet; + std::atomic started; uint64_t numYields; @@ -1377,6 +1381,10 @@ bool Net2::checkRunnable() { return !started.exchange(true); } +ActorLineageSet& Net2::getActorLineageSet() { + return actorLineageSet; +} + void Net2::run() { TraceEvent::setNetworkThread(); TraceEvent("Net2Running"); diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 50f252021b..5be9b6423f 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -3679,8 +3679,7 @@ void* sampleThread(void* arg) { while (true) { threadSleep(1.0); // TODO: Read sample rate from global config - // TODO: Copy actor lineage of currently running actor - // Read currentLineage + // Get actor lineage of currently running actor. auto actorLineage = currentLineageThreadSafe.get(); printf("Currently running actor lineage (%p):\n", actorLineage.getPtr()); auto stack = actorLineage->stack(&StackLineage::actorName); @@ -3690,11 +3689,16 @@ void* sampleThread(void* arg) { } printf("\n"); + // Get lineage of actors waiting on disk. auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy(); - printf("Disk ALPs: %d\n", diskAlps.size()); + // printf("Disk ALPs: %d\n", diskAlps.size()); + + // TODO: Get lineage of actors waiting on network + auto networkAlps = g_network->getActorLineageSet().copy(); + printf("Network ALPs: %d\n", networkAlps.size()); // TODO: Call collect on all actor lineages - for (auto actorLineage : diskAlps) { + for (auto actorLineage : networkAlps) { auto stack = actorLineage->stack(&StackLineage::actorName); while (!stack.empty()) { printf("%s ", stack.top()); diff --git a/flow/network.h b/flow/network.h index 33fb7b0f26..b335db3c2d 100644 --- a/flow/network.h +++ b/flow/network.h @@ -34,6 +34,7 @@ #include "flow/Arena.h" #include "flow/IRandom.h" #include "flow/Trace.h" +#include "flow/WriteOnlySet.h" enum class TaskPriority { Max = 1000000, @@ -535,6 +536,9 @@ public: // returns false. virtual bool checkRunnable() = 0; + // Returns the shared memory data structure used to store actor lineages. + virtual ActorLineageSet& getActorLineageSet() = 0; + virtual ProtocolVersion protocolVersion() = 0; // Shorthand for transport().getLocalAddress() From c481ba2cfa0f330230d83f8f290fa663d9e08348 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Tue, 6 Apr 2021 17:32:02 -0700 Subject: [PATCH 039/180] Update annotation class name --- fdbclient/{InstrumentRequest.h => AnnotateActor.h} | 6 +++--- fdbclient/NativeAPI.actor.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) rename fdbclient/{InstrumentRequest.h => AnnotateActor.h} (95%) diff --git a/fdbclient/InstrumentRequest.h b/fdbclient/AnnotateActor.h similarity index 95% rename from fdbclient/InstrumentRequest.h rename to fdbclient/AnnotateActor.h index 77adbd1490..cf5bf2c57e 100644 --- a/fdbclient/InstrumentRequest.h +++ b/fdbclient/AnnotateActor.h @@ -1,5 +1,5 @@ /* - * InstrumentRequest.h + * AnnotateActor.h * * This source file is part of the FoundationDB open source project * @@ -25,10 +25,10 @@ // Used to manually instrument waiting actors to collect samples for the // sampling profiler. -struct InstrumentRequest { +struct AnnotateActor { unsigned index; - InstrumentRequest() {} + AnnotateActor() {} // This API isn't great. Ideally, no cleanup call is needed. I ran into an // issue around the destructor being called twice because an instance of diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 41e63c68f8..e6d9463157 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -32,11 +32,11 @@ #include "fdbrpc/FailureMonitor.h" #include "fdbrpc/MultiInterface.h" +#include "fdbclient/AnnotateActor.h" #include "fdbclient/Atomic.h" #include "fdbclient/ClusterInterface.h" #include "fdbclient/CoordinationInterface.h" #include "fdbclient/DatabaseContext.h" -#include "fdbclient/InstrumentRequest.h" #include "fdbclient/JsonBuilder.h" #include "fdbclient/KeyRangeMap.h" #include "fdbclient/Knobs.h" @@ -3027,8 +3027,8 @@ ACTOR Future> getRange(Database cx, throw deterministicRandom()->randomChoice( std::vector{ transaction_too_old(), future_version() }); } - state InstrumentRequest request; - request.start(); + state AnnotateActor annotation; + annotation.start(); GetKeyValuesReply _rep = wait(loadBalance(cx.getPtr(), beginServer.second, @@ -3039,7 +3039,7 @@ ACTOR Future> getRange(Database cx, cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr)); rep = _rep; ++cx->transactionPhysicalReadsCompleted; - request.complete(); + annotation.complete(); } catch (Error&) { ++cx->transactionPhysicalReadsCompleted; throw; From 18120d6b1a528a975035948425d04a5d81b73c44 Mon Sep 17 00:00:00 2001 From: sfc-gh-tclinkenbeard Date: Tue, 6 Apr 2021 22:13:15 -0700 Subject: [PATCH 040/180] Add MasterMetrics periodic logging --- fdbserver/masterserver.actor.cpp | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 5fbf5bc2de..179c2e5c75 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -245,6 +245,15 @@ struct MasterData : NonCopyable, ReferenceCounted { std::vector backupWorkers; // Recruited backup workers from cluster controller. + CounterCollection cc; + Counter changeCoordinatorsRequests; + Counter getCommitVersionRequests; + Counter backupWorkerDoneRequests; + Counter getLiveCommittedVersionRequests; + Counter reportLiveCommittedVersionRequests; + + Future logger; + MasterData(Reference> const& dbInfo, MasterInterface const& myInterface, ServerCoordinators const& coordinators, @@ -258,7 +267,13 @@ struct MasterData : NonCopyable, ReferenceCounted { lastEpochEnd(invalidVersion), liveCommittedVersion(invalidVersion), databaseLocked(false), minKnownCommittedVersion(invalidVersion), recoveryTransactionVersion(invalidVersion), lastCommitTime(0), registrationCount(0), version(invalidVersion), lastVersionTime(0), txnStateStore(0), memoryLimit(2e9), - addActor(addActor), hasConfiguration(false), recruitmentStalled(makeReference>(false)) { + addActor(addActor), hasConfiguration(false), recruitmentStalled(makeReference>(false)), + cc("Master", dbgid.toString()), changeCoordinatorsRequests("ChangeCoordinatorsRequests", cc), + getCommitVersionRequests("GetCommitVersionRequests", cc), + backupWorkerDoneRequests("BackupWorkerDoneRequests", cc), + getLiveCommittedVersionRequests("GetLiveCommittedVersionRequests", cc), + reportLiveCommittedVersionRequests("ReportLiveCommittedVersionRequests", cc) { + logger = traceCounters("MasterMetrics", dbgid, SERVER_KNOBS->WORKER_LOGGING_INTERVAL, &cc, "MasterMetrics"); if (forceRecovery && !myInterface.locality.dcId().present()) { TraceEvent(SevError, "ForcedRecoveryRequiresDcID"); forceRecovery = false; @@ -1095,6 +1110,8 @@ ACTOR Future getVersion(Reference self, GetCommitVersionReques state std::map::iterator proxyItr = self->lastCommitProxyVersionReplies.find(req.requestingProxy); // lastCommitProxyVersionReplies never changes + ++self->getCommitVersionRequests; + if (proxyItr == self->lastCommitProxyVersionReplies.end()) { // Request from invalid proxy (e.g. from duplicate recruitment request) req.reply.send(Never()); @@ -1191,6 +1208,7 @@ ACTOR Future serveLiveCommittedVersion(Reference self) { if (self->liveCommittedVersion == invalidVersion) { self->liveCommittedVersion = self->recoveryTransactionVersion; } + ++self->getLiveCommittedVersionRequests; GetRawCommittedVersionReply reply; reply.version = self->liveCommittedVersion; reply.locked = self->databaseLocked; @@ -1206,6 +1224,7 @@ ACTOR Future serveLiveCommittedVersion(Reference self) { self->databaseLocked = req.locked; self->proxyMetadataVersion = req.metadataVersion; } + ++self->reportLiveCommittedVersionRequests; req.reply.send(Void()); } } @@ -1374,6 +1393,7 @@ static std::set const& normalMasterErrors() { ACTOR Future changeCoordinators(Reference self) { loop { ChangeCoordinatorsRequest req = waitNext(self->myInterface.changeCoordinators.getFuture()); + ++self->changeCoordinatorsRequests; state ChangeCoordinatorsRequest changeCoordinatorsRequest = req; while (!self->cstate.previousWrite.isReady()) { @@ -1981,6 +2001,7 @@ ACTOR Future masterServer(MasterInterface mi, if (self->logSystem.isValid() && self->logSystem->removeBackupWorker(req)) { self->registrationTrigger.trigger(); } + ++self->backupWorkerDoneRequests; req.reply.send(Void()); } when(wait(collection)) { From 5c79d29140614245e21beb4dc12e1be30479e98d Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Wed, 7 Apr 2021 10:59:45 -0700 Subject: [PATCH 041/180] Use object lifetimes instead of function calls --- fdbclient/AnnotateActor.h | 37 +++++++++++++++++++++++------------ fdbclient/NativeAPI.actor.cpp | 4 +--- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h index cf5bf2c57e..0d0cd4a632 100644 --- a/fdbclient/AnnotateActor.h +++ b/fdbclient/AnnotateActor.h @@ -27,24 +27,35 @@ // sampling profiler. struct AnnotateActor { unsigned index; + bool set; - AnnotateActor() {} + AnnotateActor() : set(false) {} - // This API isn't great. Ideally, no cleanup call is needed. I ran into an - // issue around the destructor being called twice because an instance of - // this class has to be stored as a class member (otherwise it goes away - // when wait is called), and due to how Flow does code generation the - // member will be default initialized and then initialized again when it is - // initially set. Then, the destructor will be called twice, causing issues - // when the WriteOnlySet tries to erase the same index twice. I'm working - // on this :) + AnnotateActor(Reference lineage) : set(true) { + index = g_network->getActorLineageSet().insert(lineage); + } - void start() { - index = g_network->getActorLineageSet().insert(currentLineage); + AnnotateActor(const AnnotateActor& other) = delete; + AnnotateActor(AnnotateActor&& other) = delete; + AnnotateActor& operator=(const AnnotateActor& other) = delete; + + AnnotateActor& operator=(AnnotateActor&& other) { + if (this == &other) { + return *this; + } + + this->index = other.index; + this->set = other.set; + + other.set = false; + + return *this; } - void complete() { - g_network->getActorLineageSet().erase(index); + ~AnnotateActor() { + if (set) { + g_network->getActorLineageSet().erase(index); + } } }; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index e6d9463157..f05257e06d 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -3027,8 +3027,7 @@ ACTOR Future> getRange(Database cx, throw deterministicRandom()->randomChoice( std::vector{ transaction_too_old(), future_version() }); } - state AnnotateActor annotation; - annotation.start(); + state AnnotateActor annotation(currentLineage); GetKeyValuesReply _rep = wait(loadBalance(cx.getPtr(), beginServer.second, @@ -3039,7 +3038,6 @@ ACTOR Future> getRange(Database cx, cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr)); rep = _rep; ++cx->transactionPhysicalReadsCompleted; - annotation.complete(); } catch (Error&) { ++cx->transactionPhysicalReadsCompleted; throw; From d6c4aa67d71c829c2da198a65c4753cbaa1c1246 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Tue, 6 Apr 2021 17:28:28 -0700 Subject: [PATCH 042/180] Sample actors waiting on network --- fdbclient/InstrumentRequest.h | 50 +++++++++++++++++++++++++++++++++++ fdbclient/NativeAPI.actor.cpp | 5 ++++ fdbrpc/FlowTests.actor.cpp | 4 +++ fdbrpc/sim2.actor.cpp | 7 +++++ flow/Net2.actor.cpp | 8 ++++++ flow/Platform.actor.cpp | 12 ++++++--- flow/network.h | 4 +++ 7 files changed, 86 insertions(+), 4 deletions(-) create mode 100644 fdbclient/InstrumentRequest.h diff --git a/fdbclient/InstrumentRequest.h b/fdbclient/InstrumentRequest.h new file mode 100644 index 0000000000..77adbd1490 --- /dev/null +++ b/fdbclient/InstrumentRequest.h @@ -0,0 +1,50 @@ +/* + * InstrumentRequest.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "flow/flow.h" +#include "flow/network.h" + +// Used to manually instrument waiting actors to collect samples for the +// sampling profiler. +struct InstrumentRequest { + unsigned index; + + InstrumentRequest() {} + + // This API isn't great. Ideally, no cleanup call is needed. I ran into an + // issue around the destructor being called twice because an instance of + // this class has to be stored as a class member (otherwise it goes away + // when wait is called), and due to how Flow does code generation the + // member will be default initialized and then initialized again when it is + // initially set. Then, the destructor will be called twice, causing issues + // when the WriteOnlySet tries to erase the same index twice. I'm working + // on this :) + + void start() { + index = g_network->getActorLineageSet().insert(currentLineage); + } + + void complete() { + g_network->getActorLineageSet().erase(index); + } +}; + diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 8b55757621..0952bae4d4 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -36,6 +36,7 @@ #include "fdbclient/ClusterInterface.h" #include "fdbclient/CoordinationInterface.h" #include "fdbclient/DatabaseContext.h" +#include "fdbclient/InstrumentRequest.h" #include "fdbclient/JsonBuilder.h" #include "fdbclient/KeyRangeMap.h" #include "fdbclient/Knobs.h" @@ -1796,6 +1797,7 @@ void runNetwork() { if (networkOptions.traceDirectory.present() && networkOptions.runLoopProfilingEnabled) { setupRunLoopProfiler(); } + setupSamplingProfiler(); g_network->run(); @@ -3051,6 +3053,8 @@ ACTOR Future> getRange(Database cx, throw deterministicRandom()->randomChoice( std::vector{ transaction_too_old(), future_version() }); } + state InstrumentRequest request; + request.start(); GetKeyValuesReply _rep = wait(loadBalance(cx.getPtr(), beginServer.second, @@ -3061,6 +3065,7 @@ ACTOR Future> getRange(Database cx, cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr)); rep = _rep; ++cx->transactionPhysicalReadsCompleted; + request.complete(); } catch (Error&) { ++cx->transactionPhysicalReadsCompleted; throw; diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp index 40e4ed1c52..c965149f70 100644 --- a/fdbrpc/FlowTests.actor.cpp +++ b/fdbrpc/FlowTests.actor.cpp @@ -24,6 +24,7 @@ #include "flow/UnitTest.h" #include "flow/DeterministicRandom.h" #include "flow/IThreadPool.h" +#include "flow/WriteOnlySet.h" #include "fdbrpc/fdbrpc.h" #include "fdbrpc/IAsyncFile.h" #include "flow/TLSConfig.actor.h" @@ -283,6 +284,9 @@ struct YieldMockNetwork final : INetwork, ReferenceCounted { static TLSConfig emptyConfig; return emptyConfig; } + ActorLineageSet& getActorLineageSet() override { + throw std::exception(); + } ProtocolVersion protocolVersion() override { return baseNetwork->protocolVersion(); } }; diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 3b965d22b9..5cf65da0a5 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -31,6 +31,7 @@ #include "flow/IThreadPool.h" #include "flow/ProtocolVersion.h" #include "flow/Util.h" +#include "flow/WriteOnlySet.h" #include "fdbrpc/IAsyncFile.h" #include "fdbrpc/AsyncFileCached.actor.h" #include "fdbrpc/AsyncFileNonDurable.actor.h" @@ -975,6 +976,10 @@ public: bool checkRunnable() override { return net2->checkRunnable(); } + ActorLineageSet& getActorLineageSet() override { + return actorLineageSet; + } + void stop() override { isStopped = true; } void addStopCallback(std::function fn) override { stopCallbacks.emplace_back(std::move(fn)); } bool isSimulated() const override { return true; } @@ -2117,6 +2122,8 @@ public: // Whether or not yield has returned true during the current iteration of the run loop bool yielded; int yield_limit; // how many more times yield may return false before next returning true + + ActorLineageSet actorLineageSet; }; class UDPSimSocket : public IUDPSocket, ReferenceCounted { diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index bb0b0325c6..fb64671c28 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -204,6 +204,8 @@ public: bool checkRunnable() override; + ActorLineageSet& getActorLineageSet() override; + bool useThreadPool; // private: @@ -231,6 +233,8 @@ public: std::atomic stopped; mutable std::map addressOnHostCache; + ActorLineageSet actorLineageSet; + std::atomic started; uint64_t numYields; @@ -1383,6 +1387,10 @@ bool Net2::checkRunnable() { return !started.exchange(true); } +ActorLineageSet& Net2::getActorLineageSet() { + return actorLineageSet; +} + void Net2::run() { TraceEvent::setNetworkThread(); TraceEvent("Net2Running"); diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 50f252021b..5be9b6423f 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -3679,8 +3679,7 @@ void* sampleThread(void* arg) { while (true) { threadSleep(1.0); // TODO: Read sample rate from global config - // TODO: Copy actor lineage of currently running actor - // Read currentLineage + // Get actor lineage of currently running actor. auto actorLineage = currentLineageThreadSafe.get(); printf("Currently running actor lineage (%p):\n", actorLineage.getPtr()); auto stack = actorLineage->stack(&StackLineage::actorName); @@ -3690,11 +3689,16 @@ void* sampleThread(void* arg) { } printf("\n"); + // Get lineage of actors waiting on disk. auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy(); - printf("Disk ALPs: %d\n", diskAlps.size()); + // printf("Disk ALPs: %d\n", diskAlps.size()); + + // TODO: Get lineage of actors waiting on network + auto networkAlps = g_network->getActorLineageSet().copy(); + printf("Network ALPs: %d\n", networkAlps.size()); // TODO: Call collect on all actor lineages - for (auto actorLineage : diskAlps) { + for (auto actorLineage : networkAlps) { auto stack = actorLineage->stack(&StackLineage::actorName); while (!stack.empty()) { printf("%s ", stack.top()); diff --git a/flow/network.h b/flow/network.h index d0f117dede..ec14167121 100644 --- a/flow/network.h +++ b/flow/network.h @@ -35,6 +35,7 @@ #include "flow/Arena.h" #include "flow/IRandom.h" #include "flow/Trace.h" +#include "flow/WriteOnlySet.h" enum class TaskPriority { Max = 1000000, @@ -558,6 +559,9 @@ public: // returns false. virtual bool checkRunnable() = 0; + // Returns the shared memory data structure used to store actor lineages. + virtual ActorLineageSet& getActorLineageSet() = 0; + virtual ProtocolVersion protocolVersion() = 0; // Shorthand for transport().getLocalAddress() From d60011aa74105f496cd76578cd7413c84864f884 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Tue, 6 Apr 2021 17:32:02 -0700 Subject: [PATCH 043/180] Update annotation class name --- fdbclient/{InstrumentRequest.h => AnnotateActor.h} | 6 +++--- fdbclient/NativeAPI.actor.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) rename fdbclient/{InstrumentRequest.h => AnnotateActor.h} (95%) diff --git a/fdbclient/InstrumentRequest.h b/fdbclient/AnnotateActor.h similarity index 95% rename from fdbclient/InstrumentRequest.h rename to fdbclient/AnnotateActor.h index 77adbd1490..cf5bf2c57e 100644 --- a/fdbclient/InstrumentRequest.h +++ b/fdbclient/AnnotateActor.h @@ -1,5 +1,5 @@ /* - * InstrumentRequest.h + * AnnotateActor.h * * This source file is part of the FoundationDB open source project * @@ -25,10 +25,10 @@ // Used to manually instrument waiting actors to collect samples for the // sampling profiler. -struct InstrumentRequest { +struct AnnotateActor { unsigned index; - InstrumentRequest() {} + AnnotateActor() {} // This API isn't great. Ideally, no cleanup call is needed. I ran into an // issue around the destructor being called twice because an instance of diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 0952bae4d4..cdac01f56f 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -32,11 +32,11 @@ #include "fdbrpc/FailureMonitor.h" #include "fdbrpc/MultiInterface.h" +#include "fdbclient/AnnotateActor.h" #include "fdbclient/Atomic.h" #include "fdbclient/ClusterInterface.h" #include "fdbclient/CoordinationInterface.h" #include "fdbclient/DatabaseContext.h" -#include "fdbclient/InstrumentRequest.h" #include "fdbclient/JsonBuilder.h" #include "fdbclient/KeyRangeMap.h" #include "fdbclient/Knobs.h" @@ -3053,8 +3053,8 @@ ACTOR Future> getRange(Database cx, throw deterministicRandom()->randomChoice( std::vector{ transaction_too_old(), future_version() }); } - state InstrumentRequest request; - request.start(); + state AnnotateActor annotation; + annotation.start(); GetKeyValuesReply _rep = wait(loadBalance(cx.getPtr(), beginServer.second, @@ -3065,7 +3065,7 @@ ACTOR Future> getRange(Database cx, cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr)); rep = _rep; ++cx->transactionPhysicalReadsCompleted; - request.complete(); + annotation.complete(); } catch (Error&) { ++cx->transactionPhysicalReadsCompleted; throw; From 130e520ad78aefbe3dccf02680f13dfdc5d9ac89 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Wed, 7 Apr 2021 10:59:45 -0700 Subject: [PATCH 044/180] Use object lifetimes instead of function calls --- fdbclient/AnnotateActor.h | 37 +++++++++++++++++++++++------------ fdbclient/NativeAPI.actor.cpp | 4 +--- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h index cf5bf2c57e..0d0cd4a632 100644 --- a/fdbclient/AnnotateActor.h +++ b/fdbclient/AnnotateActor.h @@ -27,24 +27,35 @@ // sampling profiler. struct AnnotateActor { unsigned index; + bool set; - AnnotateActor() {} + AnnotateActor() : set(false) {} - // This API isn't great. Ideally, no cleanup call is needed. I ran into an - // issue around the destructor being called twice because an instance of - // this class has to be stored as a class member (otherwise it goes away - // when wait is called), and due to how Flow does code generation the - // member will be default initialized and then initialized again when it is - // initially set. Then, the destructor will be called twice, causing issues - // when the WriteOnlySet tries to erase the same index twice. I'm working - // on this :) + AnnotateActor(Reference lineage) : set(true) { + index = g_network->getActorLineageSet().insert(lineage); + } - void start() { - index = g_network->getActorLineageSet().insert(currentLineage); + AnnotateActor(const AnnotateActor& other) = delete; + AnnotateActor(AnnotateActor&& other) = delete; + AnnotateActor& operator=(const AnnotateActor& other) = delete; + + AnnotateActor& operator=(AnnotateActor&& other) { + if (this == &other) { + return *this; + } + + this->index = other.index; + this->set = other.set; + + other.set = false; + + return *this; } - void complete() { - g_network->getActorLineageSet().erase(index); + ~AnnotateActor() { + if (set) { + g_network->getActorLineageSet().erase(index); + } } }; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index cdac01f56f..b208107fde 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -3053,8 +3053,7 @@ ACTOR Future> getRange(Database cx, throw deterministicRandom()->randomChoice( std::vector{ transaction_too_old(), future_version() }); } - state AnnotateActor annotation; - annotation.start(); + state AnnotateActor annotation(currentLineage); GetKeyValuesReply _rep = wait(loadBalance(cx.getPtr(), beginServer.second, @@ -3065,7 +3064,6 @@ ACTOR Future> getRange(Database cx, cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr)); rep = _rep; ++cx->transactionPhysicalReadsCompleted; - annotation.complete(); } catch (Error&) { ++cx->transactionPhysicalReadsCompleted; throw; From 83cf9658750bfed301702c7e24d4de5de0fb1a65 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Wed, 7 Apr 2021 15:38:01 -0700 Subject: [PATCH 045/180] Add global variable to fetch each type of sample --- fdbclient/AnnotateActor.cpp | 23 +++++++++++++++++++++++ fdbclient/AnnotateActor.h | 3 +++ fdbclient/CMakeLists.txt | 1 + flow/Platform.actor.cpp | 32 +++++++++++++++++--------------- 4 files changed, 44 insertions(+), 15 deletions(-) create mode 100644 fdbclient/AnnotateActor.cpp diff --git a/fdbclient/AnnotateActor.cpp b/fdbclient/AnnotateActor.cpp new file mode 100644 index 0000000000..80b9a8cec4 --- /dev/null +++ b/fdbclient/AnnotateActor.cpp @@ -0,0 +1,23 @@ +/* + * AnnotateActor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/AnnotateActor.h" + +std::map>()>> samples; diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h index 0d0cd4a632..265d1bb3ad 100644 --- a/fdbclient/AnnotateActor.h +++ b/fdbclient/AnnotateActor.h @@ -59,3 +59,6 @@ struct AnnotateActor { } }; +enum WaitState { Disk, Network }; + +extern std::map>()>> samples; diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index 129f9e7d3e..0f61d0c638 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -1,4 +1,5 @@ set(FDBCLIENT_SRCS + AnnotateActor.cpp AsyncFileS3BlobStore.actor.cpp AsyncFileS3BlobStore.actor.h AsyncTaskThread.actor.cpp diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 5be9b6423f..be12a594d2 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -50,6 +50,8 @@ #include "fdbrpc/IAsyncFile.h" +#include "fdbclient/AnnotateActor.h" + #ifdef _WIN32 #include #include @@ -3689,31 +3691,31 @@ void* sampleThread(void* arg) { } printf("\n"); - // Get lineage of actors waiting on disk. - auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy(); - // printf("Disk ALPs: %d\n", diskAlps.size()); + for (const auto& [waitState, lineageFn] : samples) { + auto alps = lineageFn(); - // TODO: Get lineage of actors waiting on network - auto networkAlps = g_network->getActorLineageSet().copy(); - printf("Network ALPs: %d\n", networkAlps.size()); + // TODO: Serialize collected actor linage properties - // TODO: Call collect on all actor lineages - for (auto actorLineage : networkAlps) { - auto stack = actorLineage->stack(&StackLineage::actorName); - while (!stack.empty()) { - printf("%s ", stack.top()); - stack.pop(); + printf("Wait State #%d ALPs (%d):\n", waitState, alps.size()); + for (auto actorLineage : alps) { + auto stack = actorLineage->stack(&StackLineage::actorName); + while (!stack.empty()) { + printf("%s ", stack.top()); + stack.pop(); + } + printf("\n"); } - printf("\n"); } - - // TODO: Serialize collected actor linage properties } return nullptr; } void setupSamplingProfiler() { + samples[WaitState::Disk] = std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet())); + samples[WaitState::Network] = + std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet())); + // TODO: Add knob TraceEvent("StartingSamplingProfilerThread"); startThread(&sampleThread, nullptr); From 60e59555a729a8227da903d65a6a264de4d97629 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 7 Apr 2021 18:39:06 -0700 Subject: [PATCH 046/180] Removed btree cleanup parameter override. --- tests/rare/RedwoodCorrectnessBTree.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/rare/RedwoodCorrectnessBTree.toml b/tests/rare/RedwoodCorrectnessBTree.toml index db21848a4b..c39098e4cc 100644 --- a/tests/rare/RedwoodCorrectnessBTree.toml +++ b/tests/rare/RedwoodCorrectnessBTree.toml @@ -7,4 +7,3 @@ startDelay = 0 testName = 'UnitTests' maxTestCases = 0 testsMatching = '/redwood/correctness/btree' - remapCleanupWindow = 1000000000 From f8786da688737e42f1a482375c550258d03e0628 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 7 Apr 2021 20:14:16 -0700 Subject: [PATCH 047/180] Added StorageByte::toString() and printed it in Redwood direct perf test. --- fdbclient/FDBTypes.h | 9 ++++++++- fdbserver/VersionedBTree.actor.cpp | 10 ++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index b2cd469ab8..dde2a348ca 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -880,8 +880,15 @@ struct StorageBytes { void serialize(Ar& ar) { serializer(ar, free, total, used, available); } -}; + std::string toString() const { + return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used}", + total / 1e6, + free / 1e6, + available / 1e6, + used / 1e6); + } +}; struct LogMessageVersion { // Each message pushed into the log system has a unique, totally ordered LogMessageVersion // See ILogSystem::push() for how these are assigned diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index a7b999539f..89f6bae442 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -8187,6 +8187,7 @@ TEST_CASE(":/redwood/performance/set") { DWALPager* pager = new DWALPager(pageSize, fileName, pageCacheBytes, remapCleanupWindow); state VersionedBTree* btree = new VersionedBTree(pager, fileName); wait(btree->init()); + printf("Initialized. StorageBytes=%s\n", btree->getStorageBytes().toString().c_str()); state int64_t kvBytesThisCommit = 0; state int64_t kvBytesTotal = 0; @@ -8271,6 +8272,7 @@ TEST_CASE(":/redwood/performance/set") { printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); + printf("StorageBytes=%s\n", btree->getStorageBytes().toString().c_str()); } printf("Warming cache with seeks\n"); @@ -8441,14 +8443,6 @@ struct KVSource { } }; -std::string toString(const StorageBytes& sb) { - return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used}", - sb.total / 1e6, - sb.free / 1e6, - sb.available / 1e6, - sb.used / 1e6); -} - ACTOR Future getStableStorageBytes(IKeyValueStore* kvs) { state StorageBytes sb = kvs->getStorageBytes(); From 5074ac6a4d9b5cfd1275193f2ad0746ddb0ca786 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 7 Apr 2021 20:40:06 -0700 Subject: [PATCH 048/180] Missed file from previous merge commit. --- fdbserver/VersionedBTree.actor.cpp | 129 ++++++++++++++++------------- 1 file changed, 71 insertions(+), 58 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 89f6bae442..1c0c013892 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -7126,7 +7126,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { bytes += deltaTest(a, b); } double elapsed = timer() - start; - printf("DeltaTest() on random large records %g M/s %g MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6); + printf("DeltaTest() on random large records %f M/s %f MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6); keyBuffer.resize(30); valueBuffer.resize(100); @@ -7138,7 +7138,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { RedwoodRecordRef b = randomRedwoodRecordRef(keyBuffer, valueBuffer); bytes += deltaTest(a, b); } - printf("DeltaTest() on random small records %g M/s %g MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6); + printf("DeltaTest() on random small records %f M/s %f MB/s\n", count / elapsed / 1e6, bytes / elapsed / 1e6); RedwoodRecordRef rec1; RedwoodRecordRef rec2; @@ -7155,7 +7155,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { for (i = 0; i < count; ++i) { total += rec1.getCommonPrefixLen(rec2, 50); } - printf("%" PRId64 " getCommonPrefixLen(skip=50) %g M/s\n", total, count / (timer() - start) / 1e6); + printf("%" PRId64 " getCommonPrefixLen(skip=50) %f M/s\n", total, count / (timer() - start) / 1e6); start = timer(); total = 0; @@ -7163,7 +7163,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { for (i = 0; i < count; ++i) { total += rec1.getCommonPrefixLen(rec2, 0); } - printf("%" PRId64 " getCommonPrefixLen(skip=0) %g M/s\n", total, count / (timer() - start) / 1e6); + printf("%" PRId64 " getCommonPrefixLen(skip=0) %f M/s\n", total, count / (timer() - start) / 1e6); char buf[1000]; RedwoodRecordRef::Delta& d = *(RedwoodRecordRef::Delta*)buf; @@ -7176,7 +7176,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { for (i = 0; i < count; ++i) { total += rec1.writeDelta(d, rec2, commonPrefix); } - printf("%" PRId64 " writeDelta(commonPrefix=%d) %g M/s\n", total, commonPrefix, count / (timer() - start) / 1e6); + printf("%" PRId64 " writeDelta(commonPrefix=%d) %f M/s\n", total, commonPrefix, count / (timer() - start) / 1e6); start = timer(); total = 0; @@ -7184,7 +7184,7 @@ TEST_CASE("/redwood/correctness/unit/RedwoodRecordRef") { for (i = 0; i < count; ++i) { total += rec1.writeDelta(d, rec2); } - printf("%" PRId64 " writeDelta() %g M/s\n", total, count / (timer() - start) / 1e6); + printf("%" PRId64 " writeDelta() %f M/s\n", total, count / (timer() - start) / 1e6); return Void(); } @@ -7744,30 +7744,43 @@ TEST_CASE("/redwood/correctness/btree") { g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting g_redwoodMetrics.clear(); - state std::string pagerFile = "unittest_pageFile.redwood"; + state std::string fileName = params.get("fileName").orDefault("unittest_pageFile.redwood"); IPager2* pager; - state bool serialTest = deterministicRandom()->coinflip(); - state bool shortTest = deterministicRandom()->coinflip(); + state bool serialTest = params.getInt("serialTest").orDefault(deterministicRandom()->coinflip()); + state bool shortTest = params.getInt("shortTest").orDefault(deterministicRandom()->coinflip()); state int pageSize = shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400)); - state int64_t targetPageOps = shortTest ? 50000 : 1000000; - state bool pagerMemoryOnly = shortTest && (deterministicRandom()->random01() < .001); - state int maxKeySize = deterministicRandom()->randomInt(1, pageSize * 2); - state int maxValueSize = randomSize(pageSize * 25); - state int maxCommitSize = shortTest ? 1000 : randomSize(std::min((maxKeySize + maxValueSize) * 20000, 10e6)); - state double clearProbability = deterministicRandom()->random01() * .1; - state double clearSingleKeyProbability = deterministicRandom()->random01(); - state double clearPostSetProbability = deterministicRandom()->random01() * .1; - state double coldStartProbability = pagerMemoryOnly ? 0 : (deterministicRandom()->random01() * 0.3); - state double advanceOldVersionProbability = deterministicRandom()->random01(); + state int64_t targetPageOps = params.getInt("targetPageOps").orDefault(shortTest ? 50000 : 1000000); + state bool pagerMemoryOnly = + params.getInt("pagerMemoryOnly").orDefault(shortTest && (deterministicRandom()->random01() < .001)); + state int maxKeySize = params.getInt("maxKeySize").orDefault(deterministicRandom()->randomInt(1, pageSize * 2)); + state int maxValueSize = params.getInt("maxValueSize").orDefault(randomSize(pageSize * 25)); + state int maxCommitSize = + params.getInt("maxCommitSize") + .orDefault(shortTest ? 1000 : randomSize(std::min((maxKeySize + maxValueSize) * 20000, 10e6))); + state double clearProbability = + params.getDouble("clearProbability").orDefault(deterministicRandom()->random01() * .1); + state double clearSingleKeyProbability = + params.getDouble("clearSingleKeyProbability").orDefault(deterministicRandom()->random01()); + state double clearPostSetProbability = + params.getDouble("clearPostSetProbability").orDefault(deterministicRandom()->random01() * .1); + state double coldStartProbability = params.getDouble("coldStartProbability") + .orDefault(pagerMemoryOnly ? 0 : (deterministicRandom()->random01() * 0.3)); + state double advanceOldVersionProbability = + params.getDouble("advanceOldVersionProbability").orDefault(deterministicRandom()->random01()); state int64_t cacheSizeBytes = - pagerMemoryOnly ? 2e9 : (pageSize * deterministicRandom()->randomInt(1, (BUGGIFY ? 2 : 10000) + 1)); - state Version versionIncrement = deterministicRandom()->randomInt64(1, 1e8); - state Version remapCleanupWindow = BUGGIFY ? 0 : deterministicRandom()->randomInt64(1, versionIncrement * 50); - state int maxVerificationMapEntries = 300e3; + params.getInt("cacheSizeBytes") + .orDefault(pagerMemoryOnly ? 2e9 + : (pageSize * deterministicRandom()->randomInt(1, (BUGGIFY ? 2 : 10000) + 1))); + state Version versionIncrement = + params.getInt("versionIncrement").orDefault(deterministicRandom()->randomInt64(1, 1e8)); + state Version remapCleanupWindow = + params.getInt("remapCleanupWindow") + .orDefault(BUGGIFY ? 0 : deterministicRandom()->randomInt64(1, versionIncrement * 50)); + state int maxVerificationMapEntries = params.getInt("maxVerificationMapEntries").orDefault(300e3); printf("\n"); printf("targetPageOps: %" PRId64 "\n", targetPageOps); @@ -7790,11 +7803,11 @@ TEST_CASE("/redwood/correctness/btree") { printf("\n"); printf("Deleting existing test data...\n"); - deleteFile(pagerFile); + deleteFile(fileName); printf("Initializing...\n"); - pager = new DWALPager(pageSize, pagerFile, cacheSizeBytes, remapCleanupWindow, pagerMemoryOnly); - state VersionedBTree* btree = new VersionedBTree(pager, pagerFile); + pager = new DWALPager(pageSize, fileName, cacheSizeBytes, remapCleanupWindow, pagerMemoryOnly); + state VersionedBTree* btree = new VersionedBTree(pager, fileName); wait(btree->init()); state std::map, Optional> written; @@ -7997,8 +8010,8 @@ TEST_CASE("/redwood/correctness/btree") { wait(closedFuture); printf("Reopening btree from disk.\n"); - IPager2* pager = new DWALPager(pageSize, pagerFile, cacheSizeBytes, remapCleanupWindow); - btree = new VersionedBTree(pager, pagerFile); + IPager2* pager = new DWALPager(pageSize, fileName, cacheSizeBytes, remapCleanupWindow); + btree = new VersionedBTree(pager, fileName); wait(btree->init()); Version v = btree->getLatestVersion(); @@ -8034,7 +8047,7 @@ TEST_CASE("/redwood/correctness/btree") { state Future closedFuture = btree->onClosed(); btree->close(); wait(closedFuture); - btree = new VersionedBTree(new DWALPager(pageSize, pagerFile, cacheSizeBytes, 0), pagerFile); + btree = new VersionedBTree(new DWALPager(pageSize, fileName, cacheSizeBytes, 0), fileName); wait(btree->init()); wait(btree->clearAllAndCheckSanity()); @@ -8133,29 +8146,29 @@ TEST_CASE(":/redwood/performance/set") { g_redwoodMetricsActor = Void(); // Prevent trace event metrics from starting g_redwoodMetrics.clear(); - state std::string fileName = params.getParam("fileName").orDefault("unittest.redwood"); - state int pageSize = params.getIntParam("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE); - state int64_t pageCacheBytes = params.getIntParam("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K); - state int nodeCount = params.getIntParam("nodeCount").orDefault(1e9); - state int maxRecordsPerCommit = params.getIntParam("maxRecordsPerCommit").orDefault(20000); - state int maxKVBytesPerCommit = params.getIntParam("maxKVBytesPerCommit").orDefault(20e6); - state int64_t kvBytesTarget = params.getIntParam("kvBytesTarget").orDefault(4e9); - state int minKeyPrefixBytes = params.getIntParam("minKeyPrefixBytes").orDefault(25); - state int maxKeyPrefixBytes = params.getIntParam("maxKeyPrefixBytes").orDefault(25); - state int minValueSize = params.getIntParam("minValueSize").orDefault(100); - state int maxValueSize = params.getIntParam("maxValueSize").orDefault(500); - state int minConsecutiveRun = params.getIntParam("minConsecutiveRun").orDefault(1); - state int maxConsecutiveRun = params.getIntParam("maxConsecutiveRun").orDefault(100); - state char firstKeyChar = params.getParam("firstKeyChar").orDefault("a")[0]; - state char lastKeyChar = params.getParam("lastKeyChar").orDefault("m")[0]; + state std::string fileName = params.get("fileName").orDefault("unittest.redwood"); + state int pageSize = params.getInt("pageSize").orDefault(SERVER_KNOBS->REDWOOD_DEFAULT_PAGE_SIZE); + state int64_t pageCacheBytes = params.getInt("pageCacheBytes").orDefault(FLOW_KNOBS->PAGE_CACHE_4K); + state int nodeCount = params.getInt("nodeCount").orDefault(1e9); + state int maxRecordsPerCommit = params.getInt("maxRecordsPerCommit").orDefault(20000); + state int maxKVBytesPerCommit = params.getInt("maxKVBytesPerCommit").orDefault(20e6); + state int64_t kvBytesTarget = params.getInt("kvBytesTarget").orDefault(4e9); + state int minKeyPrefixBytes = params.getInt("minKeyPrefixBytes").orDefault(25); + state int maxKeyPrefixBytes = params.getInt("maxKeyPrefixBytes").orDefault(25); + state int minValueSize = params.getInt("minValueSize").orDefault(100); + state int maxValueSize = params.getInt("maxValueSize").orDefault(500); + state int minConsecutiveRun = params.getInt("minConsecutiveRun").orDefault(1); + state int maxConsecutiveRun = params.getInt("maxConsecutiveRun").orDefault(100); + state char firstKeyChar = params.get("firstKeyChar").orDefault("a")[0]; + state char lastKeyChar = params.get("lastKeyChar").orDefault("m")[0]; state Version remapCleanupWindow = - params.getIntParam("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW); - state bool openExisting = params.getIntParam("openExisting").orDefault(0); - state bool insertRecords = !openExisting || params.getIntParam("insertRecords").orDefault(0); - state int concurrentSeeks = params.getIntParam("concurrentSeeks").orDefault(64); - state int concurrentScans = params.getIntParam("concurrentScans").orDefault(64); - state int seeks = params.getIntParam("seeks").orDefault(1000000); - state int scans = params.getIntParam("scans").orDefault(20000); + params.getInt("remapCleanupWindow").orDefault(SERVER_KNOBS->REDWOOD_REMAP_CLEANUP_WINDOW); + state bool openExisting = params.getInt("openExisting").orDefault(0); + state bool insertRecords = !openExisting || params.getInt("insertRecords").orDefault(0); + state int concurrentSeeks = params.getInt("concurrentSeeks").orDefault(64); + state int concurrentScans = params.getInt("concurrentScans").orDefault(64); + state int seeks = params.getInt("seeks").orDefault(1000000); + state int scans = params.getInt("scans").orDefault(20000); printf("pageSize: %d\n", pageSize); printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes); @@ -8648,10 +8661,10 @@ ACTOR Future doPrefixInsertComparison(int suffixSize, } TEST_CASE(":/redwood/performance/prefixSizeComparison") { - state int suffixSize = 12; - state int valueSize = 100; - state int recordCountTarget = 100e6; - state int usePrefixesInOrder = false; + state int suffixSize = params.getInt("suffixSize").orDefault(12); + state int valueSize = params.getInt("valueSize").orDefault(100); + state int recordCountTarget = params.getInt("recordCountTarget").orDefault(100e6); + state bool usePrefixesInOrder = params.getInt("usePrefixesInOrder").orDefault(0); wait(doPrefixInsertComparison( suffixSize, valueSize, recordCountTarget, usePrefixesInOrder, KVSource({ { 10, 100000 } }))); @@ -8669,9 +8682,9 @@ TEST_CASE(":/redwood/performance/prefixSizeComparison") { } TEST_CASE(":/redwood/performance/sequentialInsert") { - state int prefixLen = 30; - state int valueSize = 100; - state int recordCountTarget = 100e6; + state int prefixLen = params.getInt("prefixLen").orDefault(30); + state int valueSize = params.getInt("valueSize").orDefault(100); + state int recordCountTarget = params.getInt("recordCountTarget").orDefault(100e6); deleteFile("test.redwood"); wait(delay(5)); From 5e6655f11134f2880f55157f4a8d3e1515369398 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Wed, 7 Apr 2021 23:56:20 -0700 Subject: [PATCH 049/180] Added temp space to StorageBytes. --- fdbclient/FDBTypes.h | 21 ++++++++++++++------- fdbserver/VersionedBTree.actor.cpp | 3 ++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index dde2a348ca..7334917639 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -866,15 +866,21 @@ struct TLogSpillType { // Contains the amount of free and total space for a storage server, in bytes struct StorageBytes { + // Free space on the filesystem int64_t free; + // Total space on the filesystem int64_t total; - int64_t used; // Used by *this* store, not total-free - int64_t available; // Amount of disk space that can be used by data structure, including free disk space and - // internally reusable space + // Used by *this* store, not total - free + int64_t used; + // Amount of space available for use by the store, which includes free space on the filesystem + // and internal free space within the store data that is immediately reusable. + int64_t available; + // Amount of space that could eventually be available for use after garbage collection + int64_t temp; StorageBytes() {} - StorageBytes(int64_t free, int64_t total, int64_t used, int64_t available) - : free(free), total(total), used(used), available(available) {} + StorageBytes(int64_t free, int64_t total, int64_t used, int64_t available, int64_t temp = 0) + : free(free), total(total), used(used), available(available), temp(temp) {} template void serialize(Ar& ar) { @@ -882,11 +888,12 @@ struct StorageBytes { } std::string toString() const { - return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used}", + return format("{%.2f MB total, %.2f MB free, %.2f MB available, %.2f MB used, %.2f MB temp}", total / 1e6, free / 1e6, available / 1e6, - used / 1e6); + used / 1e6, + temp / 1e6); } }; struct LogMessageVersion { diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 1c0c013892..8d659ff368 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2111,8 +2111,9 @@ public: // known, if each commit delayed entries that were freeable were shuffled from the delayed free queue to the // free queue, but this doesn't seem necessary. int64_t reusable = (freeList.numEntries + delayedFreeList.numEntries) * physicalPageSize; + int64_t temp = remapQueue.numEntries * physicalPageSize; - return StorageBytes(free, total, pagerSize - reusable, free + reusable); + return StorageBytes(free, total, pagerSize - reusable, free + reusable, temp); } ACTOR static Future getUserPageCount_cleanup(DWALPager* self) { From cbd77fe6f3861ffdcd35e20dbfc838d15da0f3e7 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 8 Apr 2021 01:09:47 -0700 Subject: [PATCH 050/180] Added new StorageBytes member to StorageMetrics and TLogMetrics (for newest TLog version only). Moved StorageBytes detail from SpecialCounters to the traceCounters() decorator callback to avoid calling getStorageBytes(), which makes a system call, four extra times on storage servers and eight extra times on logs. --- fdbserver/TLogServer.actor.cpp | 36 +++++++++++++++---------------- fdbserver/storageserver.actor.cpp | 14 +++++++----- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 1561b2f81f..5c744f2e78 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -665,24 +665,6 @@ struct LogData : NonCopyable, public ReferenceCounted { specialCounter(cc, "SharedBytesDurable", [tLogData]() { return tLogData->bytesDurable; }); specialCounter(cc, "SharedOverheadBytesInput", [tLogData]() { return tLogData->overheadBytesInput; }); specialCounter(cc, "SharedOverheadBytesDurable", [tLogData]() { return tLogData->overheadBytesDurable; }); - specialCounter( - cc, "KvstoreBytesUsed", [tLogData]() { return tLogData->persistentData->getStorageBytes().used; }); - specialCounter( - cc, "KvstoreBytesFree", [tLogData]() { return tLogData->persistentData->getStorageBytes().free; }); - specialCounter(cc, "KvstoreBytesAvailable", [tLogData]() { - return tLogData->persistentData->getStorageBytes().available; - }); - specialCounter( - cc, "KvstoreBytesTotal", [tLogData]() { return tLogData->persistentData->getStorageBytes().total; }); - specialCounter( - cc, "QueueDiskBytesUsed", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().used; }); - specialCounter( - cc, "QueueDiskBytesFree", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().free; }); - specialCounter(cc, "QueueDiskBytesAvailable", [tLogData]() { - return tLogData->rawPersistentQueue->getStorageBytes().available; - }); - specialCounter( - cc, "QueueDiskBytesTotal", [tLogData]() { return tLogData->rawPersistentQueue->getStorageBytes().total; }); specialCounter(cc, "PeekMemoryReserved", [tLogData]() { return tLogData->peekMemoryLimiter.activePermits(); }); specialCounter(cc, "PeekMemoryRequestsStalled", [tLogData]() { return tLogData->peekMemoryLimiter.waiters(); }); specialCounter(cc, "Generation", [this]() { return this->recoveryCount; }); @@ -2672,7 +2654,23 @@ ACTOR Future tLogCore(TLogData* self, logData->logId, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &logData->cc, - logData->logId.toString() + "/TLogMetrics")); + logData->logId.toString() + "/TLogMetrics", + [self=self](TraceEvent& te) { + StorageBytes sbTlog = self->persistentData->getStorageBytes(); + te.detail("KvstoreBytesUsed", sbTlog.used); + te.detail("KvstoreBytesFree", sbTlog.free); + te.detail("KvstoreBytesAvailable", sbTlog.available); + te.detail("KvstoreBytesTotal", sbTlog.total); + te.detail("KvstoreBytesTemp", sbTlog.temp); + + StorageBytes sbQueue = self->rawPersistentQueue->getStorageBytes(); + te.detail("QueueDiskBytesUsed", sbQueue.used); + te.detail("QueueDiskBytesFree", sbQueue.free); + te.detail("QueueDiskBytesAvailable", sbQueue.available); + te.detail("QueueDiskBytesTotal", sbQueue.total); + te.detail("QueueDiskBytesTemp", sbQueue.temp); + })); + logData->addActor.send(serveTLogInterface(self, tli, logData, warningCollectorInput)); logData->addActor.send(cleanupPeekTrackers(logData.getPtr())); logData->addActor.send(logPeekTrackers(logData.getPtr())); diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 8c26f955bb..5ded5d78d1 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -717,10 +717,6 @@ public: specialCounter(cc, "ActiveWatches", [self]() { return self->numWatches; }); specialCounter(cc, "WatchBytes", [self]() { return self->watchBytes; }); - specialCounter(cc, "KvstoreBytesUsed", [self]() { return self->storage.getStorageBytes().used; }); - specialCounter(cc, "KvstoreBytesFree", [self]() { return self->storage.getStorageBytes().free; }); - specialCounter(cc, "KvstoreBytesAvailable", [self]() { return self->storage.getStorageBytes().available; }); - specialCounter(cc, "KvstoreBytesTotal", [self]() { return self->storage.getStorageBytes().total; }); specialCounter(cc, "KvstoreSizeTotal", [self]() { return std::get<0>(self->storage.getSize()); }); specialCounter(cc, "KvstoreNodeTotal", [self]() { return std::get<1>(self->storage.getSize()); }); specialCounter(cc, "KvstoreInlineKey", [self]() { return std::get<2>(self->storage.getSize()); }); @@ -4240,7 +4236,15 @@ ACTOR Future metricsCore(StorageServer* self, StorageServerInterface ssi) SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self->counters.cc, self->thisServerID.toString() + "/StorageMetrics", - [tag](TraceEvent& te) { te.detail("Tag", tag.toString()); })); + [tag, self=self](TraceEvent& te) { + te.detail("Tag", tag.toString()); + StorageBytes sb = self->storage.getStorageBytes(); + te.detail("KvstoreBytesUsed", sb.used); + te.detail("KvstoreBytesFree", sb.free); + te.detail("KvstoreBytesAvailable", sb.available); + te.detail("KvstoreBytesTotal", sb.total); + te.detail("KvstoreBytesTemp", sb.temp); + })); loop { choose { From 20649037057ee8deba2fca24cd2204d10a5e4b61 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 9 Apr 2021 14:25:11 -0600 Subject: [PATCH 051/180] collect and serialize --- CMakeLists.txt | 1 + cmake/GetMsgpack.cmake | 16 ++ fdbclient/ActorLineageProfiler.cpp | 183 ++++++++++++++++++++++ fdbclient/ActorLineageProfiler.h | 80 ++++++++++ fdbclient/CMakeLists.txt | 7 +- fdbserver/RoleLineage.actor.cpp | 4 +- fdbserver/RoleLineage.actor.h | 43 ++++-- fdbserver/WorkerInterface.actor.h | 34 +++++ flow/Net2.actor.cpp | 4 +- flow/Platform.actor.cpp | 8 +- flow/flow.cpp | 6 +- flow/flow.h | 13 +- flow/singleton.h | 237 +++++++++++++++++++++++++++++ 13 files changed, 606 insertions(+), 30 deletions(-) create mode 100644 cmake/GetMsgpack.cmake create mode 100644 fdbclient/ActorLineageProfiler.cpp create mode 100644 fdbclient/ActorLineageProfiler.h create mode 100644 flow/singleton.h diff --git a/CMakeLists.txt b/CMakeLists.txt index f6e85984f1..2e48d95447 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -152,6 +152,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") endif() include(CompileBoost) +include(GetMsgpack) add_subdirectory(flow) add_subdirectory(fdbrpc) add_subdirectory(fdbclient) diff --git a/cmake/GetMsgpack.cmake b/cmake/GetMsgpack.cmake new file mode 100644 index 0000000000..0b951d5a1b --- /dev/null +++ b/cmake/GetMsgpack.cmake @@ -0,0 +1,16 @@ +find_package(msgpack 3.3.0 EXACT QUIET CONFIG) + +add_library(msgpack INTERFACE) + +if(msgpack_FOUND) + target_link_libraries(msgpack INTERFACE msgpackc-cxx) +else() + include(ExternalProject) + ExternalProject_add(msgpackProject + URL "https://github.com/msgpack/msgpack-c/releases/download/cpp-3.3.0/msgpack-3.3.0.tar.gz" + URL_HASH SHA256=6e114d12a5ddb8cb11f669f83f32246e484a8addd0ce93f274996f1941c1f07b + CONFIGURE_COMMAND BUILD_COMMAND INSTALL_COMMAND) + + ExternalProject_Get_property(msgpackProject SOURCE_DIR) + target_include_directories(msgpack SYSTEM INTERFACE "${SOURCE_DIR}/include") +endif() \ No newline at end of file diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp new file mode 100644 index 0000000000..8d5ad1d6ae --- /dev/null +++ b/fdbclient/ActorLineageProfiler.cpp @@ -0,0 +1,183 @@ +/* + * ActorLineageProfiler.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-20201 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/singleton.h" +#include "fdbclient/ActorLineageProfiler.h" +#include +#include +#include + +using namespace std::literals; + +class Packer : public msgpack::packer { + struct visitor_t { + using VisitorMap = std::unordered_map>; + VisitorMap visitorMap; + + template + static void any_visitor(std::any const& val, Packer& packer) { + const T& v = std::any_cast(val); + packer.pack(v); + } + + template + struct populate_visitor_map; + template + struct populate_visitor_map { + static void populate(VisitorMap& map) { + map.emplace(any_visitor); + populate_visitor_map::populate(map); + } + }; + template <> + struct populate_visitor_map<> { + static void populate(VisitorMap&) {} + }; + + visitor_t() { populate_visitor_map::populate(visitorMap); } + + void visit(const std::any& val, Packer& packer) { + auto iter = visitorMap.find(val.type()); + if (iter == visitorMap.end()) { + // TODO: trace error + } else { + iter->second(val, packer); + } + } + }; + msgpack::sbuffer sbuffer; + // Initializing visitor_t involves building a type-map. As this is a relatively expensive operation, we don't want + // to do this each time we create a Packer object. So visitor_t is a stateless class and we only use it as a + // visitor. + crossbow::singleton visitor; + +public: + Packer() : msgpack::packer(sbuffer) {} + + void pack(std::any const& val) { visitor->visit(val, *this); } + + void pack(bool val) { + if (val) { + pack_true(); + } else { + pack_false(); + } + } + + void pack(uint64_t val) { + if (val <= std::numeric_limits::max()) { + pack_uint8(uint8_t(val)); + } else if (val <= std::numeric_limits::max()) { + pack_uint16(uint16_t(val)); + } else if (val <= std::numeric_limits::max()) { + pack_uint32(uint32_t(val)); + } else { + pack_uint64(val); + } + } + + void pack(int64_t val) { + if (val >= 0) { + this->pack(uint64_t(val)); + } else if (val >= std::numeric_limits::min()) { + pack_int8(int8_t(val)); + } else if (val >= std::numeric_limits::min()) { + pack_int8(int16_t(val)); + } else if (val >= std::numeric_limits::min()) { + pack_int8(int32_t(val)); + } else if (val >= std::numeric_limits::min()) { + pack_int8(int64_t(val)); + } + } + + void pack(float val) { pack_float(val); } + void pack(double val) { pack_double(val); } + void pack(std::string const& str) { + pack_str(str.size()); + pack_str_body(str.data(), str.size()); + } + + void pack(std::string_view val) { + pack_str(val.size()); + pack_str_body(val.data(), val.size()); + } + + template + void pack(std::map const& map) { + pack_map(map.size()); + for (const auto& p : map) { + pack(p.first); + pack(p.second); + } + } + + template + void pack(std::vector const& val) { + pack_array(val.size()); + for (const auto& v : val) { + pack(v); + } + } + + std::shared_ptr done(double time) { + auto res = std::make_shared(); + res->time = time; + res->size = sbuffer.size(); + res->data = sbuffer.release(); + return res; + } +}; + +IALPCollectorBase::IALPCollectorBase() { + SampleCollector::instance().addCollector(this); +} + +std::map SampleCollectorT::collect(ActorLineage* lineage) { + std::map out; + for (auto& collector : collectors) { + auto val = collector->collect(lineage); + if (val.has_value()) { + out[collector->name()] = val.value(); + } + } + return out; +} + +std::shared_ptr SampleCollectorT::collect() { + Packer packer; + std::map res; + double time = g_network->now(); + res["time"sv] = time; + for (auto& p : getSamples) { + std::vector> samples; + auto sampleVec = p.second(); + for (auto& val : sampleVec) { + auto m = collect(val.getPtr()); + if (!m.empty()) { + samples.emplace_back(std::move(m)); + } + } + if (!samples.empty()) { + res[to_string(p.first)] = samples; + } + } + packer.pack(res); + return packer.done(time); +} diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h new file mode 100644 index 0000000000..cbd2e7d1f3 --- /dev/null +++ b/fdbclient/ActorLineageProfiler.h @@ -0,0 +1,80 @@ +/* + * ActorLineageProfiler.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-20201 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include +#include +#include "flow/singleton.h" +#include "flow/flow.h" + +struct IALPCollectorBase { + virtual std::optional collect(ActorLineage*) = 0; + virtual const std::string_view& name() = 0; + IALPCollectorBase(); +}; + +template +struct IALPCollector : IALPCollectorBase { + const std::string_view& name() override { + static std::string_view res; + if (res == "") { + res = T::name; + } + return res; + } +}; + +enum class WaitState { Running, DiskIO }; + +std::string_view to_string(WaitState w) { + switch (w) { + case WaitState::Running: + return "Running"; + case WaitState::DiskIO: + return "DiskIO"; + } +} + +struct Sample : std::enable_shared_from_this { + double time = 0.0; + unsigned size = 0u; + char* data = nullptr; + ~Sample() { ::free(data); } +}; + +class SampleCollectorT { +public: // Types + friend class crossbow::singleton; + using Getter = std::function>()>; + +private: + std::vector collectors; + std::map getSamples; + SampleCollectorT() {} + +public: + void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); } + std::map collect(ActorLineage* lineage); + std::shared_ptr collect(); +}; + +using SampleCollector = crossbow::singleton; diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index 129f9e7d3e..f81fd92eac 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -1,4 +1,6 @@ set(FDBCLIENT_SRCS + ActorLineageProfiler.h + ActorLineageProfiler.cpp AsyncFileS3BlobStore.actor.cpp AsyncFileS3BlobStore.actor.h AsyncTaskThread.actor.cpp @@ -137,8 +139,7 @@ endif() add_flow_target(STATIC_LIBRARY NAME fdbclient SRCS ${FDBCLIENT_SRCS} ADDL_SRCS ${options_srcs}) add_dependencies(fdbclient fdboptions) +target_link_libraries(fdbclient PUBLIC fdbrpc msgpack) if(BUILD_AZURE_BACKUP) - target_link_libraries(fdbclient PUBLIC fdbrpc PRIVATE curl uuid azure-storage-lite) -else() - target_link_libraries(fdbclient PUBLIC fdbrpc) + target_link_libraries(fdbclient PRIVATE curl uuid azure-storage-lite) endif() diff --git a/fdbserver/RoleLineage.actor.cpp b/fdbserver/RoleLineage.actor.cpp index 6d1b49527a..b54282f5f0 100644 --- a/fdbserver/RoleLineage.actor.cpp +++ b/fdbserver/RoleLineage.actor.cpp @@ -20,4 +20,6 @@ #include "fdbserver/RoleLineage.actor.h" -StringRef RoleLineage::name = "RoleLineage"_sr; +using namespace std::literals; + +std::string_view RoleLineage::name = "RoleLineage"sv; diff --git a/fdbserver/RoleLineage.actor.h b/fdbserver/RoleLineage.actor.h index d35c749771..5cbf65ed53 100644 --- a/fdbserver/RoleLineage.actor.h +++ b/fdbserver/RoleLineage.actor.h @@ -21,30 +21,47 @@ #pragma once #include "flow/flow.h" #if defined(NO_INTELLISENSE) && !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_G_H) -# define FDBSERVER_ROLE_LINEAGE_ACTOR_G_H -# include "fdbserver/RoleLineage.actor.g.h" +#define FDBSERVER_ROLE_LINEAGE_ACTOR_G_H +#include "fdbserver/RoleLineage.actor.g.h" #elif !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_H) -# define FDBSERVER_ROLE_LINEAGE_ACTOR_H +#define FDBSERVER_ROLE_LINEAGE_ACTOR_H +#include "flow/singleton.h" #include "fdbrpc/Locality.h" +#include "fdbclient/ActorLineageProfiler.h" +#include "fdbserver/WorkerInterface.actor.h" + +#include +#include +#include #include "flow/actorcompiler.h" // This must be the last include struct RoleLineage : LineageProperties { - static StringRef name; - ProcessClass::ClusterRole role = ProcessClass::NoRole; + static std::string_view name; + ProcessClass::ClusterRole role = ProcessClass::NoRole; - bool isSet(ProcessClass::ClusterRole RoleLineage::*member) const { - return this->*member != ProcessClass::NoRole; - } + bool isSet(ProcessClass::ClusterRole RoleLineage::*member) const { return this->*member != ProcessClass::NoRole; } +}; + +struct RoleLineageCollector : IALPCollector { + RoleLineageCollector() : IALPCollector() {} + std::optional collect(ActorLineage* lineage) override { + auto res = lineage->get(&RoleLineage::role); + if (res.has_value()) { + return Role::get(res.value()).abbreviation; + } else { + return std::optional(); + } + } }; // creates a new root and sets the role lineage -ACTOR template +ACTOR template Future()())> runInRole(Fun fun, ProcessClass::ClusterRole role) { - currentLineage->makeRoot(); - currentLineage->modify(&RoleLineage::role) = role; - decltype(std::declval()()) res = wait(fun()); - return res; + currentLineage->makeRoot(); + currentLineage->modify(&RoleLineage::role) = role; + decltype(std::declval()()) res = wait(fun()); + return res; } #endif diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index f1d83ec819..57c2833f3c 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -787,6 +787,40 @@ struct Role { std::string abbreviation; bool includeInTraceRoles; + static const Role& get(ProcessClass::ClusterRole role) { + switch (role) { + case ProcessClass::Storage: + return STORAGE_SERVER; + case ProcessClass::TLog: + return TRANSACTION_LOG; + case ProcessClass::CommitProxy: + return COMMIT_PROXY; + case ProcessClass::GrvProxy: + return GRV_PROXY; + case ProcessClass::Master: + return MASTER; + case ProcessClass::Resolver: + return RESOLVER; + case ProcessClass::LogRouter: + return LOG_ROUTER; + case ProcessClass::ClusterController: + return CLUSTER_CONTROLLER; + case ProcessClass::DataDistributor: + return DATA_DISTRIBUTOR; + case ProcessClass::Ratekeeper: + return RATEKEEPER; + case ProcessClass::StorageCache: + return STORAGE_CACHE; + case ProcessClass::Backup: + return BACKUP; + case ProcessClass::Worker: + return WORKER; + case ProcessClass::NoRole: + ASSERT(false); + throw internal_error(); + } + } + bool operator==(const Role& r) const { return roleName == r.roleName; } bool operator!=(const Role& r) const { return !(*this == r); } diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index bb0b0325c6..a95af0cd21 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -226,7 +226,9 @@ public: TaskPriority currentTaskID; uint64_t tasksIssued; TDMetricCollection tdmetrics; - double currentTime; + // we read now() from a different thread. On Intel, reading a double is atomic anyways, but on other platforms it's + // not. For portability this should be atomic + std::atomic currentTime; // May be accessed off the network thread, e.g. by onMainThread std::atomic stopped; mutable std::map addressOnHostCache; diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 50f252021b..b28c6c35d5 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -3685,8 +3685,8 @@ void* sampleThread(void* arg) { printf("Currently running actor lineage (%p):\n", actorLineage.getPtr()); auto stack = actorLineage->stack(&StackLineage::actorName); while (!stack.empty()) { - printf("%s ", stack.top()); - stack.pop(); + printf("%s ", stack.back()); + stack.pop_back(); } printf("\n"); @@ -3697,8 +3697,8 @@ void* sampleThread(void* arg) { for (auto actorLineage : diskAlps) { auto stack = actorLineage->stack(&StackLineage::actorName); while (!stack.empty()) { - printf("%s ", stack.top()); - stack.pop(); + printf("%s ", stack.back()); + stack.pop_back(); } printf("\n"); } diff --git a/flow/flow.cpp b/flow/flow.cpp index 9a7dda781a..351c8d0aa2 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -39,9 +39,11 @@ ActorLineage::~ActorLineage() { } } -StringRef StackLineage::name = "StackLineage"_sr; +using namespace std::literals; -std::stack getActorStackTrace() { +std::string_view StackLineage::name = "StackLineage"sv; + +std::vector getActorStackTrace() { return currentLineage->stack(&StackLineage::actorName); } diff --git a/flow/flow.h b/flow/flow.h index b61453c8f2..09211959a7 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -450,7 +451,7 @@ struct ActorLineage : ReferenceCounted { friend class LocalLineage; private: - std::unordered_map properties; + std::unordered_map properties; Reference parent; public: @@ -483,15 +484,15 @@ public: return std::optional{}; } template - std::stack stack(V T::*member) const { + std::vector stack(V T::*member) const { auto current = this; - std::stack res; + std::vector res; while (current != nullptr) { auto iter = current->properties.find(T::name); if (iter != current->properties.end()) { T const& map = static_cast(*iter->second); if (map.isSet(member)) { - res.push(map.*member); + res.push_back(map.*member); } } current = current->parent.getPtr(); @@ -529,11 +530,11 @@ struct restore_lineage { }; struct StackLineage : LineageProperties { - static StringRef name; + static const std::string_view name; StringRef actorName; }; -extern std::stack getActorStackTrace(); +extern std::vector getActorStackTrace(); // SAV is short for Single Assignment Variable: It can be assigned for only once! template diff --git a/flow/singleton.h b/flow/singleton.h new file mode 100644 index 0000000000..c6a256ac42 --- /dev/null +++ b/flow/singleton.h @@ -0,0 +1,237 @@ +/* + * (C) Copyright 2015 ETH Zurich Systems Group (http://www.systems.ethz.ch/) and others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Contributors: + * Markus Pilman + * Simon Loesing + * Thomas Etter + * Kevin Bocksrocker + * Lucas Braun + */ +#pragma once + +#include +#include +#include +#include + +namespace crossbow { + +/** + * @brief A mock mutex for disabling locking in the singleton + * + * This class implements the mutex concept with empty methods. + * This can be used to disable synchronization in the singleton + * holder. + */ +struct no_locking { + void lock() {} + void unlock() {} + bool try_lock() { return true; } +}; + +template +struct create_static { + static constexpr bool supports_recreation = false; + union max_align { + char t_[sizeof(T)]; + short int short_int_; + long int long_int_; + float float_; + double double_; + long double longDouble_; + struct Test; + int Test::*pMember_; + int (Test::*pMemberFn_)(int); + }; + + static T* create() { + static max_align static_memory_; + return new (&static_memory_) T; + } + + static void destroy(T* ptr) { ptr->~T(); } +}; + +template +struct create_using_new { + static constexpr bool supports_recreation = true; + static T* create() { return new T; }; + + static void destroy(T* ptr) { delete ptr; } +}; + +template +struct create_using_malloc { + static constexpr bool supports_recreation = true; + static T* create() { + void* p = std::malloc(sizeof(T)); + if (!p) + return nullptr; + return new (p) T; + } + + static void destroy(T* ptr) { + ptr->~T(); + free(ptr); + } +}; + +template +struct create_using { + static constexpr bool supports_recreation = true; + static allocator alloc_; + + static T* create() { + T* p = alloc_.allocate(1); + if (!p) + return nullptr; + alloc_.construct(p); + return p; + }; + + static void destroy(T* ptr) { + alloc_.destroy(ptr); + alloc_.deallocate(ptr, 1); + } +}; + +template +struct default_lifetime { + static void schedule_destruction(T*, void (*func)()) { std::atexit(func); } + + static void on_dead_ref() { throw std::logic_error("Dead reference detected"); } +}; + +template +struct phoenix_lifetime { + static void schedule_destruction(T*, void (*func)()) { std::atexit(func); } + + static void on_dead_ref() {} +}; + +template +struct infinite_lifetime { + static void schedule_destruction(T*, void (*)()) {} + static void on_dead_ref() {} +}; + +template +struct lifetime_traits { + static constexpr bool supports_recreation = true; +}; + +template +struct lifetime_traits> { + static constexpr bool supports_recreation = false; +}; + +template +struct lifetime_traits> { + static constexpr bool supports_recreation = false; +}; + +template , + typename LifetimePolicy = default_lifetime, + typename Mutex = std::mutex> +class singleton { +public: + typedef Type value_type; + typedef Type* pointer; + typedef const Type* const_pointer; + typedef const Type& const_reference; + typedef Type& reference; + +private: + static bool destroyed_; + static pointer instance_; + static Mutex mutex_; + + static void destroy() { + if (destroyed_) + return; + Create::destroy(instance_); + instance_ = nullptr; + destroyed_ = true; + } + +public: + static reference instance() { + static_assert(Create::supports_recreation || !lifetime_traits::supports_recreation, + "The creation policy does not support instance recreation, while the lifetime does support it."); + if (!instance_) { + std::lock_guard l(mutex_); + if (!instance_) { + if (destroyed_) { + destroyed_ = false; + LifetimePolicy::on_dead_ref(); + } + instance_ = Create::create(); + LifetimePolicy::schedule_destruction(instance_, &destroy); + } + } + return *instance_; + } + /** + * WARNING: DO NOT EXECUTE THIS MULTITHREADED!!! + */ + static void destroy_instance() { + if (instance_) { + std::lock_guard l(mutex_); + destroy(); + } + } + +public: + pointer operator->() { + if (!instance_) { + instance(); + } + return instance_; + } + + reference operator*() { + if (!instance_) { + instance(); + } + return *instance_; + } + + const_pointer operator->() const { + if (!instance_) { + instance(); + } + return instance_; + } + + const_reference operator*() const { + if (!instance_) { + instance(); + } + return *instance_; + } +}; + +template +bool singleton::destroyed_ = false; + +template +typename singleton::pointer singleton::instance_ = nullptr; + +template +M singleton::mutex_; + +} // namespace crossbow \ No newline at end of file From 20d98421af0e8d11ca41f3e748d52f54d9a143e7 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 9 Apr 2021 15:16:07 -0600 Subject: [PATCH 052/180] fix compiler errors --- fdbclient/ActorLineageProfiler.cpp | 13 +++++++++++-- fdbclient/ActorLineageProfiler.h | 11 ++--------- fdbserver/SigStack.cpp | 20 ++++++++++---------- flow/flow.cpp | 2 +- 4 files changed, 24 insertions(+), 22 deletions(-) diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp index 8d5ad1d6ae..a28f011d5a 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -26,9 +26,18 @@ using namespace std::literals; +std::string_view to_string(WaitState w) { + switch (w) { + case WaitState::Running: + return "Running"; + case WaitState::DiskIO: + return "DiskIO"; + } +} + class Packer : public msgpack::packer { struct visitor_t { - using VisitorMap = std::unordered_map>; + using VisitorMap = std::unordered_map>; VisitorMap visitorMap; template @@ -42,7 +51,7 @@ class Packer : public msgpack::packer { template struct populate_visitor_map { static void populate(VisitorMap& map) { - map.emplace(any_visitor); + map.emplace(std::type_index(typeid(Head)), any_visitor); populate_visitor_map::populate(map); } }; diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h index cbd2e7d1f3..af32d6de13 100644 --- a/fdbclient/ActorLineageProfiler.h +++ b/fdbclient/ActorLineageProfiler.h @@ -45,14 +45,7 @@ struct IALPCollector : IALPCollectorBase { enum class WaitState { Running, DiskIO }; -std::string_view to_string(WaitState w) { - switch (w) { - case WaitState::Running: - return "Running"; - case WaitState::DiskIO: - return "DiskIO"; - } -} +std::string_view to_string(WaitState w); struct Sample : std::enable_shared_from_this { double time = 0.0; @@ -63,7 +56,7 @@ struct Sample : std::enable_shared_from_this { class SampleCollectorT { public: // Types - friend class crossbow::singleton; + friend struct crossbow::create_static; using Getter = std::function>()>; private: diff --git a/fdbserver/SigStack.cpp b/fdbserver/SigStack.cpp index efec5aff7d..0c35326766 100644 --- a/fdbserver/SigStack.cpp +++ b/fdbserver/SigStack.cpp @@ -7,17 +7,17 @@ // However, this should be good enough for an initial // proof of concept. extern "C" void stackSignalHandler(int sig) { - auto stack = getActorStackTrace(); - int i = 0; - while (!stack.empty()) { - auto s = stack.top(); - stack.pop(); - std::string_view n(reinterpret_cast(s.begin()), s.size()); - std::cout << i << ": " << n << std::endl; - ++i; - } + auto stack = getActorStackTrace(); + int i = 0; + while (!stack.empty()) { + auto s = stack.back(); + stack.pop_back(); + std::string_view n(reinterpret_cast(s.begin()), s.size()); + std::cout << i << ": " << n << std::endl; + ++i; + } } void setupStackSignal() { - std::signal(SIGUSR1, &stackSignalHandler); + std::signal(SIGUSR1, &stackSignalHandler); } diff --git a/flow/flow.cpp b/flow/flow.cpp index 351c8d0aa2..1332207e38 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -41,7 +41,7 @@ ActorLineage::~ActorLineage() { using namespace std::literals; -std::string_view StackLineage::name = "StackLineage"sv; +const std::string_view StackLineage::name = "StackLineage"sv; std::vector getActorStackTrace() { return currentLineage->stack(&StackLineage::actorName); From 8a6473c08a83bfe1bb888dbfd2dd7d3813c70295 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 9 Apr 2021 15:23:42 -0600 Subject: [PATCH 053/180] Apply suggestions from code review Co-authored-by: Lukas Joswiak --- fdbclient/ActorLineageProfiler.cpp | 16 ++++++++-------- fdbclient/ActorLineageProfiler.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp index a28f011d5a..13bc224001 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -1,9 +1,9 @@ /* - * ActorLineageProfiler.h + * ActorLineageProfiler.cpp * * This source file is part of the FoundationDB open source project * - * Copyright 2013-20201 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -107,12 +107,12 @@ public: this->pack(uint64_t(val)); } else if (val >= std::numeric_limits::min()) { pack_int8(int8_t(val)); - } else if (val >= std::numeric_limits::min()) { - pack_int8(int16_t(val)); - } else if (val >= std::numeric_limits::min()) { - pack_int8(int32_t(val)); - } else if (val >= std::numeric_limits::min()) { - pack_int8(int64_t(val)); + } else if (val >= std::numeric_limits::min()) { + pack_int16(int16_t(val)); + } else if (val >= std::numeric_limits::min()) { + pack_int32(int32_t(val)); + } else if (val >= std::numeric_limits::min()) { + pack_int64(int64_t(val)); } } diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h index af32d6de13..2b4e780f39 100644 --- a/fdbclient/ActorLineageProfiler.h +++ b/fdbclient/ActorLineageProfiler.h @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-20201 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 6656557b6a276d191e743dfa9414ffbad0afadae Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 9 Apr 2021 15:25:11 -0600 Subject: [PATCH 054/180] made internal collect method private --- fdbclient/ActorLineageProfiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h index 2b4e780f39..1f2bdad659 100644 --- a/fdbclient/ActorLineageProfiler.h +++ b/fdbclient/ActorLineageProfiler.h @@ -63,10 +63,10 @@ private: std::vector collectors; std::map getSamples; SampleCollectorT() {} + std::map collect(ActorLineage* lineage); public: void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); } - std::map collect(ActorLineage* lineage); std::shared_ptr collect(); }; From 34f903447a7bf4dea455eeba4a7aff8c5595d17e Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sat, 10 Apr 2021 22:43:37 -0700 Subject: [PATCH 055/180] Seek test output improvement. --- fdbserver/VersionedBTree.actor.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 8d659ff368..28c73480b6 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -7650,12 +7650,14 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { pos = newPos; } double elapsed = timer() - start; - printf("Seek/skip test, jumpMax=%d, items=%d, oldSeek=%d useHint=%d: Elapsed %f s\n", + printf("Seek/skip test, count=%d jumpMax=%d, items=%d, oldSeek=%d useHint=%d: Elapsed %f seconds %.2f M/s\n", + count, jumpMax, items.size(), old, useHint, - elapsed); + elapsed, + double(count) / elapsed / 1e6); }; // Compare seeking to nearby elements with and without hints, using the old and new SeekLessThanOrEqual methods. From 8e7b35d708e796cc2cc50267e2559a4e4d4812d2 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sat, 10 Apr 2021 22:44:28 -0700 Subject: [PATCH 056/180] Removed otherAncestor from DeltaTree::DecodedNode, replaced uses with path retracing. --- fdbserver/DeltaTree.h | 44 ++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index ceff1f2ec3..2865af596c 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -235,19 +235,16 @@ public: // construct root node DecodedNode(Node* raw, const T* prev, const T* next, Arena& arena, bool large) - : raw(raw), parent(nullptr), otherAncestor(nullptr), leftChild(nullptr), rightChild(nullptr), prev(prev), - next(next), item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)), - large(large) { + : raw(raw), parent(nullptr), leftChild(nullptr), rightChild(nullptr), prev(prev), next(next), + item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)), large(large) { // printf("DecodedNode1 raw=%p delta=%s\n", raw, raw->delta(large).toString().c_str()); } // Construct non-root node // wentLeft indicates that we've gone left to get to the raw node. DecodedNode(Node* raw, DecodedNode* parent, bool wentLeft, Arena& arena) - : parent(parent), large(parent->large), - otherAncestor(wentLeft ? parent->getPrevAncestor() : parent->getNextAncestor()), - prev(wentLeft ? parent->prev : &parent->item), next(wentLeft ? &parent->item : parent->next), - leftChild(nullptr), rightChild(nullptr), raw(raw), + : parent(parent), large(parent->large), prev(wentLeft ? parent->prev : &parent->item), + next(wentLeft ? &parent->item : parent->next), leftChild(nullptr), rightChild(nullptr), raw(raw), item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)) { // printf("DecodedNode2 raw=%p delta=%s\n", raw, raw->delta(large).toString().c_str()); } @@ -258,12 +255,34 @@ public: // Returns true if otherAncestor is the next ("least greator") ancestor bool otherAncestorNext() const { return parent && parent->rightChild == this; } - DecodedNode* getPrevAncestor() const { return otherAncestorPrev() ? otherAncestor : parent; } + // Gets the first ancestor to the left + DecodedNode* getPrevAncestor() const { + DecodedNode* p = parent; + const DecodedNode* child = this; + // While p is not null and p is not to the left of child (meaning child is p's right child) + while (p != nullptr && p->rightChild != child) { + // Otherwise, move up + child = p; + p = p->parent; + } + return p; + } - DecodedNode* getNextAncestor() const { return otherAncestorNext() ? otherAncestor : parent; } + DecodedNode* getNextAncestor() const { + DecodedNode* p = parent; + const DecodedNode* child = this; + // While p is not null and p is not to the right of child (meaning child is p's left child) + while (p != nullptr && p->leftChild != child) { + // Otherwise, move up + child = p; + p = p->parent; + } + return p; + } DecodedNode* jumpUpNext(DecodedNode* root, bool& othersChild) const { if (parent != nullptr) { + DecodedNode* otherAncestor = otherAncestorPrev() ? getPrevAncestor() : getNextAncestor(); if (parent->rightChild == this) { return otherAncestor; } @@ -277,6 +296,7 @@ public: DecodedNode* jumpUpPrev(DecodedNode* root, bool& othersChild) const { if (parent != nullptr) { + DecodedNode* otherAncestor = otherAncestorPrev() ? getPrevAncestor() : getNextAncestor(); if (parent->leftChild == this) { return otherAncestor; } @@ -290,22 +310,26 @@ public: DecodedNode* jumpNext(DecodedNode* root) const { if (otherAncestorNext()) { + DecodedNode* otherAncestor = getNextAncestor(); return (otherAncestor != nullptr) ? otherAncestor : rightChild; } else { if (this == root) { return rightChild; } + DecodedNode* otherAncestor = getPrevAncestor(); return (otherAncestor != nullptr) ? otherAncestor->rightChild : root; } } DecodedNode* jumpPrev(DecodedNode* root) const { if (otherAncestorPrev()) { + DecodedNode* otherAncestor = getPrevAncestor(); return (otherAncestor != nullptr) ? otherAncestor : leftChild; } else { if (this == root) { return leftChild; } + DecodedNode* otherAncestor = getNextAncestor(); return (otherAncestor != nullptr) ? otherAncestor->leftChild : root; } } @@ -317,7 +341,6 @@ public: bool large; // Node size Node* raw; DecodedNode* parent; - DecodedNode* otherAncestor; DecodedNode* leftChild; DecodedNode* rightChild; const T* prev; // greatest ancestor to the left, or tree lower bound @@ -474,7 +497,6 @@ public: newNode->leftChild = nullptr; newNode->rightChild = nullptr; newNode->raw = raw; - newNode->otherAncestor = addLeftChild ? n->getPrevAncestor() : n->getNextAncestor(); newNode->prev = prev; newNode->next = next; From 4e24e3e8c8a3f132b6a8c4ad2fce4b5f76812410 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sat, 10 Apr 2021 23:00:03 -0700 Subject: [PATCH 057/180] Reverted removal of otherAncestor from DeltaTree::DecodedNode due to too high of a performance hit. --- fdbserver/DeltaTree.h | 44 +++++++++++-------------------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 2865af596c..ceff1f2ec3 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -235,16 +235,19 @@ public: // construct root node DecodedNode(Node* raw, const T* prev, const T* next, Arena& arena, bool large) - : raw(raw), parent(nullptr), leftChild(nullptr), rightChild(nullptr), prev(prev), next(next), - item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)), large(large) { + : raw(raw), parent(nullptr), otherAncestor(nullptr), leftChild(nullptr), rightChild(nullptr), prev(prev), + next(next), item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)), + large(large) { // printf("DecodedNode1 raw=%p delta=%s\n", raw, raw->delta(large).toString().c_str()); } // Construct non-root node // wentLeft indicates that we've gone left to get to the raw node. DecodedNode(Node* raw, DecodedNode* parent, bool wentLeft, Arena& arena) - : parent(parent), large(parent->large), prev(wentLeft ? parent->prev : &parent->item), - next(wentLeft ? &parent->item : parent->next), leftChild(nullptr), rightChild(nullptr), raw(raw), + : parent(parent), large(parent->large), + otherAncestor(wentLeft ? parent->getPrevAncestor() : parent->getNextAncestor()), + prev(wentLeft ? parent->prev : &parent->item), next(wentLeft ? &parent->item : parent->next), + leftChild(nullptr), rightChild(nullptr), raw(raw), item(raw->delta(large).apply(raw->delta(large).getPrefixSource() ? *prev : *next, arena)) { // printf("DecodedNode2 raw=%p delta=%s\n", raw, raw->delta(large).toString().c_str()); } @@ -255,34 +258,12 @@ public: // Returns true if otherAncestor is the next ("least greator") ancestor bool otherAncestorNext() const { return parent && parent->rightChild == this; } - // Gets the first ancestor to the left - DecodedNode* getPrevAncestor() const { - DecodedNode* p = parent; - const DecodedNode* child = this; - // While p is not null and p is not to the left of child (meaning child is p's right child) - while (p != nullptr && p->rightChild != child) { - // Otherwise, move up - child = p; - p = p->parent; - } - return p; - } + DecodedNode* getPrevAncestor() const { return otherAncestorPrev() ? otherAncestor : parent; } - DecodedNode* getNextAncestor() const { - DecodedNode* p = parent; - const DecodedNode* child = this; - // While p is not null and p is not to the right of child (meaning child is p's left child) - while (p != nullptr && p->leftChild != child) { - // Otherwise, move up - child = p; - p = p->parent; - } - return p; - } + DecodedNode* getNextAncestor() const { return otherAncestorNext() ? otherAncestor : parent; } DecodedNode* jumpUpNext(DecodedNode* root, bool& othersChild) const { if (parent != nullptr) { - DecodedNode* otherAncestor = otherAncestorPrev() ? getPrevAncestor() : getNextAncestor(); if (parent->rightChild == this) { return otherAncestor; } @@ -296,7 +277,6 @@ public: DecodedNode* jumpUpPrev(DecodedNode* root, bool& othersChild) const { if (parent != nullptr) { - DecodedNode* otherAncestor = otherAncestorPrev() ? getPrevAncestor() : getNextAncestor(); if (parent->leftChild == this) { return otherAncestor; } @@ -310,26 +290,22 @@ public: DecodedNode* jumpNext(DecodedNode* root) const { if (otherAncestorNext()) { - DecodedNode* otherAncestor = getNextAncestor(); return (otherAncestor != nullptr) ? otherAncestor : rightChild; } else { if (this == root) { return rightChild; } - DecodedNode* otherAncestor = getPrevAncestor(); return (otherAncestor != nullptr) ? otherAncestor->rightChild : root; } } DecodedNode* jumpPrev(DecodedNode* root) const { if (otherAncestorPrev()) { - DecodedNode* otherAncestor = getPrevAncestor(); return (otherAncestor != nullptr) ? otherAncestor : leftChild; } else { if (this == root) { return leftChild; } - DecodedNode* otherAncestor = getNextAncestor(); return (otherAncestor != nullptr) ? otherAncestor->leftChild : root; } } @@ -341,6 +317,7 @@ public: bool large; // Node size Node* raw; DecodedNode* parent; + DecodedNode* otherAncestor; DecodedNode* leftChild; DecodedNode* rightChild; const T* prev; // greatest ancestor to the left, or tree lower bound @@ -497,6 +474,7 @@ public: newNode->leftChild = nullptr; newNode->rightChild = nullptr; newNode->raw = raw; + newNode->otherAncestor = addLeftChild ? n->getPrevAncestor() : n->getNextAncestor(); newNode->prev = prev; newNode->next = next; From 13e00e8408bc0914751574dc688c4ebeea4b2b11 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 12 Apr 2021 09:43:45 -0600 Subject: [PATCH 058/180] made ActorLineage thread safe --- flow/flow.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/flow/flow.h b/flow/flow.h index 09211959a7..2fab7b11a4 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -42,6 +42,7 @@ #include #include #include +#include #include "flow/Platform.h" #include "flow/FastAlloc.h" @@ -453,14 +454,23 @@ struct ActorLineage : ReferenceCounted { private: std::unordered_map properties; Reference parent; + mutable std::mutex mutex; + using Lock = std::unique_lock; public: ActorLineage(); ~ActorLineage(); - bool isRoot() const { return parent.getPtr() == nullptr; } - void makeRoot() { parent.clear(); } + bool isRoot() const { + Lock _{ mutex }; + return parent.getPtr() == nullptr; + } + void makeRoot() { + Lock _{ mutex }; + parent.clear(); + } template V& modify(V T::*member) { + Lock _{ mutex }; auto& res = properties[T::name]; if (!res) { res = new T{}; @@ -470,6 +480,7 @@ public: } template std::optional get(V T::*member) const { + Lock _{ mutex }; auto current = this; while (current != nullptr) { auto iter = current->properties.find(T::name); @@ -485,6 +496,7 @@ public: } template std::vector stack(V T::*member) const { + Lock _{ mutex }; auto current = this; std::vector res; while (current != nullptr) { From eb2fe0dbcf19e656fe1f7500fd85164778d1c868 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 12 Apr 2021 09:48:53 -0600 Subject: [PATCH 059/180] added serializable containers --- fdbclient/ActorLineageProfiler.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp index 13bc224001..a084beb4b3 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -60,7 +60,19 @@ class Packer : public msgpack::packer { static void populate(VisitorMap&) {} }; - visitor_t() { populate_visitor_map::populate(visitorMap); } + visitor_t() { + populate_visitor_map, + std::map, + std::map, + std::unordered_map>::populate(visitorMap); + } void visit(const std::any& val, Packer& packer) { auto iter = visitorMap.find(val.type()); From ec95b649b04179254ba5c8b6ef8de676972090dc Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 12 Apr 2021 09:51:59 -0600 Subject: [PATCH 060/180] Any can't be used as an index type --- fdbclient/ActorLineageProfiler.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp index a084beb4b3..5c0aaf86d1 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -69,9 +69,8 @@ class Packer : public msgpack::packer { std::string, std::string_view, std::vector, - std::map, - std::map, - std::unordered_map>::populate(visitorMap); + std::map, + std::map>::populate(visitorMap); } void visit(const std::any& val, Packer& packer) { From c8b8e8cf7d94e0d421d4a8163ba851fd0560a57e Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Wed, 14 Apr 2021 11:27:01 -0700 Subject: [PATCH 061/180] Fix msgpack install --- cmake/GetMsgpack.cmake | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cmake/GetMsgpack.cmake b/cmake/GetMsgpack.cmake index 0b951d5a1b..dc9a578175 100644 --- a/cmake/GetMsgpack.cmake +++ b/cmake/GetMsgpack.cmake @@ -9,8 +9,11 @@ else() ExternalProject_add(msgpackProject URL "https://github.com/msgpack/msgpack-c/releases/download/cpp-3.3.0/msgpack-3.3.0.tar.gz" URL_HASH SHA256=6e114d12a5ddb8cb11f669f83f32246e484a8addd0ce93f274996f1941c1f07b - CONFIGURE_COMMAND BUILD_COMMAND INSTALL_COMMAND) + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + ) ExternalProject_Get_property(msgpackProject SOURCE_DIR) target_include_directories(msgpack SYSTEM INTERFACE "${SOURCE_DIR}/include") -endif() \ No newline at end of file +endif() From daa1796c99072eac14bba04f73ea8a6f7ada9d09 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Thu, 15 Apr 2021 00:08:29 -0700 Subject: [PATCH 062/180] Added Pager function for trying to evict a page from cache. --- fdbserver/IPager.h | 1 + fdbserver/VersionedBTree.actor.cpp | 56 +++++++++++++++++++++++++----- 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 45c9f02fcc..7f21e30566 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -66,6 +66,7 @@ public: class IPagerSnapshot { public: virtual Future> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool nohit) = 0; + virtual bool tryEvictPage(LogicalPageID id) = 0; virtual Version getVersion() const = 0; virtual Key getMetaKey() const = 0; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 28c73480b6..52408c2820 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1030,6 +1030,22 @@ public: return nullptr; } + // Try to evict the item at index from cache + // Returns true if item is evicted or was not present in cache + bool tryEvict(const IndexType& index) { + auto i = cache.find(index); + if (i == cache.end() || !i->second.item.evictable()) { + return false; + } + Entry& toEvict = i->second; + if (toEvict.hits == 0) { + ++g_redwoodMetrics.pagerEvictUnhit; + } + evictionOrder.erase(evictionOrder.iterator_to(toEvict)); + cache.erase(toEvict.index); + return true; + } + // Get the object for i or create a new one. // After a get(), the object for i is the last in evictionOrder. // If noHit is set, do not consider this access to be cache hit if the object is present @@ -1690,6 +1706,11 @@ public: return readPhysicalPage(self, pageID, true); } + bool tryEvictPage(LogicalPageID logicalID, Version v) { + PhysicalPageID physicalID = getPhysicalPageID(logicalID, v); + return pageCache.tryEvict(physicalID); + } + // Reads the most recent version of pageID, either previously committed or written using updatePage() in the current // commit Future> readPage(LogicalPageID pageID, bool cacheable, bool noHit = false) override { @@ -1725,14 +1746,14 @@ public: return cacheEntry.readFuture; } - Future> readPageAtVersion(LogicalPageID pageID, Version v, bool cacheable, bool noHit) { + PhysicalPageID getPhysicalPageID(LogicalPageID pageID, Version v) { auto i = remappedPages.find(pageID); if (i != remappedPages.end()) { auto j = i->second.upper_bound(v); if (j != i->second.begin()) { --j; - debug_printf("DWALPager(%s) op=readAtVersionRemapped %s @%" PRId64 " -> %s\n", + debug_printf("DWALPager(%s) op=lookupRemapped %s @%" PRId64 " -> %s\n", filename.c_str(), toString(pageID).c_str(), v, @@ -1741,13 +1762,18 @@ public: ASSERT(pageID != invalidLogicalPageID); } } else { - debug_printf("DWALPager(%s) op=readAtVersionNotRemapped %s @%" PRId64 " (not remapped)\n", + debug_printf("DWALPager(%s) op=lookupNotRemapped %s @%" PRId64 " (not remapped)\n", filename.c_str(), toString(pageID).c_str(), v); } - return readPage(pageID, cacheable, noHit); + return (PhysicalPageID)pageID; + } + + Future> readPageAtVersion(LogicalPageID logicalID, Version v, bool cacheable, bool noHit) { + PhysicalPageID physicalID = getPhysicalPageID(logicalID, v); + return readPage(physicalID, cacheable, noHit); } // Get snapshot as of the most recent committed version of the pager @@ -2281,9 +2307,11 @@ public: throw expired.getError(); } return map(pager->readPageAtVersion(pageID, version, cacheable, noHit), - [=](Reference p) { return Reference(p); }); + [=](Reference p) { return Reference(std::move(p)); }); } + bool tryEvictPage(LogicalPageID id) override { return pager->tryEvictPage(id, version); } + Key getMetaKey() const override { return metaKey; } Version getVersion() const override { return version; } @@ -4153,6 +4181,17 @@ private: int m_size; }; + // Try to evict a BTree page from the pager cache. + // Returns true if, at the end of the call, the page is no longer in cache, + // so the caller can assume its IPage reference is the only one. + bool tryEvictPage(IPagerSnapshot* pager, BTreePageIDRef id) { + // If it's an oversized page, currently it cannot be in the cache + if (id.size() > 0) { + return true; + } + return pager->tryEvictPage(id.front()); + } + ACTOR static Future> readPage(Reference snapshot, BTreePageIDRef id, const RedwoodRecordRef* lowerBound, @@ -4175,7 +4214,7 @@ private: if (id.size() == 1) { Reference p = wait(snapshot->getPhysicalPage(id.front(), !forLazyClear, false)); - page = p; + page = std::move(p); } else { ASSERT(!id.empty()); std::vector>> reads; @@ -4208,7 +4247,7 @@ private: pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); } - return page; + return std::move(page); } static void preLoadPage(IPagerSnapshot* snapshot, BTreePageIDRef id) { @@ -5077,6 +5116,7 @@ private: state bool detachChildren = (parentInfo->count > 2); state bool forceUpdate = false; + // If no changes were made, but we should rewrite it to point directly to remapped child pages if (!m.changesMade && detachChildren) { debug_printf( "%s Internal page forced rewrite because at least %d children have been updated in-place.\n", @@ -5107,7 +5147,7 @@ private: if (m.updating) { // Page was updated in place (or being forced to be updated in place to update child page ids) debug_printf( - "%s Internal page modified in-place tryUpdate=%d forceUpdate=%d detachChildren=%d\n", + "%s Internal page modified in-place tryToUpdate=%d forceUpdate=%d detachChildren=%d\n", context.c_str(), tryToUpdate, forceUpdate, From b2d6930103becc1323397bab0fec40fa0ce64e0a Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 15 Apr 2021 11:45:14 -0700 Subject: [PATCH 063/180] The multi-version client monitors the cluster's protocol version and only activates the client library that can connect. --- bindings/c/fdb_c.cpp | 2 +- fdbclient/CoordinationInterface.h | 5 + fdbclient/DatabaseContext.h | 11 + fdbclient/IClientApi.h | 2 +- fdbclient/MonitorLeader.actor.cpp | 7 +- fdbclient/MonitorLeader.h | 1 + fdbclient/MultiVersionTransaction.actor.cpp | 313 ++++++++------------ fdbclient/MultiVersionTransaction.h | 71 ++--- fdbclient/NativeAPI.actor.cpp | 139 ++++++--- fdbclient/NativeAPI.actor.h | 9 +- fdbclient/ThreadSafeTransaction.cpp | 7 +- fdbclient/ThreadSafeTransaction.h | 2 +- fdbrpc/FlowTransport.actor.cpp | 26 +- fdbrpc/FlowTransport.h | 49 +-- flow/ProtocolVersion.h | 12 + 15 files changed, 350 insertions(+), 306 deletions(-) diff --git a/bindings/c/fdb_c.cpp b/bindings/c/fdb_c.cpp index 907f8058b6..2c133dae36 100644 --- a/bindings/c/fdb_c.cpp +++ b/bindings/c/fdb_c.cpp @@ -364,7 +364,7 @@ extern "C" DLLEXPORT double fdb_database_get_main_thread_busyness(FDBDatabase* d return DB(d)->getMainThreadBusyness(); } -// Returns the protocol version reported by a quorum of coordinators +// Returns the protocol version reported by the coordinator this client is connected to // If an expected version is non-zero, the future won't return until the protocol version is different than expected extern "C" DLLEXPORT FDBFuture* fdb_database_get_server_protocol(FDBDatabase* db, uint64_t expected_version) { Optional expected; diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h index 0d22b035fb..d826da4fd6 100644 --- a/fdbclient/CoordinationInterface.h +++ b/fdbclient/CoordinationInterface.h @@ -35,6 +35,7 @@ constexpr UID WLTOKEN_CLIENTLEADERREG_OPENDATABASE(-1, 3); constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 10); +// The coordinator interface as exposed to clients struct ClientLeaderRegInterface { RequestStream getLeader; RequestStream openDatabase; @@ -42,6 +43,10 @@ struct ClientLeaderRegInterface { ClientLeaderRegInterface() {} ClientLeaderRegInterface(NetworkAddress remote); ClientLeaderRegInterface(INetwork* local); + + bool operator==(const ClientLeaderRegInterface& rhs) const { + return getLeader == rhs.getLeader && openDatabase == rhs.openDatabase; + } }; class ClusterConnectionString { diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 2e1100fef7..487ce50bf2 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -152,6 +152,7 @@ public: return (DatabaseContext*)DatabaseContext::operator new(sizeof(DatabaseContext)); } + // Static constructor used by server processes to create a DatabaseContext // For internal (fdbserver) use only static Database create(Reference> clientInfo, Future clientInfoMonitor, @@ -164,9 +165,11 @@ public: ~DatabaseContext(); + // Constructs a new copy of this DatabaseContext from the parameters of this DatabaseContext Database clone() const { return Database(new DatabaseContext(connectionFile, clientInfo, + coordinator, clientInfoMonitor, taskID, clientLocality, @@ -196,6 +199,10 @@ public: Future onProxiesChanged(); Future getHealthMetrics(bool detailed); + // Returns the protocol version reported by the coordinator this client is connected to + // If an expected version is given, the future won't return until the protocol version is different than expected + Future getClusterProtocol(Optional expectedVersion = Optional()); + // Update the watch counter for the database void addWatch(); void removeWatch(); @@ -247,6 +254,7 @@ public: // private: explicit DatabaseContext(Reference>> connectionFile, Reference> clientDBInfo, + Reference>> coordinator, Future clientInfoMonitor, TaskPriority taskID, LocalityData const& clientLocality, @@ -380,6 +388,9 @@ public: Future clientInfoMonitor; Future connected; + // An AsyncVar that reports the coordinator this DatabaseContext is interacting with + Reference>> coordinator; + Reference>> statusClusterInterface; Future statusLeaderMon; double lastStatusFetch; diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h index 4496eff732..a3de56bf10 100644 --- a/fdbclient/IClientApi.h +++ b/fdbclient/IClientApi.h @@ -100,7 +100,7 @@ public: virtual void setOption(FDBDatabaseOptions::Option option, Optional value = Optional()) = 0; virtual double getMainThreadBusyness() = 0; - // Returns the protocol version reported by a quorum of coordinators + // Returns the protocol version reported by the coordinator this client is connected to // If an expected version is given, the future won't return until the protocol version is different than expected virtual ThreadFuture getServerProtocol( Optional expectedVersion = Optional()) = 0; diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index af563c68b0..df14e6a40a 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -757,6 +757,7 @@ void shrinkProxyList(ClientDBInfo& ni, ACTOR Future monitorProxiesOneGeneration( Reference connFile, Reference> clientInfo, + Reference>> coordinator, MonitorLeaderInfo info, Reference>>> supportedVersions, Key traceLogGroup) { @@ -774,6 +775,9 @@ ACTOR Future monitorProxiesOneGeneration( loop { state ClientLeaderRegInterface clientLeaderServer(addrs[idx]); state OpenDatabaseCoordRequest req; + + coordinator->set(clientLeaderServer); + req.clusterKey = cs.clusterKey(); req.coordinators = cs.coordinators(); req.knownClientInfoID = clientInfo->get().id; @@ -840,13 +844,14 @@ ACTOR Future monitorProxiesOneGeneration( ACTOR Future monitorProxies( Reference>> connFile, Reference> clientInfo, + Reference>> coordinator, Reference>>> supportedVersions, Key traceLogGroup) { state MonitorLeaderInfo info(connFile->get()); loop { choose { when(MonitorLeaderInfo _info = wait(monitorProxiesOneGeneration( - connFile->get(), clientInfo, info, supportedVersions, traceLogGroup))) { + connFile->get(), clientInfo, coordinator, info, supportedVersions, traceLogGroup))) { info = _info; } when(wait(connFile->onChange())) { diff --git a/fdbclient/MonitorLeader.h b/fdbclient/MonitorLeader.h index 204b6994f4..b9b195a9da 100644 --- a/fdbclient/MonitorLeader.h +++ b/fdbclient/MonitorLeader.h @@ -76,6 +76,7 @@ Future monitorLeaderForProxies(Value const& key, Future monitorProxies( Reference>> const& connFile, Reference> const& clientInfo, + Reference>> const& coordinator, Reference>>> const& supportedVersions, Key const& traceLogGroup); diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index 4b6ba0c27c..57f23e3d88 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -356,7 +356,7 @@ double DLDatabase::getMainThreadBusyness() { return 0; } -// Returns the protocol version reported by a quorum of coordinators +// Returns the protocol version reported by the coordinator this client is connected to // If an expected version is given, the future won't return until the protocol version is different than expected ThreadFuture DLDatabase::getServerProtocol(Optional expectedVersion) { ASSERT(api->databaseGetServerProtocol != nullptr); @@ -877,35 +877,35 @@ MultiVersionDatabase::MultiVersionDatabase(MultiVersionApi* api, int threadIdx, std::string clusterFilePath, Reference db, + Reference versionMonitorDb, bool openConnectors) - : dbState(new DatabaseState()), clusterFilePath(clusterFilePath) { + : dbState(new DatabaseState(clusterFilePath, versionMonitorDb)) { dbState->db = db; dbState->dbVar->set(db); - if (!openConnectors) { - dbState->currentClientIndex = 0; - } else { + if (openConnectors) { if (!api->localClientDisabled) { - dbState->currentClientIndex = 0; - dbState->addConnection(api->getLocalClient(), clusterFilePath); - } else { - dbState->currentClientIndex = -1; + dbState->addClient(api->getLocalClient()); } - api->runOnExternalClients(threadIdx, [this, clusterFilePath](Reference client) { - dbState->addConnection(client, clusterFilePath); - }); + if (!externalClientsInitialized.test_and_set()) { + api->runOnExternalClientsAllThreads([&clusterFilePath](Reference client) { + // This creates a database to initialize some client state on the external library, + // but it gets deleted immediately so that we don't keep open connections + Reference newDb = client->api->createDatabase(clusterFilePath.c_str()); + }); + } - dbState->startConnections(); + api->runOnExternalClients(threadIdx, [this](Reference client) { dbState->addClient(client); }); + + dbState->protocolVersionMonitor = dbState->monitorProtocolVersion(); } } -MultiVersionDatabase::~MultiVersionDatabase() { - dbState->cancelConnections(); -} - +// Create a MultiVersionDatabase that wraps an already created IDatabase object +// For internal use in testing Reference MultiVersionDatabase::debugCreateFromExistingDatabase(Reference db) { - return Reference(new MultiVersionDatabase(MultiVersionApi::api, 0, "", db, false)); + return Reference(new MultiVersionDatabase(MultiVersionApi::api, 0, "", db, db, false)); } Reference MultiVersionDatabase::createTransaction() { @@ -963,189 +963,122 @@ double MultiVersionDatabase::getMainThreadBusyness() { return 0; } -// Returns the protocol version reported by a quorum of coordinators +// Returns the protocol version reported by the coordinator this client is connected to // If an expected version is given, the future won't return until the protocol version is different than expected ThreadFuture MultiVersionDatabase::getServerProtocol(Optional expectedVersion) { - // TODO: send this out through the active database - return MultiVersionApi::api->getLocalClient() - ->api->createDatabase(clusterFilePath.c_str()) - ->getServerProtocol(expectedVersion); + return dbState->versionMonitorDb->getServerProtocol(expectedVersion); } -void MultiVersionDatabase::Connector::connect() { - addref(); - onMainThreadVoid( - [this]() { - if (!cancelled) { - connected = false; - if (connectionFuture.isValid()) { - connectionFuture.cancel(); - } +MultiVersionDatabase::DatabaseState::DatabaseState(std::string clusterFilePath, Reference versionMonitorDb) + : clusterFilePath(clusterFilePath), versionMonitorDb(versionMonitorDb), + dbVar(new ThreadSafeAsyncVar>(Reference(NULL))) {} - candidateDatabase = client->api->createDatabase(clusterFilePath.c_str()); - if (client->external) { - connectionFuture = candidateDatabase.castTo()->onReady(); - } else { - connectionFuture = ThreadFuture(Void()); - } +// Adds a client (local or externally loaded) that can be used to connect to the cluster +void MultiVersionDatabase::DatabaseState::addClient(Reference client) { + ProtocolVersion baseVersion = client->protocolVersion.normalizedVersion(); + auto [itr, inserted] = clients.insert({ baseVersion, client }); + if (!inserted) { + // SOMEDAY: prefer client with higher release version if protocol versions are compatible + Reference keptClient = itr->second; + Reference discardedClient = client; + if (client->canReplace(itr->second)) { + std::swap(keptClient, discardedClient); + clients[baseVersion] = client; + } - connectionFuture = flatMapThreadFuture(connectionFuture, [this](ErrorOr ready) { - if (ready.isError()) { - return ErrorOr>(ready.getError()); - } + discardedClient->failed = true; + TraceEvent(SevWarn, "DuplicateClientVersion") + .detail("Keeping", keptClient->libPath) + .detail("KeptProtocolVersion", keptClient->protocolVersion) + .detail("Disabling", discardedClient->libPath) + .detail("DisabledProtocolVersion", discardedClient->protocolVersion); - tr = candidateDatabase->createTransaction(); - return ErrorOr>( - mapThreadFuture(tr->getReadVersion(), [](ErrorOr v) { - // If the version attempt returns an error, we regard that as a connection (except - // operation_cancelled) - if (v.isError() && v.getError().code() == error_code_operation_cancelled) { - return ErrorOr(v.getError()); - } else { - return ErrorOr(Void()); - } - })); - }); - - int userParam; - connectionFuture.callOrSetAsCallback(this, userParam, 0); - } else { - delref(); - } - }, - nullptr); -} - -// Only called from main thread -void MultiVersionDatabase::Connector::cancel() { - connected = false; - cancelled = true; - if (connectionFuture.isValid()) { - connectionFuture.cancel(); - } -} - -void MultiVersionDatabase::Connector::fire(const Void& unused, int& userParam) { - onMainThreadVoid( - [this]() { - if (!cancelled) { - connected = true; - dbState->stateChanged(); - } - delref(); - }, - nullptr); -} - -void MultiVersionDatabase::Connector::error(const Error& e, int& userParam) { - if (e.code() != error_code_operation_cancelled) { - // TODO: is it right to abandon this connection attempt? - client->failed = true; MultiVersionApi::api->updateSupportedVersions(); - TraceEvent(SevError, "DatabaseConnectionError").error(e).detail("ClientLibrary", this->client->libPath); } - - delref(); } -MultiVersionDatabase::DatabaseState::DatabaseState() - : dbVar(new ThreadSafeAsyncVar>(Reference(nullptr))), currentClientIndex(-1) {} +// Watch the cluster protocol version for changes and update the database state when it does +ThreadFuture MultiVersionDatabase::DatabaseState::monitorProtocolVersion() { + ThreadFuture f = versionMonitorDb->getServerProtocol(dbProtocolVersion); + return mapThreadFuture(f, [this](ErrorOr cv) { + if (cv.isError()) { + TraceEvent("ErrorGettingClusterProtocolVersion") + .detail("ExpectedProtocolVersion", dbProtocolVersion) + .error(cv.getError()); + } -// Only called from main thread -void MultiVersionDatabase::DatabaseState::stateChanged() { - int newIndex = -1; - for (int i = 0; i < clients.size(); ++i) { - if (i != currentClientIndex && connectionAttempts[i]->connected) { - if (currentClientIndex >= 0 && !clients[i]->canReplace(clients[currentClientIndex])) { - TraceEvent(SevWarn, "DuplicateClientVersion") - .detail("Keeping", clients[currentClientIndex]->libPath) - .detail("KeptClientProtocolVersion", clients[currentClientIndex]->protocolVersion.version()) - .detail("Disabling", clients[i]->libPath) - .detail("DisabledClientProtocolVersion", clients[i]->protocolVersion.version()); - connectionAttempts[i]->connected = false; // Permanently disable this client in favor of the current one - clients[i]->failed = true; - MultiVersionApi::api->updateSupportedVersions(); - return; + ProtocolVersion clusterVersion = !cv.isError() ? cv.get() : dbProtocolVersion.orDefault(currentProtocolVersion); + onMainThreadVoid([this, clusterVersion]() { protocolVersionChanged(clusterVersion); }, nullptr); + return Void(); + }); +} + +// Called when a change to the protocol version of the cluster has been detected. Must be called from the main +// thread. +void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion protocolVersion) { + if (dbProtocolVersion.present() && + protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) { + dbProtocolVersion = protocolVersion; + } else { + TraceEvent("ProtocolVersionChanged") + .detail("NewProtocolVersion", protocolVersion) + .detail("OldProtocolVersion", dbProtocolVersion); + + dbProtocolVersion = protocolVersion; + auto itr = clients.find(protocolVersion.normalizedVersion()); + + if (itr != clients.end()) { + auto& client = itr->second; + TraceEvent("CreatingDatabaseOnExternalClient") + .detail("LibraryPath", client->libPath) + .detail("Failed", client->failed); + + Reference newDb = client->api->createDatabase(clusterFilePath.c_str()); + + optionLock.enter(); + for (auto option : options) { + try { + newDb->setOption( + option.first, + option.second.castTo()); // In practice, this will set a deferred error instead + // of throwing. If that happens, the database will be + // unusable (attempts to use it will throw errors). + } catch (Error& e) { + optionLock.leave(); + TraceEvent(SevError, "ClusterVersionChangeOptionError") + .error(e) + .detail("Option", option.first) + .detail("OptionValue", option.second) + .detail("LibPath", client->libPath); + client->failed = true; + MultiVersionApi::api->updateSupportedVersions(); + db = Reference(); // If we can't set all of the options on a cluster, we abandon the + // client + break; + } } - newIndex = i; - break; - } - } - - if (newIndex == -1) { - ASSERT_EQ(currentClientIndex, 0); // This can only happen for the local client, which we set as the current - // connection before we know it's connected - return; - } - - // Restart connection for replaced client - auto newDb = connectionAttempts[newIndex]->candidateDatabase; - - optionLock.enter(); - for (auto option : options) { - try { - newDb->setOption(option.first, - option.second.castTo()); // In practice, this will set a deferred error instead - // of throwing. If that happens, the database will be - // unusable (attempts to use it will throw errors). - } catch (Error& e) { + db = newDb; + if (dbProtocolVersion.get().hasStableInterfaces()) { + versionMonitorDb = db; + } else { + versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str()); + } optionLock.leave(); - TraceEvent(SevError, "ClusterVersionChangeOptionError") - .error(e) - .detail("Option", option.first) - .detail("OptionValue", option.second) - .detail("LibPath", clients[newIndex]->libPath); - connectionAttempts[newIndex]->connected = false; - clients[newIndex]->failed = true; - MultiVersionApi::api->updateSupportedVersions(); - return; // If we can't set all of the options on a cluster, we abandon the client + } else { + db = Reference(); + versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str()); } + + dbVar->set(db); } - db = newDb; - optionLock.leave(); - - dbVar->set(db); - - if (currentClientIndex >= 0 && connectionAttempts[currentClientIndex]->connected) { - connectionAttempts[currentClientIndex]->connected = false; - connectionAttempts[currentClientIndex]->connect(); - } - - ASSERT(newIndex >= 0 && newIndex < clients.size()); - currentClientIndex = newIndex; + protocolVersionMonitor = monitorProtocolVersion(); } -void MultiVersionDatabase::DatabaseState::addConnection(Reference client, std::string clusterFilePath) { - clients.push_back(client); - connectionAttempts.push_back( - makeReference(Reference::addRef(this), client, clusterFilePath)); -} - -void MultiVersionDatabase::DatabaseState::startConnections() { - for (auto c : connectionAttempts) { - c->connect(); - } -} - -void MultiVersionDatabase::DatabaseState::cancelConnections() { - addref(); - onMainThreadVoid( - [this]() { - for (auto c : connectionAttempts) { - c->cancel(); - } - - connectionAttempts.clear(); - clients.clear(); - delref(); - }, - nullptr); -} +std::atomic_flag MultiVersionDatabase::externalClientsInitialized = ATOMIC_FLAG_INIT; // MultiVersionApi - bool MultiVersionApi::apiVersionAtLeast(int minVersion) { ASSERT_NE(MultiVersionApi::api->apiVersion, 0); return MultiVersionApi::api->apiVersion >= minVersion || MultiVersionApi::api->apiVersion < 0; @@ -1608,6 +1541,7 @@ void MultiVersionApi::addNetworkThreadCompletionHook(void (*hook)(void*), void* } } +// Creates an IDatabase object that represents a connections to the cluster Reference MultiVersionApi::createDatabase(const char* clusterFilePath) { lock.enter(); if (!networkSetup) { @@ -1622,28 +1556,21 @@ Reference MultiVersionApi::createDatabase(const char* clusterFilePath int threadIdx = nextThread; nextThread = (nextThread + 1) % threadCount; lock.leave(); - for (auto it : externalClients) { - TraceEvent("CreatingDatabaseOnExternalClient") - .detail("LibraryPath", it.first) - .detail("Failed", it.second[threadIdx]->failed); - } - return Reference(new MultiVersionDatabase(this, threadIdx, clusterFile, Reference())); + + Reference localDb = localClient->api->createDatabase(clusterFilePath); + return Reference( + new MultiVersionDatabase(this, threadIdx, clusterFile, Reference(), localDb)); } lock.leave(); ASSERT_LE(threadCount, 1); - auto db = localClient->api->createDatabase(clusterFilePath); + Reference localDb = localClient->api->createDatabase(clusterFilePath); if (bypassMultiClientApi) { - return db; + return localDb; } else { - for (auto it : externalClients) { - TraceEvent("CreatingDatabaseOnExternalClient") - .detail("LibraryPath", it.first) - .detail("Failed", it.second[0]->failed); - } - return Reference(new MultiVersionDatabase(this, 0, clusterFile, db)); + return Reference(new MultiVersionDatabase(this, 0, clusterFile, Reference(), localDb)); } } diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index badb848334..c8aaeb840e 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -271,7 +271,7 @@ public: void setOption(FDBDatabaseOptions::Option option, Optional value = Optional()) override; double getMainThreadBusyness() override; - // Returns the protocol version reported by a quorum of coordinators + // Returns the protocol version reported by the coordinator this client is connected to // If an expected version is given, the future won't return until the protocol version is different than expected ThreadFuture getServerProtocol( Optional expectedVersion = Optional()) override; @@ -437,14 +437,14 @@ public: int threadIdx, std::string clusterFilePath, Reference db, + Reference versionMonitorDb, bool openConnectors = true); - ~MultiVersionDatabase() override; Reference createTransaction() override; void setOption(FDBDatabaseOptions::Option option, Optional value = Optional()) override; double getMainThreadBusyness() override; - // Returns the protocol version reported by a quorum of coordinators + // Returns the protocol version reported by the coordinator this client is connected to // If an expected version is given, the future won't return until the protocol version is different than expected ThreadFuture getServerProtocol( Optional expectedVersion = Optional()) override; @@ -452,67 +452,59 @@ public: void addref() override { ThreadSafeReferenceCounted::addref(); } void delref() override { ThreadSafeReferenceCounted::delref(); } + // Create a MultiVersionDatabase that wraps an already created IDatabase object + // For internal use in testing static Reference debugCreateFromExistingDatabase(Reference db); ThreadFuture rebootWorker(const StringRef& address, bool check, int duration) override; ThreadFuture forceRecoveryWithDataLoss(const StringRef& dcid) override; ThreadFuture createSnapshot(const StringRef& uid, const StringRef& snapshot_command) override; -private: - struct DatabaseState; - - struct Connector : ThreadCallback, ThreadSafeReferenceCounted { - Connector(Reference dbState, Reference client, std::string clusterFilePath) - : dbState(dbState), client(client), clusterFilePath(clusterFilePath), connected(false), cancelled(false) {} - - void connect(); - void cancel(); - - bool canFire(int notMadeActive) const override { return true; } - void fire(const Void& unused, int& userParam) override; - void error(const Error& e, int& userParam) override; - - const Reference client; - const std::string clusterFilePath; - - const Reference dbState; - - ThreadFuture connectionFuture; - - Reference candidateDatabase; - Reference tr; - - bool connected; - bool cancelled; - }; + // private: + // A struct that manages the current connection state of the MultiVersionDatabase. This wraps the underlying + // IDatabase object that is currently interacting with the cluster. struct DatabaseState : ThreadSafeReferenceCounted { - DatabaseState(); + DatabaseState(std::string clusterFilePath, Reference versionMonitorDb); - void stateChanged(); - void addConnection(Reference client, std::string clusterFilePath); - void startConnections(); - void cancelConnections(); + // Called when a change to the protocol version of the cluster has been detected. Must be called from the main + // thread. + void protocolVersionChanged(ProtocolVersion protocolVersion); + + // Adds a client (local or externally loaded) that can be used to connect to the cluster + void addClient(Reference client); + + // Watch the cluster protocol version for changes and update the database state when it does + ThreadFuture monitorProtocolVersion(); Reference db; const Reference>> dbVar; + std::string clusterFilePath; + + // Used to monitor the cluster protocol version. Will be the same as db unless we have either not connected + // yet or if the client version associated with db does not support protocol monitoring. In those cases, this + // will be a specially created local db. + Reference versionMonitorDb; ThreadFuture changed; bool cancelled; - int currentClientIndex; - std::vector> clients; - std::vector> connectionAttempts; + ThreadFuture protocolVersionMonitor; + Optional dbProtocolVersion; + std::map> clients; std::vector>>> options; UniqueOrderedOptionList transactionDefaultOptions; Mutex optionLock; }; - std::string clusterFilePath; const Reference dbState; friend class MultiVersionTransaction; + + // Clients must create a database object in order to initialize some of their state. + // This needs to be done only once, and this flag tracks whether that has happened. + static std::atomic_flag externalClientsInitialized; }; // An implementation of IClientApi that can choose between multiple different client implementations either provided @@ -530,6 +522,7 @@ public: void stopNetwork() override; void addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParameter) override; + // Creates an IDatabase object that represents a connections to the cluster Reference createDatabase(const char* clusterFilePath) override; static MultiVersionApi* api; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 6615e973dd..4a6239346e 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -898,6 +898,7 @@ Future> HealthMetricsRangeImpl::getRange(ReadYourWrit DatabaseContext::DatabaseContext(Reference>> connectionFile, Reference> clientInfo, + Reference>> coordinator, Future clientInfoMonitor, TaskPriority taskID, LocalityData const& clientLocality, @@ -906,9 +907,10 @@ DatabaseContext::DatabaseContext(Reference> clientInfo, Future clientInfoMonitor, LocalityData clientLocality, @@ -1166,6 +1170,7 @@ Database DatabaseContext::create(Reference> clientInfo, bool switchable) { return Database(new DatabaseContext(Reference>>(), clientInfo, + makeReference>>(), clientInfoMonitor, taskID, clientLocality, @@ -1446,6 +1451,9 @@ void DatabaseContext::expireThrottles() { extern IPAddress determinePublicIPAutomatically(ClusterConnectionString const& ccs); +// Creates a database object that represents a connection to a cluster +// This constructor uses a preallocated DatabaseContext that may have been created +// on another thread Database Database::createDatabase(Reference connFile, int apiVersion, bool internal, @@ -1492,15 +1500,20 @@ Database Database::createDatabase(Reference connFile, g_network->initTLS(); auto clientInfo = makeReference>(); + auto coordinator = makeReference>>(); auto connectionFile = makeReference>>(); connectionFile->set(connFile); - Future clientInfoMonitor = monitorProxies( - connectionFile, clientInfo, networkOptions.supportedVersions, StringRef(networkOptions.traceLogGroup)); + Future clientInfoMonitor = monitorProxies(connectionFile, + clientInfo, + coordinator, + networkOptions.supportedVersions, + StringRef(networkOptions.traceLogGroup)); DatabaseContext* db; if (preallocatedDb) { db = new (preallocatedDb) DatabaseContext(connectionFile, clientInfo, + coordinator, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, @@ -1512,6 +1525,7 @@ Database Database::createDatabase(Reference connFile, } else { db = new DatabaseContext(connectionFile, clientInfo, + coordinator, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, @@ -4872,48 +4886,95 @@ Future> Transaction::getVersionstamp() { return versionstampPromise.getFuture(); } -ACTOR Future coordinatorProtocolsFetcher(Reference f) { - state ClientCoordinators coord(f); +// Gets the protocol version reported by a coordinator via the protocol info interface +ACTOR Future getCoordinatorProtocol(NetworkAddressList coordinatorAddresses) { + RequestStream requestStream{ Endpoint{ { coordinatorAddresses }, WLTOKEN_PROTOCOL_INFO } }; + ProtocolInfoReply reply = wait(retryBrokenPromise(requestStream, ProtocolInfoRequest{})); - state vector> coordProtocols; - coordProtocols.reserve(coord.clientLeaderServers.size()); - for (int i = 0; i < coord.clientLeaderServers.size(); i++) { - RequestStream requestStream{ Endpoint{ - { coord.clientLeaderServers[i].getLeader.getEndpoint().addresses }, WLTOKEN_PROTOCOL_INFO } }; - coordProtocols.push_back(retryBrokenPromise(requestStream, ProtocolInfoRequest{})); - } - - wait(smartQuorum(coordProtocols, coordProtocols.size() / 2 + 1, 1.5)); - - std::unordered_map protocolCount; - for (int i = 0; i < coordProtocols.size(); i++) { - if (coordProtocols[i].isReady()) { - protocolCount[coordProtocols[i].get().version.version()]++; - } - } - - uint64_t majorityProtocol = std::max_element(protocolCount.begin(), - protocolCount.end(), - [](const std::pair& l, - const std::pair& r) { return l.second < r.second; }) - ->first; - return ProtocolVersion(majorityProtocol); + return reply.version; } -// Returns the protocol version reported by a quorum of coordinators -// If an expected version is given, the future won't return until the protocol version is different than expected -ACTOR Future getClusterProtocol(Reference f, - Optional expectedVersion) { +// Gets the protocol version reported by a coordinator in its connect packet +// If we are unable to get a version from the connect packet (e.g. because we lost connection with the peer), then this +// function will return with an unset result. +// If an expected version is given, this future won't return if the actual protocol version matches the expected version +ACTOR Future> getCoordinatorProtocolFromConnectPacket( + NetworkAddress coordinatorAddress, + Optional expectedVersion) { + + state Reference>> protocolVersion = + FlowTransport::transport().getPeerProtocolAsyncVar(coordinatorAddress); + loop { - ProtocolVersion protocolVersion = wait(coordinatorProtocolsFetcher(f)); - if (!expectedVersion.present() || protocolVersion != expectedVersion.get()) { - return protocolVersion; - } else { - wait(delay(2.0)); // TODO: this is temporary, so not making into a knob yet + if (protocolVersion->get().present() && + (!expectedVersion.present() || expectedVersion.get() != protocolVersion->get().get())) { + return protocolVersion->get(); + } + + Future change = protocolVersion->onChange(); + if (!protocolVersion->get().present()) { + // If we still don't have any connection info after a timeout, retry sending the protocol version request + change = timeout(change, FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT, Void()); + } + + wait(change); + + if (!protocolVersion->get().present()) { + return protocolVersion->get(); } } } +// Returns the protocol version reported by the given coordinator +// If an expected version is given, the future won't return until the protocol version is different than expected +ACTOR Future getClusterProtocolImpl( + Reference>> coordinator, + Optional expectedVersion) { + + state bool needToConnect = true; + state Future protocolVersion = Never(); + + loop { + if (!coordinator->get().present()) { + wait(coordinator->onChange()); + } else { + Endpoint coordinatorEndpoint = coordinator->get().get().getLeader.getEndpoint(); + if (needToConnect) { + // Even though we typically rely on the connect packet to get the protocol version, we need to send some + // request in order to start a connection. This protocol version request serves that purpose. + protocolVersion = getCoordinatorProtocol(coordinatorEndpoint.addresses); + needToConnect = false; + } + choose { + when(wait(coordinator->onChange())) { needToConnect = true; } + + when(ProtocolVersion pv = wait(protocolVersion)) { + if (!expectedVersion.present() || expectedVersion.get() != pv) { + return pv; + } + } + + // Older versions of FDB don't have an endpoint to return the protocol version, so we get this info from + // the connect packet + when(Optional pv = wait(getCoordinatorProtocolFromConnectPacket( + coordinatorEndpoint.getPrimaryAddress(), expectedVersion))) { + if (pv.present()) { + return pv.get(); + } else { + needToConnect = true; + } + } + } + } + } +} + +// Returns the protocol version reported by the coordinator this client is currently connected to +// If an expected version is given, the future won't return until the protocol version is different than expected +Future DatabaseContext::getClusterProtocol(Optional expectedVersion) { + return getClusterProtocolImpl(coordinator, expectedVersion); +} + uint32_t Transaction::getSize() { auto s = tr.transaction.mutations.expectedSize() + tr.transaction.read_conflict_ranges.expectedSize() + tr.transaction.write_conflict_ranges.expectedSize(); diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index 51411ae0a2..9f9b0057ca 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -76,11 +76,15 @@ class Database { public: enum { API_VERSION_LATEST = -1 }; + // Creates a database object that represents a connection to a cluster + // This constructor uses a preallocated DatabaseContext that may have been created + // on another thread static Database createDatabase(Reference connFile, int apiVersion, bool internal = true, LocalityData const& clientLocality = LocalityData(), DatabaseContext* preallocatedDb = nullptr); + static Database createDatabase(std::string connFileName, int apiVersion, bool internal = true, @@ -400,11 +404,6 @@ ACTOR Future snapCreate(Database cx, Standalone snapCmd, UID sn // Checks with Data Distributor that it is safe to mark all servers in exclusions as failed ACTOR Future checkSafeExclusions(Database cx, vector exclusions); -// Returns the protocol version reported by a quorum of coordinators -// If an expected version is given, the future won't return until the protocol version is different than expected -ACTOR Future getClusterProtocol(Reference f, - Optional expectedVersion); - inline uint64_t getWriteOperationCost(uint64_t bytes) { return bytes / std::max(1, CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR) + 1; } diff --git a/fdbclient/ThreadSafeTransaction.cpp b/fdbclient/ThreadSafeTransaction.cpp index c5bf2dce87..ce17338af7 100644 --- a/fdbclient/ThreadSafeTransaction.cpp +++ b/fdbclient/ThreadSafeTransaction.cpp @@ -97,13 +97,12 @@ double ThreadSafeDatabase::getMainThreadBusyness() { return g_network->networkInfo.metrics.networkBusyness; } -// Returns the protocol version reported by a quorum of coordinators +// Returns the protocol version reported by the coordinator this client is connected to // If an expected version is given, the future won't return until the protocol version is different than expected ThreadFuture ThreadSafeDatabase::getServerProtocol(Optional expectedVersion) { DatabaseContext* db = this->db; - return onMainThread([db, expectedVersion]() -> Future { - return getClusterProtocol(db->getConnectionFile(), expectedVersion); - }); + return onMainThread( + [db, expectedVersion]() -> Future { return db->getClusterProtocol(expectedVersion); }); } ThreadSafeDatabase::ThreadSafeDatabase(std::string connFilename, int apiVersion) { diff --git a/fdbclient/ThreadSafeTransaction.h b/fdbclient/ThreadSafeTransaction.h index e6360c2a6d..407f9aefae 100644 --- a/fdbclient/ThreadSafeTransaction.h +++ b/fdbclient/ThreadSafeTransaction.h @@ -39,7 +39,7 @@ public: void setOption(FDBDatabaseOptions::Option option, Optional value = Optional()) override; double getMainThreadBusyness() override; - // Returns the protocol version reported by a quorum of coordinators + // Returns the protocol version reported by the coordinator this client is connected to // If an expected version is given, the future won't return until the protocol version is different than expected ThreadFuture getServerProtocol( Optional expectedVersion = Optional()) override; diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 56fca670b2..b7221c8876 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -760,6 +760,13 @@ ACTOR Future connectionKeeper(Reference self, conn->close(); conn = Reference(); + + // Old versions will throw this error, and we don't want to forget their protocol versions. + // This means we can't tell the difference between an old protocol version and one we + // can no longer connect to. + if (e.code() != error_code_incompatible_protocol_version) { + self->protocolVersion->set(Optional()); + } } // Clients might send more packets in response, which needs to go out on the next connection @@ -787,7 +794,8 @@ Peer::Peer(TransportData* transport, NetworkAddress const& destination) incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()), pingLatencies(destination.isPublic() ? FLOW_KNOBS->PING_SAMPLE_AMOUNT : 1), lastLoggedBytesReceived(0), bytesSent(0), lastLoggedBytesSent(0), lastLoggedTime(0.0), connectOutgoingCount(0), connectIncomingCount(0), - connectFailedCount(0), connectLatencies(destination.isPublic() ? FLOW_KNOBS->NETWORK_CONNECT_SAMPLE_AMOUNT : 1) { + connectFailedCount(0), connectLatencies(destination.isPublic() ? FLOW_KNOBS->NETWORK_CONNECT_SAMPLE_AMOUNT : 1), + protocolVersion(Reference>>(new AsyncVar>())) { IFailureMonitor::failureMonitor().setStatus(destination, FailureStatus(false)); } @@ -1103,12 +1111,12 @@ static int getNewBufferSize(const uint8_t* begin, packetLen + sizeof(uint32_t) * (peerAddress.isTLS() ? 2 : 3)); } +// This actor exists whenever there is an open or opening connection, whether incoming or outgoing +// For incoming connections conn is set and peer is initially nullptr; for outgoing connections it is the reverse ACTOR static Future connectionReader(TransportData* transport, Reference conn, Reference peer, Promise> onConnected) { - // This actor exists whenever there is an open or opening connection, whether incoming or outgoing - // For incoming connections conn is set and peer is initially nullptr; for outgoing connections it is the reverse state Arena arena; state uint8_t* unprocessed_begin = nullptr; @@ -1209,6 +1217,7 @@ ACTOR static Future connectionReader(TransportData* transport, if (!protocolVersion.hasMultiVersionClient()) { // Older versions expected us to hang up. It may work even if we don't hang up here, but // it's safer to keep the old behavior. + peer->protocolVersion->set(peerProtocolVersion); throw incompatible_protocol_version(); } } else { @@ -1256,6 +1265,7 @@ ACTOR static Future connectionReader(TransportData* transport, onConnected.send(peer); wait(delay(0)); // Check for cancellation } + peer->protocolVersion->set(peerProtocolVersion); } } @@ -1669,6 +1679,16 @@ Reference> FlowTransport::getDegraded() { return self->degraded; } +// Returns the protocol version of the peer at the specified address. The result is returned as an AsyncVar that +// can be used to monitor for changes of a peer's protocol. The protocol version will be unset in the event that +// there is no connection established to the peer. +// +// Note that this function does not establish a connection to the peer. In order to obtain a peer's protocol +// version, some other mechanism should be used to connect to that peer. +Reference>> FlowTransport::getPeerProtocolAsyncVar(NetworkAddress addr) { + return self->peers.at(addr)->protocolVersion; +} + void FlowTransport::resetConnection(NetworkAddress address) { auto peer = self->getPeer(address); if (peer) { diff --git a/fdbrpc/FlowTransport.h b/fdbrpc/FlowTransport.h index bdec8237bd..e2bbfddeee 100644 --- a/fdbrpc/FlowTransport.h +++ b/fdbrpc/FlowTransport.h @@ -152,6 +152,9 @@ struct Peer : public ReferenceCounted { double lastLoggedTime; int64_t lastLoggedBytesReceived; int64_t lastLoggedBytesSent; + + Reference>> protocolVersion; + // Cleared every time stats are logged for this peer. int connectOutgoingCount; int connectIncomingCount; @@ -174,64 +177,64 @@ public: FlowTransport(uint64_t transportId); ~FlowTransport(); - static void createInstance(bool isClient, uint64_t transportId); // Creates a new FlowTransport and makes FlowTransport::transport() return it. This uses g_network->global() // variables, so it will be private to a simulation. + static void createInstance(bool isClient, uint64_t transportId); static bool isClient() { return g_network->global(INetwork::enClientFailureMonitor) != nullptr; } - void initMetrics(); // Metrics must be initialized after FlowTransport::createInstance has been called + void initMetrics(); - Future bind(NetworkAddress publicAddress, NetworkAddress listenAddress); // Starts a server listening on the given listenAddress, and sets publicAddress to be the public // address of this server. Returns only errors. + Future bind(NetworkAddress publicAddress, NetworkAddress listenAddress); - NetworkAddress getLocalAddress() const; // Returns first local NetworkAddress. + NetworkAddress getLocalAddress() const; - NetworkAddressList getLocalAddresses() const; // Returns all local NetworkAddress. + NetworkAddressList getLocalAddresses() const; - std::map>* getIncompatiblePeers(); // Returns the same of all peers that have attempted to connect, but have incompatible protocol versions + std::map>* getIncompatiblePeers(); - Future onIncompatibleChanged(); // Returns when getIncompatiblePeers has at least one peer which is incompatible. + Future onIncompatibleChanged(); - void addPeerReference(const Endpoint&, bool isStream); // Signal that a peer connection is being used, even if no messages are currently being sent to the peer + void addPeerReference(const Endpoint&, bool isStream); - void removePeerReference(const Endpoint&, bool isStream); // Signal that a peer connection is no longer being used + void removePeerReference(const Endpoint&, bool isStream); - void addEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID); // Sets endpoint to be a new local endpoint which delivers messages to the given receiver + void addEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID); void addEndpoints(std::vector> const& streams); - void removeEndpoint(const Endpoint&, NetworkMessageReceiver*); // The given local endpoint no longer delivers messages to the given receiver or uses resources + void removeEndpoint(const Endpoint&, NetworkMessageReceiver*); - void addWellKnownEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID); // Sets endpoint to a new local endpoint (without changing its token) which delivers messages to the given receiver // Implementations may have limitations on when this function is called and what endpoint.token may be! + void addWellKnownEndpoint(Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID); + // sendReliable will keep trying to deliver the data to the destination until cancelReliable is called. It will + // retry sending if the connection is closed or the failure manager reports the destination become available (edge + // triggered). ReliablePacket* sendReliable(ISerializeSource const& what, const Endpoint& destination); - // sendReliable will keep trying to deliver the data to the destination until cancelReliable is - // called. It will retry sending if the connection is closed or the failure manager reports - // the destination become available (edge triggered). + // Makes Packet "unreliable" (either the data or a connection close event will be delivered eventually). It can + // still be used safely to send a reply to a "reliable" request. void cancelReliable(ReliablePacket*); - // Makes Packet "unreliable" (either the data or a connection close event will be delivered - // eventually). It can still be used safely to send a reply to a "reliable" request. - Reference> getDegraded(); // This async var will be set to true when the process cannot connect to a public network address that the failure // monitor thinks is healthy. + Reference> getDegraded(); - void resetConnection(NetworkAddress address); // Forces the connection with this address to be reset + void resetConnection(NetworkAddress address); Reference sendUnreliable(ISerializeSource const& what, const Endpoint& destination, @@ -239,6 +242,14 @@ public: bool incompatibleOutgoingConnectionsPresent(); + // Returns the protocol version of the peer at the specified address. The result is returned as an AsyncVar that + // can be used to monitor for changes of a peer's protocol. The protocol version will be unset in the event that + // there is no connection established to the peer. + // + // Note that this function does not establish a connection to the peer. In order to obtain a peer's protocol + // version, some other mechanism should be used to connect to that peer. + Reference>> getPeerProtocolAsyncVar(NetworkAddress addr); + static FlowTransport& transport() { return *static_cast((void*)g_network->global(INetwork::enFlowTransport)); } diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h index 1a5bd816b8..74da1dfd70 100644 --- a/flow/ProtocolVersion.h +++ b/flow/ProtocolVersion.h @@ -20,6 +20,7 @@ #pragma once #include +#include "flow/Trace.h" #define PROTOCOL_VERSION_FEATURE(v, x) \ struct x { \ @@ -50,6 +51,10 @@ public: return (other.version() & compatibleProtocolVersionMask) == (version() & compatibleProtocolVersionMask); } + // Returns a normalized protocol version that will be the same for all compatible versions + constexpr ProtocolVersion normalizedVersion() const { + return ProtocolVersion(_version & compatibleProtocolVersionMask); + } constexpr bool isValid() const { return version() >= minValidProtocolVersion; } constexpr uint64_t version() const { return _version & versionFlagMask; } @@ -134,6 +139,13 @@ public: // introduced features PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, SpanContext); }; +template <> +struct Traceable : std::true_type { + static std::string toString(const ProtocolVersion& protocolVersion) { + return format("0x%016lX", protocolVersion.version()); + } +}; + // These impact both communications and the deserialization of certain database and IKeyValueStore keys. // // The convention is that 'x' and 'y' should match the major and minor version of the software, and 'z' should be 0. From 711fb5829369458c1bb2305af0369af72c6b3043 Mon Sep 17 00:00:00 2001 From: RenxuanW Date: Thu, 15 Apr 2021 12:40:39 -0700 Subject: [PATCH 064/180] Improve logging on worker joining cluster 1. Logging on worker nodes when it joins a cluster and which cluster; 2. Log the connection string that is being used by worker; 3. Log a warning when a worker fails to join a cluster for longer than 5min, either because it doesn't know which cluster to join, or fails to get a RegisterWorkerReply within 5min. --- fdbserver/worker.actor.cpp | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 7e8ddbaf79..76554189be 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -526,9 +526,9 @@ ACTOR Future registrationClient(ReferencegetConnectionString().toString(); if (connFile && !connFile->fileContentsUpToDate(fileConnectionString)) { request.issues.push_back_deep(request.issues.arena(), LiteralStringRef("incorrect_cluster_file_contents")); - std::string connectionString = connFile->getConnectionString().toString(); if (!incorrectTime.present()) { incorrectTime = now(); } @@ -542,6 +542,12 @@ ACTOR Future registrationClient(Reference(); + if (connFile->canGetFilename()) { + TraceEvent("ClusterFileContents") + .detail("Filename", connFile->getFilename()) + .detail("ConnectionStringFromFile", fileConnectionString.toString()) + .detail("CurrentConnectionString", connectionString); + } } auto peers = FlowTransport::transport().getIncompatiblePeers(); @@ -554,21 +560,27 @@ ACTOR Future registrationClient(Reference registrationReply = + state Future registrationReply = ccInterface->get().present() ? brokenPromiseToNever(ccInterface->get().get().registerWorker.getReply(request)) : Never(); - choose { + state double startTime = now(); + loop choose { when(RegisterWorkerReply reply = wait(registrationReply)) { processClass = reply.processClass; asyncPriorityInfo->set(reply.priorityInfo); + TraceEvent("WorkerJoiningCluster").detail("CCID", ccInterface->get().get().id()); + break; } - when(wait(ccInterface->onChange())) {} - when(wait(ddInterf->onChange())) {} - when(wait(rkInterf->onChange())) {} - when(wait(degraded->onChange())) {} - when(wait(FlowTransport::transport().onIncompatibleChanged())) {} - when(wait(issues->onChange())) {} + when(wait(delay(300))) { // 5 min + TraceEvent(SevWarn, "WorkerNotJoinedClusterForLongTime").detail("WaitTime", now() - startTime); + } + when(wait(ccInterface->onChange())) { break; } + when(wait(ddInterf->onChange())) { break; } + when(wait(rkInterf->onChange())) { break; } + when(wait(degraded->onChange())) { break; } + when(wait(FlowTransport::transport().onIncompatibleChanged())) { break; } + when(wait(issues->onChange())) { break; } } } } From 486260e944c3362a283eb1494a79e12ba873f3af Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 15 Apr 2021 13:36:31 -0700 Subject: [PATCH 065/180] Fix infinite loop with stable interface protocol monitoring. Fix case where getting an error with a network option didn't properly terminate the database connection. Reduce option lock critical section. --- fdbclient/MultiVersionTransaction.actor.cpp | 24 +++++++++++---------- fdbclient/NativeAPI.actor.cpp | 2 ++ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index 57f23e3d88..b39fde5cfd 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -1029,22 +1029,23 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion if (itr != clients.end()) { auto& client = itr->second; - TraceEvent("CreatingDatabaseOnExternalClient") + TraceEvent("CreatingDatabaseOnClient") .detail("LibraryPath", client->libPath) - .detail("Failed", client->failed); + .detail("Failed", client->failed) + .detail("External", client->external); Reference newDb = client->api->createDatabase(clusterFilePath.c_str()); optionLock.enter(); for (auto option : options) { try { - newDb->setOption( - option.first, - option.second.castTo()); // In practice, this will set a deferred error instead - // of throwing. If that happens, the database will be - // unusable (attempts to use it will throw errors). + // In practice, this will set a deferred error instead of throwing. If that happens, the database + // will be unusable (attempts to use it will throw errors). + newDb->setOption(option.first, option.second.castTo()); } catch (Error& e) { optionLock.leave(); + + // If we can't set all of the options on a cluster, we abandon the client TraceEvent(SevError, "ClusterVersionChangeOptionError") .error(e) .detail("Option", option.first) @@ -1052,19 +1053,20 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion .detail("LibPath", client->libPath); client->failed = true; MultiVersionApi::api->updateSupportedVersions(); - db = Reference(); // If we can't set all of the options on a cluster, we abandon the - // client + newDb = Reference(); break; } } db = newDb; - if (dbProtocolVersion.get().hasStableInterfaces()) { + + optionLock.leave(); + + if (dbProtocolVersion.get().hasStableInterfaces() && db) { versionMonitorDb = db; } else { versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str()); } - optionLock.leave(); } else { db = Reference(); versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str()); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 4a6239346e..f673d025c6 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -4952,6 +4952,8 @@ ACTOR Future getClusterProtocolImpl( if (!expectedVersion.present() || expectedVersion.get() != pv) { return pv; } + + protocolVersion = Never(); } // Older versions of FDB don't have an endpoint to return the protocol version, so we get this info from From 551268b0f25be9f1eb35089e702a6a78cd2d1913 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Thu, 15 Apr 2021 13:50:50 -0700 Subject: [PATCH 066/180] Add well known endpoint for worker communication --- cmake/GetMsgpack.cmake | 7 +++- fdbclient/NativeAPI.actor.cpp | 4 ++ fdbclient/ProcessInterface.h | 57 +++++++++++++++++++++++++++++ fdbclient/SpecialKeySpace.actor.cpp | 37 ++++++++++++++++++- fdbclient/SpecialKeySpace.actor.h | 7 ++++ fdbrpc/FlowTransport.actor.cpp | 2 +- fdbserver/worker.actor.cpp | 15 ++++++++ flow/Platform.actor.cpp | 18 +-------- 8 files changed, 126 insertions(+), 21 deletions(-) create mode 100644 fdbclient/ProcessInterface.h diff --git a/cmake/GetMsgpack.cmake b/cmake/GetMsgpack.cmake index 0b951d5a1b..dc9a578175 100644 --- a/cmake/GetMsgpack.cmake +++ b/cmake/GetMsgpack.cmake @@ -9,8 +9,11 @@ else() ExternalProject_add(msgpackProject URL "https://github.com/msgpack/msgpack-c/releases/download/cpp-3.3.0/msgpack-3.3.0.tar.gz" URL_HASH SHA256=6e114d12a5ddb8cb11f669f83f32246e484a8addd0ce93f274996f1941c1f07b - CONFIGURE_COMMAND BUILD_COMMAND INSTALL_COMMAND) + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + ) ExternalProject_Get_property(msgpackProject SOURCE_DIR) target_include_directories(msgpack SYSTEM INTERFACE "${SOURCE_DIR}/include") -endif() \ No newline at end of file +endif() diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index b208107fde..f5c135dd23 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1046,6 +1046,10 @@ DatabaseContext::DatabaseContext(Reference( KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); + registerSpecialKeySpaceModule( + SpecialKeySpace::MODULE::ACTORLINEAGE, + SpecialKeySpace::IMPLTYPE::READONLY, + std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE))); } if (apiVersionAtLeast(630)) { registerSpecialKeySpaceModule(SpecialKeySpace::MODULE::TRANSACTION, diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h new file mode 100644 index 0000000000..c76cf9ef48 --- /dev/null +++ b/fdbclient/ProcessInterface.h @@ -0,0 +1,57 @@ +/* + * ProcessInterface.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/FDBTypes.h" +#include "fdbrpc/fdbrpc.h" + +constexpr UID WLTOKEN_PROCESS(-1, 11); + +struct ProcessInterface { + constexpr static FileIdentifier file_identifier = 985636; + RequestStream getInterface; + RequestStream echo; + + template + void serialize(Ar& ar) { + serializer(ar, echo); + } +}; + +struct GetProcessInterfaceRequest { + constexpr static FileIdentifier file_identifier = 7632546; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, reply); + } +}; + +// TODO: Used for demonstration purposes, remove in later PR +struct EchoRequest { + constexpr static FileIdentifier file_identifier = 10624019; + std::string message; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, message, reply); + } +}; diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index 5fb7360b0d..eaa35e353d 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -22,6 +22,7 @@ #include "boost/algorithm/string.hpp" #include "fdbclient/Knobs.h" +#include "fdbclient/ProcessInterface.h" #include "fdbclient/SpecialKeySpace.actor.h" #include "flow/Arena.h" #include "flow/UnitTest.h" @@ -65,9 +66,12 @@ std::unordered_map SpecialKeySpace::moduleToB { SpecialKeySpace::MODULE::CONFIGURATION, KeyRangeRef(LiteralStringRef("\xff\xff/configuration/"), LiteralStringRef("\xff\xff/configuration0")) }, { SpecialKeySpace::MODULE::TRACING, - KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) } + KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) }, + { SpecialKeySpace::MODULE::ACTORLINEAGE, + KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) } }; +// TODO: Similar for actor lineage? std::unordered_map SpecialKeySpace::managementApiCommandToRange = { { "exclude", KeyRangeRef(LiteralStringRef("excluded/"), LiteralStringRef("excluded0")) @@ -1794,3 +1798,34 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke "profile", "Clear operation is forbidden for profile client. You can set it to default to disable profiling."); } + +ActorLineageImpl::ActorLineageImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {} + +ACTOR static Future> actorLineageGetRangeActor(ReadYourWritesTransaction* ryw, + KeyRef prefix, + KeyRangeRef kr) { + state Standalone result; + Standalone addressString = kr.begin.removePrefix(prefix); + + try { + auto address = NetworkAddress::parse(addressString.contents().toString()); + + state ProcessInterface process; + process.getInterface = RequestStream(Endpoint({ address }, WLTOKEN_PROCESS)); + ProcessInterface p = wait(retryBrokenPromise(process.getInterface, GetProcessInterfaceRequest{})); + process = p; + + EchoRequest echoRequest; + echoRequest.message = "Hello"; + std::string response = wait(process.echo.getReply(echoRequest)); + result.push_back_deep(result.arena(), KeyValueRef(kr.begin, response)); + } catch (Error& e) { + TraceEvent(SevDebug, "SpecialKeysNetworkParseError").error(e); + } + + return result; +} + +Future> ActorLineageImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const { + return actorLineageGetRangeActor(ryw, getKeyRange().begin, kr); +} diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h index c760a10724..051b17470a 100644 --- a/fdbclient/SpecialKeySpace.actor.h +++ b/fdbclient/SpecialKeySpace.actor.h @@ -142,6 +142,7 @@ public: class SpecialKeySpace { public: enum class MODULE { + ACTORLINEAGE, // Sampling data CLUSTERFILEPATH, CONFIGURATION, // Configuration of the cluster CONNECTIONSTRING, @@ -377,5 +378,11 @@ public: void clear(ReadYourWritesTransaction* ryw, const KeyRef& key) override; }; +class ActorLineageImpl : public SpecialKeyRangeReadImpl { +public: + explicit ActorLineageImpl(KeyRangeRef kr); + Future> getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr) const override; +}; + #include "flow/unactorcompiler.h" #endif diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 56fca670b2..15dac5dea0 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -334,7 +334,7 @@ ACTOR Future pingLatencyLogger(TransportData* self) { } TransportData::TransportData(uint64_t transportId) - : endpoints(/*wellKnownTokenCount*/ 11), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints), + : endpoints(/*wellKnownTokenCount*/ 12), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints), warnAlwaysForLargePacket(true), lastIncompatibleMessage(0), transportId(transportId), numIncompatibleConnections(0) { degraded = makeReference>(false); diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 2740d9e720..4d05d3f5fe 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -22,6 +22,7 @@ #include #include "fdbrpc/Locality.h" +#include "fdbclient/ProcessInterface.h" #include "fdbclient/StorageServerInterface.h" #include "fdbserver/Knobs.h" #include "flow/ActorCollection.h" @@ -2032,6 +2033,19 @@ ACTOR Future serveProtocolInfo() { } } +ACTOR Future serveProcess() { + state ProcessInterface process; + process.getInterface.makeWellKnownEndpoint(WLTOKEN_PROCESS, TaskPriority::DefaultEndpoint); + loop { + choose { + when(GetProcessInterfaceRequest req = waitNext(process.getInterface.getFuture())) { + req.reply.send(process); + } + when(EchoRequest req = waitNext(process.echo.getFuture())) { req.reply.send(req.message); } + } + } +} + ACTOR Future fdbd(Reference connFile, LocalityData localities, ProcessClass processClass, @@ -2048,6 +2062,7 @@ ACTOR Future fdbd(Reference connFile, currentLineage->modify(&RoleLineage::role) = ProcessClass::Worker; actors.push_back(serveProtocolInfo()); + actors.push_back(serveProcess()); try { ServerCoordinators coordinators(connFile); diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 78fe11b0a5..4d435afe00 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -3683,28 +3683,12 @@ void* sampleThread(void* arg) { // Get actor lineage of currently running actor. auto actorLineage = currentLineageThreadSafe.get(); - printf("Currently running actor lineage (%p):\n", actorLineage.getPtr()); - auto stack = actorLineage->stack(&StackLineage::actorName); - while (!stack.empty()) { - printf("%s ", stack.back()); - stack.pop_back(); - } - printf("\n"); + // TODO: Use actorLineage for (const auto& [waitState, lineageFn] : samples) { auto alps = lineageFn(); // TODO: Serialize collected actor linage properties - - printf("Wait State #%d ALPs (%d):\n", waitState, alps.size()); - for (auto actorLineage : alps) { - auto stack = actorLineage->stack(&StackLineage::actorName); - while (!stack.empty()) { - printf("%s ", stack.back()); - stack.pop_back(); - } - printf("\n"); - } } } From 5c33c7c4f59841585e92687e2a69433d5787b57a Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Thu, 15 Apr 2021 13:54:49 -0700 Subject: [PATCH 067/180] Remove TODO --- fdbclient/SpecialKeySpace.actor.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index eaa35e353d..b245b049ba 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -71,7 +71,6 @@ std::unordered_map SpecialKeySpace::moduleToB KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) } }; -// TODO: Similar for actor lineage? std::unordered_map SpecialKeySpace::managementApiCommandToRange = { { "exclude", KeyRangeRef(LiteralStringRef("excluded/"), LiteralStringRef("excluded0")) From 21c518467a3278dec795a2f104984611dc8674fb Mon Sep 17 00:00:00 2001 From: RenxuanW Date: Thu, 15 Apr 2021 15:38:44 -0700 Subject: [PATCH 068/180] Move 300s to a knob. --- fdbserver/Knobs.cpp | 1 + fdbserver/Knobs.h | 1 + fdbserver/worker.actor.cpp | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index ad4c797b8d..df3434fb9b 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -616,6 +616,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi //Worker init( WORKER_LOGGING_INTERVAL, 5.0 ); init( HEAP_PROFILER_INTERVAL, 30.0 ); + init( JOIN_CLUSTER_WARNING_INTERVAL, 300.0 ); init( DEGRADED_RESET_INTERVAL, 24*60*60 ); if ( randomize && BUGGIFY ) DEGRADED_RESET_INTERVAL = 10; init( DEGRADED_WARNING_LIMIT, 1 ); init( DEGRADED_WARNING_RESET_DELAY, 7*24*60*60 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 9a5f2a528c..690bbe6327 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -543,6 +543,7 @@ public: // Worker double WORKER_LOGGING_INTERVAL; double HEAP_PROFILER_INTERVAL; + double JOIN_CLUSTER_WARNING_INTERVAL; double DEGRADED_RESET_INTERVAL; double DEGRADED_WARNING_LIMIT; double DEGRADED_WARNING_RESET_DELAY; diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 76554189be..e03089b618 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -572,7 +572,7 @@ ACTOR Future registrationClient(Referenceget().get().id()); break; } - when(wait(delay(300))) { // 5 min + when(wait(delay(SERVER_KNOBS->JOIN_CLUSTER_WARNING_INTERVAL))) { TraceEvent(SevWarn, "WorkerNotJoinedClusterForLongTime").detail("WaitTime", now() - startTime); } when(wait(ccInterface->onChange())) { break; } From aba752d12e10e1469184b1b71d944f5e3af5efd7 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Fri, 16 Apr 2021 14:56:05 -0400 Subject: [PATCH 069/180] Added release notes regarding 6.3 to 6.2 downgrades --- .../sphinx/source/release-notes/release-notes-620.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/documentation/sphinx/source/release-notes/release-notes-620.rst b/documentation/sphinx/source/release-notes/release-notes-620.rst index 3148eefa97..1150b29c38 100644 --- a/documentation/sphinx/source/release-notes/release-notes-620.rst +++ b/documentation/sphinx/source/release-notes/release-notes-620.rst @@ -8,6 +8,11 @@ Release Notes * Fix backup agent stall when writing to local filesystem with slow metadata operations. `(PR #4428) `_ * Backup agent no longer uses 4k block caching layer on local output files so that write operations are larger. `(PR #4428) `_ * Fix accounting error that could cause commits to incorrectly fail with ``proxy_memory_limit_exceeded``. `(PR #4529) `_ +* Added support for downgrades from FDB version 6.3. `(PR #4673) `_ +* Restrictions added for 6.3 clusters to maintain compatibility with a 6.2 downgrade. `(PR #4469) `_ + * Downgrades from 6.3 cannot have ``TLogVersion`` greater than V4 (6.2). + * Downgrades from 6.3 cannot use storage engine types that are not ``ssd-1``, ``ssd-2``, or ``memory``. + * Downgrades from 6.3 must not have any key servers serialized with tag encoding. ``TAG_ENCODE_KEY_SERVERS`` must not be set to true at any point in time. 6.2.32 ====== From d79dc447b4fd45907e72ae8913b021e60ac76458 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 16 Apr 2021 15:24:21 -0700 Subject: [PATCH 070/180] Update release notes --- .../sphinx/source/release-notes/release-notes-630.rst | 4 ++++ .../sphinx/source/release-notes/release-notes-700.rst | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst index 076f85d74d..cd8c5e4150 100644 --- a/documentation/sphinx/source/release-notes/release-notes-630.rst +++ b/documentation/sphinx/source/release-notes/release-notes-630.rst @@ -2,6 +2,10 @@ Release Notes ############# +6.3.13 +====== +* The multi-version client now requires at most two connections to the cluster, regardless of how many external clients are configured. `(PR #4667) `_ + 6.3.12 ====== * Change the default for --knob_tls_server_handshake_threads to 64. The previous was 1000. This avoids starting 1000 threads by default, but may adversely affect recovery time for large clusters using tls. Users with large tls clusters should consider explicitly setting this knob in their foundationdb.conf file. `(PR #4421) `_ diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst index 431ea14fc2..5f3d3a4669 100644 --- a/documentation/sphinx/source/release-notes/release-notes-700.rst +++ b/documentation/sphinx/source/release-notes/release-notes-700.rst @@ -15,7 +15,8 @@ Features Performance ----------- -* Increased performance of dr_agent when copying the mutation log. The ``COPY_LOG_BLOCK_SIZE``, ``COPY_LOG_BLOCKS_PER_TASK``, ``COPY_LOG_PREFETCH_BLOCKS``, ``COPY_LOG_READ_AHEAD_BYTES`` and ``COPY_LOG_TASK_DURATION_NANOS`` knobs can be set. `(PR 3436) `_ +* Increased performance of dr_agent when copying the mutation log. The ``COPY_LOG_BLOCK_SIZE``, ``COPY_LOG_BLOCKS_PER_TASK``, ``COPY_LOG_PREFETCH_BLOCKS``, ``COPY_LOG_READ_AHEAD_BYTES`` and ``COPY_LOG_TASK_DURATION_NANOS`` knobs can be set. `(PR #3436) `_ +* Reduced the number of connections required by the multi-version client when loading external clients. When connection to 7.0 clusters, only one connection will be used. With older clusters, at most two connections will be used. `(PR #4667) `_ Reliability ----------- From 336a429be106c8c88da24a8856d59736fbdb4773 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 16 Apr 2021 17:32:53 -0600 Subject: [PATCH 071/180] first version of profiler --- fdbclient/ActorLineageProfiler.cpp | 94 +++++++++++++++++++++++++++--- fdbclient/ActorLineageProfiler.h | 69 ++++++++++++++++++---- fdbclient/AnnotateActor.h | 20 ++++++- 3 files changed, 161 insertions(+), 22 deletions(-) diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp index 5c0aaf86d1..4993a74207 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -18,7 +18,9 @@ * limitations under the License. */ +#include "flow/flow.h" #include "flow/singleton.h" +#include "fdbrpc/IAsyncFile.h" #include "fdbclient/ActorLineageProfiler.h" #include #include @@ -26,15 +28,6 @@ using namespace std::literals; -std::string_view to_string(WaitState w) { - switch (w) { - case WaitState::Running: - return "Running"; - case WaitState::DiskIO: - return "DiskIO"; - } -} - class Packer : public msgpack::packer { struct visitor_t { using VisitorMap = std::unordered_map>; @@ -201,3 +194,86 @@ std::shared_ptr SampleCollectorT::collect() { packer.pack(res); return packer.done(time); } + +void SampleCollection_t::refresh() { + auto sample = _collector->collect(); + auto min = sample->time - windowSize; + double oldest = 0.0; + while (oldest < min && !data.empty()) { + // we remove at most 10 elements at a time. This is so we don't block the main thread for too long. + { + Lock _{ mutex }; + int i = 0; + do { + oldest = data.front()->time; + data.pop_front(); + ++i; + } while (i < 10 && oldest < min && !data.empty()); + } + } + { + Lock _{ mutex }; + data.push_back(sample); + } +} + +std::vector> SampleCollection_t::get(double from /*= 0.0*/, + double to /*= std::numeric_limits::max()*/) const { + Lock _{ mutex }; + std::vector> res; + for (const auto& sample : data) { + if (sample->time > to) { + break; + } else if (sample->time > from) { + res.emplace_back(sample); + } + } + return res; +} + +ActorLineageProfilerT::ActorLineageProfilerT() { + collection->collector()->addGetter(WaitState::Network, + std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet()))); + collection->collector()->addGetter( + WaitState::Disk, + std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet()))); + collection->collector()->addGetter(WaitState::Running, []() { + auto res = currentLineageThreadSafe.get(); + return std::vector>({ currentLineageThreadSafe.get() }); + }); +} + +ActorLineageProfilerT::~ActorLineageProfilerT() { + stop(); +} + +void ActorLineageProfilerT::stop() { + setFrequency(0); +} + +void ActorLineageProfilerT::setFrequency(unsigned frequency) { + bool change = this->frequency != frequency; + this->frequency = frequency; + if (frequency != 0 && !profilerThread.joinable()) { + profilerThread = std::thread(std::bind(&ActorLineageProfilerT::profile, this)); + } else if (change) { + cond.notify_all(); + } +} + +void ActorLineageProfilerT::profile() { + for (;;) { + collection->refresh(); + if (frequency == 0) { + return; + } + { + std::unique_lock lock{ mutex }; + cond.wait_for(lock, std::chrono::microseconds(1000000 / frequency)); + // cond.wait_until(lock, lastSample + std::chrono::milliseconds) + } + if (frequency == 0) { + return; + } + } +} diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h index 1f2bdad659..3f11840714 100644 --- a/fdbclient/ActorLineageProfiler.h +++ b/fdbclient/ActorLineageProfiler.h @@ -19,13 +19,19 @@ */ #pragma once +#include "fdbclient/AnnotateActor.h" + #include #include #include #include +#include +#include #include "flow/singleton.h" #include "flow/flow.h" +void runSamplingProfiler(); + struct IALPCollectorBase { virtual std::optional collect(ActorLineage*) = 0; virtual const std::string_view& name() = 0; @@ -34,19 +40,9 @@ struct IALPCollectorBase { template struct IALPCollector : IALPCollectorBase { - const std::string_view& name() override { - static std::string_view res; - if (res == "") { - res = T::name; - } - return res; - } + const std::string_view& name() override { return T::name; } }; -enum class WaitState { Running, DiskIO }; - -std::string_view to_string(WaitState w); - struct Sample : std::enable_shared_from_this { double time = 0.0; unsigned size = 0u; @@ -68,6 +64,57 @@ private: public: void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); } std::shared_ptr collect(); + void addGetter(WaitState waitState, Getter const& getter); }; using SampleCollector = crossbow::singleton; + +class SampleCollection_t { + friend struct crossbow::create_static; + using Lock = std::unique_lock; + SampleCollection_t() {} + + SampleCollector _collector; + mutable std::mutex mutex; + std::atomic windowSize = 0.0; + std::deque> data; + +public: + /** + * Define how many samples the collection shoul keep. The window size is defined by time dimension. + * + * \param duration How long a sample should be kept in the collection. + */ + void setWindowSize(double duration) { windowSize.store(duration); } + /** + * By default returns reference counted pointers of all samples. A window can be defined in terms of absolute time. + * + * \param from The minimal age of all returned samples. + * \param to The max age of all returned samples. + */ + std::vector> get(double from = 0.0, double to = std::numeric_limits::max()) const; + /** + * Collects all new samples from the sample collector and stores them in the collection. + */ + void refresh(); + const SampleCollector& collector() const { return _collector; } + SampleCollector& collector() { return _collector; } +}; + +using SampleCollection = crossbow::singleton; + +class ActorLineageProfilerT { + friend struct crossbow::create_static; + ActorLineageProfilerT(); + SampleCollection collection; + std::thread profilerThread; + std::atomic frequency = 0; + std::mutex mutex; + std::condition_variable cond; + void profile(); + +public: + ~ActorLineageProfilerT(); + void setFrequency(unsigned frequency); + void stop(); +}; diff --git a/fdbclient/AnnotateActor.h b/fdbclient/AnnotateActor.h index 265d1bb3ad..660b777d69 100644 --- a/fdbclient/AnnotateActor.h +++ b/fdbclient/AnnotateActor.h @@ -23,6 +23,8 @@ #include "flow/flow.h" #include "flow/network.h" +#include + // Used to manually instrument waiting actors to collect samples for the // sampling profiler. struct AnnotateActor { @@ -51,7 +53,7 @@ struct AnnotateActor { return *this; } - + ~AnnotateActor() { if (set) { g_network->getActorLineageSet().erase(index); @@ -59,6 +61,20 @@ struct AnnotateActor { } }; -enum WaitState { Disk, Network }; +enum class WaitState { Disk, Network, Running }; +// usually we shouldn't use `using namespace` in a header file, but literals should be safe as user defined literals +// need to be prefixed with `_` +using namespace std::literals; + +constexpr std::string_view to_string(WaitState st) { + switch (st) { + case WaitState::Disk: + return "Disk"sv; + case WaitState::Network: + return "Network"sv; + case WaitState::Running: + return "Running"sv; + } +} extern std::map>()>> samples; From 5c3cb0da205f02a407d471cf23ba6c492135f41f Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Fri, 16 Apr 2021 20:13:23 -0700 Subject: [PATCH 072/180] Pager now reports whether an uncacheable read hit the cache or not. CommitSubtree now does uncacheable reads so it can avoid copying the old version of the page for modification if it was not already in cache. --- fdbserver/IPager.h | 10 +++- fdbserver/VersionedBTree.actor.cpp | 76 +++++++++++++++++++++--------- 2 files changed, 63 insertions(+), 23 deletions(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 7f21e30566..4eddfbc5e8 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -65,7 +65,10 @@ public: class IPagerSnapshot { public: - virtual Future> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool nohit) = 0; + virtual Future> getPhysicalPage(LogicalPageID pageID, + bool cacheable, + bool nohit, + bool* fromCache = nullptr) = 0; virtual bool tryEvictPage(LogicalPageID id) = 0; virtual Version getVersion() const = 0; @@ -117,7 +120,10 @@ public: // Cacheable indicates that the page should be added to the page cache (if applicable?) as a result of this read. // NoHit indicates that the read should not be considered a cache hit, such as when preloading pages that are // considered likely to be needed soon. - virtual Future> readPage(LogicalPageID pageID, bool cacheable = true, bool noHit = false) = 0; + virtual Future> readPage(LogicalPageID pageID, + bool cacheable = true, + bool noHit = false, + bool* fromCache = nullptr) = 0; // Get a snapshot of the metakey and all pages as of the version v which must be >= getOldestVersion() // Note that snapshots at any version may still see the results of updatePage() calls. diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 52408c2820..77d602d0cc 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1711,14 +1711,24 @@ public: return pageCache.tryEvict(physicalID); } - // Reads the most recent version of pageID, either previously committed or written using updatePage() in the current - // commit - Future> readPage(LogicalPageID pageID, bool cacheable, bool noHit = false) override { + // Reads the most recent version of pageID, either previously committed or written using updatePage() + // in the current commit + // If cacheable is false then if fromCache is valid it will be set to true if the page is from cache, otherwise + // false. If cacheable is true, fromCache is ignored as the result is automatically from cache by virtue of being + // cacheable. + Future> readPage(LogicalPageID pageID, + bool cacheable, + bool noHit = false, + bool* fromCache = nullptr) override { // Use cached page if present, without triggering a cache hit. // Otherwise, read the page and return it but don't add it to the cache if (!cacheable) { debug_printf("DWALPager(%s) op=readUncached %s\n", filename.c_str(), toString(pageID).c_str()); PageCacheEntry* pCacheEntry = pageCache.getIfExists(pageID); + if (fromCache != nullptr) { + *fromCache = pCacheEntry != nullptr; + } + if (pCacheEntry != nullptr) { debug_printf("DWALPager(%s) op=readUncachedHit %s\n", filename.c_str(), toString(pageID).c_str()); return pCacheEntry->readFuture; @@ -1771,9 +1781,13 @@ public: return (PhysicalPageID)pageID; } - Future> readPageAtVersion(LogicalPageID logicalID, Version v, bool cacheable, bool noHit) { + Future> readPageAtVersion(LogicalPageID logicalID, + Version v, + bool cacheable, + bool noHit, + bool* fromCache) { PhysicalPageID physicalID = getPhysicalPageID(logicalID, v); - return readPage(physicalID, cacheable, noHit); + return readPage(physicalID, cacheable, noHit, fromCache); } // Get snapshot as of the most recent committed version of the pager @@ -2302,11 +2316,14 @@ public: : pager(pager), metaKey(meta), version(version), expired(expiredFuture) {} ~DWALPagerSnapshot() override {} - Future> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool noHit) override { + Future> getPhysicalPage(LogicalPageID pageID, + bool cacheable, + bool noHit, + bool* fromCache) override { if (expired.isError()) { throw expired.getError(); } - return map(pager->readPageAtVersion(pageID, version, cacheable, noHit), + return map(pager->readPageAtVersion(pageID, version, cacheable, noHit, fromCache), [=](Reference p) { return Reference(std::move(p)); }); } @@ -3324,7 +3341,7 @@ public: } // Start reading the page, without caching entries.push_back( - std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true))); + std::make_pair(q.get(), self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true, false))); --toPop; } @@ -4196,7 +4213,11 @@ private: BTreePageIDRef id, const RedwoodRecordRef* lowerBound, const RedwoodRecordRef* upperBound, - bool forLazyClear = false) { + bool forLazyClear = false, + bool cacheable = true, + bool* fromCache = nullptr) + + { if (!forLazyClear) { debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), @@ -4213,17 +4234,22 @@ private: state Reference page; if (id.size() == 1) { - Reference p = wait(snapshot->getPhysicalPage(id.front(), !forLazyClear, false)); + Reference p = wait(snapshot->getPhysicalPage(id.front(), cacheable, false, fromCache)); page = std::move(p); } else { ASSERT(!id.empty()); std::vector>> reads; for (auto& pageID : id) { - reads.push_back(snapshot->getPhysicalPage(pageID, !forLazyClear, false)); + reads.push_back(snapshot->getPhysicalPage(pageID, cacheable, false)); } std::vector> pages = wait(getAll(reads)); // TODO: Cache reconstituted super pages somehow, perhaps with help from the Pager. page = Reference(new SuperPage(pages)); + + // In the current implementation, SuperPages are never present in the cache + if (fromCache != nullptr) { + *fromCache = false; + } } debug_printf("readPage() op=readComplete %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); @@ -4233,7 +4259,7 @@ private: metrics.pageReadExt += (id.size() - 1); if (!forLazyClear && page->userData == nullptr) { - debug_printf("readPage() Creating Reader for %s @%" PRId64 " lower=%s upper=%s\n", + debug_printf("readPage() Creating Mirror for %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString(false).c_str(), @@ -4618,8 +4644,9 @@ private: state Reference commitReadLock = self->m_commitReadLock; wait(commitReadLock->take()); state FlowLock::Releaser readLock(*commitReadLock); - state Reference page = - wait(readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound)); + state bool fromCache = false; + state Reference page = wait( + readPage(snapshot, rootID, update->decodeLowerBound, update->decodeUpperBound, false, false, &fromCache)); readLock.release(); state BTreePage* btPage = (BTreePage*)page->begin(); @@ -4631,11 +4658,13 @@ private: // though it is awkward to reason about. state bool tryToUpdate = btPage->tree().numItems > 0 && update->boundariesNormal(); - // If trying to update the page, we need to clone it so we don't modify the original. + // If trying to update the page and the page reference points into the cache, + // we need to clone it so we don't modify the original version of the page. // TODO: Refactor DeltaTree::Mirror so it can be shared between different versions of pages - if (tryToUpdate) { + if (tryToUpdate && fromCache) { page = self->cloneForUpdate(page); btPage = (BTreePage*)page->begin(); + fromCache = false; } debug_printf( @@ -5124,12 +5153,17 @@ private: parentInfo->count); forceUpdate = true; if (!m.updating) { - page = self->cloneForUpdate(page); - cursor = getCursor(page); - btPage = (BTreePage*)page->begin(); - m.btPage = btPage; - m.m = cursor.mirror; m.updating = true; + + // Copy the page before modification if the page references the cache + if (fromCache) { + page = self->cloneForUpdate(page); + cursor = getCursor(page); + btPage = (BTreePage*)page->begin(); + m.btPage = btPage; + m.m = cursor.mirror; + fromCache = false; + } } ++g_redwoodMetrics.level(btPage->height).forceUpdate; } From 09ddcb3bae9a83818aa64f06ed79ebb3aca566ad Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 19 Apr 2021 11:27:19 -0600 Subject: [PATCH 073/180] remove old sample thread --- fdbclient/NativeAPI.actor.cpp | 3 ++- fdbserver/fdbserver.actor.cpp | 1 - flow/Platform.actor.cpp | 28 ---------------------------- flow/Platform.h | 8 +++----- 4 files changed, 5 insertions(+), 35 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index b761f6c049..7ab3f18440 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -1801,7 +1801,6 @@ void runNetwork() { if (networkOptions.traceDirectory.present() && networkOptions.runLoopProfilingEnabled) { setupRunLoopProfiler(); } - setupSamplingProfiler(); g_network->run(); @@ -2483,9 +2482,11 @@ ACTOR Future watchValue(Future version, cx->invalidateCache(key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID)); } else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) { + // clang-format off TEST(e.code() == error_code_watch_cancelled); // Too many watches on the storage server, poll for changes instead TEST(e.code() == error_code_watch_cancelled); // Too many watches on storage server, poll for changes TEST(e.code() == error_code_process_behind); // The storage servers are all behind + // clang-format on wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, info.taskID)); } else if (e.code() == error_code_timed_out) { // The storage server occasionally times out watches in case // it was cancelled diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index ce2a903c1f..ac1bf7950f 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -1948,7 +1948,6 @@ int main(int argc, char* argv[]) { ASSERT(opts.connectionFile); setupRunLoopProfiler(); - setupSamplingProfiler(); auto dataFolder = opts.dataFolder; if (!dataFolder.size()) diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 4d435afe00..8cdb34f769 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -3677,34 +3677,6 @@ void setupRunLoopProfiler() { #endif } -void* sampleThread(void* arg) { - while (true) { - threadSleep(1.0); // TODO: Read sample rate from global config - - // Get actor lineage of currently running actor. - auto actorLineage = currentLineageThreadSafe.get(); - // TODO: Use actorLineage - - for (const auto& [waitState, lineageFn] : samples) { - auto alps = lineageFn(); - - // TODO: Serialize collected actor linage properties - } - } - - return nullptr; -} - -void setupSamplingProfiler() { - samples[WaitState::Disk] = std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet())); - samples[WaitState::Network] = - std::bind(&ActorLineageSet::copy, std::ref(IAsyncFileSystem::filesystem()->getActorLineageSet())); - - // TODO: Add knob - TraceEvent("StartingSamplingProfilerThread"); - startThread(&sampleThread, nullptr); -} - // UnitTest for getMemoryInfo #ifdef __linux__ TEST_CASE("/flow/Platform/getMemoryInfo") { diff --git a/flow/Platform.h b/flow/Platform.h index edf9ff3997..c50c13e11a 100644 --- a/flow/Platform.h +++ b/flow/Platform.h @@ -741,8 +741,6 @@ void registerCrashHandler(); void setupRunLoopProfiler(); EXTERNC void setProfilingEnabled(int enabled); -void setupSamplingProfiler(); - // Use _exit() or criticalError(), not exit() #define exit static_assert(false, "Calls to exit() are forbidden by policy"); @@ -793,17 +791,17 @@ inline void fdb_probe_actor_exit(const char* name, unsigned long id, int index) #include static inline uint32_t hwCrc32cU8(unsigned int crc, unsigned char v) { uint32_t ret; - asm volatile("crc32cb %w[r], %w[c], %w[v]" : [r] "=r"(ret) : [c] "r"(crc), [v] "r"(v)); + asm volatile("crc32cb %w[r], %w[c], %w[v]" : [ r ] "=r"(ret) : [ c ] "r"(crc), [ v ] "r"(v)); return ret; } static inline uint32_t hwCrc32cU32(unsigned int crc, unsigned int v) { uint32_t ret; - asm volatile("crc32cw %w[r], %w[c], %w[v]" : [r] "=r"(ret) : [c] "r"(crc), [v] "r"(v)); + asm volatile("crc32cw %w[r], %w[c], %w[v]" : [ r ] "=r"(ret) : [ c ] "r"(crc), [ v ] "r"(v)); return ret; } static inline uint64_t hwCrc32cU64(uint64_t crc, uint64_t v) { uint64_t ret; - asm volatile("crc32cx %w[r], %w[c], %x[v]" : [r] "=r"(ret) : [c] "r"(crc), [v] "r"(v)); + asm volatile("crc32cx %w[r], %w[c], %x[v]" : [ r ] "=r"(ret) : [ c ] "r"(crc), [ v ] "r"(v)); return ret; } #else From f8d2bca6a4b0a664373d4ce511ee062c65b9cc9e Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 19 Apr 2021 13:10:27 -0600 Subject: [PATCH 074/180] address review comments --- fdbclient/ActorLineageProfiler.cpp | 32 ++++++++++++++---------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp index 4993a74207..82d04aa42c 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -197,23 +197,21 @@ std::shared_ptr SampleCollectorT::collect() { void SampleCollection_t::refresh() { auto sample = _collector->collect(); - auto min = sample->time - windowSize; - double oldest = 0.0; - while (oldest < min && !data.empty()) { - // we remove at most 10 elements at a time. This is so we don't block the main thread for too long. - { - Lock _{ mutex }; - int i = 0; - do { - oldest = data.front()->time; - data.pop_front(); - ++i; - } while (i < 10 && oldest < min && !data.empty()); - } - } + auto min = std::max(sample->time - windowSize, sample->time); { Lock _{ mutex }; - data.push_back(sample); + data.emplace_back(std::move(sample)); + } + double oldest = data.front()->time; + // we don't need to check for data.empty() in this loop (or the inner loop) as we know that we will end + // up with at least one entry which is the most recent sample + while (oldest < min) { + Lock _{ mutex }; + // we remove at most 10 elements at a time. This is so we don't block the main thread for too long. + for (int i = 0; i < 10 && oldest < min; ++i) { + data.pop_front(); + oldest = data.front()->time; + } } } @@ -224,8 +222,8 @@ std::vector> SampleCollection_t::get(double from /*= 0.0 for (const auto& sample : data) { if (sample->time > to) { break; - } else if (sample->time > from) { - res.emplace_back(sample); + } else if (sample->time >= from) { + res.push_back(sample); } } return res; From 36702e57ee3dd61b942705cccd6ae5a6ae20d7fd Mon Sep 17 00:00:00 2001 From: RenxuanW Date: Mon, 19 Apr 2021 17:06:10 -0700 Subject: [PATCH 075/180] Rename a few variables. --- fdbserver/Knobs.cpp | 2 +- fdbserver/Knobs.h | 2 +- fdbserver/worker.actor.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index df3434fb9b..ef2334d3cf 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -616,7 +616,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi //Worker init( WORKER_LOGGING_INTERVAL, 5.0 ); init( HEAP_PROFILER_INTERVAL, 30.0 ); - init( JOIN_CLUSTER_WARNING_INTERVAL, 300.0 ); + init( REGISTER_WORKER_REQUEST_TIMEOUT, 300.0 ); init( DEGRADED_RESET_INTERVAL, 24*60*60 ); if ( randomize && BUGGIFY ) DEGRADED_RESET_INTERVAL = 10; init( DEGRADED_WARNING_LIMIT, 1 ); init( DEGRADED_WARNING_RESET_DELAY, 7*24*60*60 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 690bbe6327..a9333b0cf3 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -543,7 +543,7 @@ public: // Worker double WORKER_LOGGING_INTERVAL; double HEAP_PROFILER_INTERVAL; - double JOIN_CLUSTER_WARNING_INTERVAL; + double REGISTER_WORKER_REQUEST_TIMEOUT; double DEGRADED_RESET_INTERVAL; double DEGRADED_WARNING_LIMIT; double DEGRADED_WARNING_RESET_DELAY; diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index e03089b618..c2f35f226c 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -569,11 +569,11 @@ ACTOR Future registrationClient(Referenceset(reply.priorityInfo); - TraceEvent("WorkerJoiningCluster").detail("CCID", ccInterface->get().get().id()); + TraceEvent("WorkerRegisterReply").detail("CCID", ccInterface->get().get().id()); break; } when(wait(delay(SERVER_KNOBS->JOIN_CLUSTER_WARNING_INTERVAL))) { - TraceEvent(SevWarn, "WorkerNotJoinedClusterForLongTime").detail("WaitTime", now() - startTime); + TraceEvent(SevWarn, "WorkerRegisterTimeout").detail("WaitTime", now() - startTime); } when(wait(ccInterface->onChange())) { break; } when(wait(ddInterf->onChange())) { break; } From 4a35fa07e784816843d3053dadc4a53c42a03912 Mon Sep 17 00:00:00 2001 From: RenxuanW Date: Mon, 19 Apr 2021 17:14:46 -0700 Subject: [PATCH 076/180] Add a safe check --- fdbserver/worker.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index c2f35f226c..0eb77c2553 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -542,7 +542,7 @@ ACTOR Future registrationClient(Reference(); - if (connFile->canGetFilename()) { + if (connFile && connFile->canGetFilename()) { TraceEvent("ClusterFileContents") .detail("Filename", connFile->getFilename()) .detail("ConnectionStringFromFile", fileConnectionString.toString()) From f04303185fa81f9900ea38f99a03452d2059c886 Mon Sep 17 00:00:00 2001 From: RenxuanW Date: Mon, 19 Apr 2021 17:17:22 -0700 Subject: [PATCH 077/180] Huh --- fdbserver/worker.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 0eb77c2553..b7cbfe16d7 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -572,7 +572,7 @@ ACTOR Future registrationClient(Referenceget().get().id()); break; } - when(wait(delay(SERVER_KNOBS->JOIN_CLUSTER_WARNING_INTERVAL))) { + when(wait(delay(SERVER_KNOBS->REGISTER_WORKER_REQUEST_TIMEOUT))) { TraceEvent(SevWarn, "WorkerRegisterTimeout").detail("WaitTime", now() - startTime); } when(wait(ccInterface->onChange())) { break; } From f8054b82de8c61a05c7cd8b0c45d9eca0981033d Mon Sep 17 00:00:00 2001 From: Cynthia Date: Mon, 19 Apr 2021 22:24:13 -0600 Subject: [PATCH 078/180] fdbcli prints error on TLS File not found --- flow/TLSConfig.actor.cpp | 50 ++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/flow/TLSConfig.actor.cpp b/flow/TLSConfig.actor.cpp index d716ccf19a..867c2369e1 100644 --- a/flow/TLSConfig.actor.cpp +++ b/flow/TLSConfig.actor.cpp @@ -253,21 +253,36 @@ LoadedTLSConfig TLSConfig::loadSync() const { const std::string certPath = getCertificatePathSync(); if (certPath.size()) { - loaded.tlsCertBytes = readFileBytes(certPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE); + try { + loaded.tlsCertBytes = readFileBytes(certPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE); + } catch (Error& e) { + fprintf(stderr, "Error reading TLS Certificate [%s]: %s\n", certPath.c_str(), e.what()); + throw; + } } else { loaded.tlsCertBytes = tlsCertBytes; } const std::string keyPath = getKeyPathSync(); if (keyPath.size()) { - loaded.tlsKeyBytes = readFileBytes(keyPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE); + try { + loaded.tlsKeyBytes = readFileBytes(keyPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE); + } catch (Error& e) { + fprintf(stderr, "Error reading TLS Key [%s]: %s\n", keyPath.c_str(), e.what()); + throw; + } } else { loaded.tlsKeyBytes = tlsKeyBytes; } const std::string CAPath = getCAPathSync(); if (CAPath.size()) { - loaded.tlsCABytes = readFileBytes(CAPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE); + try { + loaded.tlsCABytes = readFileBytes(CAPath, FLOW_KNOBS->CERT_FILE_MAX_SIZE); + } catch (Error& e) { + fprintf(stderr, "Error reading TLS CA [%s]: %s\n", CAPath.c_str(), e.what()); + throw; + } } else { loaded.tlsCABytes = tlsCABytes; } @@ -297,28 +312,49 @@ ACTOR Future TLSConfig::loadAsync(const TLSConfig* self) { state LoadedTLSConfig loaded; state std::vector> reads; - const std::string& certPath = self->getCertificatePathSync(); + state int32_t certIdx = -1; + state int32_t keyIdx = -1; + state int32_t caIdx = -1; + + state std::string certPath = self->getCertificatePathSync(); if (certPath.size()) { reads.push_back(readEntireFile(certPath, &loaded.tlsCertBytes)); + certIdx = reads.size() - 1; } else { loaded.tlsCertBytes = self->tlsCertBytes; } - const std::string& keyPath = self->getKeyPathSync(); + state std::string keyPath = self->getKeyPathSync(); if (keyPath.size()) { reads.push_back(readEntireFile(keyPath, &loaded.tlsKeyBytes)); + keyIdx = reads.size() - 1; } else { loaded.tlsKeyBytes = self->tlsKeyBytes; } - const std::string& CAPath = self->getCAPathSync(); + state std::string CAPath = self->getCAPathSync(); if (CAPath.size()) { reads.push_back(readEntireFile(CAPath, &loaded.tlsCABytes)); + caIdx = reads.size() - 1; } else { loaded.tlsCABytes = self->tlsCABytes; } - wait(waitForAll(reads)); + try { + wait(waitForAll(reads)); + } catch (Error& e) { + if (certIdx != -1 && reads[certIdx].isError()) { + fprintf(stderr, "Failure reading TLS Certificate [%s]: %s\n", certPath.c_str(), e.what()); + } else if (keyIdx != -1 && reads[keyIdx].isError()) { + fprintf(stderr, "Failure reading TLS Key [%s]: %s\n", keyPath.c_str(), e.what()); + } else if (caIdx != -1 && reads[caIdx].isError()) { + fprintf(stderr, "Failure reading TLS Key [%s]: %s\n", CAPath.c_str(), e.what()); + } else { + fprintf(stderr, "Failure reading TLS needed file: %s\n", e.what()); + } + + throw; + } loaded.tlsPassword = self->tlsPassword; loaded.tlsVerifyPeers = self->tlsVerifyPeers; From c81e1e95193ee07437984ae1d3ab8cc7f0bd957b Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Mon, 19 Apr 2021 22:46:57 -0700 Subject: [PATCH 079/180] Add sampling profiler frequency to global config --- fdbclient/ActorLineageProfiler.h | 4 +++- fdbclient/GlobalConfig.actor.cpp | 12 ++++++++++++ fdbclient/GlobalConfig.actor.h | 15 +++++++++++++++ fdbserver/ClusterController.actor.cpp | 4 +++- fdbserver/fdbserver.actor.cpp | 24 ++++++++++++++++++++++++ 5 files changed, 57 insertions(+), 2 deletions(-) diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h index 3f11840714..5dee2a4291 100644 --- a/fdbclient/ActorLineageProfiler.h +++ b/fdbclient/ActorLineageProfiler.h @@ -64,7 +64,7 @@ private: public: void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); } std::shared_ptr collect(); - void addGetter(WaitState waitState, Getter const& getter); + void addGetter(WaitState waitState, Getter const& getter) { getSamples[waitState] = getter; }; }; using SampleCollector = crossbow::singleton; @@ -118,3 +118,5 @@ public: void setFrequency(unsigned frequency); void stop(); }; + +using ActorLineageProfiler = crossbow::singleton; diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp index 58e032d363..95d7cfce13 100644 --- a/fdbclient/GlobalConfig.actor.cpp +++ b/fdbclient/GlobalConfig.actor.cpp @@ -34,6 +34,8 @@ const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_inf const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag_sample_rate"); const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost"); +const KeyRef sampleFrequency = LiteralStringRef("visibility/sample_frequency"); + GlobalConfig::GlobalConfig() : lastUpdate(0) {} void GlobalConfig::create(DatabaseContext* cx, Reference> dbInfo) { @@ -45,6 +47,10 @@ void GlobalConfig::create(DatabaseContext* cx, Reference> } } +void GlobalConfig::updateDBInfo(Reference> dbInfo) { + _updater = updater(&GlobalConfig::globalConfig(), dbInfo); +} + GlobalConfig& GlobalConfig::globalConfig() { void* res = g_network->global(INetwork::enGlobalConfig); ASSERT(res); @@ -77,6 +83,10 @@ Future GlobalConfig::onInitialized() { return initialized.getFuture(); } +Future GlobalConfig::onChange() { + return configChanged.onTrigger(); +} + void GlobalConfig::insert(KeyRef key, ValueRef value) { data.erase(key); @@ -222,6 +232,8 @@ ACTOR Future GlobalConfig::updater(GlobalConfig* self, ReferencelastUpdate = vh.version; } } + + self->configChanged.trigger(); } catch (Error& e) { throw; } diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h index 5c3693f450..bf7532a974 100644 --- a/fdbclient/GlobalConfig.actor.h +++ b/fdbclient/GlobalConfig.actor.h @@ -49,6 +49,8 @@ extern const KeyRef fdbClientInfoTxnSizeLimit; extern const KeyRef transactionTagSampleRate; extern const KeyRef transactionTagSampleCost; +extern const KeyRef sampleFrequency; + // Structure used to hold the values stored by global configuration. The arena // is used as memory to store both the key and the value (the value is only // stored in the arena if it is an object; primitives are just copied). @@ -78,6 +80,14 @@ public: // For example, given "config/a", returns "\xff\xff/global_config/config/a". static Key prefixedKey(KeyRef key); + // Update the ClientDBInfo object used internally to check for updates to + // global configuration. The ClientDBInfo reference must be the same one + // used in the cluster controller, but fdbserver requires initial creation + // of the GlobalConfig class before the cluster controller is initialized. + // This function allows the ClientDBInfo object to be updated after create + // was called. + void updateDBInfo(Reference> dbInfo); + // Get a value from the framework. Values are returned as a ConfigValue // reference which also contains the arena holding the object. As long as // the caller keeps the ConfigValue reference, the value is guaranteed to @@ -114,6 +124,10 @@ public: // been created and is ready. Future onInitialized(); + // Triggers the returned future when any key-value pair in the global + // configuration changes. + Future onChange(); + private: GlobalConfig(); @@ -139,6 +153,7 @@ private: Database cx; Future _updater; Promise initialized; + AsyncTrigger configChanged; std::unordered_map> data; Version lastUpdate; }; diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 8ec3a4d30c..6b929ca29e 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -135,7 +135,9 @@ public: true, TaskPriority::DefaultEndpoint, true)) // SOMEDAY: Locality! - {} + { + GlobalConfig::globalConfig().updateDBInfo(clientInfo); + } void setDistributor(const DataDistributorInterface& interf) { auto newInfo = serverInfo->get(); diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 136cd90c3d..59e2f494fc 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -35,6 +35,8 @@ #include #include +#include "fdbclient/ActorLineageProfiler.h" +#include "fdbclient/GlobalConfig.actor.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/RestoreWorkerInterface.actor.h" #include "fdbclient/SystemData.h" @@ -456,6 +458,27 @@ ACTOR Future dumpDatabase(Database cx, std::string outputFilename, KeyRang } } +// Handles running the sampling profiler, including responding to frequency +// changes and other updates the client wishes to make through global +// configuration. +ACTOR Future actorLineageProfiler() { + wait(delay(1)); + wait(GlobalConfig::globalConfig().onInitialized()); + // TODO: Add flag to enable/disable + state unsigned frequency = GlobalConfig::globalConfig().get(sampleFrequency, 0); + ActorLineageProfiler::instance().setFrequency(frequency); + + loop { + wait(GlobalConfig::globalConfig().onChange()); + + unsigned latestFrequency = GlobalConfig::globalConfig().get(sampleFrequency, 0); + if (latestFrequency != frequency) { + frequency = latestFrequency; + ActorLineageProfiler::instance().setFrequency(latestFrequency); + } + } +} + void memoryTest(); void skipListTest(); @@ -1987,6 +2010,7 @@ int main(int argc, char* argv[]) { opts.whitelistBinPaths)); actors.push_back(histogramReport()); // actors.push_back( recurring( []{}, .001 ) ); // for ASIO latency measurement + actors.push_back(actorLineageProfiler()); f = stopAfter(waitForAll(actors)); g_network->run(); From 7beccc8643f4a40ea00e7bd64bfd33f7f02ad9a2 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Tue, 20 Apr 2021 14:13:25 -0400 Subject: [PATCH 080/180] move operational details out of release notes and into administration.rst --- documentation/sphinx/source/administration.rst | 13 +++++++++++++ .../source/release-notes/release-notes-620.rst | 5 +---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/documentation/sphinx/source/administration.rst b/documentation/sphinx/source/administration.rst index 5f6369d889..bcdeec1566 100644 --- a/documentation/sphinx/source/administration.rst +++ b/documentation/sphinx/source/administration.rst @@ -799,3 +799,16 @@ Upgrading from Older Versions ----------------------------- Upgrades from versions older than 5.0.0 are no longer supported. + +Version-specific notes on downgrading +=================================== + +In general, downgrades between patch releases (i.e. 6.2.x - 6.1.x) are not supported. + +Downgrading from 6.3.13 - 6.2.33 +-------------------------------- +After upgrading from 6.2 to 6.3, the option of rolling back and downgrading to return to 6.2 is still possible, given that the following conditions are met: + +* The 6.3 cluster cannot have ``TLogVersion`` greater than V4 (6.2). +* The 6.3 cluster cannot use storage engine types that are not ``ssd-1``, ``ssd-2``, or ``memory``. +* The 6.3 cluster must not have any key servers serialized with tag encoding. The ``TAG_ENCODE_KEY_SERVERS`` fdbclient knob must not be set to true at any point in time. \ No newline at end of file diff --git a/documentation/sphinx/source/release-notes/release-notes-620.rst b/documentation/sphinx/source/release-notes/release-notes-620.rst index 1150b29c38..b14bbc65fd 100644 --- a/documentation/sphinx/source/release-notes/release-notes-620.rst +++ b/documentation/sphinx/source/release-notes/release-notes-620.rst @@ -9,10 +9,7 @@ Release Notes * Backup agent no longer uses 4k block caching layer on local output files so that write operations are larger. `(PR #4428) `_ * Fix accounting error that could cause commits to incorrectly fail with ``proxy_memory_limit_exceeded``. `(PR #4529) `_ * Added support for downgrades from FDB version 6.3. `(PR #4673) `_ -* Restrictions added for 6.3 clusters to maintain compatibility with a 6.2 downgrade. `(PR #4469) `_ - * Downgrades from 6.3 cannot have ``TLogVersion`` greater than V4 (6.2). - * Downgrades from 6.3 cannot use storage engine types that are not ``ssd-1``, ``ssd-2``, or ``memory``. - * Downgrades from 6.3 must not have any key servers serialized with tag encoding. ``TAG_ENCODE_KEY_SERVERS`` must not be set to true at any point in time. +* Restrictions added for 6.3 clusters to maintain compatibility with a 6.2 downgrade. Details available in ``administration.rst``. `(PR #4469) `_ 6.2.32 ====== From d76b32da188c6f51da6d0837551cada78cc6ef53 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 20 Apr 2021 15:10:01 -0600 Subject: [PATCH 081/180] Annotate read paths on the server side --- fdbclient/ActorLineageProfiler.h | 2 +- fdbclient/NativeAPI.actor.cpp | 3 + fdbclient/TransactionLineage.cpp | 25 +++++ fdbclient/TransactionLineage.h | 128 ++++++++++++++++++++++++++ fdbserver/CommitProxyServer.actor.cpp | 4 + fdbserver/GrvProxyServer.actor.cpp | 5 + fdbserver/storageserver.actor.cpp | 15 ++- fdbserver/worker.actor.cpp | 4 + 8 files changed, 183 insertions(+), 3 deletions(-) create mode 100644 fdbclient/TransactionLineage.cpp create mode 100644 fdbclient/TransactionLineage.h diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h index 3f11840714..81d4bcaec7 100644 --- a/fdbclient/ActorLineageProfiler.h +++ b/fdbclient/ActorLineageProfiler.h @@ -64,7 +64,7 @@ private: public: void addCollector(IALPCollectorBase* collector) { collectors.push_back(collector); } std::shared_ptr collect(); - void addGetter(WaitState waitState, Getter const& getter); + void addGetter(WaitState waitState, Getter const& getter) { getSamples.emplace(waitState, getter); } }; using SampleCollector = crossbow::singleton; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 1857cea0c7..ac45d83b05 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -49,6 +49,7 @@ #include "fdbclient/SpecialKeySpace.actor.h" #include "fdbclient/StorageServerInterface.h" #include "fdbclient/SystemData.h" +#include "fdbclient/TransactionLineage.h" #include "fdbclient/versions.h" #include "fdbrpc/LoadBalance.h" #include "fdbrpc/Net2FileSystem.h" @@ -86,6 +87,8 @@ using std::pair; namespace { +TransactionLineageCollector transactionLineageCollector; + template Future loadBalance( DatabaseContext* ctx, diff --git a/fdbclient/TransactionLineage.cpp b/fdbclient/TransactionLineage.cpp new file mode 100644 index 0000000000..9ef0f21e1b --- /dev/null +++ b/fdbclient/TransactionLineage.cpp @@ -0,0 +1,25 @@ +/* + * TransactionLineage.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/TransactionLineage.h" + +namespace { +TransactionLineageCollector transactionLineageCollector; +} \ No newline at end of file diff --git a/fdbclient/TransactionLineage.h b/fdbclient/TransactionLineage.h new file mode 100644 index 0000000000..b4518de231 --- /dev/null +++ b/fdbclient/TransactionLineage.h @@ -0,0 +1,128 @@ +/* + * TransactionLineage.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "fdbclient/ActorLineageProfiler.h" + +struct TransactionLineage : LineageProperties { + enum class Operation { + Unset, + GetValue, + GetKey, + GetKeyValues, + WatchValue, + GetConsistentReadVersion, + Commit, + GetKeyServersLocations + }; + static constexpr std::string_view name = "Transaction"sv; + uint64_t txID; + Operation operation = Operation::Unset; + + bool isSet(uint64_t TransactionLineage::*member) const { return this->*member > 0; } + bool isSet(Operation TransactionLineage::*member) const { return this->*member != Operation::Unset; } +}; + +struct TransactionLineageCollector : IALPCollector { + using Operation = TransactionLineage::Operation; + std::optional collect(ActorLineage* lineage) { + std::map res; + auto txID = lineage->get(&TransactionLineage::txID); + if (txID.has_value()) { + res["ID"sv] = txID.value(); + } + auto operation = lineage->get(&TransactionLineage::operation); + if (operation.has_value()) { + switch (operation.value()) { + case Operation::Unset: + res["operation"sv] = "Unset"sv; + break; + case Operation::GetValue: + res["operation"sv] = "GetValue"sv; + break; + case Operation::GetKey: + res["operation"sv] = "GetKey"sv; + break; + case Operation::GetKeyValues: + res["operation"sv] = "GetKeyValues"sv; + break; + case Operation::WatchValue: + res["operation"sv] = "WatchValue"sv; + break; + case Operation::GetConsistentReadVersion: + res["operation"sv] = "GetConsistentReadVersion"sv; + break; + case Operation::Commit: + res["operation"sv] = "Commit"sv; + break; + case Operation::GetKeyServersLocations: + res["operation"sv] = "GetKeyServersLocations"sv; + break; + } + } + if (res.empty()) { + return std::optional{}; + } else { + return res; + } + } +}; + +template +class ScopedLineage { + V before; + V T::*member; + bool valid = true; + +public: + ScopedLineage(V T::*member, V const& value) : member(member) { + auto val = currentLineage->modify(member); + before = val; + val = value; + } + ~ScopedLineage() { + if (!valid) { + return; + } + currentLineage->modify(member) = before; + } + ScopedLineage(ScopedLineage&& o) : before(std::move(o.before)), member(o.member), valid(o.valid) { + o.release(); + } + ScopedLineage& operator=(ScopedLineage&& o) { + if (valid) { + currentLineage->modify(member) = before; + } + before = std::move(o.before); + member = o.member; + valid = o.valid; + o.release(); + return *this; + } + ScopedLineage(const ScopedLineage&) = delete; + ScopedLineage& operator=(const ScopedLineage&) = delete; + void release() { valid = false; } +}; + +template +ScopedLineage make_scoped_lineage(V T::*member, V const& value) { + return ScopedLineage(member, value); +} diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index 4ae833c050..428a384279 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -28,6 +28,7 @@ #include "fdbclient/CommitProxyInterface.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/SystemData.h" +#include "fdbclient/TransactionLineage.h" #include "fdbrpc/sim_validation.h" #include "fdbserver/ApplyMetadataMutation.h" #include "fdbserver/ConflictSet.h" @@ -1396,6 +1397,7 @@ ACTOR Future commitBatch(ProxyCommitData* self, // WARNING: this code is run at a high priority (until the first delay(0)), so it needs to do as little work as // possible state CommitBatch::CommitBatchContext context(self, trs, currentBatchMemBytesCount); + currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::Commit; // Active load balancing runs at a very high priority (to obtain accurate estimate of memory used by commit batches) // so we need to downgrade here @@ -1432,6 +1434,8 @@ ACTOR Future commitBatch(ProxyCommitData* self, ACTOR static Future doKeyServerLocationRequest(GetKeyServerLocationsRequest req, ProxyCommitData* commitData) { // We can't respond to these requests until we have valid txnStateStore + currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKeyServersLocations; + currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first(); wait(commitData->validState.getFuture()); wait(delay(0, TaskPriority::DefaultEndpoint)); diff --git a/fdbserver/GrvProxyServer.actor.cpp b/fdbserver/GrvProxyServer.actor.cpp index 8ab3719181..faad80d2d7 100644 --- a/fdbserver/GrvProxyServer.actor.cpp +++ b/fdbserver/GrvProxyServer.actor.cpp @@ -19,6 +19,7 @@ */ #include "fdbclient/Notified.h" +#include "fdbclient/TransactionLineage.h" #include "fdbserver/LogSystem.h" #include "fdbserver/LogSystemDiskQueueAdapter.h" #include "fdbclient/CommitProxyInterface.h" @@ -349,8 +350,11 @@ ACTOR Future queueGetReadVersionRequests(Reference> GrvProxyStats* stats, GrvTransactionRateInfo* batchRateInfo, TransactionTagMap* transactionTagCounter) { + currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetConsistentReadVersion; loop choose { when(GetReadVersionRequest req = waitNext(readVersionRequests)) { + auto lineage = make_scoped_lineage(&TransactionLineage::txID, req.spanContext.first()); + // currentLineage->modify(&TransactionLineage::txID) = // WARNING: this code is run at a high priority, so it needs to do as little work as possible if (stats->txnRequestIn.getValue() - stats->txnRequestOut.getValue() > SERVER_KNOBS->START_TRANSACTION_MAX_QUEUE_SIZE) { @@ -637,6 +641,7 @@ ACTOR static Future transactionStarter(GrvProxyInterface proxy, state Span span; state int64_t midShardSize = SERVER_KNOBS->MIN_SHARD_BYTES; + currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetConsistentReadVersion; addActor.send(monitorDDMetricsChanges(&midShardSize, db)); addActor.send(getRate(proxy.id(), diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 8c26f955bb..7538685acf 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -42,6 +42,7 @@ #include "fdbclient/Notified.h" #include "fdbclient/StatusClient.h" #include "fdbclient/SystemData.h" +#include "fdbclient/TransactionLineage.h" #include "fdbclient/VersionedMap.h" #include "fdbserver/FDBExecHelper.actor.h" #include "fdbserver/IKeyValueStore.h" @@ -521,7 +522,7 @@ public: // process of committing makeShardDurable) // == v -> k is readable (from storage+versionedData) @ [storageVersion,v], and not being updated // when version increases - // == latestVersion -> k is readable (from storage+versionedData) @ [storageVersion,version.get()], and thus + // == latestVersion -> k is readable (from stora ge+versionedData) @ [storageVersion,version.get()], and thus // stays available when version increases CoalescedKeyRangeMap newestAvailableVersion; @@ -874,7 +875,7 @@ public: } return fun(this, request); } -}; + }; const StringRef StorageServer::CurrentRunningFetchKeys::emptyString = LiteralStringRef(""); const KeyRangeRef StorageServer::CurrentRunningFetchKeys::emptyKeyRange = @@ -1106,6 +1107,7 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { state int64_t resultSize = 0; Span span("SS:getValue"_loc, { req.spanContext }); span.addTag("key"_sr, req.key); + currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first(); try { ++data->counters.getValueQueries; @@ -1799,6 +1801,7 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) { state Span span("SS:getKeyValues"_loc, { req.spanContext }); state int64_t resultSize = 0; + currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first(); ++data->counters.getRangeQueries; ++data->counters.allQueries; @@ -1959,6 +1962,7 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { state Span span("SS:getKey"_loc, { req.spanContext }); state int64_t resultSize = 0; + currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first(); ++data->counters.getKeyQueries; ++data->counters.allQueries; @@ -4324,6 +4328,7 @@ ACTOR Future checkBehind(StorageServer* self) { } ACTOR Future serveGetValueRequests(StorageServer* self, FutureStream getValue) { + currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetValue; loop { GetValueRequest req = waitNext(getValue); // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade @@ -4341,6 +4346,7 @@ ACTOR Future serveGetValueRequests(StorageServer* self, FutureStream serveGetKeyValuesRequests(StorageServer* self, FutureStream getKeyValues) { + currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKeyValues; loop { GetKeyValuesRequest req = waitNext(getKeyValues); // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade @@ -4350,6 +4356,7 @@ ACTOR Future serveGetKeyValuesRequests(StorageServer* self, FutureStream serveGetKeyRequests(StorageServer* self, FutureStream getKey) { + currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::GetKey; loop { GetKeyRequest req = waitNext(getKey); // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade @@ -4362,6 +4369,7 @@ ACTOR Future watchValueWaitForVersion(StorageServer* self, WatchValueRequest req, PromiseStream stream) { state Span span("SS:watchValueWaitForVersion"_loc, { req.spanContext }); + currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first(); try { wait(success(waitForVersionNoTooOld(self, req.version))); stream.send(req); @@ -4375,9 +4383,11 @@ ACTOR Future watchValueWaitForVersion(StorageServer* self, ACTOR Future serveWatchValueRequestsImpl(StorageServer* self, FutureStream stream) { loop { + currentLineage->modify(&TransactionLineage::txID) = 0; state WatchValueRequest req = waitNext(stream); state Reference metadata = self->getWatchMetadata(req.key.contents()); state Span span("SS:serveWatchValueRequestsImpl"_loc, { req.spanContext }); + currentLineage->modify(&TransactionLineage::txID) = req.spanContext.first(); if (!metadata.isValid()) { // case 1: no watch set for the current key metadata = makeReference(req.key, req.value, req.version, req.tags, req.debugID); @@ -4451,6 +4461,7 @@ ACTOR Future serveWatchValueRequestsImpl(StorageServer* self, FutureStream ACTOR Future serveWatchValueRequests(StorageServer* self, FutureStream watchValue) { state PromiseStream stream; + currentLineage->modify(&TransactionLineage::operation) = TransactionLineage::Operation::WatchValue; self->actors.add(serveWatchValueRequestsImpl(self, stream.getFuture())); loop { diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 4d05d3f5fe..2beccdf0ef 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -79,6 +79,10 @@ extern IKeyValueStore* keyValueStoreCompressTestData(IKeyValueStore* store); #define KV_STORE(filename, uid) keyValueStoreMemory(filename, uid) #endif +namespace { +RoleLineageCollector roleLineageCollector; +} + ACTOR Future> tryDBInfoBroadcast(RequestStream stream, UpdateServerDBInfoRequest req) { ErrorOr> rep = From 235717772281f3d545e571eb72624a9eb0a5320e Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Tue, 20 Apr 2021 15:05:51 -0700 Subject: [PATCH 082/180] Add bool support to global configuration --- fdbclient/ActorLineageProfiler.cpp | 4 ++++ fdbclient/GlobalConfig.actor.cpp | 4 +++- fdbclient/GlobalConfig.actor.h | 2 +- fdbclient/SpecialKeySpace.actor.cpp | 3 +++ fdbclient/Tuple.cpp | 29 +++++++++++++++++++++++++++++ fdbclient/Tuple.h | 4 +++- fdbserver/fdbserver.actor.cpp | 5 ++--- 7 files changed, 45 insertions(+), 6 deletions(-) diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp index 82d04aa42c..733f581718 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -257,6 +257,10 @@ void ActorLineageProfilerT::setFrequency(unsigned frequency) { } else if (change) { cond.notify_all(); } + + if (frequency == 0) { + profilerThread.join(); + } } void ActorLineageProfilerT::profile() { diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp index 95d7cfce13..8096688786 100644 --- a/fdbclient/GlobalConfig.actor.cpp +++ b/fdbclient/GlobalConfig.actor.cpp @@ -34,7 +34,7 @@ const KeyRef fdbClientInfoTxnSizeLimit = LiteralStringRef("config/fdb_client_inf const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag_sample_rate"); const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost"); -const KeyRef sampleFrequency = LiteralStringRef("visibility/sample_frequency"); +const KeyRef samplingFrequency = LiteralStringRef("visibility/sampling/frequency"); GlobalConfig::GlobalConfig() : lastUpdate(0) {} @@ -99,6 +99,8 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) { any = StringRef(arena, t.getString(0).contents()); } else if (t.getType(0) == Tuple::ElementType::INT) { any = t.getInt(0); + } else if (t.getType(0) == Tuple::ElementType::BOOL) { + any = t.getBool(0); } else if (t.getType(0) == Tuple::ElementType::FLOAT) { any = t.getFloat(0); } else if (t.getType(0) == Tuple::ElementType::DOUBLE) { diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h index bf7532a974..8835955400 100644 --- a/fdbclient/GlobalConfig.actor.h +++ b/fdbclient/GlobalConfig.actor.h @@ -49,7 +49,7 @@ extern const KeyRef fdbClientInfoTxnSizeLimit; extern const KeyRef transactionTagSampleRate; extern const KeyRef transactionTagSampleCost; -extern const KeyRef sampleFrequency; +extern const KeyRef samplingFrequency; // Structure used to hold the values stored by global configuration. The arena // is used as memory to store both the key and the value (the value is only diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index af1f106a66..603887fcf6 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -1397,6 +1397,9 @@ Future> GlobalConfigImpl::getRange(ReadYourWritesTran } else if (config->value.type() == typeid(int64_t)) { result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast(config->value)))); + } else if (config->value.type() == typeid(bool)) { + result.push_back_deep(result.arena(), + KeyValueRef(prefixedKey, std::to_string(std::any_cast(config->value)))); } else if (config->value.type() == typeid(float)) { result.push_back_deep(result.arena(), KeyValueRef(prefixedKey, std::to_string(std::any_cast(config->value)))); diff --git a/fdbclient/Tuple.cpp b/fdbclient/Tuple.cpp index 367a7b80fb..ab1fcb0314 100644 --- a/fdbclient/Tuple.cpp +++ b/fdbclient/Tuple.cpp @@ -71,6 +71,8 @@ Tuple::Tuple(StringRef const& str, bool exclude_incomplete) { i += sizeof(float) + 1; } else if (data[i] == 0x21) { i += sizeof(double) + 1; + } else if (data[i] == 0x26 || data[i] == 0x27) { + i += 1; } else if (data[i] == '\x00') { i += 1; } else { @@ -144,6 +146,16 @@ Tuple& Tuple::append(int64_t value) { return *this; } +Tuple& Tuple::appendBool(bool value) { + offsets.push_back(data.size()); + if (value) { + data.push_back(data.arena(), 0x27); + } else { + data.push_back(data.arena(), 0x26); + } + return *this; +} + Tuple& Tuple::appendFloat(float value) { offsets.push_back(data.size()); float swap = bigEndianFloat(value); @@ -192,6 +204,8 @@ Tuple::ElementType Tuple::getType(size_t index) const { return ElementType::FLOAT; } else if (code == 0x21) { return ElementType::DOUBLE; + } else if (code == 0x26 || code == 0x27) { + return ElementType::BOOL; } else { throw invalid_tuple_data_type(); } @@ -287,6 +301,21 @@ int64_t Tuple::getInt(size_t index, bool allow_incomplete) const { } // TODO: Combine with bindings/flow/Tuple.*. This code is copied from there. +bool Tuple::getBool(size_t index) const { + if (index >= offsets.size()) { + throw invalid_tuple_index(); + } + ASSERT_LT(offsets[index], data.size()); + uint8_t code = data[offsets[index]]; + if (code == 0x26) { + return false; + } else if (code == 0x27) { + return true; + } else { + throw invalid_tuple_data_type(); + } +} + float Tuple::getFloat(size_t index) const { if (index >= offsets.size()) { throw invalid_tuple_index(); diff --git a/fdbclient/Tuple.h b/fdbclient/Tuple.h index 3dc597f262..62feba307b 100644 --- a/fdbclient/Tuple.h +++ b/fdbclient/Tuple.h @@ -40,6 +40,7 @@ struct Tuple { Tuple& append(int64_t); // There are some ambiguous append calls in fdbclient, so to make it easier // to add append for floats and doubles, name them differently for now. + Tuple& appendBool(bool); Tuple& appendFloat(float); Tuple& appendDouble(double); Tuple& appendNull(); @@ -51,7 +52,7 @@ struct Tuple { return append(t); } - enum ElementType { NULL_TYPE, INT, BYTES, UTF8, FLOAT, DOUBLE }; + enum ElementType { NULL_TYPE, INT, BYTES, UTF8, BOOL, FLOAT, DOUBLE }; // this is number of elements, not length of data size_t size() const { return offsets.size(); } @@ -59,6 +60,7 @@ struct Tuple { ElementType getType(size_t index) const; Standalone getString(size_t index) const; int64_t getInt(size_t index, bool allow_incomplete = false) const; + bool getBool(size_t index) const; float getFloat(size_t index) const; double getDouble(size_t index) const; diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 59e2f494fc..ab31760f7f 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -464,14 +464,13 @@ ACTOR Future dumpDatabase(Database cx, std::string outputFilename, KeyRang ACTOR Future actorLineageProfiler() { wait(delay(1)); wait(GlobalConfig::globalConfig().onInitialized()); - // TODO: Add flag to enable/disable - state unsigned frequency = GlobalConfig::globalConfig().get(sampleFrequency, 0); + state unsigned frequency = GlobalConfig::globalConfig().get(samplingFrequency, 0); ActorLineageProfiler::instance().setFrequency(frequency); loop { wait(GlobalConfig::globalConfig().onChange()); - unsigned latestFrequency = GlobalConfig::globalConfig().get(sampleFrequency, 0); + unsigned latestFrequency = GlobalConfig::globalConfig().get(samplingFrequency, 0); if (latestFrequency != frequency) { frequency = latestFrequency; ActorLineageProfiler::instance().setFrequency(latestFrequency); From 115efaabc3b2d875a3ccabb8fd74c15fed55124c Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Tue, 20 Apr 2021 15:31:13 -0700 Subject: [PATCH 083/180] Move profiler start function --- ...ler.cpp => ActorLineageProfiler.actor.cpp} | 23 +++++++++++++++++- ...rofiler.h => ActorLineageProfiler.actor.h} | 13 +++++++++- fdbclient/CMakeLists.txt | 4 ++-- fdbserver/RoleLineage.actor.h | 2 +- fdbserver/fdbserver.actor.cpp | 24 ++----------------- 5 files changed, 39 insertions(+), 27 deletions(-) rename fdbclient/{ActorLineageProfiler.cpp => ActorLineageProfiler.actor.cpp} (89%) rename fdbclient/{ActorLineageProfiler.h => ActorLineageProfiler.actor.h} (90%) diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.actor.cpp similarity index 89% rename from fdbclient/ActorLineageProfiler.cpp rename to fdbclient/ActorLineageProfiler.actor.cpp index 733f581718..5c746ad9e2 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.actor.cpp @@ -21,7 +21,8 @@ #include "flow/flow.h" #include "flow/singleton.h" #include "fdbrpc/IAsyncFile.h" -#include "fdbclient/ActorLineageProfiler.h" +#include "fdbclient/ActorLineageProfiler.actor.h" +#include "fdbclient/GlobalConfig.actor.h" #include #include #include @@ -279,3 +280,23 @@ void ActorLineageProfilerT::profile() { } } } + +// Handles running the sampling profiler, including responding to frequency +// changes and other updates the client wishes to make through global +// configuration. +ACTOR Future runSamplingProfiler() { + wait(delay(1)); // A bit of a hack to get around GlobalConfig not being setup yet + wait(GlobalConfig::globalConfig().onInitialized()); + state unsigned frequency = GlobalConfig::globalConfig().get(samplingFrequency, 0); + ActorLineageProfiler::instance().setFrequency(frequency); + + loop { + wait(GlobalConfig::globalConfig().onChange()); + + unsigned latestFrequency = GlobalConfig::globalConfig().get(samplingFrequency, 0); + if (latestFrequency != frequency) { + frequency = latestFrequency; + ActorLineageProfiler::instance().setFrequency(latestFrequency); + } + } +} diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.actor.h similarity index 90% rename from fdbclient/ActorLineageProfiler.h rename to fdbclient/ActorLineageProfiler.actor.h index 5dee2a4291..50d064b746 100644 --- a/fdbclient/ActorLineageProfiler.h +++ b/fdbclient/ActorLineageProfiler.actor.h @@ -19,6 +19,13 @@ */ #pragma once + +#if defined(NO_INTELLISENSE) && !defined(FLOW_ACTORLINEAGEPROFILER_ACTOR_G_H) +#define FLOW_ACTORLINEAGEPROFILER_ACTOR_G_H +#include "fdbclient/ActorLineageProfiler.actor.g.h" +#elif !defined(FLOW_ACTORLINEAGEPROFILER_ACTOR_H) +#define FLOW_ACTORLINEAGEPROFILER_ACTOR_H + #include "fdbclient/AnnotateActor.h" #include @@ -30,7 +37,9 @@ #include "flow/singleton.h" #include "flow/flow.h" -void runSamplingProfiler(); +#include "flow/actorcompiler.h" // This must be the last #include. + +ACTOR Future runSamplingProfiler(); struct IALPCollectorBase { virtual std::optional collect(ActorLineage*) = 0; @@ -120,3 +129,5 @@ public: }; using ActorLineageProfiler = crossbow::singleton; + +#endif diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index ee87d08646..25825f3f23 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -1,6 +1,6 @@ set(FDBCLIENT_SRCS - ActorLineageProfiler.h - ActorLineageProfiler.cpp + ActorLineageProfiler.actor.h + ActorLineageProfiler.actor.cpp AnnotateActor.cpp AsyncFileS3BlobStore.actor.cpp AsyncFileS3BlobStore.actor.h diff --git a/fdbserver/RoleLineage.actor.h b/fdbserver/RoleLineage.actor.h index 5cbf65ed53..977adaa47b 100644 --- a/fdbserver/RoleLineage.actor.h +++ b/fdbserver/RoleLineage.actor.h @@ -28,7 +28,7 @@ #include "flow/singleton.h" #include "fdbrpc/Locality.h" -#include "fdbclient/ActorLineageProfiler.h" +#include "fdbclient/ActorLineageProfiler.actor.h" #include "fdbserver/WorkerInterface.actor.h" #include diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index ab31760f7f..53876fd6fd 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -35,7 +35,7 @@ #include #include -#include "fdbclient/ActorLineageProfiler.h" +#include "fdbclient/ActorLineageProfiler.actor.h" #include "fdbclient/GlobalConfig.actor.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/RestoreWorkerInterface.actor.h" @@ -458,26 +458,6 @@ ACTOR Future dumpDatabase(Database cx, std::string outputFilename, KeyRang } } -// Handles running the sampling profiler, including responding to frequency -// changes and other updates the client wishes to make through global -// configuration. -ACTOR Future actorLineageProfiler() { - wait(delay(1)); - wait(GlobalConfig::globalConfig().onInitialized()); - state unsigned frequency = GlobalConfig::globalConfig().get(samplingFrequency, 0); - ActorLineageProfiler::instance().setFrequency(frequency); - - loop { - wait(GlobalConfig::globalConfig().onChange()); - - unsigned latestFrequency = GlobalConfig::globalConfig().get(samplingFrequency, 0); - if (latestFrequency != frequency) { - frequency = latestFrequency; - ActorLineageProfiler::instance().setFrequency(latestFrequency); - } - } -} - void memoryTest(); void skipListTest(); @@ -2009,7 +1989,7 @@ int main(int argc, char* argv[]) { opts.whitelistBinPaths)); actors.push_back(histogramReport()); // actors.push_back( recurring( []{}, .001 ) ); // for ASIO latency measurement - actors.push_back(actorLineageProfiler()); + actors.push_back(runSamplingProfiler()); f = stopAfter(waitForAll(actors)); g_network->run(); From 9e89159efb7a994d2d880ddd474bc9834cbd6a2e Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 20 Apr 2021 16:21:01 -0700 Subject: [PATCH 084/180] Don't use DLDatabase objects before they are ready (applicable for API versions < 610). Fix reference counting of DLDatabase objects to avoid leaking the underlying database handle. Update release notes to note that clients older than 6.2 still create extra connections. --- .../release-notes/release-notes-630.rst | 2 +- .../release-notes/release-notes-700.rst | 2 +- fdbclient/MultiVersionTransaction.actor.cpp | 94 ++++++++++++------- fdbclient/MultiVersionTransaction.h | 4 + 4 files changed, 66 insertions(+), 36 deletions(-) diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst index cd8c5e4150..f4b5c8aacb 100644 --- a/documentation/sphinx/source/release-notes/release-notes-630.rst +++ b/documentation/sphinx/source/release-notes/release-notes-630.rst @@ -4,7 +4,7 @@ Release Notes 6.3.13 ====== -* The multi-version client now requires at most two connections to the cluster, regardless of how many external clients are configured. `(PR #4667) `_ +* The multi-version client now requires at most two client connections with version 6.2 or larger, regardless of how many external clients are configured. Clients older than 6.2 will continue to create an additional connection each. `(PR #4667) `_ 6.3.12 ====== diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst index 5f3d3a4669..84e8f0680a 100644 --- a/documentation/sphinx/source/release-notes/release-notes-700.rst +++ b/documentation/sphinx/source/release-notes/release-notes-700.rst @@ -16,7 +16,7 @@ Performance ----------- * Increased performance of dr_agent when copying the mutation log. The ``COPY_LOG_BLOCK_SIZE``, ``COPY_LOG_BLOCKS_PER_TASK``, ``COPY_LOG_PREFETCH_BLOCKS``, ``COPY_LOG_READ_AHEAD_BYTES`` and ``COPY_LOG_TASK_DURATION_NANOS`` knobs can be set. `(PR #3436) `_ -* Reduced the number of connections required by the multi-version client when loading external clients. When connection to 7.0 clusters, only one connection will be used. With older clusters, at most two connections will be used. `(PR #4667) `_ +* Reduced the number of connections required by the multi-version client when loading external clients. When connecting to 7.0 clusters, only one connection with version 6.2 or larger will be used. With older clusters, at most two connections with version 6.2 or larger will be used. Clients older than version 6.2 will continue to create an additional connection each. `(PR #4667) `_ Reliability ----------- diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index b39fde5cfd..555765c26c 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -289,12 +289,15 @@ void DLTransaction::reset() { // DLDatabase DLDatabase::DLDatabase(Reference api, ThreadFuture dbFuture) : api(api), db(nullptr) { + addref(); ready = mapThreadFuture(dbFuture, [this](ErrorOr db) { if (db.isError()) { + delref(); return ErrorOr(db.getError()); } this->db = db.get(); + delref(); return ErrorOr(Void()); }); } @@ -1013,12 +1016,56 @@ ThreadFuture MultiVersionDatabase::DatabaseState::monitorProtocolVersion() }); } +// Replaces the active database connection with a new one. Must be called from the main thread. +void MultiVersionDatabase::DatabaseState::updateDatabase(Reference newDb, Reference client) { + if (newDb) { + optionLock.enter(); + for (auto option : options) { + try { + // In practice, this will set a deferred error instead of throwing. If that happens, the database + // will be unusable (attempts to use it will throw errors). + newDb->setOption(option.first, option.second.castTo()); + } catch (Error& e) { + optionLock.leave(); + + // If we can't set all of the options on a cluster, we abandon the client + TraceEvent(SevError, "ClusterVersionChangeOptionError") + .error(e) + .detail("Option", option.first) + .detail("OptionValue", option.second) + .detail("LibPath", client->libPath); + client->failed = true; + MultiVersionApi::api->updateSupportedVersions(); + newDb = Reference(); + break; + } + } + + db = newDb; + + optionLock.leave(); + + if (dbProtocolVersion.get().hasStableInterfaces() && db) { + versionMonitorDb = db; + } else { + versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str()); + } + } else { + db = Reference(); + versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str()); + } + + dbVar->set(db); + protocolVersionMonitor = monitorProtocolVersion(); +} + // Called when a change to the protocol version of the cluster has been detected. Must be called from the main // thread. void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion protocolVersion) { if (dbProtocolVersion.present() && protocolVersion.normalizedVersion() == dbProtocolVersion.get().normalizedVersion()) { dbProtocolVersion = protocolVersion; + protocolVersionMonitor = monitorProtocolVersion(); } else { TraceEvent("ProtocolVersionChanged") .detail("NewProtocolVersion", protocolVersion) @@ -1036,46 +1083,25 @@ void MultiVersionDatabase::DatabaseState::protocolVersionChanged(ProtocolVersion Reference newDb = client->api->createDatabase(clusterFilePath.c_str()); - optionLock.enter(); - for (auto option : options) { - try { - // In practice, this will set a deferred error instead of throwing. If that happens, the database - // will be unusable (attempts to use it will throw errors). - newDb->setOption(option.first, option.second.castTo()); - } catch (Error& e) { - optionLock.leave(); + if (client->external && !MultiVersionApi::apiVersionAtLeast(610)) { + dbReady = mapThreadFuture( + newDb.castTo()->onReady(), [this, newDb, client](ErrorOr ready) { + if (!ready.isError()) { + onMainThreadVoid([this, newDb, client]() { updateDatabase(newDb, client); }, nullptr); + } else { + updateDatabase(Reference(), client); + } - // If we can't set all of the options on a cluster, we abandon the client - TraceEvent(SevError, "ClusterVersionChangeOptionError") - .error(e) - .detail("Option", option.first) - .detail("OptionValue", option.second) - .detail("LibPath", client->libPath); - client->failed = true; - MultiVersionApi::api->updateSupportedVersions(); - newDb = Reference(); - break; - } - } - - db = newDb; - - optionLock.leave(); - - if (dbProtocolVersion.get().hasStableInterfaces() && db) { - versionMonitorDb = db; + dbReady = ThreadFuture(); + return ready; + }); } else { - versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str()); + updateDatabase(newDb, client); } } else { - db = Reference(); - versionMonitorDb = MultiVersionApi::api->getLocalClient()->api->createDatabase(clusterFilePath.c_str()); + updateDatabase(Reference(), Reference()); } - - dbVar->set(db); } - - protocolVersionMonitor = monitorProtocolVersion(); } std::atomic_flag MultiVersionDatabase::externalClientsInitialized = ATOMIC_FLAG_INIT; diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index c8aaeb840e..4e0e91a969 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -467,6 +467,9 @@ public: struct DatabaseState : ThreadSafeReferenceCounted { DatabaseState(std::string clusterFilePath, Reference versionMonitorDb); + // Replaces the active database connection with a new one. Must be called from the main thread. + void updateDatabase(Reference newDb, Reference client); + // Called when a change to the protocol version of the cluster has been detected. Must be called from the main // thread. void protocolVersionChanged(ProtocolVersion protocolVersion); @@ -490,6 +493,7 @@ public: bool cancelled; + ThreadFuture dbReady; ThreadFuture protocolVersionMonitor; Optional dbProtocolVersion; std::map> clients; From 15336ca274261bdfc27143c5143d02fa90ee0472 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Tue, 20 Apr 2021 17:51:38 -0700 Subject: [PATCH 085/180] Add callback for specific global configuration key changes --- ...ler.actor.cpp => ActorLineageProfiler.cpp} | 29 +++++++------------ ...rofiler.actor.h => ActorLineageProfiler.h} | 12 +------- fdbclient/CMakeLists.txt | 4 +-- fdbclient/GlobalConfig.actor.cpp | 17 +++++++++-- fdbclient/GlobalConfig.actor.h | 9 ++++++ fdbclient/NativeAPI.actor.cpp | 2 ++ fdbserver/RoleLineage.actor.h | 2 +- fdbserver/fdbserver.actor.cpp | 3 +- fdbserver/worker.actor.cpp | 3 ++ 9 files changed, 44 insertions(+), 37 deletions(-) rename fdbclient/{ActorLineageProfiler.actor.cpp => ActorLineageProfiler.cpp} (90%) rename fdbclient/{ActorLineageProfiler.actor.h => ActorLineageProfiler.h} (90%) diff --git a/fdbclient/ActorLineageProfiler.actor.cpp b/fdbclient/ActorLineageProfiler.cpp similarity index 90% rename from fdbclient/ActorLineageProfiler.actor.cpp rename to fdbclient/ActorLineageProfiler.cpp index 5c746ad9e2..c317a88f37 100644 --- a/fdbclient/ActorLineageProfiler.actor.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -21,7 +21,7 @@ #include "flow/flow.h" #include "flow/singleton.h" #include "fdbrpc/IAsyncFile.h" -#include "fdbclient/ActorLineageProfiler.actor.h" +#include "fdbclient/ActorLineageProfiler.h" #include "fdbclient/GlobalConfig.actor.h" #include #include @@ -259,7 +259,7 @@ void ActorLineageProfilerT::setFrequency(unsigned frequency) { cond.notify_all(); } - if (frequency == 0) { + if (frequency == 0 && profilerThread.joinable()) { profilerThread.join(); } } @@ -281,22 +281,13 @@ void ActorLineageProfilerT::profile() { } } -// Handles running the sampling profiler, including responding to frequency -// changes and other updates the client wishes to make through global -// configuration. -ACTOR Future runSamplingProfiler() { - wait(delay(1)); // A bit of a hack to get around GlobalConfig not being setup yet - wait(GlobalConfig::globalConfig().onInitialized()); - state unsigned frequency = GlobalConfig::globalConfig().get(samplingFrequency, 0); - ActorLineageProfiler::instance().setFrequency(frequency); - - loop { - wait(GlobalConfig::globalConfig().onChange()); - - unsigned latestFrequency = GlobalConfig::globalConfig().get(samplingFrequency, 0); - if (latestFrequency != frequency) { - frequency = latestFrequency; - ActorLineageProfiler::instance().setFrequency(latestFrequency); - } +// Callback used to update the sampling profilers run frequency whenever the +// frequency changes. +void samplingProfilerUpdateFrequency(std::optional freq) { + double frequency = 0; + if (freq.has_value()) { + frequency = std::any_cast(freq.value()); } + TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency); + ActorLineageProfiler::instance().setFrequency(frequency); } diff --git a/fdbclient/ActorLineageProfiler.actor.h b/fdbclient/ActorLineageProfiler.h similarity index 90% rename from fdbclient/ActorLineageProfiler.actor.h rename to fdbclient/ActorLineageProfiler.h index 50d064b746..b73e7d04eb 100644 --- a/fdbclient/ActorLineageProfiler.actor.h +++ b/fdbclient/ActorLineageProfiler.h @@ -20,12 +20,6 @@ #pragma once -#if defined(NO_INTELLISENSE) && !defined(FLOW_ACTORLINEAGEPROFILER_ACTOR_G_H) -#define FLOW_ACTORLINEAGEPROFILER_ACTOR_G_H -#include "fdbclient/ActorLineageProfiler.actor.g.h" -#elif !defined(FLOW_ACTORLINEAGEPROFILER_ACTOR_H) -#define FLOW_ACTORLINEAGEPROFILER_ACTOR_H - #include "fdbclient/AnnotateActor.h" #include @@ -37,9 +31,7 @@ #include "flow/singleton.h" #include "flow/flow.h" -#include "flow/actorcompiler.h" // This must be the last #include. - -ACTOR Future runSamplingProfiler(); +void samplingProfilerUpdateFrequency(std::optional freq); struct IALPCollectorBase { virtual std::optional collect(ActorLineage*) = 0; @@ -129,5 +121,3 @@ public: }; using ActorLineageProfiler = crossbow::singleton; - -#endif diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index 25825f3f23..ee87d08646 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -1,6 +1,6 @@ set(FDBCLIENT_SRCS - ActorLineageProfiler.actor.h - ActorLineageProfiler.actor.cpp + ActorLineageProfiler.h + ActorLineageProfiler.cpp AnnotateActor.cpp AsyncFileS3BlobStore.actor.cpp AsyncFileS3BlobStore.actor.h diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp index 8096688786..79bbbb2202 100644 --- a/fdbclient/GlobalConfig.actor.cpp +++ b/fdbclient/GlobalConfig.actor.cpp @@ -87,6 +87,10 @@ Future GlobalConfig::onChange() { return configChanged.onTrigger(); } +void GlobalConfig::trigger(KeyRef key, std::function)> fn) { + callbacks.emplace(key, std::move(fn)); +} + void GlobalConfig::insert(KeyRef key, ValueRef value) { data.erase(key); @@ -109,19 +113,26 @@ void GlobalConfig::insert(KeyRef key, ValueRef value) { ASSERT(false); } data[stableKey] = makeReference(std::move(arena), std::move(any)); + + if (callbacks.find(stableKey) != callbacks.end()) { + callbacks[stableKey](data[stableKey]->value); + } } catch (Error& e) { TraceEvent("GlobalConfigTupleParseError").detail("What", e.what()); } } void GlobalConfig::erase(KeyRef key) { - data.erase(key); + erase(KeyRangeRef(key, keyAfter(key))); } void GlobalConfig::erase(KeyRangeRef range) { auto it = data.begin(); while (it != data.end()) { if (range.contains(it->first)) { + if (callbacks.find(it->first) != callbacks.end()) { + callbacks[it->first](std::nullopt); + } it = data.erase(it); } else { ++it; @@ -175,7 +186,9 @@ ACTOR Future GlobalConfig::migrate(GlobalConfig* self) { // Updates local copy of global configuration by reading the entire key-range // from storage. ACTOR Future GlobalConfig::refresh(GlobalConfig* self) { - self->data.clear(); + for (const auto& [key, _] : self->data) { + self->erase(key); + } Transaction tr(self->cx); Standalone result = wait(tr.getRange(globalConfigDataKeys, CLIENT_KNOBS->TOO_MANY)); diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h index 8835955400..de98c442e1 100644 --- a/fdbclient/GlobalConfig.actor.h +++ b/fdbclient/GlobalConfig.actor.h @@ -27,7 +27,9 @@ #define FDBCLIENT_GLOBALCONFIG_ACTOR_H #include +#include #include +#include #include #include @@ -128,6 +130,12 @@ public: // configuration changes. Future onChange(); + // Calls \ref fn when the value associated with \ref key is changed. \ref + // fn is passed the updated value for the key, or an empty optional if the + // key has been cleared. If the value is an allocated object, its memory + // remains in the control of the global configuration. + void trigger(KeyRef key, std::function)> fn); + private: GlobalConfig(); @@ -156,6 +164,7 @@ private: AsyncTrigger configChanged; std::unordered_map> data; Version lastUpdate; + std::unordered_map)>> callbacks; }; #endif diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 1857cea0c7..cd7638221b 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -32,6 +32,7 @@ #include "fdbrpc/FailureMonitor.h" #include "fdbrpc/MultiInterface.h" +#include "fdbclient/ActorLineageProfiler.h" #include "fdbclient/AnnotateActor.h" #include "fdbclient/Atomic.h" #include "fdbclient/ClusterInterface.h" @@ -960,6 +961,7 @@ DatabaseContext::DatabaseContext(Reference diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 53876fd6fd..1d66b163d4 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -35,7 +35,7 @@ #include #include -#include "fdbclient/ActorLineageProfiler.actor.h" +#include "fdbclient/ActorLineageProfiler.h" #include "fdbclient/GlobalConfig.actor.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/RestoreWorkerInterface.actor.h" @@ -1989,7 +1989,6 @@ int main(int argc, char* argv[]) { opts.whitelistBinPaths)); actors.push_back(histogramReport()); // actors.push_back( recurring( []{}, .001 ) ); // for ASIO latency measurement - actors.push_back(runSamplingProfiler()); f = stopAfter(waitForAll(actors)); g_network->run(); diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 4d05d3f5fe..fea422dcd8 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -22,6 +22,7 @@ #include #include "fdbrpc/Locality.h" +#include "fdbclient/GlobalConfig.actor.h" #include "fdbclient/ProcessInterface.h" #include "fdbclient/StorageServerInterface.h" #include "fdbserver/Knobs.h" @@ -1038,6 +1039,8 @@ ACTOR Future workerServer(Reference connFile, metricsLogger = runMetrics(openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, lockAware), KeyRef(metricsPrefix)); } + + GlobalConfig::globalConfig().trigger(samplingFrequency, samplingProfilerUpdateFrequency); } errorForwarders.add(resetAfter(degraded, From 8b280f5be637a465e57c1821f3fa41d07619da6e Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Tue, 20 Apr 2021 17:55:27 -0700 Subject: [PATCH 086/180] Remove old includes --- fdbclient/ActorLineageProfiler.cpp | 1 - fdbserver/fdbserver.actor.cpp | 2 -- 2 files changed, 3 deletions(-) diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp index c317a88f37..42ac76da90 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -22,7 +22,6 @@ #include "flow/singleton.h" #include "fdbrpc/IAsyncFile.h" #include "fdbclient/ActorLineageProfiler.h" -#include "fdbclient/GlobalConfig.actor.h" #include #include #include diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 1d66b163d4..136cd90c3d 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -35,8 +35,6 @@ #include #include -#include "fdbclient/ActorLineageProfiler.h" -#include "fdbclient/GlobalConfig.actor.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/RestoreWorkerInterface.actor.h" #include "fdbclient/SystemData.h" From 36b1ab7ba5fabaf0214785fff286dabcbaaced1f Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Tue, 20 Apr 2021 22:05:16 -0700 Subject: [PATCH 087/180] Detach profiler thread instead of joining it --- fdbclient/ActorLineageProfiler.cpp | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp index 42ac76da90..fe335d90d5 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -250,23 +250,30 @@ void ActorLineageProfilerT::stop() { } void ActorLineageProfilerT::setFrequency(unsigned frequency) { + unsigned oldFrequency = this->frequency; bool change = this->frequency != frequency; this->frequency = frequency; - if (frequency != 0 && !profilerThread.joinable()) { - profilerThread = std::thread(std::bind(&ActorLineageProfilerT::profile, this)); - } else if (change) { - cond.notify_all(); - } - if (frequency == 0 && profilerThread.joinable()) { - profilerThread.join(); + if (change) { + // Profiler thread will automatically switch to new frequency after + // being triggered by the the condition variable. Only need to start a + // new profiler thread if the old one has been stopped due to the + // profiler thread returning (frequency set to 0). + if (oldFrequency == 0 && frequency != 0) { + std::thread(&ActorLineageProfilerT::profile, this).detach(); + } + cond.notify_all(); } } void ActorLineageProfilerT::profile() { + static std::atomic_int profileThreadCount = 0; + ASSERT(++profileThreadCount == 1); + for (;;) { collection->refresh(); if (frequency == 0) { + profileThreadCount--; return; } { @@ -275,6 +282,7 @@ void ActorLineageProfilerT::profile() { // cond.wait_until(lock, lastSample + std::chrono::milliseconds) } if (frequency == 0) { + profileThreadCount--; return; } } From 28f8a2716e03384e113255d33c69fe4a75fc79c0 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 21 Apr 2021 11:54:05 -0700 Subject: [PATCH 088/180] For old incompatible connections, set the correct protocol version on the version async var --- fdbrpc/FlowTransport.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index b7221c8876..47bf03c7e8 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -1217,7 +1217,7 @@ ACTOR static Future connectionReader(TransportData* transport, if (!protocolVersion.hasMultiVersionClient()) { // Older versions expected us to hang up. It may work even if we don't hang up here, but // it's safer to keep the old behavior. - peer->protocolVersion->set(peerProtocolVersion); + peer->protocolVersion->set(protocolVersion); throw incompatible_protocol_version(); } } else { From f485d7fa5ea0bc6090acb2a4caba97d6aeb3b00b Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 21 Apr 2021 12:25:03 -0700 Subject: [PATCH 089/180] Fix comment typo --- fdbclient/MultiVersionTransaction.actor.cpp | 2 +- fdbclient/MultiVersionTransaction.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index 555765c26c..0168dea969 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -1569,7 +1569,7 @@ void MultiVersionApi::addNetworkThreadCompletionHook(void (*hook)(void*), void* } } -// Creates an IDatabase object that represents a connections to the cluster +// Creates an IDatabase object that represents a connection to the cluster Reference MultiVersionApi::createDatabase(const char* clusterFilePath) { lock.enter(); if (!networkSetup) { diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index 4e0e91a969..4bad3c7ca9 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -526,7 +526,7 @@ public: void stopNetwork() override; void addNetworkThreadCompletionHook(void (*hook)(void*), void* hookParameter) override; - // Creates an IDatabase object that represents a connections to the cluster + // Creates an IDatabase object that represents a connection to the cluster Reference createDatabase(const char* clusterFilePath) override; static MultiVersionApi* api; From 80e15e87685fd462d8051a1728f8680bc08f69de Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 21 Apr 2021 14:56:02 -0600 Subject: [PATCH 090/180] started implementation --- .stignore | 2 + fdbclient/ActorLineageProfiler.cpp | 3 ++ fdbclient/ActorLineageProfiler.h | 50 +++++++++++++++++++++- fdbclient/CMakeLists.txt | 1 + fdbclient/FluentDSampleIngestor.cpp | 65 +++++++++++++++++++++++++++++ okteto.yml | 12 ++++++ 6 files changed, 131 insertions(+), 2 deletions(-) create mode 100644 .stignore create mode 100644 fdbclient/FluentDSampleIngestor.cpp create mode 100644 okteto.yml diff --git a/.stignore b/.stignore new file mode 100644 index 0000000000..7500a08f9f --- /dev/null +++ b/.stignore @@ -0,0 +1,2 @@ +.git +.clangd diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp index 82d04aa42c..f2e65e47fb 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -213,6 +213,7 @@ void SampleCollection_t::refresh() { oldest = data.front()->time; } } + config->ingest(sample); } std::vector> SampleCollection_t::get(double from /*= 0.0*/, @@ -275,3 +276,5 @@ void ActorLineageProfilerT::profile() { } } } + +SampleIngestor::~SampleIngestor() {} diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h index 3f11840714..4d32760e32 100644 --- a/fdbclient/ActorLineageProfiler.h +++ b/fdbclient/ActorLineageProfiler.h @@ -30,8 +30,6 @@ #include "flow/singleton.h" #include "flow/flow.h" -void runSamplingProfiler(); - struct IALPCollectorBase { virtual std::optional collect(ActorLineage*) = 0; virtual const std::string_view& name() = 0; @@ -50,6 +48,53 @@ struct Sample : std::enable_shared_from_this { ~Sample() { ::free(data); } }; +class SampleIngestor : std::enable_shared_from_this { +public: + virtual ~SampleIngestor(); + virtual void ingest(std::shared_ptr const& sample) = 0; +}; + +class NoneIngestor : public SampleIngestor { +public: + void ingest(std::shared_ptr const& sample) override {} +}; + +// The FluentD ingestor uses the pimp idiom. This is to make compilation less heavy weight as this implementation has +// dependencies to boost::asio +struct FluentDIngestorImpl; + +class FluentDIngestor : public SampleIngestor { +public: // Public Types + enum class Protocol { TCP, UDP }; + +private: // members + FluentDIngestorImpl* impl; + +public: // interface + void ingest(std::shared_ptr const& sample) override; + FluentDIngestor(Protocol protocol, NetworkAddress& endpoint); + ~FluentDIngestor(); +}; + +class ProfilerConfigT { +private: // private types + using Lock = std::unique_lock; + friend class crossbow::create_static; + +private: // members + std::shared_ptr ingestor = std::make_shared(); + +private: // construction + ProfilerConfigT() {} + ProfilerConfigT(ProfilerConfigT const&) = delete; + ProfilerConfigT& operator=(ProfilerConfigT const&) = delete; + +public: + void setBackend(std::shared_ptr ingestor) { this->ingestor = ingestor; } +}; + +using ProfilerConfig = crossbow::singleton; + class SampleCollectorT { public: // Types friend struct crossbow::create_static; @@ -78,6 +123,7 @@ class SampleCollection_t { mutable std::mutex mutex; std::atomic windowSize = 0.0; std::deque> data; + ProfilerConfig config; public: /** diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index ee87d08646..e9d3d3716b 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -30,6 +30,7 @@ set(FDBCLIENT_SRCS EventTypes.actor.h FDBOptions.h FDBTypes.h + FluentDSampleIngestor.cpp FileBackupAgent.actor.cpp GlobalConfig.h GlobalConfig.actor.h diff --git a/fdbclient/FluentDSampleIngestor.cpp b/fdbclient/FluentDSampleIngestor.cpp new file mode 100644 index 0000000000..0a81ba0613 --- /dev/null +++ b/fdbclient/FluentDSampleIngestor.cpp @@ -0,0 +1,65 @@ +/* + * FluentDSampleIngestor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/ActorLineageProfiler.h" +#include + +namespace { +struct FluentDSocket { + virtual ~FluentDSocket() {} + virtual void connect(NetworkAddress& endpoint) = 0; + // virtual void send() = 0; +}; + +struct TCPFluentDSocket : FluentDSocket { + boost::asio::io_context& io_context; + boost::asio::ip::tcp::socket socket; + TCPFluentDSocket(boost::asio::io_context& context) : io_context(context), socket(context) {} + void connect(NetworkAddress& endpoint) override { boost::asio::ip::tcp::resolver resolver(io_context); } +}; + +struct UDPFluentDSocket : FluentDSocket { + boost::asio::io_context& io_context; + boost::asio::ip::tcp::socket socket; + UDPFluentDSocket(boost::asio::io_context& context) : io_context(context), socket(context) {} + void connect(NetworkAddress& endpoint) override {} +}; +} // namespace + +struct FluentDIngestorImpl { + using Protocol = FluentDIngestor::Protocol; + boost::asio::io_context io_context; + std::unique_ptr socket; + FluentDIngestorImpl(Protocol protocol, NetworkAddress& endpoint) { + switch (protocol) { + case Protocol::TCP: + socket.reset(new TCPFluentDSocket(io_context)); + break; + case Protocol::UDP: + socket.reset(new UDPFluentDSocket(io_context)); + break; + } + socket->connect(endpoint); + } +}; + +FluentDIngestor::~FluentDIngestor() {} + +FluentDIngestor::FluentDIngestor(Protocol protocol, NetworkAddress& endpoint) {} \ No newline at end of file diff --git a/okteto.yml b/okteto.yml new file mode 100644 index 0000000000..efa744a7d8 --- /dev/null +++ b/okteto.yml @@ -0,0 +1,12 @@ +name: foundationdb +autocreate: true +image: foundationdb/devel:centos7-latest +command: bash +volumes: +- /root/.m2 +- /root/build +sync: +- .:/usr/src/fdb +forward: +- 5005:5005 +- 8080:8080 From 99c1edf87eb57816b1e7b75f1a57cae89a731456 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 22 Apr 2021 17:48:09 -0600 Subject: [PATCH 091/180] Implemented fluentd functionality --- fdbclient/ActorLineageProfiler.cpp | 84 ++++++++-------- fdbclient/ActorLineageProfiler.h | 19 ++-- fdbclient/FluentDSampleIngestor.cpp | 143 ++++++++++++++++++++++++---- 3 files changed, 181 insertions(+), 65 deletions(-) diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp index 5bcfaacbb5..a62d0ae890 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -25,6 +25,7 @@ #include #include #include +#include using namespace std::literals; @@ -230,7 +231,45 @@ std::vector> SampleCollection_t::get(double from /*= 0.0 return res; } -ActorLineageProfilerT::ActorLineageProfilerT() { +struct ProfilerImpl { + boost::asio::io_context context; + boost::asio::executor_work_guard workGuard; + boost::asio::steady_timer timer; + std::thread mainThread; + unsigned frequency; + + SampleCollection collection; + + ProfilerImpl() : workGuard(context.get_executor()), timer(context) { + mainThread = std::thread([this]() { context.run(); }); + } + ~ProfilerImpl() { + setFrequency(0); + workGuard.reset(); + mainThread.join(); + } + + void profileHandler(boost::system::error_code const& ec) { + if (ec) { + return; + } + collection->refresh(); + timer = boost::asio::steady_timer(context, std::chrono::microseconds(1000000 / frequency)); + timer.async_wait([this](auto const& ec) { profileHandler(ec); }); + } + + void setFrequency(unsigned frequency) { + boost::asio::post(context, [this, frequency]() { + this->frequency = frequency; + timer.cancel(); + if (frequency > 0) { + profileHandler(boost::system::error_code{}); + } + }); + } +}; + +ActorLineageProfilerT::ActorLineageProfilerT() : impl(new ProfilerImpl()) { collection->collector()->addGetter(WaitState::Network, std::bind(&ActorLineageSet::copy, std::ref(g_network->getActorLineageSet()))); collection->collector()->addGetter( @@ -243,50 +282,15 @@ ActorLineageProfilerT::ActorLineageProfilerT() { } ActorLineageProfilerT::~ActorLineageProfilerT() { - stop(); -} - -void ActorLineageProfilerT::stop() { - setFrequency(0); + delete impl; } void ActorLineageProfilerT::setFrequency(unsigned frequency) { - unsigned oldFrequency = this->frequency; - bool change = this->frequency != frequency; - this->frequency = frequency; - - if (change) { - // Profiler thread will automatically switch to new frequency after - // being triggered by the the condition variable. Only need to start a - // new profiler thread if the old one has been stopped due to the - // profiler thread returning (frequency set to 0). - if (oldFrequency == 0 && frequency != 0) { - std::thread(&ActorLineageProfilerT::profile, this).detach(); - } - cond.notify_all(); - } + impl->setFrequency(frequency); } -void ActorLineageProfilerT::profile() { - static std::atomic_int profileThreadCount = 0; - ASSERT(++profileThreadCount == 1); - - for (;;) { - collection->refresh(); - if (frequency == 0) { - profileThreadCount--; - return; - } - { - std::unique_lock lock{ mutex }; - cond.wait_for(lock, std::chrono::microseconds(1000000 / frequency)); - // cond.wait_until(lock, lastSample + std::chrono::milliseconds) - } - if (frequency == 0) { - profileThreadCount--; - return; - } - } +boost::asio::io_context& ActorLineageProfilerT::context() { + return impl->context; } SampleIngestor::~SampleIngestor() {} diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h index c7348d83c1..0e7c8e7385 100644 --- a/fdbclient/ActorLineageProfiler.h +++ b/fdbclient/ActorLineageProfiler.h @@ -152,20 +152,25 @@ public: using SampleCollection = crossbow::singleton; +struct ProfilerImpl; + +namespace boost { +namespace asio { +// forward declare io_context because including boost asio is super expensive +class io_context; +} // namespace asio +} // namespace boost + class ActorLineageProfilerT { friend struct crossbow::create_static; - ActorLineageProfilerT(); + ProfilerImpl* impl; SampleCollection collection; - std::thread profilerThread; - std::atomic frequency = 0; - std::mutex mutex; - std::condition_variable cond; - void profile(); + ActorLineageProfilerT(); public: ~ActorLineageProfilerT(); void setFrequency(unsigned frequency); - void stop(); + boost::asio::io_context& context(); }; using ActorLineageProfiler = crossbow::singleton; diff --git a/fdbclient/FluentDSampleIngestor.cpp b/fdbclient/FluentDSampleIngestor.cpp index 0a81ba0613..f1609ae5b3 100644 --- a/fdbclient/FluentDSampleIngestor.cpp +++ b/fdbclient/FluentDSampleIngestor.cpp @@ -20,46 +20,153 @@ #include "fdbclient/ActorLineageProfiler.h" #include +#include namespace { + +boost::asio::ip::address ipAddress(IPAddress const& n) { + if (n.isV6()) { + return boost::asio::ip::address_v6(n.toV6()); + } else { + return boost::asio::ip::address_v4(n.toV4()); + } +} + +template +boost::asio::ip::basic_endpoint toEndpoint(NetworkAddress const n) { + return boost::asio::ip::basic_endpoint(ipAddress(n.ip), n.port); +} + struct FluentDSocket { virtual ~FluentDSocket() {} - virtual void connect(NetworkAddress& endpoint) = 0; - // virtual void send() = 0; + virtual void connect(NetworkAddress const& endpoint) = 0; + virtual void send(std::shared_ptr const& sample) = 0; + virtual const boost::system::error_code& failed() const = 0; }; -struct TCPFluentDSocket : FluentDSocket { - boost::asio::io_context& io_context; - boost::asio::ip::tcp::socket socket; - TCPFluentDSocket(boost::asio::io_context& context) : io_context(context), socket(context) {} - void connect(NetworkAddress& endpoint) override { boost::asio::ip::tcp::resolver resolver(io_context); } +template +struct FluentDSocketImpl : FluentDSocket, std::enable_shared_from_this> { + static constexpr unsigned MAX_QUEUE_SIZE = 100; + boost::asio::io_context& context; + typename Protocol::socket socket; + FluentDSocketImpl(boost::asio::io_context& context) : context(context), socket(context) {} + bool ready = false; + std::deque> queue; + boost::system::error_code _failed; + + const boost::system::error_code& failed() const override { return _failed; } + + void sendCompletionHandler(boost::system::error_code const& ec) { + if (ec) { + // TODO: trace error + _failed = ec; + return; + } + if (queue.empty()) { + ready = true; + } else { + auto sample = queue.front(); + queue.pop_front(); + sendImpl(sample); + } + } + + template + std::enable_if_t> sendImpl(std::shared_ptr const& sample) { + boost::asio::async_write( + socket, + boost::asio::const_buffer(sample->data, sample->size), + [sample, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); }); + } + + template + std::enable_if_t> sendImpl(std::shared_ptr const& sample) { + socket.async_send( + boost::asio::const_buffer(sample->data, sample->size), + [sample, self = this->shared_from_this()](auto const& ec, size_t) { self->sendCompletionHandler(ec); }); + } + + void send(std::shared_ptr const& sample) override { + if (_failed) { + return; + } + if (ready) { + ready = false; + sendImpl(sample); + } else { + if (queue.size() < MAX_QUEUE_SIZE) { + queue.push_back(sample); + } // TODO: else trace a warning + } + } + + void connect(NetworkAddress const& endpoint) override { + auto to = toEndpoint(endpoint); + socket.async_connect(to, [self = this->shared_from_this()](boost::system::error_code const& ec) { + if (ec) { + // TODO: error handling + self->_failed = ec; + return; + } + self->ready = true; + }); + } }; -struct UDPFluentDSocket : FluentDSocket { - boost::asio::io_context& io_context; - boost::asio::ip::tcp::socket socket; - UDPFluentDSocket(boost::asio::io_context& context) : io_context(context), socket(context) {} - void connect(NetworkAddress& endpoint) override {} -}; } // namespace struct FluentDIngestorImpl { using Protocol = FluentDIngestor::Protocol; - boost::asio::io_context io_context; + Protocol protocol; + NetworkAddress endpoint; + boost::asio::io_context& io_context; std::unique_ptr socket; - FluentDIngestorImpl(Protocol protocol, NetworkAddress& endpoint) { + boost::asio::steady_timer retryTimer; + FluentDIngestorImpl(Protocol protocol, NetworkAddress const& endpoint) + : protocol(protocol), endpoint(endpoint), io_context(ActorLineageProfiler::instance().context()), + retryTimer(io_context) { + connect(); + } + + ~FluentDIngestorImpl() { retryTimer.cancel(); } + + void connect() { switch (protocol) { case Protocol::TCP: - socket.reset(new TCPFluentDSocket(io_context)); + socket.reset(new FluentDSocketImpl(io_context)); break; case Protocol::UDP: - socket.reset(new UDPFluentDSocket(io_context)); + socket.reset(new FluentDSocketImpl(io_context)); break; } socket->connect(endpoint); } + + void retry() { + retryTimer = boost::asio::steady_timer(io_context, std::chrono::seconds(1)); + retryTimer.async_wait([this](auto const& ec) { + if (ec) { + return; + } + connect(); + }); + socket.reset(); + } }; FluentDIngestor::~FluentDIngestor() {} -FluentDIngestor::FluentDIngestor(Protocol protocol, NetworkAddress& endpoint) {} \ No newline at end of file +FluentDIngestor::FluentDIngestor(Protocol protocol, NetworkAddress& endpoint) + : impl(new FluentDIngestorImpl(protocol, endpoint)) {} + +void FluentDIngestor::ingest(const std::shared_ptr& sample) { + if (!impl->socket) { + // the connection failed in the past and we wait for a timeout before we retry + return; + } else if (impl->socket->failed()) { + impl->retry(); + return; + } else { + impl->socket->send(sample); + } +} From adb0ce97769721ba5d95206880ccf3570d5355d2 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 22 Apr 2021 17:52:27 -0600 Subject: [PATCH 092/180] address review comments --- fdbclient/TransactionLineage.h | 2 +- fdbserver/storageserver.actor.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbclient/TransactionLineage.h b/fdbclient/TransactionLineage.h index b4518de231..711d89101c 100644 --- a/fdbclient/TransactionLineage.h +++ b/fdbclient/TransactionLineage.h @@ -94,7 +94,7 @@ class ScopedLineage { public: ScopedLineage(V T::*member, V const& value) : member(member) { - auto val = currentLineage->modify(member); + auto& val = currentLineage->modify(member); before = val; val = value; } diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 7538685acf..254484710d 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -522,7 +522,7 @@ public: // process of committing makeShardDurable) // == v -> k is readable (from storage+versionedData) @ [storageVersion,v], and not being updated // when version increases - // == latestVersion -> k is readable (from stora ge+versionedData) @ [storageVersion,version.get()], and thus + // == latestVersion -> k is readable (from storage+versionedData) @ [storageVersion,version.get()], and thus // stays available when version increases CoalescedKeyRangeMap newestAvailableVersion; @@ -875,7 +875,7 @@ public: } return fun(this, request); } - }; +}; const StringRef StorageServer::CurrentRunningFetchKeys::emptyString = LiteralStringRef(""); const KeyRangeRef StorageServer::CurrentRunningFetchKeys::emptyKeyRange = From 3e18b857a872275e912043c4b4a66e1006e1375f Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 23 Apr 2021 11:02:53 -0600 Subject: [PATCH 093/180] add command line args to configure profile ingestor --- fdbclient/ActorLineageProfiler.cpp | 55 ++++++++++++++++++++++++++++++ fdbclient/ActorLineageProfiler.h | 7 +++- fdbserver/fdbserver.actor.cpp | 34 ++++++++++++++++-- 3 files changed, 93 insertions(+), 3 deletions(-) diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp index a62d0ae890..3b300f1653 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -305,3 +305,58 @@ void samplingProfilerUpdateFrequency(std::optional freq) { TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency); ActorLineageProfiler::instance().setFrequency(frequency); } + +void ProfilerConfigT::reset(std::map const& config) { + bool expectNoMore = false, useFluentD = false, useTCP = false; + std::string endpoint; + ConfigError err; + for (auto& kv : config) { + if (expectNoMore) { + err.description = format("Unexpected option %s", kv.first.c_str()); + throw err; + } + if (kv.first == "collector") { + std::string val = kv.second; + std::for_each(val.begin(), val.end(), [](auto c) { return std::tolower(c); }); + if (val == "none") { + setBackend(std::make_shared()); + } else if (val == "fluentd") { + useFluentD = true; + } else { + err.description = format("Unsupported collector: %s", val.c_str()); + throw err; + } + } else if (kv.first == "collector_endpoint") { + endpoint = kv.second; + } else if (kv.first == "collector_protocol") { + auto val = kv.second; + std::for_each(val.begin(), val.end(), [](auto c) { return std::tolower(c); }); + if (val == "tcp") { + useTCP = true; + } else if (val == "udp") { + useTCP = false; + } else { + err.description = format("Unsupported protocol for fluentd: %s", kv.second.c_str()); + throw err; + } + } else { + err.description = format("Unknown option %s", kv.first.c_str()); + throw err; + } + } + if (useFluentD) { + if (endpoint.empty()) { + err.description = "Endpoint is required for fluentd ingestor"; + throw err; + } + NetworkAddress address; + try { + address = NetworkAddress::parse(endpoint); + } catch (Error& e) { + err.description = format("Can't parse address %s", endpoint.c_str()); + throw err; + } + setBackend(std::make_shared( + useTCP ? FluentDIngestor::Protocol::TCP : FluentDIngestor::Protocol::TCP, address)); + } +} diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h index 0e7c8e7385..d09aba7d2c 100644 --- a/fdbclient/ActorLineageProfiler.h +++ b/fdbclient/ActorLineageProfiler.h @@ -79,6 +79,10 @@ public: // interface ~FluentDIngestor(); }; +struct ConfigError { + std::string description; +}; + class ProfilerConfigT { private: // private types using Lock = std::unique_lock; @@ -91,9 +95,10 @@ private: // construction ProfilerConfigT() {} ProfilerConfigT(ProfilerConfigT const&) = delete; ProfilerConfigT& operator=(ProfilerConfigT const&) = delete; + void setBackend(std::shared_ptr ingestor) { this->ingestor = ingestor; } public: - void setBackend(std::shared_ptr ingestor) { this->ingestor = ingestor; } + void reset(std::map const& config); }; using ProfilerConfig = crossbow::singleton; diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 136cd90c3d..75247d85cf 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -68,6 +68,7 @@ #include "flow/Tracing.h" #include "flow/WriteOnlySet.h" #include "flow/UnitTest.h" +#include "fdbclient/ActorLineageProfiler.h" #if defined(__linux__) || defined(__FreeBSD__) #include @@ -85,6 +86,8 @@ #include "flow/actorcompiler.h" // This must be the last #include. +using namespace std::literals; + // clang-format off enum { OPT_CONNFILE, OPT_SEEDCONNFILE, OPT_SEEDCONNSTRING, OPT_ROLE, OPT_LISTEN, OPT_PUBLICADDR, OPT_DATAFOLDER, OPT_LOGFOLDER, OPT_PARENTPID, OPT_TRACER, OPT_NEWCONSOLE, @@ -92,7 +95,7 @@ enum { OPT_DCID, OPT_MACHINE_CLASS, OPT_BUGGIFY, OPT_VERSION, OPT_BUILD_FLAGS, OPT_CRASHONERROR, OPT_HELP, OPT_NETWORKIMPL, OPT_NOBUFSTDOUT, OPT_BUFSTDOUTERR, OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_UNITTESTPARAM, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE, OPT_METRICSPREFIX, OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE, - OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE + OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE, OPT_PROFILER }; CSimpleOpt::SOption g_rgOptions[] = { @@ -172,9 +175,10 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_METRICSPREFIX, "--metrics_prefix", SO_REQ_SEP }, { OPT_IO_TRUST_SECONDS, "--io_trust_seconds", SO_REQ_SEP }, { OPT_IO_TRUST_WARN_ONLY, "--io_trust_warn_only", SO_NONE }, - { OPT_TRACE_FORMAT , "--trace_format", SO_REQ_SEP }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, { OPT_WHITELIST_BINPATH, "--whitelist_binpath", SO_REQ_SEP }, { OPT_BLOB_CREDENTIAL_FILE, "--blob_credential_file", SO_REQ_SEP }, + { OPT_PROFILER, "--profiler_", SO_REQ_SEP}, #ifndef TLS_DISABLED TLS_OPTION_FLAGS @@ -618,6 +622,11 @@ static void printUsage(const char* name, bool devhelp) { " Machine class (valid options are storage, transaction," " resolution, grv_proxy, commit_proxy, master, test, unset, stateless, log, router," " and cluster_controller)."); + printOptionUsage("--profiler_", + "Set a actor profiler option. Supported options are:\n" + " collector -- None or FluentD (FluentD requires collector_endpoint to be set)\n" + " collector_endpoint -- IP:PORT of the fluentd server\n" + " collector_protocol -- UDP or TCP (default is UDP)"); #ifndef TLS_DISABLED printf(TLS_HELP); #endif @@ -981,6 +990,8 @@ struct CLIOptions { Standalone machineId; UnitTestParameters testParams; + std::map profilerConfig; + static CLIOptions parseArgs(int argc, char* argv[]) { CLIOptions opts; opts.parseArgsInternal(argc, argv); @@ -1054,6 +1065,18 @@ private: knobs.push_back(std::make_pair(syn, args.OptionArg())); break; } + case OPT_PROFILER: { + std::string syn = args.OptionSyntax(); + std::string_view key = syn; + auto prefix = "--profiler_"sv; + if (key.find(prefix) != 0) { + fprintf(stderr, "ERROR: unable to parse profiler option '%s'\n", syn.c_str()); + flushAndExit(FDB_EXIT_ERROR); + } + key.remove_prefix(prefix.size()); + profilerConfig.emplace(key, args.OptionArg()); + break; + }; case OPT_UNITTESTPARAM: { std::string syn = args.OptionSyntax(); if (!StringRef(syn).startsWith(LiteralStringRef("--test_"))) { @@ -1454,6 +1477,13 @@ private: } } + try { + ProfilerConfig::instance().reset(profilerConfig); + } catch (ConfigError& e) { + printf("Error seting up profiler: %s", e.description.c_str()); + flushAndExit(FDB_EXIT_ERROR); + } + if (seedConnString.length() && seedConnFile.length()) { fprintf( stderr, "%s\n", "--seed_cluster_file and --seed_connection_string may not both be specified at once."); From 52bba82e8ef24d1eaef8ee0347e04a1bc0bf8e85 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Fri, 23 Apr 2021 14:05:05 -0700 Subject: [PATCH 094/180] Add window size configuration key --- fdbclient/ActorLineageProfiler.cpp | 10 ++++++++++ fdbclient/ActorLineageProfiler.h | 1 + fdbclient/GlobalConfig.actor.cpp | 1 + fdbclient/GlobalConfig.actor.h | 1 + fdbclient/NativeAPI.actor.cpp | 1 + 5 files changed, 14 insertions(+) diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp index fe335d90d5..46a74bace7 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -298,3 +298,13 @@ void samplingProfilerUpdateFrequency(std::optional freq) { TraceEvent(SevInfo, "SamplingProfilerUpdateFrequency").detail("Frequency", frequency); ActorLineageProfiler::instance().setFrequency(frequency); } + +// Callback used to update the sample collector window size. +void samplingProfilerUpdateWindow(std::optional window) { + double duration = 0; + if (window.has_value()) { + duration = std::any_cast(window.value()); + } + TraceEvent(SevInfo, "SamplingProfilerUpdateWindow").detail("Duration", duration); + SampleCollection::instance().setWindowSize(duration); +} diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h index b73e7d04eb..c612274133 100644 --- a/fdbclient/ActorLineageProfiler.h +++ b/fdbclient/ActorLineageProfiler.h @@ -32,6 +32,7 @@ #include "flow/flow.h" void samplingProfilerUpdateFrequency(std::optional freq); +void samplingProfilerUpdateWindow(std::optional window); struct IALPCollectorBase { virtual std::optional collect(ActorLineage*) = 0; diff --git a/fdbclient/GlobalConfig.actor.cpp b/fdbclient/GlobalConfig.actor.cpp index 79bbbb2202..1d06d84880 100644 --- a/fdbclient/GlobalConfig.actor.cpp +++ b/fdbclient/GlobalConfig.actor.cpp @@ -35,6 +35,7 @@ const KeyRef transactionTagSampleRate = LiteralStringRef("config/transaction_tag const KeyRef transactionTagSampleCost = LiteralStringRef("config/transaction_tag_sample_cost"); const KeyRef samplingFrequency = LiteralStringRef("visibility/sampling/frequency"); +const KeyRef samplingWindow = LiteralStringRef("visibility/sampling/window"); GlobalConfig::GlobalConfig() : lastUpdate(0) {} diff --git a/fdbclient/GlobalConfig.actor.h b/fdbclient/GlobalConfig.actor.h index de98c442e1..65028dcd92 100644 --- a/fdbclient/GlobalConfig.actor.h +++ b/fdbclient/GlobalConfig.actor.h @@ -52,6 +52,7 @@ extern const KeyRef transactionTagSampleRate; extern const KeyRef transactionTagSampleCost; extern const KeyRef samplingFrequency; +extern const KeyRef samplingWindow; // Structure used to hold the values stored by global configuration. The arena // is used as memory to store both the key and the value (the value is only diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index c329a17546..d9e24f79dc 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -965,6 +965,7 @@ DatabaseContext::DatabaseContext(Reference Date: Thu, 22 Apr 2021 20:55:06 -0700 Subject: [PATCH 095/180] Add API to read samples from worker --- fdbcli/fdbcli.actor.cpp | 8 ++ fdbclient/ActorLineageProfiler.cpp | 7 +- fdbclient/ActorLineageProfiler.h | 2 +- fdbclient/ProcessInterface.h | 45 +++++++- fdbclient/SpecialKeySpace.actor.cpp | 169 +++++++++++++++++++++++++--- fdbclient/SpecialKeySpace.actor.h | 7 ++ fdbserver/worker.actor.cpp | 24 +++- 7 files changed, 242 insertions(+), 20 deletions(-) diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index d655601e22..d21775d47f 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -4698,6 +4698,14 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } catch (Error& e) { if (e.code() != error_code_actor_cancelled) fprintf(stderr, "ERROR: %s (%d)\n", e.what(), e.code()); + if (e.code() == error_code_special_keys_api_failure) { + auto f = tr->get(LiteralStringRef("\xff\xff/error_message")); + ASSERT(f.isReady()); + if (f.get().present()) { + auto msg = f.get().get().toString(); + printf("Special Key space error_message: %s\n", msg.c_str()); + } + } is_error = true; if (intrans) { printf("Rolling back current transaction\n"); diff --git a/fdbclient/ActorLineageProfiler.cpp b/fdbclient/ActorLineageProfiler.cpp index 46a74bace7..46de22d2fc 100644 --- a/fdbclient/ActorLineageProfiler.cpp +++ b/fdbclient/ActorLineageProfiler.cpp @@ -63,13 +63,14 @@ class Packer : public msgpack::packer { std::string_view, std::vector, std::map, - std::map>::populate(visitorMap); + std::map, + std::vector>>::populate(visitorMap); } void visit(const std::any& val, Packer& packer) { auto iter = visitorMap.find(val.type()); if (iter == visitorMap.end()) { - // TODO: trace error + TraceEvent(SevError, "PackerTypeNotFound").detail("Type", val.type().name()); } else { iter->second(val, packer); } @@ -197,7 +198,7 @@ std::shared_ptr SampleCollectorT::collect() { void SampleCollection_t::refresh() { auto sample = _collector->collect(); - auto min = std::max(sample->time - windowSize, sample->time); + auto min = std::min(sample->time - windowSize, sample->time); { Lock _{ mutex }; data.emplace_back(std::move(sample)); diff --git a/fdbclient/ActorLineageProfiler.h b/fdbclient/ActorLineageProfiler.h index c612274133..82cd22cb1c 100644 --- a/fdbclient/ActorLineageProfiler.h +++ b/fdbclient/ActorLineageProfiler.h @@ -78,7 +78,7 @@ class SampleCollection_t { SampleCollector _collector; mutable std::mutex mutex; - std::atomic windowSize = 0.0; + std::atomic windowSize = 5.0; std::deque> data; public: diff --git a/fdbclient/ProcessInterface.h b/fdbclient/ProcessInterface.h index c76cf9ef48..9b648d8127 100644 --- a/fdbclient/ProcessInterface.h +++ b/fdbclient/ProcessInterface.h @@ -18,6 +18,7 @@ * limitations under the License. */ +#include "fdbclient/AnnotateActor.h" #include "fdbclient/FDBTypes.h" #include "fdbrpc/fdbrpc.h" @@ -26,11 +27,11 @@ constexpr UID WLTOKEN_PROCESS(-1, 11); struct ProcessInterface { constexpr static FileIdentifier file_identifier = 985636; RequestStream getInterface; - RequestStream echo; + RequestStream actorLineage; template void serialize(Ar& ar) { - serializer(ar, echo); + serializer(ar, actorLineage); } }; @@ -55,3 +56,43 @@ struct EchoRequest { serializer(ar, message, reply); } }; + +// This type is used to send serialized sample data over the network. +// TODO: Possible to combine with `Sample`? +struct SerializedSample { + constexpr static FileIdentifier file_identifier = 15785634; + + WaitState waitState; + double time; + int seq; + std::string data; + + template + void serialize(Ar& ar) { + serializer(ar, waitState, time, seq, data); + } +}; + +struct ActorLineageReply { + constexpr static FileIdentifier file_identifier = 1887656; + std::vector samples; + + template + void serialize(Ar& ar) { + serializer(ar, samples); + } +}; + +struct ActorLineageRequest { + constexpr static FileIdentifier file_identifier = 11654765; + WaitState waitStateStart, waitStateEnd; + double timeStart, timeEnd; + int seqStart, seqEnd; + // TODO: Add end values + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, waitStateStart, waitStateEnd, timeStart, timeEnd, seqStart, seqEnd, reply); + } +}; diff --git a/fdbclient/SpecialKeySpace.actor.cpp b/fdbclient/SpecialKeySpace.actor.cpp index 603887fcf6..f251feddfa 100644 --- a/fdbclient/SpecialKeySpace.actor.cpp +++ b/fdbclient/SpecialKeySpace.actor.cpp @@ -21,6 +21,10 @@ #include "boost/lexical_cast.hpp" #include "boost/algorithm/string.hpp" +#include + +#include + #include "fdbclient/Knobs.h" #include "fdbclient/ProcessInterface.h" #include "fdbclient/GlobalConfig.actor.h" @@ -96,6 +100,15 @@ std::unordered_map SpecialKeySpace::managementApiCommandT .withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) } }; +std::unordered_map SpecialKeySpace::actorLineageApiCommandToRange = { + { "state", + KeyRangeRef(LiteralStringRef("state/"), LiteralStringRef("state0")) + .withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) }, + { "time", + KeyRangeRef(LiteralStringRef("time/"), LiteralStringRef("time0")) + .withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) } +}; + std::set SpecialKeySpace::options = { "excluded/force", "failed/force" }; std::set SpecialKeySpace::tracingOptions = { kTracingTransactionIdKey, kTracingTokenKey }; @@ -1925,26 +1938,156 @@ void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& ke ActorLineageImpl::ActorLineageImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {} +void parse(StringRef& val, int& i) { + i = std::stoi(val.toString()); +} + +void parse(StringRef& val, double& d) { + d = std::stod(val.toString()); +} + +void parse(StringRef& val, WaitState& w) { + if (val == LiteralStringRef("disk")) { + w = WaitState::Disk; + } else if (val == LiteralStringRef("network")) { + w = WaitState::Network; + } else if (val == LiteralStringRef("running")) { + w = WaitState::Running; + } else { + throw std::range_error("failed to parse run state"); + } +} + +void parse(StringRef& val, NetworkAddress& a) { + auto address = NetworkAddress::parse(val.toString()); + if (!address.isValid()) { + throw std::invalid_argument("invalid host"); + } + a = address; +} + +// Base case function for parsing function below. +template +void parse(std::vector::iterator it, std::vector::iterator end, T& t1) { + if (it == end) { + return; + } + parse(*it, t1); +} + +// Given an iterator into a vector of string tokens, an iterator to the end of +// the search space in the vector (exclusive), and a list of references to +// types, parses each token in the vector into the associated type according to +// the order of the arguments. +// +// For example, given the vector ["1", "1.5", "127.0.0.1:4000"] and the +// argument list int a, double b, NetworkAddress c, after this function returns +// each parameter passed in will hold the parsed value from the token list. +// +// The appropriate parsing function must be implemented for the type you wish +// to parse. See the existing parsing functions above, and add your own if +// necessary. +template +void parse(std::vector::iterator it, std::vector::iterator end, T& t1, Types&... remaining) { + // Return as soon as all tokens have been parsed. This allows parameters + // passed at the end to act as optional parameters -- they will only be set + // if the value exists. + if (it == end) { + return; + } + + try { + parse(*it, t1); + parse(++it, end, remaining...); + } catch (Error& e) { + throw e; + } catch (std::exception& e) { + throw e; + } +} + ACTOR static Future> actorLineageGetRangeActor(ReadYourWritesTransaction* ryw, KeyRef prefix, KeyRangeRef kr) { state Standalone result; - Standalone addressString = kr.begin.removePrefix(prefix); + + // Set default values for all fields. The default will be used if the field + // is missing in the key. + state NetworkAddress host; + state WaitState waitStateStart = WaitState{ 0 }; + state WaitState waitStateEnd = WaitState{ 2 }; + state double timeStart = 0; + state double timeEnd = std::numeric_limits::max(); + state int seqStart = 0; + state int seqEnd = std::numeric_limits::max(); + + state std::vector beginValues = kr.begin.removePrefix(prefix).splitAny("/"_sr); + state std::vector endValues = kr.end.removePrefix(prefix).splitAny("/"_sr); + // Require index (either "state" or "time") and address:port. + if (beginValues.size() < 2 || endValues.size() < 2) { + ryw->setSpecialKeySpaceErrorMsg("missing required parameters (index, host)"); + throw special_keys_api_failure(); + } try { - auto address = NetworkAddress::parse(addressString.contents().toString()); - - state ProcessInterface process; - process.getInterface = RequestStream(Endpoint({ address }, WLTOKEN_PROCESS)); - ProcessInterface p = wait(retryBrokenPromise(process.getInterface, GetProcessInterfaceRequest{})); - process = p; - - EchoRequest echoRequest; - echoRequest.message = "Hello"; - std::string response = wait(process.echo.getReply(echoRequest)); - result.push_back_deep(result.arena(), KeyValueRef(kr.begin, response)); + state NetworkAddress endRangeHost; + if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) { + // For the range \xff\xff/actor_lineage/state/ip:port/wait-state/time/seq + parse(beginValues.begin() + 1, beginValues.end(), host, waitStateStart, timeStart, seqStart); + if (kr.begin != kr.end) { + parse(endValues.begin() + 1, endValues.end(), endRangeHost, waitStateEnd, timeEnd, seqEnd); + } + } else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) { + // For the range \xff\xff/actor_lineage/time/ip:port/time/wait-state/seq + parse(beginValues.begin() + 1, beginValues.end(), host, timeStart, waitStateStart, seqStart); + if (kr.begin != kr.end) { + parse(endValues.begin() + 1, endValues.end(), endRangeHost, timeEnd, waitStateEnd, seqEnd); + } + } else { + ryw->setSpecialKeySpaceErrorMsg("invalid index in actor_lineage"); + throw special_keys_api_failure(); + } } catch (Error& e) { - TraceEvent(SevDebug, "SpecialKeysNetworkParseError").error(e); + if (e.code() != special_keys_api_failure().code()) { + ryw->setSpecialKeySpaceErrorMsg("failed to parse key"); + throw special_keys_api_failure(); + } else { + throw e; + } + } + + if (kr.begin != kr.end && host != endRangeHost) { + // The client doesn't know about all the hosts, so a get range covering + // multiple hosts has no way of knowing which IP:port combos to use. + ryw->setSpecialKeySpaceErrorMsg("the host must remain the same on both ends of the range"); + throw special_keys_api_failure(); + } + + // Open endpoint to target process on each call. This can be optimized at + // some point... + state ProcessInterface process; + process.getInterface = RequestStream(Endpoint({ host }, WLTOKEN_PROCESS)); + ProcessInterface p = wait(retryBrokenPromise(process.getInterface, GetProcessInterfaceRequest{})); + process = p; + + ActorLineageRequest actorLineageRequest; + actorLineageRequest.waitStateStart = waitStateStart; + actorLineageRequest.waitStateEnd = waitStateEnd; + actorLineageRequest.timeStart = timeStart; + actorLineageRequest.timeEnd = timeEnd; + actorLineageRequest.seqStart = seqStart; + actorLineageRequest.seqEnd = seqEnd; + ActorLineageReply reply = wait(process.actorLineage.getReply(actorLineageRequest)); + + for (const auto& sample : reply.samples) { + msgpack::object_handle oh = msgpack::unpack(sample.data.data(), sample.data.size()); + msgpack::object deserialized = oh.get(); + + std::ostringstream stream; + stream << deserialized; + // TODO: Fix return value for ranges + Key returnKey = prefix.withSuffix(host.toString() + "/" + std::to_string(sample.seq)); + result.push_back_deep(result.arena(), KeyValueRef(returnKey, stream.str())); } return result; diff --git a/fdbclient/SpecialKeySpace.actor.h b/fdbclient/SpecialKeySpace.actor.h index 08a3c6cfc5..fd16af7c2c 100644 --- a/fdbclient/SpecialKeySpace.actor.h +++ b/fdbclient/SpecialKeySpace.actor.h @@ -200,6 +200,12 @@ public: static KeyRef getManagementApiCommandPrefix(const std::string& command) { return managementApiCommandToRange.at(command).begin; } + static KeyRangeRef getActorLineageApiCommandRange(const std::string& command) { + return actorLineageApiCommandToRange.at(command); + } + static KeyRef getActorLineageApiCommandPrefix(const std::string& command) { + return actorLineageApiCommandToRange.at(command).begin; + } static Key getManagementApiCommandOptionSpecialKey(const std::string& command, const std::string& option); static const std::set& getManagementApiOptionsSet() { return options; } static const std::set& getTracingOptions() { return tracingOptions; } @@ -228,6 +234,7 @@ private: static std::unordered_map moduleToBoundary; static std::unordered_map managementApiCommandToRange; // management command to its special keys' range + static std::unordered_map actorLineageApiCommandToRange; static std::set options; // "/