From 76838a20b7bd936472d3431bbc7534afac883dad Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 30 Oct 2020 09:11:08 -0700 Subject: [PATCH 001/461] A model used to quickly simulate various GRV scenarios and algorithms --- contrib/grv_proxy_model/grv_test.py | 134 ++++++++ contrib/grv_proxy_model/plot.py | 107 +++++++ contrib/grv_proxy_model/priority.py | 40 +++ contrib/grv_proxy_model/proxy_model.py | 338 ++++++++++++++++++++ contrib/grv_proxy_model/rate_model.py | 83 +++++ contrib/grv_proxy_model/ratekeeper_model.py | 67 ++++ contrib/grv_proxy_model/smoother.py | 53 +++ contrib/grv_proxy_model/workload_model.py | 201 ++++++++++++ 8 files changed, 1023 insertions(+) create mode 100755 contrib/grv_proxy_model/grv_test.py create mode 100755 contrib/grv_proxy_model/plot.py create mode 100755 contrib/grv_proxy_model/priority.py create mode 100755 contrib/grv_proxy_model/proxy_model.py create mode 100755 contrib/grv_proxy_model/rate_model.py create mode 100755 contrib/grv_proxy_model/ratekeeper_model.py create mode 100644 contrib/grv_proxy_model/smoother.py create mode 100755 contrib/grv_proxy_model/workload_model.py diff --git a/contrib/grv_proxy_model/grv_test.py b/contrib/grv_proxy_model/grv_test.py new file mode 100755 index 0000000000..1cd0224538 --- /dev/null +++ b/contrib/grv_proxy_model/grv_test.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 + +# +# grv_test.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import inspect +import sys + +import rate_model +import workload_model +import proxy_model +import ratekeeper_model +from priority import Priority +from plot import Plotter + +parser = argparse.ArgumentParser() +parser.add_argument('-w', '--workload', type=str, help='Name of workload to run') +parser.add_argument('-r', '--ratekeeper', type=str, help='Name of ratekeeper model') +parser.add_argument('-d', '--duration', type=int, default=240, help='Duration of simulated test, in seconds. Defaults to 240.') +parser.add_argument('-L', '--limiter', type=str, default='Original', help='Name of limiter implementation. Defaults to \'Original\'.') +parser.add_argument('-p', '--proxy', type=str, default='ProxyModel', help='Name of proxy implementation. Defaults to \'ProxyModel\'.') +parser.add_argument('--list', action='store_true', default=False, help='List options for all models.') +parser.add_argument('--no-graph', action='store_true', default=False, help='Disable graphical output.') + +args = parser.parse_args() + +def print_choices_list(context=None): + if context == 'workload' or context is None: + print('Workloads:') + for w in workload_model.predefined_workloads.keys(): + print(' %s' % w) + + if context == 'ratekeeper' or context is None: + print('\nRatekeeper models:') + for r in ratekeeper_model.predefined_ratekeeper.keys(): + print(' %s' % r) + + proxy_model_classes = [c for c in [getattr(proxy_model, a) for a in dir(proxy_model)] if inspect.isclass(c)] + + if context == 'proxy' or context is None: + print('\nProxy models:') + for p in proxy_model_classes: + if issubclass(p, proxy_model.ProxyModel): + print(' %s' % p.__name__) + + if context == 'limiter' or context is None: + print('\nProxy limiters:') + for p in proxy_model_classes: + if issubclass(p, proxy_model.Limiter) and p != proxy_model.Limiter: + name = p.__name__ + if name.endswith('Limiter'): + name = name[0:-len('Limiter')] + print(' %s' % name) + +if args.workload is None or args.ratekeeper is None: + print('ERROR: A workload (-w/--workload) and ratekeeper model (-r/--ratekeeper) must be specified.\n') + print_choices_list() + sys.exit(1) + +if args.list: + print_choices_list() + sys.exit(0) + +def validate_class_type(var, name, superclass): + cls = getattr(var, name, None) + return cls is not None and inspect.isclass(cls) and issubclass(cls, superclass) + +if not args.ratekeeper in ratekeeper_model.predefined_ratekeeper: + print('Invalid ratekeeper model `%s\'' % args.ratekeeper) + print_choices_list('ratekeeper') + sys.exit(1) + +if not args.workload in workload_model.predefined_workloads: + print('Invalid workload model `%s\'' % args.workload) + print_choices_list('workload') + sys.exit(1) + +if not validate_class_type(proxy_model, args.proxy, proxy_model.ProxyModel): + print('Invalid proxy model `%s\'' % args.proxy) + print_choices_list('proxy') + sys.exit(1) + +limiter_name = args.limiter +if not validate_class_type(proxy_model, limiter_name, proxy_model.Limiter): + limiter_name += 'Limiter' + if not validate_class_type(proxy_model, limiter_name, proxy_model.Limiter): + print('Invalid proxy limiter `%s\'' % args.limiter) + print_choices_list('limiter') + sys.exit(1) + +ratekeeper = ratekeeper_model.predefined_ratekeeper[args.ratekeeper] +workload = workload_model.predefined_workloads[args.workload] + +limiter = getattr(proxy_model, limiter_name) +proxy = getattr(proxy_model, args.proxy)(args.duration, ratekeeper, workload, limiter) + +proxy.run() + +for priority in workload.priorities(): + latencies = sorted([p for t in proxy.results.latencies[priority].values() for p in t]) + total_started = sum(proxy.results.started[priority].values()) + still_queued = sum([r.count for r in proxy.request_queue if r.priority == priority]) + + if len(latencies) > 0: + print('\n%s: %d requests in %d seconds (rate=%f). %d still queued.' % (priority, total_started, proxy.time, float(total_started)/proxy.time, still_queued)) + print(' Median latency: %f' % latencies[len(latencies)//2]) + print(' 90%% latency: %f' % latencies[int(0.9*len(latencies))]) + print(' 99%% latency: %f' % latencies[int(0.99*len(latencies))]) + print(' 99.9%% latency: %f' % latencies[int(0.999*len(latencies))]) + print(' Max latency: %f' % latencies[-1]) + +print('') + +if not args.no_graph: + plotter = Plotter(proxy.results) + plotter.display() diff --git a/contrib/grv_proxy_model/plot.py b/contrib/grv_proxy_model/plot.py new file mode 100755 index 0000000000..9334e2c844 --- /dev/null +++ b/contrib/grv_proxy_model/plot.py @@ -0,0 +1,107 @@ +# +# plot.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import matplotlib.pyplot as plt + +class Plotter: + def __init__(self, results): + self.results = results + + def add_plot(data, time_resolution, label, use_avg=False): + out_data = {} + counts = {} + for t in data.keys(): + out_data.setdefault(t//time_resolution*time_resolution, 0) + counts.setdefault(t//time_resolution*time_resolution, 0) + out_data[t//time_resolution*time_resolution] += data[t] + counts[t//time_resolution*time_resolution] += 1 + + if use_avg: + out_data = { t: v/counts[t] for t,v in out_data.items() } + + plt.plot(list(out_data.keys()), list(out_data.values()), label=label) + + def add_plot_with_times(data, label): + plt.plot(list(data.keys()), list(data.values()), label=label) + + def display(self, time_resolution=0.1): + plt.figure(figsize=(40,9)) + plt.subplot(3, 3, 1) + for priority in self.results.started.keys(): + Plotter.add_plot(self.results.started[priority], time_resolution, priority) + + plt.xlabel('Time (s)') + plt.ylabel('Released/s') + plt.legend() + + plt.subplot(3, 3, 2) + for priority in self.results.queued.keys(): + Plotter.add_plot(self.results.queued[priority], time_resolution, priority) + + plt.xlabel('Time (s)') + plt.ylabel('Requests/s') + plt.legend() + + plt.subplot(3, 3, 3) + for priority in self.results.unprocessed_queue_sizes.keys(): + data = {k: max(v) for (k,v) in self.results.unprocessed_queue_sizes[priority].items()} + Plotter.add_plot(data, time_resolution, priority) + + plt.xlabel('Time (s)') + plt.ylabel('Max queue size') + plt.legend() + + num = 4 + for priority in self.results.latencies.keys(): + plt.subplot(3, 3, num) + median_latencies = {k: v[int(0.5*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()} + percentile90_latencies = {k: v[int(0.9*len(v))] if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()} + max_latencies = {k: max(v) if len(v) > 0 else 0 for (k,v) in self.results.latencies[priority].items()} + + Plotter.add_plot(median_latencies, time_resolution, 'median') + Plotter.add_plot(percentile90_latencies, time_resolution, '90th percentile') + Plotter.add_plot(max_latencies, time_resolution, 'max') + + plt.xlabel('Time (s)') + plt.ylabel(str(priority) + ' Latency (s)') + plt.yscale('log') + plt.legend() + num += 1 + + for priority in self.results.rate.keys(): + plt.subplot(3, 3, num) + if len(self.results.rate[priority]) > 0: + Plotter.add_plot(self.results.rate[priority], time_resolution, 'Rate', use_avg=True) + if len(self.results.released[priority]) > 0: + Plotter.add_plot(self.results.released[priority], time_resolution, 'Released', use_avg=True) + if len(self.results.limit[priority]) > 0: + Plotter.add_plot(self.results.limit[priority], time_resolution, 'Limit', use_avg=True) + if len(self.results.limit_and_budget[priority]) > 0: + Plotter.add_plot(self.results.limit_and_budget[priority], time_resolution, 'Limit and budget', use_avg=True) + if len(self.results.budget[priority]) > 0: + Plotter.add_plot(self.results.budget[priority], time_resolution, 'Budget', use_avg=True) + + plt.xlabel('Time (s)') + plt.ylabel('Value (' + str(priority) + ')') + plt.legend() + num += 1 + + plt.show() + diff --git a/contrib/grv_proxy_model/priority.py b/contrib/grv_proxy_model/priority.py new file mode 100755 index 0000000000..3ba5c05f2e --- /dev/null +++ b/contrib/grv_proxy_model/priority.py @@ -0,0 +1,40 @@ +# +# priority.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import functools + +@functools.total_ordering +class Priority: + def __init__(self, priority_value, label): + self.priority_value = priority_value + self.label = label + + def __lt__(self, other): + return self.priority_value < other.priority_value + + def __str__(self): + return self.label + + def __repr__(self): + return repr(self.label) + +Priority.SYSTEM = Priority(0, "System") +Priority.DEFAULT = Priority(1, "Default") +Priority.BATCH = Priority(2, "Batch") diff --git a/contrib/grv_proxy_model/proxy_model.py b/contrib/grv_proxy_model/proxy_model.py new file mode 100755 index 0000000000..9ca2a39bfe --- /dev/null +++ b/contrib/grv_proxy_model/proxy_model.py @@ -0,0 +1,338 @@ +# +# proxy_model.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import copy +import functools +import heapq + +from priority import Priority +from smoother import Smoother + +@functools.total_ordering +class Task: + def __init__(self, time, fxn): + self.time = time + self.fxn = fxn + + def __lt__(self, other): + return self.time < other.time + +class Limiter: + class UpdateRateParams: + def __init__(self, time): + self.time = time + + class UpdateLimitParams: + def __init__(self, time, elapsed): + self.time = time + self.elapsed = elapsed + + class CanStartParams: + def __init__(self, time, num_started, count): + self.time = time + self.num_started = num_started + self.count = count + + class UpdateBudgetParams: + def __init__(self, time, num_started, num_started_at_priority, min_priority, last_batch, queue_empty, elapsed): + self.time = time + self.num_started = num_started + self.num_started_at_priority = num_started_at_priority + self.min_priority = min_priority + self.last_batch = last_batch + self.queue_empty = queue_empty + self.elapsed = elapsed + + def __init__(self, priority, ratekeeper_model, proxy_model): + self.priority = priority + self.ratekeeper_model = ratekeeper_model + self.proxy_model = proxy_model + self.limit = 0 + self.rate = self.ratekeeper_model.get_limit(0, self.priority) + + def update_rate(self, params): + pass + + def update_limit(self, params): + pass + + def can_start(self, params): + pass + + def update_budget(self, params): + pass + +class OriginalLimiter(Limiter): + def __init__(self, priority, limit_rate_model, proxy_model): + Limiter.__init__(self, priority, limit_rate_model, proxy_model) + + def update_rate(self, params): + self.rate = self.ratekeeper_model.get_limit(params.time, self.priority) + + def update_limit(self, params): + self.limit = min(0, self.limit) + params.elapsed * self.rate + self.limit = min(self.limit, self.rate * 0.01) + self.limit = min(self.limit, 100000) + + self.proxy_model.results.rate[self.priority][params.time] = self.rate + self.proxy_model.results.limit[self.priority][params.time] = self.limit + + def can_start(self, params): + return params.num_started < self.limit + + def update_budget(self, params): + self.limit -= params.num_started + +class PositiveBudgetLimiter(OriginalLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + OriginalLimiter.__init__(self, priority, limit_rate_model, proxy_model) + + def update_limit(self, params): + self.limit += params.elapsed * self.rate + self.limit = min(self.limit, 2.0 * self.rate) + +class ClampedBudgetLimiter(PositiveBudgetLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model) + + def update_budget(self, params): + min_budget = -self.rate * 5.0 + if self.limit > min_budget: + self.limit = max(self.limit - params.num_started, min_budget) + +class TimeLimiter(PositiveBudgetLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model) + self.locked_until = 0 + + def can_start(self, params): + return params.time >= self.locked_until and PositiveBudgetLimiter.can_start(self, params) + + def update_budget(self, params): + #print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch)) + + if params.min_priority >= self.priority or params.num_started < self.limit: + self.limit -= params.num_started + else: + self.limit = min(self.limit, max(self.limit - params.num_started, -params.last_batch)) + self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + (params.num_started - self.limit)/self.rate) + + #print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority)) + +class TimePositiveBudgetLimiter(PositiveBudgetLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + PositiveBudgetLimiter.__init__(self, priority, limit_rate_model, proxy_model) + self.locked_until = 0 + + def update_limit(self, params): + if params.time >= self.locked_until: + PositiveBudgetLimiter.update_limit(self, params) + + def can_start(self, params): + return params.num_started + params.count <= self.limit + + def update_budget(self, params): + #if params.num_started > 0: + #print('Start update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s, last_batch=%d' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority, params.last_batch)) + + if params.num_started > self.limit: + self.locked_until = min(params.time + 2.0, max(params.time, self.locked_until) + penalty/self.rate) + self.limit = 0 + else: + self.limit -= params.num_started + + #if params.num_started > 0: + #print('End update budget: time=%f, limit=%f, locked_until=%f, num_started=%d, priority=%s, min_priority=%s' % (params.time, self.limit, self.locked_until, params.num_started, self.priority, params.min_priority)) + +class SmoothingLimiter(OriginalLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + OriginalLimiter.__init__(self, priority, limit_rate_model, proxy_model) + self.smooth_released = Smoother(2) + self.smooth_rate_limit = Smoother(2) + self.rate_set = False + + def update_rate(self, params): + OriginalLimiter.update_rate(self, params) + if not self.rate_set: + self.rate_set = True + self.smooth_rate_limit.reset(self.rate) + else: + self.smooth_rate_limit.set_total(params.time, self.rate) + + def update_limit(self, params): + self.limit = 2.0 * (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time)) + + def can_start(self, params): + return params.num_started + params.count <= self.limit + + def update_budget(self, params): + self.smooth_released.add_delta(params.time, params.num_started) + +class SmoothingBudgetLimiter(SmoothingLimiter): + def __init__(self, priority, limit_rate_model, proxy_model): + SmoothingLimiter.__init__(self, priority, limit_rate_model, proxy_model) + #self.smooth_filled = Smoother(2) + self.budget = 0 + + def update_limit(self, params): + release_rate = (self.smooth_rate_limit.smooth_total(params.time) - self.smooth_released.smooth_rate(params.time)) + #self.smooth_filled.set_total(params.time, 1 if release_rate > 0 else 0) + self.limit = 2.0 * release_rate + + self.proxy_model.results.rate[self.priority][params.time] = self.smooth_rate_limit.smooth_total(params.time) + self.proxy_model.results.released[self.priority][params.time] = self.smooth_released.smooth_rate(params.time) + self.proxy_model.results.limit[self.priority][params.time] = self.limit + self.proxy_model.results.limit_and_budget[self.priority][params.time] = self.limit + self.budget + self.proxy_model.results.budget[self.priority][params.time] = self.budget + + #self.budget = max(0, self.budget + params.elapsed * self.smooth_rate_limit.smooth_total(params.time)) + + #if self.smooth_filled.smooth_total(params.time) >= 0.1: + #self.budget += params.elapsed * self.smooth_rate_limit.smooth_total(params.time) + + #print('Update limit: time=%f, priority=%s, limit=%f, rate=%f, released=%f, budget=%f' % (params.time, self.priority, self.limit, self.smooth_rate_limit.smooth_total(params.time), self.smooth_released.smooth_rate(params.time), self.budget)) + + def can_start(self, params): + return params.num_started + params.count <= self.limit + self.budget #or params.num_started + params.count <= self.budget + + def update_budget(self, params): + self.budget = max(0, self.budget + (self.limit - params.num_started_at_priority) / 2 * params.elapsed) + + if params.queue_empty: + self.budget = min(10, self.budget) + + self.smooth_released.add_delta(params.time, params.num_started_at_priority) + +class ProxyModel: + class Results: + def __init__(self, priorities, duration): + self.started = self.init_result(priorities, 0, duration) + self.queued = self.init_result(priorities, 0, duration) + self.latencies = self.init_result(priorities, [], duration) + self.unprocessed_queue_sizes = self.init_result(priorities, [], duration) + + self.rate = {p:{} for p in priorities} + self.released = {p:{} for p in priorities} + self.limit = {p:{} for p in priorities} + self.limit_and_budget = {p:{} for p in priorities} + self.budget = {p:{} for p in priorities} + + def init_result(self, priorities, starting_value, duration): + return {p: {s: copy.copy(starting_value) for s in range(0, duration)} for p in priorities} + + def __init__(self, duration, ratekeeper_model, workload_model, Limiter): + self.time = 0 + self.log_time = 0 + self.duration = duration + self.priority_limiters = { priority: Limiter(priority, ratekeeper_model, self) for priority in workload_model.priorities() } + self.workload_model = workload_model + self.request_scheduled = { p: False for p in self.workload_model.priorities()} + + self.tasks = [] + self.request_queue = [] + self.results = ProxyModel.Results(self.workload_model.priorities(), duration) + + def run(self): + self.update_rate() + self.process_requests(self.time) + + for priority in self.workload_model.priorities(): + next_request = self.workload_model.next_request(self.time, priority) + assert next_request is not None + heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request))) + self.request_scheduled[priority] = True + + while True:# or len(self.request_queue) > 0: + if int(self.time) > self.log_time: + self.log_time = int(self.time) + #print(self.log_time) + + task = heapq.heappop(self.tasks) + self.time = task.time + if self.time >= self.duration: + break + + task.fxn() + + def update_rate(self): + for limiter in self.priority_limiters.values(): + limiter.update_rate(Limiter.UpdateRateParams(self.time)) + + heapq.heappush(self.tasks, Task(self.time + 0.01, lambda: self.update_rate())) + + def receive_request(self, request): + heapq.heappush(self.request_queue, request) + + self.results.queued[request.priority][int(self.time)] += request.count + + next_request = self.workload_model.next_request(self.time, request.priority) + if next_request is not None and next_request.time < self.duration: + heapq.heappush(self.tasks, Task(next_request.time, lambda: self.receive_request(next_request))) + else: + self.request_scheduled[request.priority] = False + + def process_requests(self, last_time): + elapsed = self.time - last_time + for limiter in self.priority_limiters.values(): + limiter.update_limit(Limiter.UpdateLimitParams(self.time, elapsed)) + + current_started = 0 + started = {p:0 for p in self.workload_model.priorities()} + + min_priority = Priority.SYSTEM + last_batch = 0 + while len(self.request_queue) > 0: + request = self.request_queue[0] + + if not self.priority_limiters[request.priority].can_start(Limiter.CanStartParams(self.time, current_started, request.count)): + break + + min_priority = request.priority + last_batch = request.count + + if self.workload_model.request_completed(request) and not self.request_scheduled[request.priority]: + next_request = self.workload_model.next_request(self.time, request.priority) + assert next_request is not None + heapq.heappush(self.tasks, Task(next_request.time, lambda next_request=next_request: self.receive_request(next_request))) + self.request_scheduled[request.priority] = True + + current_started += request.count + started[request.priority] += request.count + + heapq.heappop(self.request_queue) + self.results.started[request.priority][int(self.time)] += request.count + self.results.latencies[request.priority][int(self.time)].append(self.time-request.time) + + if len(self.request_queue) == 0: + min_priority = Priority.BATCH + + for priority, limiter in self.priority_limiters.items(): + started_at_priority = sum([v for p,v in started.items() if p <= priority]) + limiter.update_budget(Limiter.UpdateBudgetParams(self.time, current_started, started_at_priority, min_priority, last_batch, len(self.request_queue) == 0 or self.request_queue[0].priority > priority, elapsed)) + + for priority in self.workload_model.priorities(): + self.results.unprocessed_queue_sizes[priority][int(self.time)].append(self.workload_model.workload_models[priority].outstanding) + + current_time = self.time + + delay = 0.001 + heapq.heappush(self.tasks, Task(self.time + delay, lambda: self.process_requests(current_time))) + + diff --git a/contrib/grv_proxy_model/rate_model.py b/contrib/grv_proxy_model/rate_model.py new file mode 100755 index 0000000000..1fabce2c7e --- /dev/null +++ b/contrib/grv_proxy_model/rate_model.py @@ -0,0 +1,83 @@ +# +# rate_model.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy + +class RateModel: + def __init__(self): + pass + + def get_rate(self, time): + pass + +class FixedRateModel(RateModel): + def __init__(self, rate): + RateModel.__init__(self) + self.rate = rate + + def get_rate(self, time): + return self.rate + +class UnlimitedRateModel(FixedRateModel): + def __init__(self): + self.rate = 1e9 + +class IntervalRateModel(RateModel): + def __init__(self, intervals): + self.intervals = sorted(intervals) + + def get_rate(self, time): + if len(self.intervals) == 0 or time < self.intervals[0][0]: + return 0 + + target_interval = len(self.intervals)-1 + for i in range(1, len(self.intervals)): + if time < self.intervals[i][0]: + target_interval = i-1 + break + + self.intervals = self.intervals[target_interval:] + return self.intervals[0][1] + +class SawtoothRateModel(RateModel): + def __init__(self, low, high, frequency): + self.low = low + self.high = high + self.frequency = frequency + + def get_rate(self, time): + if int(2*time/self.frequency) % 2 == 0: + return self.low + else: + return self.high + +class DistributionRateModel(RateModel): + def __init__(self, distribution, frequency): + self.distribution = distribution + self.frequency = frequency + self.last_change = 0 + self.rate = None + + def get_rate(self, time): + if self.frequency == 0 or int((time - self.last_change) / self.frequency) > int(self.last_change / self.frequency) or self.rate is None: + self.last_change = time + self.rate = self.distribution() + + return self.rate diff --git a/contrib/grv_proxy_model/ratekeeper_model.py b/contrib/grv_proxy_model/ratekeeper_model.py new file mode 100755 index 0000000000..57125dc4c0 --- /dev/null +++ b/contrib/grv_proxy_model/ratekeeper_model.py @@ -0,0 +1,67 @@ +# +# ratekeeper.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy +import rate_model +from priority import Priority + +class RatekeeperModel: + def __init__(self, limit_models): + self.limit_models = limit_models + + def get_limit(self, time, priority): + return self.limit_models[priority].get_rate(time) + +predefined_ratekeeper = {} + +predefined_ratekeeper['default200_batch100'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.FixedRateModel(200), + Priority.BATCH: rate_model.FixedRateModel(100) +}) + +predefined_ratekeeper['default_sawtooth'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.SawtoothRateModel(10, 200, 1), + Priority.BATCH: rate_model.FixedRateModel(0) +}) + +predefined_ratekeeper['default_uniform_random'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.DistributionRateModel(lambda: numpy.random.uniform(10, 200), 1), + Priority.BATCH: rate_model.FixedRateModel(0) +}) + +predefined_ratekeeper['default_trickle'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.FixedRateModel(3), + Priority.BATCH: rate_model.FixedRateModel(0) +}) + +predefined_ratekeeper['default1000'] = RatekeeperModel( +{ + Priority.SYSTEM: rate_model.UnlimitedRateModel(), + Priority.DEFAULT: rate_model.FixedRateModel(1000), + Priority.BATCH: rate_model.FixedRateModel(500) +}) diff --git a/contrib/grv_proxy_model/smoother.py b/contrib/grv_proxy_model/smoother.py new file mode 100644 index 0000000000..bc1b32ea12 --- /dev/null +++ b/contrib/grv_proxy_model/smoother.py @@ -0,0 +1,53 @@ +# +# smoother.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import math + +class Smoother: + def __init__(self, folding_time): + self.folding_time = folding_time + self.reset(0) + + def reset(self, value): + self.time = 0 + self.total = value + self.estimate = value + + def set_total(self, time, total): + self.add_delta(time, total-self.total) + + def add_delta(self, time, delta): + self.update(time) + self.total += delta + + def smooth_total(self, time): + self.update(time) + return self.estimate + + def smooth_rate(self, time): + self.update(time) + return (self.total-self.estimate) / self.folding_time + + def update(self, time): + elapsed = time - self.time + if elapsed > 0: + self.time = time + self.estimate += (self.total-self.estimate) * (1-math.exp(-elapsed/self.folding_time)) + diff --git a/contrib/grv_proxy_model/workload_model.py b/contrib/grv_proxy_model/workload_model.py new file mode 100755 index 0000000000..63fb4c472e --- /dev/null +++ b/contrib/grv_proxy_model/workload_model.py @@ -0,0 +1,201 @@ +# +# workload_model.py +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2020 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import functools +import numpy +import math + +import rate_model +from priority import Priority + +@functools.total_ordering +class Request: + def __init__(self, time, count, priority): + self.time = time + self.count = count + self.priority = priority + + def __lt__(self, other): + return self.priority < other.priority + +class PriorityWorkloadModel: + def __init__(self, priority, rate_model, batch_model, generator, max_outstanding=1e9): + self.priority = priority + self.rate_model = rate_model + self.batch_model = batch_model + self.generator = generator + self.max_outstanding = max_outstanding + self.outstanding = 0 + + def next_request(self, time): + if self.outstanding >= self.max_outstanding: + return None + + batch_size = self.batch_model.next_batch() + self.outstanding += batch_size + interval = self.generator.next_request_interval(self.rate_model.get_rate(time)) + return Request(time + interval, batch_size, self.priority) + + def request_completed(self, request): + was_full = self.max_outstanding <= self.outstanding + self.outstanding -= request.count + + return was_full and self.outstanding < self.max_outstanding + +class WorkloadModel: + def __init__(self, workload_models): + self.workload_models = workload_models + + def priorities(self): + return list(self.workload_models.keys()) + + def next_request(self, time, priority): + return self.workload_models[priority].next_request(time) + + def request_completed(self, request): + return self.workload_models[request.priority].request_completed(request) + +class Distribution: + EXPONENTIAL = lambda x: numpy.random.exponential(x) + UNIFORM = lambda x: numpy.random.uniform(0, 2.0*x) + FIXED = lambda x: x + +class BatchGenerator: + def __init__(self): + pass + + def next_batch(self): + pass + +class DistributionBatchGenerator(BatchGenerator): + def __init__(self, distribution, size): + BatchGenerator.__init__(self) + self.distribution = distribution + self.size = size + + def next_batch(self): + return math.ceil(self.distribution(self.size)) + +class RequestGenerator: + def __init__(self): + pass + + def next_request_interval(self, rate): + pass + +class DistributionRequestGenerator(RequestGenerator): + def __init__(self, distribution): + RequestGenerator.__init__(self) + self.distribution = distribution + + def next_request_interval(self, rate): + if rate == 0: + return 1e9 + + return self.distribution(1.0/rate) + +predefined_workloads = {} + +predefined_workloads['slow_exponential'] = WorkloadModel( +{ + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.FixedRateModel(100), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.EXPONENTIAL), + max_outstanding=100 + ) +}) + +predefined_workloads['fixed_uniform'] = WorkloadModel( +{ + Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, + rate_model.FixedRateModel(0), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=10 + ), + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.FixedRateModel(95), + DistributionBatchGenerator(Distribution.FIXED, 10), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ), + Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, + rate_model.FixedRateModel(1), + DistributionBatchGenerator(Distribution.UNIFORM, 500), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ) +}) + +predefined_workloads['batch_starvation'] = WorkloadModel( +{ + Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, + rate_model.FixedRateModel(1), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=10 + ), + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.IntervalRateModel([(0,50), (60,150), (120,90)]), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ), + Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, + rate_model.FixedRateModel(100), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ) +}) + +predefined_workloads['default_low_high_low'] = WorkloadModel( +{ + Priority.SYSTEM: PriorityWorkloadModel(Priority.SYSTEM, + rate_model.FixedRateModel(0), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=10 + ), + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.IntervalRateModel([(0,100), (60,300), (120,100)]), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ), + Priority.BATCH: PriorityWorkloadModel(Priority.BATCH, + rate_model.FixedRateModel(0), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.UNIFORM), + max_outstanding=200 + ) +}) + +for rate in [83, 100, 180, 190, 200]: + predefined_workloads['default%d' % rate] = WorkloadModel( + { + Priority.DEFAULT: PriorityWorkloadModel(Priority.DEFAULT, + rate_model.FixedRateModel(rate), + DistributionBatchGenerator(Distribution.FIXED, 1), + DistributionRequestGenerator(Distribution.EXPONENTIAL), + max_outstanding=1000 + ) + }) From 82f7f541c39377ae2386cc52b777b354b3f545c4 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 25 Nov 2020 11:38:08 -0700 Subject: [PATCH 002/461] started lineage implementation --- flow/flow.cpp | 2 ++ flow/flow.h | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/flow/flow.cpp b/flow/flow.cpp index 89f04bd5df..a2bfcc1510 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -26,6 +26,8 @@ #include #include +thread_local ActorLineagePropertyMap* currentLineage = nullptr; + #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER) // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test. void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) { diff --git a/flow/flow.h b/flow/flow.h index a72465143d..155c5db2a2 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -36,6 +36,7 @@ #include #include #include +#include #include "flow/Platform.h" #include "flow/FastAlloc.h" @@ -407,6 +408,30 @@ struct SingleCallback { } }; +// in the future we might want to read these from a different thread. std::shared_ptr +// seems to be better suited for this... +struct ActorLineagePropertyMap : std::enable_shared_from_this { + std::shared_ptr parent = nullptr; +}; + +extern thread_local ActorLineagePropertyMap* currentLineage; + +struct ActorLineage { + std::shared_ptr properties = std::make_shared(); + ActorLineage() { + if (currentLineage) { + properties->parent = currentLineage->shared_from_this(); + } + } +}; + +struct save_lineage { + ActorLineagePropertyMap* current = currentLineage; + ~save_lineage() { + currentLineage = current; + } +}; + // SAV is short for Single Assignment Variable: It can be assigned for only once! template struct SAV : private Callback, FastAllocated> { @@ -445,6 +470,7 @@ public: ASSERT(canBeSet()); new (&value_storage) T(std::forward(value)); this->error_state = Error::fromCode(SET_ERROR_CODE); + save_lineage _{}; while (Callback::next != this) Callback::next->fire(this->value()); } @@ -457,6 +483,7 @@ public: void sendError(Error err) { ASSERT(canBeSet() && int16_t(err.code()) > 0); this->error_state = err; + save_lineage _{}; while (Callback::next != this) Callback::next->error(err); } @@ -477,6 +504,7 @@ public: void finishSendAndDelPromiseRef() { // Call only after value_storage has already been initialized! this->error_state = Error::fromCode(SET_ERROR_CODE); + save_lineage _{}; while (Callback::next != this) Callback::next->fire(this->value()); @@ -500,6 +528,7 @@ public: } this->error_state = err; + save_lineage _{}; while (Callback::next != this) Callback::next->error(err); @@ -987,7 +1016,7 @@ static inline void destruct(T& t) { } template -struct Actor : SAV { +struct Actor : SAV, ActorLineage { int8_t actor_wait_state; // -1 means actor is cancelled; 0 means actor is not waiting; 1-N mean waiting in callback group # Actor() : SAV(1, 1), actor_wait_state(0) { /*++actorCount;*/ } @@ -995,7 +1024,7 @@ struct Actor : SAV { }; template <> -struct Actor { +struct Actor : ActorLineage { // This specialization is for a void actor (one not returning a future, hence also uncancellable) int8_t actor_wait_state; // 0 means actor is not waiting; 1-N mean waiting in callback group # From 05f77f905fb3a32c026729479de3de5456a5789e Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 7 Dec 2020 15:15:25 -0700 Subject: [PATCH 003/461] Added actor lineage --- flow/actorcompiler/ActorCompiler.cs | 1 + flow/actorcompiler/actorcompiler.csproj | 108 +----------------------- flow/actorcompiler/actorcompiler.sln | 34 ++++++++ flow/flow.cpp | 5 +- flow/flow.h | 96 +++++++++++++-------- flow/genericactors.actor.h | 4 + 6 files changed, 110 insertions(+), 138 deletions(-) create mode 100644 flow/actorcompiler/actorcompiler.sln diff --git a/flow/actorcompiler/ActorCompiler.cs b/flow/actorcompiler/ActorCompiler.cs index 7aef82a42e..dc9de91868 100644 --- a/flow/actorcompiler/ActorCompiler.cs +++ b/flow/actorcompiler/ActorCompiler.cs @@ -452,6 +452,7 @@ namespace actorcompiler fullClassName, string.Join(", ", actor.parameters.Select(p => p.name).ToArray())); + writer.WriteLine("restore_lineage _;"); if (actor.returnType != null) writer.WriteLine("\treturn Future<{1}>({0});", newActor, actor.returnType); else diff --git a/flow/actorcompiler/actorcompiler.csproj b/flow/actorcompiler/actorcompiler.csproj index e737adabd2..b590913634 100644 --- a/flow/actorcompiler/actorcompiler.csproj +++ b/flow/actorcompiler/actorcompiler.csproj @@ -1,108 +1,8 @@ - - + + - Debug - 10.0.20506 - 2.0 - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51} Exe - Properties - actorcompiler - actorcompiler - v4.0 - 512 - $(SolutionDir)bin\$(Configuration)\ - publish\ - true - Disk - false - Foreground - 7 - Days - false - false - true - 0 - 1.0.0.%2a - false - false - true + net5.0 - - true - DEBUG;TRACE - full - AnyCPU - default - prompt - false - false - - - TRACE - true - pdbonly - AnyCPU - default - prompt - false - false - - - - - 3.5 - - - 3.5 - - - 3.5 - - - 4.0 - - - - - - - - - - - - - - False - Microsoft .NET Framework 4 %28x86 and x64%29 - true - - - False - .NET Framework 3.5 SP1 Client Profile - false - - - False - .NET Framework 3.5 SP1 - false - - - False - Windows Installer 3.1 - true - - - - - - - + \ No newline at end of file diff --git a/flow/actorcompiler/actorcompiler.sln b/flow/actorcompiler/actorcompiler.sln new file mode 100644 index 0000000000..a4292bfaaa --- /dev/null +++ b/flow/actorcompiler/actorcompiler.sln @@ -0,0 +1,34 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.26124.0 +MinimumVisualStudioVersion = 15.0.26124.0 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "actorcompiler", "actorcompiler.csproj", "{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|Any CPU = Release|Any CPU + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|Any CPU.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x64.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x64.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x86.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x86.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|Any CPU.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|Any CPU.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x64.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x64.Build.0 = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x86.ActiveCfg = Debug|Any CPU + {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x86.Build.0 = Debug|Any CPU + EndGlobalSection +EndGlobal diff --git a/flow/flow.cpp b/flow/flow.cpp index a2bfcc1510..c4a6097300 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -26,7 +26,10 @@ #include #include -thread_local ActorLineagePropertyMap* currentLineage = nullptr; +extern thread_local Reference currentLineage; + +ActorLineage::ActorLineage() : parent(currentLineage) { +} #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER) // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test. diff --git a/flow/flow.h b/flow/flow.h index 155c5db2a2..a0c9793a7a 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -20,6 +20,7 @@ #ifndef FLOW_FLOW_H #define FLOW_FLOW_H +#include "flow/FastRef.h" #pragma once #pragma warning( disable: 4244 4267 ) // SOMEDAY: Carefully check for integer overflow issues (e.g. size_t to int conversions like this suppresses) @@ -408,28 +409,21 @@ struct SingleCallback { } }; -// in the future we might want to read these from a different thread. std::shared_ptr -// seems to be better suited for this... -struct ActorLineagePropertyMap : std::enable_shared_from_this { - std::shared_ptr parent = nullptr; +struct ActorLineagePropertyMap : ReferenceCounted { }; -extern thread_local ActorLineagePropertyMap* currentLineage; - -struct ActorLineage { - std::shared_ptr properties = std::make_shared(); - ActorLineage() { - if (currentLineage) { - properties->parent = currentLineage->shared_from_this(); - } - } +struct ActorLineage : ReferenceCounted { + Reference map; + Reference parent; + ActorLineage(); }; -struct save_lineage { - ActorLineagePropertyMap* current = currentLineage; - ~save_lineage() { - currentLineage = current; - } +extern thread_local Reference currentLineage; + +struct restore_lineage { + Reference lineage; + restore_lineage() : lineage(currentLineage) {} + ~restore_lineage() { currentLineage = lineage; } }; // SAV is short for Single Assignment Variable: It can be assigned for only once! @@ -447,7 +441,8 @@ public: T& value() { return *(T*)&value_storage; } - SAV(int futures, int promises) : futures(futures), promises(promises), error_state(Error::fromCode(UNSET_ERROR_CODE)) { + SAV(int futures, int promises) + : futures(futures), promises(promises), error_state(Error::fromCode(UNSET_ERROR_CODE)) { Callback::prev = Callback::next = this; } ~SAV() { @@ -466,13 +461,14 @@ public: } template - void send(U && value) { + void send(U&& value) { ASSERT(canBeSet()); new (&value_storage) T(std::forward(value)); this->error_state = Error::fromCode(SET_ERROR_CODE); - save_lineage _{}; - while (Callback::next != this) + restore_lineage _; + while (Callback::next != this) { Callback::next->fire(this->value()); + } } void send(Never) { @@ -483,13 +479,15 @@ public: void sendError(Error err) { ASSERT(canBeSet() && int16_t(err.code()) > 0); this->error_state = err; - save_lineage _{}; - while (Callback::next != this) + restore_lineage _; + while (Callback::next != this) { Callback::next->error(err); + } } template void sendAndDelPromiseRef(U && value) { + restore_lineage _; ASSERT(canBeSet()); if (promises == 1 && !futures) { // No one is left to receive the value, so we can just die @@ -503,8 +501,8 @@ public: void finishSendAndDelPromiseRef() { // Call only after value_storage has already been initialized! + restore_lineage _; this->error_state = Error::fromCode(SET_ERROR_CODE); - save_lineage _{}; while (Callback::next != this) Callback::next->fire(this->value()); @@ -520,6 +518,7 @@ public: } void sendErrorAndDelPromiseRef(Error err) { + restore_lineage _; ASSERT(canBeSet() && int16_t(err.code()) > 0); if (promises == 1 && !futures) { // No one is left to receive the value, so we can just die @@ -528,7 +527,6 @@ public: } this->error_state = err; - save_lineage _{}; while (Callback::next != this) Callback::next->error(err); @@ -624,6 +622,7 @@ struct NotifiedQueue : private SingleCallback, FastAllocated if (error.isValid()) return; if (SingleCallback::next != this) { + restore_lineage _; SingleCallback::next->fire(std::forward(value)); } else { @@ -635,8 +634,10 @@ struct NotifiedQueue : private SingleCallback, FastAllocated if (error.isValid()) return; this->error = err; - if (SingleCallback::next != this) + if (SingleCallback::next != this) { + restore_lineage _; SingleCallback::next->error(err); + } } void addPromiseRef() { promises++; } @@ -1016,38 +1017,67 @@ static inline void destruct(T& t) { } template -struct Actor : SAV, ActorLineage { +struct Actor : SAV { + Reference lineage = Reference{new ActorLineage() }; int8_t actor_wait_state; // -1 means actor is cancelled; 0 means actor is not waiting; 1-N mean waiting in callback group # - Actor() : SAV(1, 1), actor_wait_state(0) { /*++actorCount;*/ } + Actor() : SAV(1, 1), actor_wait_state(0) { + /*++actorCount;*/ + currentLineage = lineage; + } + + Reference setLineage() { + auto res = currentLineage; + currentLineage = lineage; + return res; + } //~Actor() { --actorCount; } }; template <> -struct Actor : ActorLineage { +struct Actor { // This specialization is for a void actor (one not returning a future, hence also uncancellable) + Reference lineage = Reference{new ActorLineage() }; int8_t actor_wait_state; // 0 means actor is not waiting; 1-N mean waiting in callback group # - Actor() : actor_wait_state(0) { /*++actorCount;*/ } + Actor() : actor_wait_state(0) { + /*++actorCount;*/ + currentLineage = lineage; + } + + Reference setLineage() { + auto res = currentLineage; + currentLineage = lineage; + return res; + } //~Actor() { --actorCount; } }; template struct ActorCallback : Callback { - virtual void fire(ValueType const& value) override { static_cast(this)->a_callback_fire(this, value); } - virtual void error(Error e) override { static_cast(this)->a_callback_error(this, e); } + virtual void fire(ValueType const& value) override { + auto _ = static_cast(this)->setLineage(); + static_cast(this)->a_callback_fire(this, value); + } + virtual void error(Error e) override { + auto _ = static_cast(this)->setLineage(); + static_cast(this)->a_callback_error(this, e); + } }; template struct ActorSingleCallback : SingleCallback { virtual void fire(ValueType const& value) override { + auto _ = static_cast(this)->setLineage(); static_cast(this)->a_callback_fire(this, value); } virtual void fire(ValueType && value) override { + auto _ = static_cast(this)->setLineage(); static_cast(this)->a_callback_fire(this, std::move(value)); } virtual void error(Error e) override { + auto _ = static_cast(this)->setLineage(); static_cast(this)->a_callback_error(this, e); } }; diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 3fcab1f7dd..ab9d9c07d5 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -1493,6 +1493,10 @@ struct YieldedFutureActor : SAV, ActorCallback setLineage() { + return currentLineage; + } + void a_callback_fire(ActorCallback*, Void) { if (int16_t(in_error_state.code()) == UNSET_ERROR_CODE) { in_error_state = Error::fromCode(SET_ERROR_CODE); From d837e923ad9f8cbf3a5bcd5668a74d4ee0222c32 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 7 Dec 2020 15:23:18 -0700 Subject: [PATCH 004/461] minor bugfix --- flow/flow.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/flow.cpp b/flow/flow.cpp index c4a6097300..ed977141bd 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -26,7 +26,7 @@ #include #include -extern thread_local Reference currentLineage; +thread_local Reference currentLineage; ActorLineage::ActorLineage() : parent(currentLineage) { } From 2c4e38329e536172d2413da61d884ef944277598 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 9 Dec 2020 10:19:32 -0700 Subject: [PATCH 005/461] fix some compiler warnings --- fdbclient/SystemData.cpp | 6 +++--- fdbserver/BackupProgress.actor.cpp | 2 +- fdbserver/BackupWorker.actor.cpp | 6 +++--- fdbserver/CommitProxyServer.actor.cpp | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index b402ad99a7..16733b1ad6 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -57,7 +57,7 @@ const Value keyServersValue( Standalone result, const std::vecto std::vector destTag; bool foundOldLocality = false; - for (const KeyValueRef kv : result) { + for (const KeyValueRef& kv : result) { UID uid = decodeServerTagKey(kv.key); if (std::find(src.begin(), src.end(), uid) != src.end()) { srcTag.push_back( decodeServerTagValue(kv.value) ); @@ -109,7 +109,7 @@ void decodeKeyServersValue( Standalone result, const ValueRef& v src.clear(); dest.clear(); - for (const KeyValueRef kv : result) { + for (const KeyValueRef& kv : result) { Tag tag = decodeServerTagValue(kv.value); if (std::find(srcTag.begin(), srcTag.end(), tag) != srcTag.end()) { src.push_back( decodeServerTagKey(kv.key) ); @@ -122,7 +122,7 @@ void decodeKeyServersValue( Standalone result, const ValueRef& v std::sort(dest.begin(), dest.end()); if(missingIsError && (src.size() != srcTag.size() || dest.size() != destTag.size())) { TraceEvent(SevError, "AttemptedToDecodeMissingTag"); - for (const KeyValueRef kv : result) { + for (const KeyValueRef& kv : result) { Tag tag = decodeServerTagValue(kv.value); UID serverID = decodeServerTagKey(kv.key); TraceEvent("TagUIDMap").detail("Tag", tag.toString()).detail("UID", serverID.toString()); diff --git a/fdbserver/BackupProgress.actor.cpp b/fdbserver/BackupProgress.actor.cpp index 3f1d564c16..f496ec0558 100644 --- a/fdbserver/BackupProgress.actor.cpp +++ b/fdbserver/BackupProgress.actor.cpp @@ -121,7 +121,7 @@ std::map, std::map> BackupProgr } } - for (const Tag tag : tags) { // tags without progress data + for (const Tag& tag : tags) { // tags without progress data tagVersions.insert({ tag, adjustedBeginVersion }); TraceEvent("BackupVersionRange", dbgid) .detail("OldEpoch", epoch) diff --git a/fdbserver/BackupWorker.actor.cpp b/fdbserver/BackupWorker.actor.cpp index 3cea9f6611..b5f78593e2 100644 --- a/fdbserver/BackupWorker.actor.cpp +++ b/fdbserver/BackupWorker.actor.cpp @@ -508,7 +508,7 @@ ACTOR Future setBackupKeys(BackupData* self, std::map savedL state std::vector>> prevVersions; state std::vector versionConfigs; state std::vector>> allWorkersReady; - for (const auto [uid, version] : savedLogVersions) { + for (const auto& [uid, version] : savedLogVersions) { versionConfigs.emplace_back(uid); prevVersions.push_back(versionConfigs.back().latestBackupWorkerSavedVersion().get(tr)); allWorkersReady.push_back(versionConfigs.back().allWorkerStarted().get(tr)); @@ -573,7 +573,7 @@ ACTOR Future monitorBackupProgress(BackupData* self) { if (self->recruitedEpoch == self->oldestBackupEpoch) { // update update progress so far if previous epochs are done Version v = std::numeric_limits::max(); - for (const auto [tag, version] : tagVersions) { + for (const auto& [tag, version] : tagVersions) { v = std::min(v, version); } savedLogVersions.emplace(uid, v); @@ -783,7 +783,7 @@ ACTOR Future saveMutationsToFile(BackupData* self, Version popVersion, int .detail("TagId", self->tag.id) .detail("File", file->getFileName()); } - for (const UID uid : activeUids) { + for (const UID& uid : activeUids) { self->backups[uid].lastSavedVersion = popVersion + 1; } diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index eac0f0d4c2..96ae4c000c 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -1778,7 +1778,7 @@ ACTOR Future commitProxyServerCore(CommitProxyInterface proxy, MasterInter state KeyRange txnKeys = allKeys; Standalone UIDtoTagMap = commitData.txnStateStore->readRange( serverTagKeys ).get(); state std::map tag_uid; - for (const KeyValueRef kv : UIDtoTagMap) { + for (const KeyValueRef& kv : UIDtoTagMap) { tag_uid[decodeServerTagValue(kv.value)] = decodeServerTagKey(kv.key); } loop { From 0d324cee80b306797e6f92392414b786ad5ce914 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 9 Dec 2020 10:19:59 -0700 Subject: [PATCH 006/461] Annotation framework and role lineage --- fdbrpc/CMakeLists.txt | 2 + fdbrpc/Locality.h | 1 + fdbrpc/RoleLineage.cpp | 23 ++++++++++ fdbrpc/RoleLineage.h | 31 +++++++++++++ fdbserver/worker.actor.cpp | 3 ++ flow/flow.cpp | 6 +++ flow/flow.h | 90 ++++++++++++++++++++++++++++++++------ 7 files changed, 142 insertions(+), 14 deletions(-) create mode 100644 fdbrpc/RoleLineage.cpp create mode 100644 fdbrpc/RoleLineage.h diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt index b4fb20098d..41229dce47 100644 --- a/fdbrpc/CMakeLists.txt +++ b/fdbrpc/CMakeLists.txt @@ -22,6 +22,8 @@ set(FDBRPC_SRCS ReplicationPolicy.cpp ReplicationTypes.cpp ReplicationUtils.cpp + RoleLineage.h + RoleLineage.cpp Stats.actor.cpp Stats.h sim2.actor.cpp diff --git a/fdbrpc/Locality.h b/fdbrpc/Locality.h index 11c209071a..2129b7a3b7 100644 --- a/fdbrpc/Locality.h +++ b/fdbrpc/Locality.h @@ -63,6 +63,7 @@ struct ProcessClass { Ratekeeper, StorageCache, Backup, + Worker, // used for actor lineage tracking NoRole }; enum ClassSource { CommandLineSource, AutoSource, DBSource, InvalidSource = -1 }; diff --git a/fdbrpc/RoleLineage.cpp b/fdbrpc/RoleLineage.cpp new file mode 100644 index 0000000000..89a64bbe40 --- /dev/null +++ b/fdbrpc/RoleLineage.cpp @@ -0,0 +1,23 @@ +/* + * RoleLineage.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbrpc/RoleLineage.h" + +StringRef RoleLineage::name = "RoleLineage"_sr; diff --git a/fdbrpc/RoleLineage.h b/fdbrpc/RoleLineage.h new file mode 100644 index 0000000000..30a2ea2650 --- /dev/null +++ b/fdbrpc/RoleLineage.h @@ -0,0 +1,31 @@ +/* + * RoleLineage.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "fdbrpc/Locality.h" + +struct RoleLineage : LineageProperties { + static StringRef name; + ProcessClass::ClusterRole role = ProcessClass::NoRole; + + bool isSet(ProcessClass::ClusterRole RoleLineage::*member) { + return this->*member != ProcessClass::NoRole; + } +}; diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index ca34f903a2..98363ea247 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -22,6 +22,7 @@ #include #include "fdbrpc/Locality.h" +#include "fdbrpc/RoleLineage.h" #include "fdbclient/StorageServerInterface.h" #include "fdbserver/Knobs.h" #include "flow/ActorCollection.h" @@ -46,6 +47,7 @@ #include "flow/Profiler.h" #include "flow/ThreadHelper.actor.h" #include "flow/Trace.h" +#include "flow/flow.h" #ifdef __linux__ #include @@ -1810,6 +1812,7 @@ ACTOR Future fdbd( { state vector> actors; state Promise recoveredDiskFiles; + currentLineage->modify(&RoleLineage::role) = ProcessClass::Worker; try { ServerCoordinators coordinators( connFile ); diff --git a/flow/flow.cpp b/flow/flow.cpp index ed977141bd..5b354fe054 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -31,6 +31,12 @@ thread_local Reference currentLineage; ActorLineage::ActorLineage() : parent(currentLineage) { } +ActorLineage::~ActorLineage() { + for (auto ptr : properties) { + delete ptr.second; + } +} + #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER) // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test. void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) { diff --git a/flow/flow.h b/flow/flow.h index a0c9793a7a..0ffc895a86 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -20,6 +20,7 @@ #ifndef FLOW_FLOW_H #define FLOW_FLOW_H +#include "flow/Arena.h" #include "flow/FastRef.h" #pragma once @@ -29,6 +30,7 @@ #include #include +#include #include #include #include @@ -409,21 +411,88 @@ struct SingleCallback { } }; -struct ActorLineagePropertyMap : ReferenceCounted { +struct LineagePropertiesBase { +}; + +// helper class to make implementation of LineageProperties easier +template +struct LineageProperties : LineagePropertiesBase { + // Contract: + // + // StringRef name = "SomeUniqueName"_str; + + + // this has to be implemented by subclasses + // but can't be made virtual. + // A user should implement this for any type + // within the properies class. + template + bool isSet(Value Derived::*member) { + return true; + } }; struct ActorLineage : ReferenceCounted { - Reference map; +private: + std::unordered_map properties; Reference parent; +public: ActorLineage(); + ~ActorLineage(); + bool isRoot() const { + return parent.getPtr() == nullptr; + } + void makeRoot() { + parent.clear(); + } + template + V& modify(V T::*member) { + auto& res = properties[T::name]; + if (!res) { + res = new T{}; + } + T* map = static_cast(res); + return map->*member; + } + template + std::optional get(V T::*member) const { + auto current = this; + while (current != nullptr) { + auto iter = current->properties.find(T::name); + if (iter != current->properties.end()) { + T const& map = static_cast(*iter->second); + if (map.isSet(member)) { + return map.*member; + } + } + current = current->parent.getPtr(); + } + return std::optional{}; + } + template + std::stack stack(V T::*member) const { + auto current = this; + std::stack res; + while (current != nullptr) { + auto iter = current->properties.find(T::name); + if (iter != current->properties.end()) { + T const& map = static_cast(*iter->second); + if (map.isSet(member)) { + res.push(map.*member); + } + } + current = current->parent.getPtr(); + } + return res; + } }; extern thread_local Reference currentLineage; struct restore_lineage { - Reference lineage; - restore_lineage() : lineage(currentLineage) {} - ~restore_lineage() { currentLineage = lineage; } + Reference prev; + restore_lineage() : prev(currentLineage) {} + ~restore_lineage() { currentLineage = prev; } }; // SAV is short for Single Assignment Variable: It can be assigned for only once! @@ -465,7 +534,6 @@ public: ASSERT(canBeSet()); new (&value_storage) T(std::forward(value)); this->error_state = Error::fromCode(SET_ERROR_CODE); - restore_lineage _; while (Callback::next != this) { Callback::next->fire(this->value()); } @@ -479,7 +547,6 @@ public: void sendError(Error err) { ASSERT(canBeSet() && int16_t(err.code()) > 0); this->error_state = err; - restore_lineage _; while (Callback::next != this) { Callback::next->error(err); } @@ -487,7 +554,6 @@ public: template void sendAndDelPromiseRef(U && value) { - restore_lineage _; ASSERT(canBeSet()); if (promises == 1 && !futures) { // No one is left to receive the value, so we can just die @@ -501,7 +567,6 @@ public: void finishSendAndDelPromiseRef() { // Call only after value_storage has already been initialized! - restore_lineage _; this->error_state = Error::fromCode(SET_ERROR_CODE); while (Callback::next != this) Callback::next->fire(this->value()); @@ -518,7 +583,6 @@ public: } void sendErrorAndDelPromiseRef(Error err) { - restore_lineage _; ASSERT(canBeSet() && int16_t(err.code()) > 0); if (promises == 1 && !futures) { // No one is left to receive the value, so we can just die @@ -622,7 +686,6 @@ struct NotifiedQueue : private SingleCallback, FastAllocated if (error.isValid()) return; if (SingleCallback::next != this) { - restore_lineage _; SingleCallback::next->fire(std::forward(value)); } else { @@ -635,7 +698,6 @@ struct NotifiedQueue : private SingleCallback, FastAllocated this->error = err; if (SingleCallback::next != this) { - restore_lineage _; SingleCallback::next->error(err); } } @@ -1025,13 +1087,13 @@ struct Actor : SAV { /*++actorCount;*/ currentLineage = lineage; } + //~Actor() { --actorCount; } Reference setLineage() { auto res = currentLineage; currentLineage = lineage; return res; } - //~Actor() { --actorCount; } }; template <> @@ -1045,13 +1107,13 @@ struct Actor { /*++actorCount;*/ currentLineage = lineage; } + //~Actor() { --actorCount; } Reference setLineage() { auto res = currentLineage; currentLineage = lineage; return res; } - //~Actor() { --actorCount; } }; template From 945d0246cddc0dcfff982f22af54c43617bc79a8 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 9 Dec 2020 13:28:15 -0700 Subject: [PATCH 007/461] add actor stacktrace feature --- flow/actorcompiler/ActorCompiler.cs | 3 ++- flow/flow.cpp | 6 ++++++ flow/flow.h | 12 ++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/flow/actorcompiler/ActorCompiler.cs b/flow/actorcompiler/ActorCompiler.cs index dc9de91868..28771f4503 100644 --- a/flow/actorcompiler/ActorCompiler.cs +++ b/flow/actorcompiler/ActorCompiler.cs @@ -452,7 +452,7 @@ namespace actorcompiler fullClassName, string.Join(", ", actor.parameters.Select(p => p.name).ToArray())); - writer.WriteLine("restore_lineage _;"); + writer.WriteLine("\trestore_lineage _;"); if (actor.returnType != null) writer.WriteLine("\treturn Future<{1}>({0});", newActor, actor.returnType); else @@ -1287,6 +1287,7 @@ namespace actorcompiler constructor.WriteLine("{"); constructor.Indent(+1); ProbeEnter(constructor, actor.name); + constructor.WriteLine("currentLineage->modify(&StackLineage::actorName) = LiteralStringRef(\"{0}\");", actor.name); constructor.WriteLine("this->{0};", body.call()); ProbeExit(constructor, actor.name); WriteFunction(writer, constructor, constructor.BodyText); diff --git a/flow/flow.cpp b/flow/flow.cpp index 5b354fe054..2e47847fcd 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -37,6 +37,12 @@ ActorLineage::~ActorLineage() { } } +StringRef StackLineage::name = "StackLineage"_sr; + +std::stack getActorStackTrace() { + return currentLineage->stack(&StackLineage::actorName); +} + #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER) // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test. void * rte_memcpy_noinline(void *__restrict __dest, const void *__restrict __src, size_t __n) { diff --git a/flow/flow.h b/flow/flow.h index 0ffc895a86..518dbd036c 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -495,6 +495,18 @@ struct restore_lineage { ~restore_lineage() { currentLineage = prev; } }; +struct StackLineage : LineageProperties { + static StringRef name; + StringRef actorName; + + template + bool isSet(Value StackLineage::*member) { + return true; + } +}; + +extern std::stack getActorStackTrace(); + // SAV is short for Single Assignment Variable: It can be assigned for only once! template struct SAV : private Callback, FastAllocated> { From f8e1df6c4f8c5a687afffe2b9a28aa13e32ae9d5 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 10 Dec 2020 10:42:04 -0700 Subject: [PATCH 008/461] Support for actor stack traces --- fdbrpc/RoleLineage.h | 2 +- fdbserver/CMakeLists.txt | 1 + fdbserver/SigStack.cpp | 23 +++++++++++++++++++++++ fdbserver/worker.actor.cpp | 3 +++ flow/flow.h | 7 +------ tests/TestRunner/local_cluster.py | 2 +- 6 files changed, 30 insertions(+), 8 deletions(-) create mode 100644 fdbserver/SigStack.cpp diff --git a/fdbrpc/RoleLineage.h b/fdbrpc/RoleLineage.h index 30a2ea2650..8e9d3f4e9e 100644 --- a/fdbrpc/RoleLineage.h +++ b/fdbrpc/RoleLineage.h @@ -25,7 +25,7 @@ struct RoleLineage : LineageProperties { static StringRef name; ProcessClass::ClusterRole role = ProcessClass::NoRole; - bool isSet(ProcessClass::ClusterRole RoleLineage::*member) { + bool isSet(ProcessClass::ClusterRole RoleLineage::*member) const { return this->*member != ProcessClass::NoRole; } }; diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index bf266069cb..f52e5b8279 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -88,6 +88,7 @@ set(FDBSERVER_SRCS ResolverInterface.h ServerDBInfo.actor.h ServerDBInfo.h + SigStack.cpp SimulatedCluster.actor.cpp SimulatedCluster.h SkipList.cpp diff --git a/fdbserver/SigStack.cpp b/fdbserver/SigStack.cpp new file mode 100644 index 0000000000..efec5aff7d --- /dev/null +++ b/fdbserver/SigStack.cpp @@ -0,0 +1,23 @@ +#include "flow/flow.h" +#include +#include +#include + +// This is not yet correct, as this is not async safe +// However, this should be good enough for an initial +// proof of concept. +extern "C" void stackSignalHandler(int sig) { + auto stack = getActorStackTrace(); + int i = 0; + while (!stack.empty()) { + auto s = stack.top(); + stack.pop(); + std::string_view n(reinterpret_cast(s.begin()), s.size()); + std::cout << i << ": " << n << std::endl; + ++i; + } +} + +void setupStackSignal() { + std::signal(SIGUSR1, &stackSignalHandler); +} diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 98363ea247..5d371c0c80 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -1798,6 +1798,8 @@ ACTOR Future monitorLeaderRemotelyWithDelayedCandidacy( Reference fdbd( Reference connFile, LocalityData localities, @@ -1812,6 +1814,7 @@ ACTOR Future fdbd( { state vector> actors; state Promise recoveredDiskFiles; + setupStackSignal(); currentLineage->modify(&RoleLineage::role) = ProcessClass::Worker; try { diff --git a/flow/flow.h b/flow/flow.h index 518dbd036c..b1e4c1e1fb 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -427,7 +427,7 @@ struct LineageProperties : LineagePropertiesBase { // A user should implement this for any type // within the properies class. template - bool isSet(Value Derived::*member) { + bool isSet(Value Derived::*member) const { return true; } }; @@ -498,11 +498,6 @@ struct restore_lineage { struct StackLineage : LineageProperties { static StringRef name; StringRef actorName; - - template - bool isSet(Value StackLineage::*member) { - return true; - } }; extern std::stack getActorStackTrace(); diff --git a/tests/TestRunner/local_cluster.py b/tests/TestRunner/local_cluster.py index 68318d51dd..85f2094774 100644 --- a/tests/TestRunner/local_cluster.py +++ b/tests/TestRunner/local_cluster.py @@ -38,7 +38,7 @@ cluster_file = {etcdir}/fdb.cluster command = {fdbserver_bin} public_address = auto:$ID listen_address = public -datadir = {datadir} +datadir = {datadir}/$ID logdir = {logdir} # logsize = 10MiB # maxlogssize = 100MiB From fb64902d5c5b6e88501ebe906d4d939f61257b9b Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 19 Jan 2021 16:04:09 -0700 Subject: [PATCH 009/461] Assign roles --- fdbrpc/CMakeLists.txt | 2 -- fdbserver/CMakeLists.txt | 2 ++ .../RoleLineage.actor.cpp | 2 +- .../RoleLineage.actor.h | 21 ++++++++++++++- fdbserver/worker.actor.cpp | 26 ++++++++++++++++++- flow/flow.cpp | 5 ++-- flow/flow.h | 16 ++++++++++++ 7 files changed, 67 insertions(+), 7 deletions(-) rename fdbrpc/RoleLineage.cpp => fdbserver/RoleLineage.actor.cpp (95%) rename fdbrpc/RoleLineage.h => fdbserver/RoleLineage.actor.h (59%) diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt index 7a9ce26a10..af84676be7 100644 --- a/fdbrpc/CMakeLists.txt +++ b/fdbrpc/CMakeLists.txt @@ -22,8 +22,6 @@ set(FDBRPC_SRCS ReplicationPolicy.cpp ReplicationTypes.cpp ReplicationUtils.cpp - RoleLineage.h - RoleLineage.cpp Stats.actor.cpp Stats.h sim2.actor.cpp diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index afc45b2cc4..9e406a0d26 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -86,6 +86,8 @@ set(FDBSERVER_SRCS RestoreWorker.actor.cpp Resolver.actor.cpp ResolverInterface.h + RoleLineage.actor.h + RoleLineage.actor.cpp ServerDBInfo.actor.h ServerDBInfo.h SigStack.cpp diff --git a/fdbrpc/RoleLineage.cpp b/fdbserver/RoleLineage.actor.cpp similarity index 95% rename from fdbrpc/RoleLineage.cpp rename to fdbserver/RoleLineage.actor.cpp index 89a64bbe40..6d1b49527a 100644 --- a/fdbrpc/RoleLineage.cpp +++ b/fdbserver/RoleLineage.actor.cpp @@ -18,6 +18,6 @@ * limitations under the License. */ -#include "fdbrpc/RoleLineage.h" +#include "fdbserver/RoleLineage.actor.h" StringRef RoleLineage::name = "RoleLineage"_sr; diff --git a/fdbrpc/RoleLineage.h b/fdbserver/RoleLineage.actor.h similarity index 59% rename from fdbrpc/RoleLineage.h rename to fdbserver/RoleLineage.actor.h index 8e9d3f4e9e..d35c749771 100644 --- a/fdbrpc/RoleLineage.h +++ b/fdbserver/RoleLineage.actor.h @@ -1,5 +1,5 @@ /* - * RoleLineage.h + * RoleLineage.actor.h * * This source file is part of the FoundationDB open source project * @@ -19,7 +19,15 @@ */ #pragma once +#include "flow/flow.h" +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_G_H) +# define FDBSERVER_ROLE_LINEAGE_ACTOR_G_H +# include "fdbserver/RoleLineage.actor.g.h" +#elif !defined(FDBSERVER_ROLE_LINEAGE_ACTOR_H) +# define FDBSERVER_ROLE_LINEAGE_ACTOR_H + #include "fdbrpc/Locality.h" +#include "flow/actorcompiler.h" // This must be the last include struct RoleLineage : LineageProperties { static StringRef name; @@ -29,3 +37,14 @@ struct RoleLineage : LineageProperties { return this->*member != ProcessClass::NoRole; } }; + +// creates a new root and sets the role lineage +ACTOR template +Future()())> runInRole(Fun fun, ProcessClass::ClusterRole role) { + currentLineage->makeRoot(); + currentLineage->modify(&RoleLineage::role) = role; + decltype(std::declval()()) res = wait(fun()); + return res; +} + +#endif diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 36f5c14860..19aea8622c 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -22,7 +22,6 @@ #include #include "fdbrpc/Locality.h" -#include "fdbrpc/RoleLineage.h" #include "fdbclient/StorageServerInterface.h" #include "fdbserver/Knobs.h" #include "flow/ActorCollection.h" @@ -33,6 +32,7 @@ #include "fdbclient/NativeAPI.actor.h" #include "fdbserver/MetricLogger.h" #include "fdbserver/BackupInterface.h" +#include "fdbserver/RoleLineage.actor.h" #include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/IKeyValueStore.h" #include "fdbserver/WaitFailure.h" @@ -1024,6 +1024,8 @@ ACTOR Future workerServer( DiskStore s = stores[f]; // FIXME: Error handling if( s.storedComponent == DiskStore::Storage ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage; IKeyValueStore* kv = openKVStore(s.storeType, s.filename, s.storeID, memoryLimit, false, validateDataFiles); Future kvClosed = kv->onClosed(); filesClosed.add( kvClosed ); @@ -1058,6 +1060,8 @@ ACTOR Future workerServer( f = storageServerRollbackRebooter( f, s.storeType, s.filename, recruited.id(), recruited.locality, dbInfo, folder, &filesClosed, memoryLimit, kv); errorForwarders.add( forwardError( errors, Role::STORAGE_SERVER, recruited.id(), f ) ); } else if( s.storedComponent == DiskStore::TLogData ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::TLog; std::string logQueueBasename; const std::string filename = basename(s.filename); if (StringRef(filename).startsWith(fileLogDataPrefix)) { @@ -1218,6 +1222,8 @@ ACTOR Future workerServer( } } when( RecruitMasterRequest req = waitNext(interf.master.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Master; MasterInterface recruited; recruited.locality = locality; recruited.initEndpoints(); @@ -1238,6 +1244,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when ( InitializeDataDistributorRequest req = waitNext(interf.dataDistributor.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::DataDistributor; DataDistributorInterface recruited(locality); recruited.initEndpoints(); @@ -1256,6 +1264,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when ( InitializeRatekeeperRequest req = waitNext(interf.ratekeeper.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Ratekeeper; RatekeeperInterface recruited(locality, req.reqId); recruited.initEndpoints(); @@ -1280,6 +1290,8 @@ ACTOR Future workerServer( } when (InitializeBackupRequest req = waitNext(interf.backup.getFuture())) { if (!backupWorkerCache.exists(req.reqId)) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Backup; BackupInterface recruited(locality); recruited.initEndpoints(); @@ -1309,6 +1321,8 @@ ACTOR Future workerServer( .detail("MinRecruitable", TLogVersion::MIN_RECRUITABLE); req.reply.sendError(internal_error()); } + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::TLog; TLogOptions tLogOptions(req.logVersion, req.spillType); TLogFn tLogFn = tLogFnForOptions(tLogOptions); auto& logData = sharedLogs[SharedLogsKey(tLogOptions, req.storeType)]; @@ -1341,6 +1355,8 @@ ACTOR Future workerServer( } when( InitializeStorageRequest req = waitNext(interf.storage.getFuture()) ) { if( !storageCache.exists( req.reqId ) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Storage; StorageServerInterface recruited(req.interfaceId); recruited.locality = locality; recruited.initEndpoints(); @@ -1379,6 +1395,8 @@ ACTOR Future workerServer( forwardPromise( req.reply, storageCache.get( req.reqId ) ); } when(InitializeCommitProxyRequest req = waitNext(interf.commitProxy.getFuture())) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::CommitProxy; CommitProxyInterface recruited; recruited.processId = locality.processId(); recruited.provisional = false; @@ -1402,6 +1420,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when( InitializeGrvProxyRequest req = waitNext(interf.grvProxy.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::GrvProxy; GrvProxyInterface recruited; recruited.processId = locality.processId(); recruited.provisional = false; @@ -1421,6 +1441,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when( InitializeResolverRequest req = waitNext(interf.resolver.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::Resolver; ResolverInterface recruited; recruited.locality = locality; recruited.initEndpoints(); @@ -1438,6 +1460,8 @@ ACTOR Future workerServer( req.reply.send(recruited); } when( InitializeLogRouterRequest req = waitNext(interf.logRouter.getFuture()) ) { + LocalLineage _; + currentLineage->modify(&RoleLineage::role) = ProcessClass::ClusterRole::LogRouter; TLogInterface recruited(locality); recruited.initEndpoints(); diff --git a/flow/flow.cpp b/flow/flow.cpp index 2e47847fcd..c90bbbe9ae 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -28,8 +28,9 @@ thread_local Reference currentLineage; -ActorLineage::ActorLineage() : parent(currentLineage) { -} +LineagePropertiesBase::~LineagePropertiesBase() {} + +ActorLineage::ActorLineage() : parent(currentLineage) {} ActorLineage::~ActorLineage() { for (auto ptr : properties) { diff --git a/flow/flow.h b/flow/flow.h index e043ab49d4..9b3ba698b6 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -412,6 +412,7 @@ struct SingleCallback { }; struct LineagePropertiesBase { + virtual ~LineagePropertiesBase(); }; // helper class to make implementation of LineageProperties easier @@ -433,6 +434,7 @@ struct LineageProperties : LineagePropertiesBase { }; struct ActorLineage : ReferenceCounted { + friend class LocalLineage; private: std::unordered_map properties; Reference parent; @@ -489,6 +491,20 @@ public: extern thread_local Reference currentLineage; +// This class can be used in order to modify all lineage properties +// of actors created within a (non-actor) scope +struct LocalLineage { + Reference lineage = Reference{new ActorLineage() }; + Reference oldLineage; + LocalLineage() { + oldLineage = currentLineage; + currentLineage = lineage; + } + ~LocalLineage() { + currentLineage = oldLineage; + } +}; + struct restore_lineage { Reference prev; restore_lineage() : prev(currentLineage) {} From f40d8c2f490a08351ce3d7e91bfd6752e268548a Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 19 Jan 2021 16:04:21 -0700 Subject: [PATCH 010/461] make profiler signal handler reentrant safe --- flow/Profiler.actor.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp index ece9bcfafd..33d1542db7 100644 --- a/flow/Profiler.actor.cpp +++ b/flow/Profiler.actor.cpp @@ -148,6 +148,8 @@ struct Profiler { } void signal_handler() { // async signal safe! + static std::atomic inSigHandler = false; + if (!inSigHandler.exchange(true)) { return; } if(profilingEnabled) { double t = timer(); output_buffer->push(*(void**)&t); @@ -156,6 +158,7 @@ struct Profiler { output_buffer->push(addresses[i]); output_buffer->push((void*)-1LL); } + inSigHandler.store(false); } static void signal_handler_for_closure(int, siginfo_t* si, void*, void* self) { // async signal safe! From c3efbe3040770dae65319446b9b3877f29b0ee44 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 19 Jan 2021 16:52:30 -0700 Subject: [PATCH 011/461] fixed minor bug --- flow/Profiler.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp index 33d1542db7..d691f46205 100644 --- a/flow/Profiler.actor.cpp +++ b/flow/Profiler.actor.cpp @@ -149,7 +149,7 @@ struct Profiler { void signal_handler() { // async signal safe! static std::atomic inSigHandler = false; - if (!inSigHandler.exchange(true)) { return; } + if (inSigHandler.exchange(true)) { return; } if(profilingEnabled) { double t = timer(); output_buffer->push(*(void**)&t); From 29c626ca6a0d02f1d412327e177cc5db36b02042 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Mon, 15 Mar 2021 17:36:13 -0400 Subject: [PATCH 012/461] Changed code flow to fix loophole that avoided the knob guarding higher protocol versions and also added new restarting tests --- fdbserver/MoveKeys.actor.cpp | 24 ++++++++------- tests/CMakeLists.txt | 3 ++ .../to_6.2.33/CycleTestRestart-1.txt | 30 +++++++++++++++++++ .../to_6.2.33/CycleTestRestart-2.txt | 26 ++++++++++++++++ 4 files changed, 73 insertions(+), 10 deletions(-) create mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-1.txt create mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-2.txt diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index c08f3f3476..83f7170e95 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -1232,23 +1232,27 @@ void seedShardServers(Arena& arena, CommitTransactionRef& tr, vector serverTags; + std::vector serverSrcUID; serverTags.reserve(servers.size()); - for (int i = 0; i < servers.size(); i++) - serverTags.push_back(server_tag[servers[i].id()]); + for (auto& s : servers) { + serverTags.push_back(server_tag[s.id()]); + serverSrcUID.push_back(s.id()); + } + auto ksValue = CLIENT_KNOBS->TAG_ENCODE_KEY_SERVERS ? keyServersValue(serverTags) + : keyServersValue(Standalone(), serverSrcUID); // We have to set this range in two blocks, because the master tracking of "keyServersLocations" depends on a change // to a specific // key (keyServersKeyServersKey) - krmSetPreviouslyEmptyRange( - tr, arena, keyServersPrefix, KeyRangeRef(KeyRef(), allKeys.end), keyServersValue(serverTags), Value()); + krmSetPreviouslyEmptyRange(tr, arena, keyServersPrefix, KeyRangeRef(KeyRef(), allKeys.end), ksValue, Value()); - for (int s = 0; s < servers.size(); s++) - krmSetPreviouslyEmptyRange( - tr, arena, serverKeysPrefixFor(servers[s].id()), allKeys, serverKeysTrue, serverKeysFalse); + for (auto& s : servers) { + krmSetPreviouslyEmptyRange(tr, arena, serverKeysPrefixFor(s.id()), allKeys, serverKeysTrue, serverKeysFalse); + } } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 132616b1bb..16f0eb2170 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -204,6 +204,9 @@ if(WITH_PYTHON) add_fdb_test( TEST_FILES restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml) + add_fdb_test( + TEST_FILES restarting/to_6.2.33/CycleTestRestart-1.txt + restarting/to_6.2.33/CycleTestRestart-2.txt IGNORE) add_fdb_test( TEST_FILES restarting/to_6.3.10/CycleTestRestart-1.txt restarting/to_6.3.10/CycleTestRestart-2.txt) diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-1.txt b/tests/restarting/to_6.2.33/CycleTestRestart-1.txt new file mode 100644 index 0000000000..647c2f3fe3 --- /dev/null +++ b/tests/restarting/to_6.2.33/CycleTestRestart-1.txt @@ -0,0 +1,30 @@ +testTitle=Clogged + clearAfterTest=false + testName=Cycle + transactionsPerSecond=500.0 + nodeCount=2500 + testDuration=10.0 + expectedRate=0 + + testName=RandomClogging + testDuration=10.0 + + testName=Rollback + meanDelay=10.0 + testDuration=10.0 + + testName=Attrition + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 + + testName=Attrition + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 + + testName=SaveAndKill + restartInfoLocation=simfdb/restartInfo.ini + testDuration=10.0 diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-2.txt b/tests/restarting/to_6.2.33/CycleTestRestart-2.txt new file mode 100644 index 0000000000..7d498f2be1 --- /dev/null +++ b/tests/restarting/to_6.2.33/CycleTestRestart-2.txt @@ -0,0 +1,26 @@ +testTitle=Clogged + runSetup=false + testName=Cycle + transactionsPerSecond=2500.0 + nodeCount=2500 + testDuration=10.0 + expectedRate=0 + + testName=RandomClogging + testDuration=10.0 + + testName=Rollback + meanDelay=10.0 + testDuration=10.0 + + testName=Attrition + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 + + testName=Attrition + machinesToKill=10 + machinesToLeave=3 + reboot=true + testDuration=10.0 From a8c7a798f2483c22ffd6c8dacbb0946c81237c12 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 17 Mar 2021 15:34:20 -0600 Subject: [PATCH 013/461] First prototype of actorlineageset --- flow/ActorLineageSet.cpp | 118 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 flow/ActorLineageSet.cpp diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp new file mode 100644 index 0000000000..9fb93e9df7 --- /dev/null +++ b/flow/ActorLineageSet.cpp @@ -0,0 +1,118 @@ +/* + * ActorLineageSet.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/flow.h" +#include + +class ActorLineageSet { +public: + // The type we use for lookup into the set. Gets assigned during insert + using Index = unsigned; + // For now we use a fixed size capacity + constexpr static Index CAPACITY = 1024; + constexpr static Index npos = std::numeric_limits::max(); + + explicit ActorLineageSet(); + ActorLineageSet(const ActorLineageSet&) = delete; + ActorLineageSet& operator=(const ActorLineageSet&) = delete; + + // Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so + // the actual size might change anytime after or even during the call. This function only guarantees that the size + // was whatever the method returns at one point between the start and the end of the function call. The safest way + // to handle this is by assuming that this returns an estimate. + unsigned size(); + + Index insert(const Reference& lineage); + void erase(Index idx); + std::vector> copy(); + +private: + static constexpr uintptr_t FREE = 0b1; + static constexpr uintptr_t LOCK = 0b10; + std::atomic _size = 0; + std::vector> _set; + boost::lockfree::queue, boost::lockfree::capacity> freeQueue; + boost::lockfree::queue, boost::lockfree::capacity> + freeList; +}; + +ActorLineageSet::ActorLineageSet() { + // insert the free indexes in reverse order + for (unsigned i = CAPACITY; i > 0; --i) { + freeQueue.push(i - 1); + _set[i] = uintptr_t(1); + } +} + +std::vector> ActorLineageSet::copy() { + std::vector> result; + for (int i = 0; i < CAPACITY; ++i) { + auto ptr = _set[i].load(); + if ((ptr & FREE) != 0) { + ASSERT((ptr & LOCK) == 0); + if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) { + ActorLineage* entry = reinterpret_cast(ptr); + ptr |= LOCK; + entry->addref(); + // we try to unlock now. If this element was removed while we incremented the refcount, the element will + // end up in the freeList, so we will decrement later. + _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); + result.emplace_back(entry); + } + } + } + // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread + // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next + // iteration + ActorLineage* toClean; + while (freeList.pop(toClean)) { + toClean->delref(); + } + return result; +} + +ActorLineageSet::Index ActorLineageSet::insert(const Reference& lineage) { + Index res; + if (!freeQueue.pop(res)) { + TraceEvent(SevWarnAlways, "NoCapacityInActorLineageSet"); + return npos; + } + ASSERT(_set[res].load() & FREE); + auto ptr = reinterpret_cast(lineage.getPtr()); + lineage->addref(); + _set[res].store(ptr); + return res; +} + +void ActorLineageSet::erase(Index idx) { + while (true) { + auto ptr = _set[idx].load(); + if (ptr & LOCK) { + _set[idx].store(FREE); + freeList.push(reinterpret_cast(ptr ^ LOCK)); + return; + } else { + if (_set[idx].compare_exchange_strong(ptr, FREE)) { + reinterpret_cast(ptr)->delref(); + return; + } + } + } +} \ No newline at end of file From 9812a49058adf16c2cdd1445f876f372be074109 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 17 Mar 2021 15:40:19 -0600 Subject: [PATCH 014/461] use consume_all to clean up after copy --- flow/ActorLineageSet.cpp | 5 +---- flow/CMakeLists.txt | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp index 9fb93e9df7..0957339501 100644 --- a/flow/ActorLineageSet.cpp +++ b/flow/ActorLineageSet.cpp @@ -81,10 +81,7 @@ std::vector> ActorLineageSet::copy() { // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next // iteration - ActorLineage* toClean; - while (freeList.pop(toClean)) { - toClean->delref(); - } + freeList.consume_all([](auto toClean) { toClean->delRef(); }); return result; } diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index c838e8eff8..5e89fe4d28 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -3,6 +3,7 @@ find_package(Threads REQUIRED) set(FLOW_SRCS ActorCollection.actor.cpp ActorCollection.h + ActorLineageSet.cpp Arena.cpp Arena.h AsioReactor.h From f6c7aa6ac77e55266e030109eb77d24b8894952e Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 17 Mar 2021 15:50:29 -0600 Subject: [PATCH 015/461] fixed typo --- flow/ActorLineageSet.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp index 0957339501..9a0d34c9bf 100644 --- a/flow/ActorLineageSet.cpp +++ b/flow/ActorLineageSet.cpp @@ -81,7 +81,7 @@ std::vector> ActorLineageSet::copy() { // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next // iteration - freeList.consume_all([](auto toClean) { toClean->delRef(); }); + freeList.consume_all([](auto toClean) { toClean->delref(); }); return result; } From 4f1b807e1f480f24a0e3cb9622149953c295a4ab Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 17 Mar 2021 16:01:23 -0600 Subject: [PATCH 016/461] assert object alignment --- flow/ActorLineageSet.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/flow/ActorLineageSet.cpp b/flow/ActorLineageSet.cpp index 9a0d34c9bf..570976379c 100644 --- a/flow/ActorLineageSet.cpp +++ b/flow/ActorLineageSet.cpp @@ -93,6 +93,7 @@ ActorLineageSet::Index ActorLineageSet::insert(const Reference& li } ASSERT(_set[res].load() & FREE); auto ptr = reinterpret_cast(lineage.getPtr()); + ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned lineage->addref(); _set[res].store(ptr); return res; From 650e0de62570338ebff06cedc819a9bb00a0b925 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Thu, 18 Mar 2021 15:32:17 -0400 Subject: [PATCH 017/461] Remove extra downgrade workloads to restrict downgrade testing to 1 version apart --- tests/CMakeLists.txt | 3 -- .../to_6.2.33/CycleTestRestart-1.txt | 30 ------------------- .../to_6.2.33/CycleTestRestart-2.txt | 26 ---------------- 3 files changed, 59 deletions(-) delete mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-1.txt delete mode 100644 tests/restarting/to_6.2.33/CycleTestRestart-2.txt diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 16f0eb2170..132616b1bb 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -204,9 +204,6 @@ if(WITH_PYTHON) add_fdb_test( TEST_FILES restarting/from_7.0.0/UpgradeAndBackupRestore-1.toml restarting/from_7.0.0/UpgradeAndBackupRestore-2.toml) - add_fdb_test( - TEST_FILES restarting/to_6.2.33/CycleTestRestart-1.txt - restarting/to_6.2.33/CycleTestRestart-2.txt IGNORE) add_fdb_test( TEST_FILES restarting/to_6.3.10/CycleTestRestart-1.txt restarting/to_6.3.10/CycleTestRestart-2.txt) diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-1.txt b/tests/restarting/to_6.2.33/CycleTestRestart-1.txt deleted file mode 100644 index 647c2f3fe3..0000000000 --- a/tests/restarting/to_6.2.33/CycleTestRestart-1.txt +++ /dev/null @@ -1,30 +0,0 @@ -testTitle=Clogged - clearAfterTest=false - testName=Cycle - transactionsPerSecond=500.0 - nodeCount=2500 - testDuration=10.0 - expectedRate=0 - - testName=RandomClogging - testDuration=10.0 - - testName=Rollback - meanDelay=10.0 - testDuration=10.0 - - testName=Attrition - machinesToKill=10 - machinesToLeave=3 - reboot=true - testDuration=10.0 - - testName=Attrition - machinesToKill=10 - machinesToLeave=3 - reboot=true - testDuration=10.0 - - testName=SaveAndKill - restartInfoLocation=simfdb/restartInfo.ini - testDuration=10.0 diff --git a/tests/restarting/to_6.2.33/CycleTestRestart-2.txt b/tests/restarting/to_6.2.33/CycleTestRestart-2.txt deleted file mode 100644 index 7d498f2be1..0000000000 --- a/tests/restarting/to_6.2.33/CycleTestRestart-2.txt +++ /dev/null @@ -1,26 +0,0 @@ -testTitle=Clogged - runSetup=false - testName=Cycle - transactionsPerSecond=2500.0 - nodeCount=2500 - testDuration=10.0 - expectedRate=0 - - testName=RandomClogging - testDuration=10.0 - - testName=Rollback - meanDelay=10.0 - testDuration=10.0 - - testName=Attrition - machinesToKill=10 - machinesToLeave=3 - reboot=true - testDuration=10.0 - - testName=Attrition - machinesToKill=10 - machinesToLeave=3 - reboot=true - testDuration=10.0 From 5c1b674815b1765dbc08eed4d98875163dee5708 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 19 Mar 2021 10:31:58 -0600 Subject: [PATCH 018/461] implemented test --- flow/CMakeLists.txt | 2 +- flow/WriteOnlySet.actor.cpp | 159 +++++++++++++++++++ flow/{ActorLineageSet.cpp => WriteOnlySet.h} | 75 ++++----- 3 files changed, 187 insertions(+), 49 deletions(-) create mode 100644 flow/WriteOnlySet.actor.cpp rename flow/{ActorLineageSet.cpp => WriteOnlySet.h} (60%) diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index 5e89fe4d28..4c28aee437 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -3,7 +3,6 @@ find_package(Threads REQUIRED) set(FLOW_SRCS ActorCollection.actor.cpp ActorCollection.h - ActorLineageSet.cpp Arena.cpp Arena.h AsioReactor.h @@ -70,6 +69,7 @@ set(FLOW_SRCS TreeBenchmark.h UnitTest.cpp UnitTest.h + WriteOnlySet.actor.cpp XmlTraceLogFormatter.cpp XmlTraceLogFormatter.h actorcompiler.h diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp new file mode 100644 index 0000000000..d0f7c514ad --- /dev/null +++ b/flow/WriteOnlySet.actor.cpp @@ -0,0 +1,159 @@ +/* + * WriteOnlySet.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/DeterministicRandom.h" +#include "flow/WriteOnlySet.h" +#include "flow/flow.h" +#include "flow/UnitTest.h" + +#include +#include +#include "flow/actorcompiler.h" // has to be last include + +template +auto WriteOnlySet::insert(const Reference& lineage) -> Index { + Index res; + if (!freeQueue.pop(res)) { + TraceEvent(SevWarnAlways, "NoCapacityInWriteOnlySet"); + return npos; + } + ASSERT(_set[res].load() & FREE); + auto ptr = reinterpret_cast(lineage.getPtr()); + ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned + ASSERT((ptr & FREE) == 0 && (ptr & LOCK) == 0); + lineage->addref(); + _set[res].store(ptr); + return res; +} + +template +void WriteOnlySet::erase(Index idx) { + while (true) { + auto ptr = _set[idx].load(); + if (ptr & LOCK) { + _set[idx].store(FREE); + freeList.push(reinterpret_cast(ptr ^ LOCK)); + return; + } else { + if (_set[idx].compare_exchange_strong(ptr, FREE)) { + reinterpret_cast(ptr)->delref(); + return; + } + } + } +} + +// Explicit instantiation +template class WriteOnlySet; + +// testing code +namespace { + +std::atomic instanceCounter = 0; +constexpr double iteration_frequency = 10.0; + +struct TestObject { + mutable std::atomic _refCount = 1; + TestObject() { instanceCounter.fetch_add(1); } + void delref() const { + if (--_refCount == 0) { + delete this; + --instanceCounter; + } + } + void addref() const { ++_refCount; } +}; + +using TestSet = WriteOnlySet; +using Clock = std::chrono::steady_clock; + +ACTOR Future threadjoiner(std::shared_ptr> threads, std::shared_ptr set) { + loop { + wait(delay(0.1)); + for (unsigned i = 0;;) { + if (threads->size() == i) { + break; + } + auto& t = (*threads)[i]; + if (t.joinable()) { + t.join(); + if (i + 1 < threads->size()) { + std::swap(*threads->rbegin(), (*threads)[i]); + } + threads->pop_back(); + } else { + ++i; + } + } + if (threads->empty()) { + set->copy(); + ASSERT(instanceCounter.load() == 0); + return Void(); + } + } +} + +void testCopier(std::shared_ptr set, std::chrono::seconds runFor) { + auto start = Clock::now(); + while (true) { + if (Clock::now() - start > runFor) { + return; + } + auto copy = set->copy(); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } +} + +void writer(std::shared_ptr set, std::chrono::seconds runFor) { + auto start = Clock::now(); + std::random_device rDev; + DeterministicRandom rnd(rDev()); + while (true) { + if (Clock::now() - start > runFor) { + return; + } + std::vector positions; + for (int i = 0; i < rnd.randomInt(1, 101); ++i) { + positions.push_back(set->insert(Reference(new TestObject()))); + } + rnd.randomShuffle(positions); + for (auto p : positions) { + set->erase(p); + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } +} + +TEST_CASE("/flow/WriteOnlySet") { + if (g_network->isSimulated()) { + // This test is not deterministic, so we shouldn't run it in simulation + return Void(); + } + auto set = std::make_shared(); + auto threads = std::make_shared>(); + std::chrono::seconds runFor(10); + for (int i = 0; i < 5; ++i) { + threads->emplace_back([set, runFor]() { writer(set, runFor); }); + } + threads->emplace_back([set, runFor]() { testCopier(set, runFor); }); + wait(threadjoiner(threads, set)); + return Void(); +} +} // namespace \ No newline at end of file diff --git a/flow/ActorLineageSet.cpp b/flow/WriteOnlySet.h similarity index 60% rename from flow/ActorLineageSet.cpp rename to flow/WriteOnlySet.h index 570976379c..a319ad22f0 100644 --- a/flow/ActorLineageSet.cpp +++ b/flow/WriteOnlySet.h @@ -1,9 +1,9 @@ /* - * ActorLineageSet.cpp + * WriteOnlySet.cpp * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,20 +18,23 @@ * limitations under the License. */ -#include "flow/flow.h" +#pragma once +#include "flow/Error.h" +#include "flow/FastRef.h" +#include "flow/Trace.h" #include -class ActorLineageSet { +template +class WriteOnlySet { public: // The type we use for lookup into the set. Gets assigned during insert - using Index = unsigned; + using Index = IndexType; // For now we use a fixed size capacity - constexpr static Index CAPACITY = 1024; constexpr static Index npos = std::numeric_limits::max(); - explicit ActorLineageSet(); - ActorLineageSet(const ActorLineageSet&) = delete; - ActorLineageSet& operator=(const ActorLineageSet&) = delete; + explicit WriteOnlySet(); + WriteOnlySet(const WriteOnlySet&) = delete; + WriteOnlySet& operator=(const WriteOnlySet&) = delete; // Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so // the actual size might change anytime after or even during the call. This function only guarantees that the size @@ -39,36 +42,39 @@ public: // to handle this is by assuming that this returns an estimate. unsigned size(); - Index insert(const Reference& lineage); + Index insert(const Reference& lineage); void erase(Index idx); - std::vector> copy(); + std::vector> copy(); private: static constexpr uintptr_t FREE = 0b1; static constexpr uintptr_t LOCK = 0b10; - std::atomic _size = 0; + std::atomic _size = 0; std::vector> _set; + static_assert(std::atomic::is_always_lock_free, "Index type can't be used as a lock-free type"); + static_assert(std::atomic::is_always_lock_free, "uintptr_t can't be used as a lock-free type"); boost::lockfree::queue, boost::lockfree::capacity> freeQueue; - boost::lockfree::queue, boost::lockfree::capacity> - freeList; + boost::lockfree::queue, boost::lockfree::capacity> freeList; }; -ActorLineageSet::ActorLineageSet() { +template +WriteOnlySet::WriteOnlySet() : _set(CAPACITY) { // insert the free indexes in reverse order for (unsigned i = CAPACITY; i > 0; --i) { freeQueue.push(i - 1); - _set[i] = uintptr_t(1); + _set[i] = uintptr_t(FREE); } } -std::vector> ActorLineageSet::copy() { - std::vector> result; +template +std::vector> WriteOnlySet::copy() { + std::vector> result; for (int i = 0; i < CAPACITY; ++i) { auto ptr = _set[i].load(); if ((ptr & FREE) != 0) { ASSERT((ptr & LOCK) == 0); if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) { - ActorLineage* entry = reinterpret_cast(ptr); + T* entry = reinterpret_cast(ptr); ptr |= LOCK; entry->addref(); // we try to unlock now. If this element was removed while we incremented the refcount, the element will @@ -85,32 +91,5 @@ std::vector> ActorLineageSet::copy() { return result; } -ActorLineageSet::Index ActorLineageSet::insert(const Reference& lineage) { - Index res; - if (!freeQueue.pop(res)) { - TraceEvent(SevWarnAlways, "NoCapacityInActorLineageSet"); - return npos; - } - ASSERT(_set[res].load() & FREE); - auto ptr = reinterpret_cast(lineage.getPtr()); - ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned - lineage->addref(); - _set[res].store(ptr); - return res; -} - -void ActorLineageSet::erase(Index idx) { - while (true) { - auto ptr = _set[idx].load(); - if (ptr & LOCK) { - _set[idx].store(FREE); - freeList.push(reinterpret_cast(ptr ^ LOCK)); - return; - } else { - if (_set[idx].compare_exchange_strong(ptr, FREE)) { - reinterpret_cast(ptr)->delref(); - return; - } - } - } -} \ No newline at end of file +class ActorLineage; +extern template class WriteOnlySet; From 459afeed4cd9d6df4892e085f94d369af59f1efc Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 19 Mar 2021 11:25:55 -0600 Subject: [PATCH 019/461] disable jemalloc on macOS --- cmake/Jemalloc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Jemalloc.cmake b/cmake/Jemalloc.cmake index 6dff173b93..e89ef3ce82 100644 --- a/cmake/Jemalloc.cmake +++ b/cmake/Jemalloc.cmake @@ -3,7 +3,7 @@ add_library(jemalloc INTERFACE) set(USE_JEMALLOC ON) # We don't want to use jemalloc on Windows # Nor on FreeBSD, where jemalloc is the default system allocator -if(USE_SANITIZER OR WIN32 OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")) +if(USE_SANITIZER OR WIN32 OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") OR APPLE) set(USE_JEMALLOC OFF) return() endif() From 995ae34b1e637f6f776fc889e00474eb1ca1a322 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 19 Mar 2021 17:10:42 -0600 Subject: [PATCH 020/461] Bugfxies & hack to allow new unit test to run --- fdbserver/fdbserver.actor.cpp | 4 ++ flow/WriteOnlySet.actor.cpp | 89 ++++++++++++++++++++++++++++++----- flow/WriteOnlySet.h | 44 +++-------------- 3 files changed, 89 insertions(+), 48 deletions(-) diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index ff28269e4f..a285c0b958 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -66,6 +66,7 @@ #include "flow/SystemMonitor.h" #include "flow/TLSConfig.actor.h" #include "flow/Tracing.h" +#include "flow/WriteOnlySet.h" #if defined(__linux__) || defined(__FreeBSD__) #include @@ -1572,6 +1573,9 @@ private: } // namespace int main(int argc, char* argv[]) { + // TODO: Remove later, this is just to force the statics to be initialized + // otherwise the unit test won't run + ActorLineageSet _; try { platformInit(); diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index d0f7c514ad..32023f5e24 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -34,32 +34,75 @@ auto WriteOnlySet::insert(const Reference& lineage) - TraceEvent(SevWarnAlways, "NoCapacityInWriteOnlySet"); return npos; } - ASSERT(_set[res].load() & FREE); + ASSERT(_set[res].load() == 0); auto ptr = reinterpret_cast(lineage.getPtr()); - ASSERT((ptr % 4) == 0); // this needs to be at least 4-byte aligned - ASSERT((ptr & FREE) == 0 && (ptr & LOCK) == 0); + ASSERT((ptr % 2) == 0); // this needs to be at least 2-byte aligned + ASSERT(ptr != 0); lineage->addref(); _set[res].store(ptr); return res; } template -void WriteOnlySet::erase(Index idx) { +bool WriteOnlySet::eraseImpl(Index idx) { while (true) { auto ptr = _set[idx].load(); if (ptr & LOCK) { - _set[idx].store(FREE); + _set[idx].store(0); freeList.push(reinterpret_cast(ptr ^ LOCK)); - return; + return false; } else { - if (_set[idx].compare_exchange_strong(ptr, FREE)) { + if (_set[idx].compare_exchange_strong(ptr, 0)) { reinterpret_cast(ptr)->delref(); - return; + return true; } } } } +template +bool WriteOnlySet::erase(Index idx) { + auto res = eraseImpl(idx); + ASSERT(freeQueue.push(idx)); + return res; +} + +template +WriteOnlySet::WriteOnlySet() : _set(CAPACITY) { + // insert the free indexes in reverse order + for (unsigned i = CAPACITY; i > 0; --i) { + freeQueue.push(i - 1); + _set[i] = uintptr_t(0); + } +} + +template +std::vector> WriteOnlySet::copy() { + std::vector> result; + for (int i = 0; i < CAPACITY; ++i) { + auto ptr = _set[i].load(); + if (ptr) { + ASSERT((ptr & LOCK) == 0); // if we lock something we need to immediately unlock after we're done copying + // We attempt lock so this won't get deleted. We will try this only once, if the other thread removed the + // object from the set between the previews lines and now, we just won't make it part of the result. + if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) { + T* entry = reinterpret_cast(ptr); + ptr |= LOCK; + entry->addref(); + // we try to unlock now. If this element was removed while we incremented the refcount, the element will + // end up in the freeList, so we will decrement later. + _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); + result.emplace_back(entry); + } + } + } + // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread + // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next + // iteration + freeList.consume_all([](auto toClean) { toClean->delref(); }); + return result; +} + // Explicit instantiation template class WriteOnlySet; @@ -67,7 +110,10 @@ template class WriteOnlySet; namespace { std::atomic instanceCounter = 0; -constexpr double iteration_frequency = 10.0; +std::atomic numInserts = 0; +std::atomic numErase = 0; +std::atomic numLockedErase = 0; +std::atomic numCopied = 0; struct TestObject { mutable std::atomic _refCount = 1; @@ -117,6 +163,7 @@ void testCopier(std::shared_ptr set, std::chrono::seconds runFor) { return; } auto copy = set->copy(); + numCopied.fetch_add(copy.size()); std::this_thread::sleep_for(std::chrono::milliseconds(10)); } } @@ -126,17 +173,32 @@ void writer(std::shared_ptr set, std::chrono::seconds runFor) { std::random_device rDev; DeterministicRandom rnd(rDev()); while (true) { + unsigned inserts = 0, erases = 0; if (Clock::now() - start > runFor) { return; } std::vector positions; for (int i = 0; i < rnd.randomInt(1, 101); ++i) { - positions.push_back(set->insert(Reference(new TestObject()))); + Reference o(new TestObject()); + auto pos = set->insert(o); + if (pos == TestSet::npos) { + // could not insert -- ignore + break; + } + ++inserts; + ASSERT(pos < TestSet::capacity); + positions.push_back(pos); } rnd.randomShuffle(positions); for (auto p : positions) { - set->erase(p); + if (!set->erase(p)) { + ++numLockedErase; + } + ++erases; } + numInserts.fetch_add(inserts); + numErase.fetch_add(erases); + ASSERT(inserts == erases); std::this_thread::sleep_for(std::chrono::milliseconds(1)); } } @@ -154,6 +216,11 @@ TEST_CASE("/flow/WriteOnlySet") { } threads->emplace_back([set, runFor]() { testCopier(set, runFor); }); wait(threadjoiner(threads, set)); + TraceEvent("WriteOnlySetTestResult") + .detail("Inserts", numInserts.load()) + .detail("Erases", numErase.load()) + .detail("Copies", numCopied.load()) + .detail("LockedErase", numLockedErase.load()); return Void(); } } // namespace \ No newline at end of file diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h index a319ad22f0..9d80795c68 100644 --- a/flow/WriteOnlySet.h +++ b/flow/WriteOnlySet.h @@ -31,6 +31,7 @@ public: using Index = IndexType; // For now we use a fixed size capacity constexpr static Index npos = std::numeric_limits::max(); + constexpr static IndexType capacity = CAPACITY; explicit WriteOnlySet(); WriteOnlySet(const WriteOnlySet&) = delete; @@ -43,12 +44,13 @@ public: unsigned size(); Index insert(const Reference& lineage); - void erase(Index idx); + bool erase(Index idx); std::vector> copy(); private: - static constexpr uintptr_t FREE = 0b1; - static constexpr uintptr_t LOCK = 0b10; + bool eraseImpl(Index idx); + + static constexpr uintptr_t LOCK = 0b1; std::atomic _size = 0; std::vector> _set; static_assert(std::atomic::is_always_lock_free, "Index type can't be used as a lock-free type"); @@ -57,39 +59,7 @@ private: boost::lockfree::queue, boost::lockfree::capacity> freeList; }; -template -WriteOnlySet::WriteOnlySet() : _set(CAPACITY) { - // insert the free indexes in reverse order - for (unsigned i = CAPACITY; i > 0; --i) { - freeQueue.push(i - 1); - _set[i] = uintptr_t(FREE); - } -} - -template -std::vector> WriteOnlySet::copy() { - std::vector> result; - for (int i = 0; i < CAPACITY; ++i) { - auto ptr = _set[i].load(); - if ((ptr & FREE) != 0) { - ASSERT((ptr & LOCK) == 0); - if (_set[i].compare_exchange_strong(ptr, ptr | LOCK)) { - T* entry = reinterpret_cast(ptr); - ptr |= LOCK; - entry->addref(); - // we try to unlock now. If this element was removed while we incremented the refcount, the element will - // end up in the freeList, so we will decrement later. - _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); - result.emplace_back(entry); - } - } - } - // after we're done we need to clean up all objects that contented on a lock. This won't be perfect (as some thread - // might not yet added the object to the free list), but whatever we don't get now we'll clean up in the next - // iteration - freeList.consume_all([](auto toClean) { toClean->delref(); }); - return result; -} - class ActorLineage; extern template class WriteOnlySet; + +using ActorLineageSet = WriteOnlySet; From 99ac47e96c10922ca40e1267467bcfcbb51a51a0 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 19 Mar 2021 18:08:09 -0600 Subject: [PATCH 021/461] documentation --- flow/WriteOnlySet.actor.cpp | 6 ++++ flow/WriteOnlySet.h | 65 +++++++++++++++++++++++++++++++++---- 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index 32023f5e24..93d9e99fc7 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -109,12 +109,14 @@ template class WriteOnlySet; // testing code namespace { +// Some statistics std::atomic instanceCounter = 0; std::atomic numInserts = 0; std::atomic numErase = 0; std::atomic numLockedErase = 0; std::atomic numCopied = 0; +// A simple object that counts the number of its instances. This is used to detect memory leaks. struct TestObject { mutable std::atomic _refCount = 1; TestObject() { instanceCounter.fetch_add(1); } @@ -130,6 +132,7 @@ struct TestObject { using TestSet = WriteOnlySet; using Clock = std::chrono::steady_clock; +// An actor that can join a set of threads in an async way. ACTOR Future threadjoiner(std::shared_ptr> threads, std::shared_ptr set) { loop { wait(delay(0.1)); @@ -156,6 +159,7 @@ ACTOR Future threadjoiner(std::shared_ptr> thread } } +// occasionally copy the contents of the past set. void testCopier(std::shared_ptr set, std::chrono::seconds runFor) { auto start = Clock::now(); while (true) { @@ -168,6 +172,7 @@ void testCopier(std::shared_ptr set, std::chrono::seconds runFor) { } } +// In a loop adds and removes a set of objects to the set void writer(std::shared_ptr set, std::chrono::seconds runFor) { auto start = Clock::now(); std::random_device rDev; @@ -203,6 +208,7 @@ void writer(std::shared_ptr set, std::chrono::seconds runFor) { } } +// This unit test creates 5 writer threads and one copier thread. TEST_CASE("/flow/WriteOnlySet") { if (g_network->isSimulated()) { // This test is not deterministic, so we shouldn't run it in simulation diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h index 9d80795c68..a2589ec387 100644 --- a/flow/WriteOnlySet.h +++ b/flow/WriteOnlySet.h @@ -24,6 +24,21 @@ #include "flow/Trace.h" #include +/** + * This is a Write-Only set that supports copying the whole content. This data structure is lock-free and allows a user + * to insert and remove objects up to a given capacity (passed by a template). + * + * Template parameters: + * \param T The type to store. + * \param IndexType The type used as an index + * \param CAPACITY The maximum number of object this structure can store (if a user tries to store more, insert will + * fail gracefully) + * \pre T implements `void addref() const` and `void delref() const` + * \pre IndexType must have a copy constructor + * \pre IndexType must have a trivial assignment operator + * \pre IndexType must have a trivial destructor + * \pre IndexType can be used as an index into a std::vector + */ template class WriteOnlySet { public: @@ -37,25 +52,61 @@ public: WriteOnlySet(const WriteOnlySet&) = delete; WriteOnlySet& operator=(const WriteOnlySet&) = delete; - // Returns the number of elements at the time of calling. Keep in mind that this is a lockfree data structure, so - // the actual size might change anytime after or even during the call. This function only guarantees that the size - // was whatever the method returns at one point between the start and the end of the function call. The safest way - // to handle this is by assuming that this returns an estimate. - unsigned size(); + /** + * Attempts to insert \p lineage into the set. This method can fail if the set is full (its size is equal to its + * capacity). Calling insert on a full set is safe but the method will return \ref npos if the operation fails. + * + * \param lineage A reference to the object the user wants to insert. + * \ret An index that can later be used to erase the value again or \ref npos if the insert failed. + * \pre lineage.getPtr() % 2 == 0 (the memory for lineage has to be at least 2 byte aligned) + */ + [[nodiscard]] Index insert(const Reference& lineage); - Index insert(const Reference& lineage); + /** + * Erases the object associated with \p idx from the set. + * + * \ret Whether the reference count was decremented. Usually the return value is only interesting for testing and + * benchmarking purposes and will in most cases be ignored. If \ref delref wasn't called, it will be called + * later. Note that at the time the return value is checked, \ref delref might already have been called. + */ bool erase(Index idx); + /** + * Copies all elements that are stored in the set into a vector. This copy operation does NOT provide a snapshot of + * the data structure. The contract is weak: + * - All object that were in the set before copy is called and weren't removed until after copy returned are + * guaranteed to be in the result. + * - Any object that was inserted while copy is running might be in the result. + * - Any object that was erased while copy is running might be in the result. + */ std::vector> copy(); private: + // the implementation of erase -- the wrapper just makes the function a bit more readable. bool eraseImpl(Index idx); + // the last bit of a pointer within the set is used like a boolean and true means that the object is locked. Locking + // an object is only relevant for memory management. A locked pointer can still be erased from the set, but the + // erase won't call delref on the object. Instead it will push the pointer into the \ref freeList and copy will call + // delref later. static constexpr uintptr_t LOCK = 0b1; - std::atomic _size = 0; + + // The actual memory std::vector> _set; static_assert(std::atomic::is_always_lock_free, "Index type can't be used as a lock-free type"); static_assert(std::atomic::is_always_lock_free, "uintptr_t can't be used as a lock-free type"); + + // The freeQueue. On creation all indexes (0..capacity-1) are pushed into this queue. On insert one element from + // this queue is consumed and the resulting number is used as an index into the set. On erase the index is given + // back to the freeQueue. boost::lockfree::queue, boost::lockfree::capacity> freeQueue; + + // The freeList is used for memory management. Generally copying a shared pointer can't be done in a lock-free way. + // Instead, when we copy the data structure we first copy the address, then attempt to set the last bit to 1 and + // only if that succeeds we will increment the reference count. Whenever we attempt to remove an object + // in \ref erase we remove the object from the set (using an atomic compare and swap) and only decrement the + // reference count if the last bit is 0. If it's not we'll push the pointer into this free list. + // \ref copy will consume all elements from this freeList each time it runs and decrements the refcount for each + // element. boost::lockfree::queue, boost::lockfree::capacity> freeList; }; From 61352b912444c5d3601b8e33de234cc1f61fe32b Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 22 Mar 2021 11:41:45 -0600 Subject: [PATCH 022/461] use push_back where emplace_back is unnecessary --- flow/WriteOnlySet.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index 93d9e99fc7..9ab63aa56f 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -92,7 +92,7 @@ std::vector> WriteOnlySet::copy() { // we try to unlock now. If this element was removed while we incremented the refcount, the element will // end up in the freeList, so we will decrement later. _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); - result.emplace_back(entry); + result.push_back(entry); } } } From 301daf326939d6378d410420d007322f7c7a3dd3 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 22 Mar 2021 11:46:16 -0600 Subject: [PATCH 023/461] address review comments --- flow/WriteOnlySet.actor.cpp | 2 +- flow/WriteOnlySet.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index 9ab63aa56f..364c53460d 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -1,5 +1,5 @@ /* - * WriteOnlySet.cpp + * WriteOnlySet.actor.cpp * * This source file is part of the FoundationDB open source project * diff --git a/flow/WriteOnlySet.h b/flow/WriteOnlySet.h index a2589ec387..c71736f852 100644 --- a/flow/WriteOnlySet.h +++ b/flow/WriteOnlySet.h @@ -1,5 +1,5 @@ /* - * WriteOnlySet.cpp + * WriteOnlySet.h * * This source file is part of the FoundationDB open source project * @@ -50,7 +50,9 @@ public: explicit WriteOnlySet(); WriteOnlySet(const WriteOnlySet&) = delete; + WriteOnlySet(WriteOnlySet&&) = delete; WriteOnlySet& operator=(const WriteOnlySet&) = delete; + WriteOnlySet& operator=(WriteOnlySet&&) = delete; /** * Attempts to insert \p lineage into the set. This method can fail if the set is full (its size is equal to its @@ -93,7 +95,7 @@ private: // The actual memory std::vector> _set; static_assert(std::atomic::is_always_lock_free, "Index type can't be used as a lock-free type"); - static_assert(std::atomic::is_always_lock_free, "uintptr_t can't be used as a lock-free type"); + static_assert(std::atomic::is_always_lock_free, "uintptr_t can't be used as a lock-free type"); // The freeQueue. On creation all indexes (0..capacity-1) are pushed into this queue. On insert one element from // this queue is consumed and the resulting number is used as an index into the set. On erase the index is given From 5bd79de88179945a78e7862d90e7de183d3d690c Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Mon, 22 Mar 2021 10:01:28 -0700 Subject: [PATCH 024/461] Fix build --- flow/Profiler.actor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp index 46b0bcecb4..24bba87739 100644 --- a/flow/Profiler.actor.cpp +++ b/flow/Profiler.actor.cpp @@ -142,6 +142,8 @@ struct Profiler { } void signal_handler() { // async signal safe! + static std::atomic inSigHandler = false; + if (inSigHandler.exchange(true)) { return; } if (profilingEnabled) { double t = timer(); output_buffer->push(*(void**)&t); From 0ec7340a6f72f8d29b43ade50667d2b0e88ebd75 Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Mon, 22 Mar 2021 10:55:52 -0700 Subject: [PATCH 025/461] Create reference --- flow/WriteOnlySet.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flow/WriteOnlySet.actor.cpp b/flow/WriteOnlySet.actor.cpp index 364c53460d..92eceea7bc 100644 --- a/flow/WriteOnlySet.actor.cpp +++ b/flow/WriteOnlySet.actor.cpp @@ -92,7 +92,7 @@ std::vector> WriteOnlySet::copy() { // we try to unlock now. If this element was removed while we incremented the refcount, the element will // end up in the freeList, so we will decrement later. _set[i].compare_exchange_strong(ptr, ptr ^ LOCK); - result.push_back(entry); + result.push_back(Reference(entry)); } } } @@ -229,4 +229,4 @@ TEST_CASE("/flow/WriteOnlySet") { .detail("LockedErase", numLockedErase.load()); return Void(); } -} // namespace \ No newline at end of file +} // namespace From 35f9fe08a277ba3c1e0d74dc6795cb7ca7811194 Mon Sep 17 00:00:00 2001 From: Chaoguang Lin Date: Tue, 23 Mar 2021 14:44:14 -0700 Subject: [PATCH 026/461] Remove unnecessary header in IClientApi.h --- fdbclient/IClientApi.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fdbclient/IClientApi.h b/fdbclient/IClientApi.h index 25f5098b0f..0791f795a4 100644 --- a/fdbclient/IClientApi.h +++ b/fdbclient/IClientApi.h @@ -20,7 +20,6 @@ #ifndef FDBCLIENT_ICLIENTAPI_H #define FDBCLIENT_ICLIENTAPI_H -#include "fdbclient/ManagementAPI.actor.h" #pragma once #include "fdbclient/FDBOptions.g.h" From cb39d1a6ed1ee89f3e02369e56068465818e42b8 Mon Sep 17 00:00:00 2001 From: Chaoguang Lin Date: Wed, 24 Mar 2021 09:33:20 -0700 Subject: [PATCH 027/461] Refactor consistencycheck command using special keys --- fdbcli/CMakeLists.txt | 3 + fdbcli/ConsistencycheckCommand.actor.cpp | 45 +++++++++++++ fdbcli/Util.cpp | 12 ++++ fdbcli/fdbcli.actor.cpp | 82 +++++++++++------------- fdbcli/fdbcli.h | 60 +++++++++++++++++ 5 files changed, 156 insertions(+), 46 deletions(-) create mode 100644 fdbcli/ConsistencycheckCommand.actor.cpp create mode 100644 fdbcli/Util.cpp create mode 100644 fdbcli/fdbcli.h diff --git a/fdbcli/CMakeLists.txt b/fdbcli/CMakeLists.txt index 2b65baf040..b1eb09d491 100644 --- a/fdbcli/CMakeLists.txt +++ b/fdbcli/CMakeLists.txt @@ -1,7 +1,10 @@ set(FDBCLI_SRCS + fdbcli.h fdbcli.actor.cpp + ConsistencycheckCommand.actor.cpp FlowLineNoise.actor.cpp FlowLineNoise.h + Util.cpp linenoise/linenoise.h) if(NOT WIN32) diff --git a/fdbcli/ConsistencycheckCommand.actor.cpp b/fdbcli/ConsistencycheckCommand.actor.cpp new file mode 100644 index 0000000000..349be547f5 --- /dev/null +++ b/fdbcli/ConsistencycheckCommand.actor.cpp @@ -0,0 +1,45 @@ +#include "fdbcli/fdbcli.h" + +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/IClientApi.h" + +#include "flow/Arena.h" +#include "flow/FastRef.h" +#include "flow/ThreadHelper.actor.h" +#include "flow/actorcompiler.h" + +using namespace FDBCLI; + +ACTOR static Future consistencycheckCommandActor(Reference db, std::vector tokens) { + state Reference tr = db->createTransaction(); + tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); + KeyRef k = LiteralStringRef("\xff\xff/management/consistency_check_suspended"); + if (tokens.size() == 1) { + Optional suspended = wait(unsafeThreadFutureToFuture(tr->get(k))); + printf("ConsistencyCheck is %s\n", suspended.present() ? "off" : "on"); + } else if (tokens.size() == 2 && tokencmp(tokens[1], "off")) { + tr->set(k, Value()); + wait(unsafeThreadFutureToFuture(tr->commit())); + } else if (tokens.size() == 2 && tokencmp(tokens[1], "on")) { + tr->clear(k); + wait(unsafeThreadFutureToFuture(tr->commit())); + } else { + printUsage(tokens[0]); + return false; + } + return true; +} + +namespace FDBCLI { + +Future consistencycheckCommand(Reference db, std::vector tokens) { + return consistencycheckCommandActor(db, tokens); +} + +CommandFactory consistencycheckFactory("consistencycheck", CommandHelp( + "consistencycheck [on|off]", + "permits or prevents consistency checking", + "Calling this command with `on' permits consistency check processes to run and `off' will halt their checking. " + "Calling this command with no arguments will display if consistency checking is currently allowed.\n")); + +} // namespace FDBCLI \ No newline at end of file diff --git a/fdbcli/Util.cpp b/fdbcli/Util.cpp new file mode 100644 index 0000000000..20d9da2f2c --- /dev/null +++ b/fdbcli/Util.cpp @@ -0,0 +1,12 @@ +#include "flow/Arena.h" + +namespace FDBCLI { + +bool tokencmp(StringRef token, const char* command) { + if (token.size() != strlen(command)) + return false; + + return !memcmp(token.begin(), command, token.size()); +} + +} \ No newline at end of file diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index e608e96086..d88e98455f 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -21,6 +21,8 @@ #include "boost/lexical_cast.hpp" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/FDBTypes.h" +#include "fdbclient/IClientApi.h" +#include "fdbclient/MultiVersionTransaction.h" #include "fdbclient/Status.h" #include "fdbclient/StatusClient.h" #include "fdbclient/DatabaseContext.h" @@ -34,12 +36,14 @@ #include "fdbclient/TagThrottle.h" #include "flow/DeterministicRandom.h" +#include "flow/FastRef.h" #include "flow/Platform.h" #include "flow/TLSConfig.actor.h" #include "flow/SimpleOpt.h" #include "fdbcli/FlowLineNoise.h" +#include "fdbcli/fdbcli.h" #include #include @@ -55,6 +59,12 @@ #include "flow/actorcompiler.h" // This must be the last #include. +/* + * While we could just use the MultiVersionApi instance directly, this #define allows us to swap in any other IClientApi + * instance (e.g. from ThreadSafeApi) + */ +#define API ((IClientApi*)MultiVersionApi::api) + extern const char* getSourceVersion(); std::vector validOptions; @@ -319,12 +329,12 @@ static std::string formatStringRef(StringRef item, bool fullEscaping = false) { return ret; } -static bool tokencmp(StringRef token, const char* command) { - if (token.size() != strlen(command)) - return false; +// static bool tokencmp(StringRef token, const char* command) { +// if (token.size() != strlen(command)) +// return false; - return !memcmp(token.begin(), command, token.size()); -} +// return !memcmp(token.begin(), command, token.size()); +// } static std::vector> parseLine(std::string& line, bool& err, bool& partial) { err = false; @@ -452,20 +462,13 @@ static void printProgramUsage(const char* name) { " -h, --help Display this help and exit.\n"); } -struct CommandHelp { - std::string usage; - std::string short_desc; - std::string long_desc; - CommandHelp() {} - CommandHelp(const char* u, const char* s, const char* l) : usage(u), short_desc(s), long_desc(l) {} -}; - -std::map helpMap; -std::set hiddenCommands; - #define ESCAPINGK "\n\nFor information on escaping keys, type `help escaping'." #define ESCAPINGKV "\n\nFor information on escaping keys and values, type `help escaping'." +using namespace FDBCLI; +std::map& helpMap = FDBCLI::CommandFactory::commands(); +std::set& hiddenCommands = FDBCLI::CommandFactory::hiddenCommands(); + void initHelp() { helpMap["begin"] = CommandHelp("begin", @@ -649,11 +652,6 @@ void initHelp() { "SECONDS have elapsed, or after a storage server with a different ZONEID fails. Only one ZONEID can be marked " "for maintenance. Calling this command with no arguments will display any ongoing maintenance. Calling this " "command with `off' will disable maintenance.\n"); - helpMap["consistencycheck"] = CommandHelp( - "consistencycheck [on|off]", - "permits or prevents consistency checking", - "Calling this command with `on' permits consistency check processes to run and `off' will halt their checking. " - "Calling this command with no arguments will display if consistency checking is currently allowed.\n"); helpMap["throttle"] = CommandHelp("throttle [ARGS]", "view and control throttled tags", @@ -719,7 +717,7 @@ void printHelp(StringRef command) { printf("I don't know anything about `%s'\n", formatStringRef(command).c_str()); } -void printUsage(StringRef command) { +void FDBCLI::printUsage(StringRef command) { auto i = helpMap.find(command.toString()); if (i != helpMap.end()) printf("Usage: %s\n", i->second.usage.c_str()); @@ -3140,6 +3138,8 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { state Database db; state Reference tr; + // refactoring + state Reference db2; state bool writeMode = false; @@ -3177,6 +3177,14 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { return 1; } + try { + db2 = API->createDatabase(opt.clusterFile.c_str()); + } catch (Error& e) { + fprintf(stderr, "(CAPI)ERROR: %s (%d)\n", e.what(), e.code()); + printf("(CAPI): Unable to connect to cluster from `%s'\n", ccf->getFilename().c_str()); + return 1; + } + if (opt.trace) { TraceEvent("CLIProgramStart") .setMaxEventLength(12000) @@ -3795,29 +3803,8 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } if (tokencmp(tokens[0], "consistencycheck")) { - getTransaction(db, tr, options, intrans); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); - tr->setOption(FDBTransactionOptions::LOCK_AWARE); - if (tokens.size() == 1) { - state Future>> ccSuspendSettingFuture = - tr->get(fdbShouldConsistencyCheckBeSuspended); - wait(makeInterruptable(success(ccSuspendSettingFuture))); - bool ccSuspendSetting = - ccSuspendSettingFuture.get().present() - ? BinaryReader::fromStringRef(ccSuspendSettingFuture.get().get(), Unversioned()) - : false; - printf("ConsistencyCheck is %s\n", ccSuspendSetting ? "off" : "on"); - } else if (tokens.size() == 2 && tokencmp(tokens[1], "off")) { - tr->set(fdbShouldConsistencyCheckBeSuspended, BinaryWriter::toValue(true, Unversioned())); - wait(commitTransaction(tr)); - } else if (tokens.size() == 2 && tokencmp(tokens[1], "on")) { - tr->set(fdbShouldConsistencyCheckBeSuspended, BinaryWriter::toValue(false, Unversioned())); - wait(commitTransaction(tr)); - } else { - printUsage(tokens[0]); - is_error = true; - } + bool _result = wait(consistencycheckCommand(db2, tokens)); + is_error = _result; continue; } @@ -4909,7 +4896,10 @@ int main(int argc, char** argv) { } try { - setupNetwork(); + // setupNetwork(); + // refactoring fdbcli + API->selectApiVersion(700); + API->setupNetwork(); Future cliFuture = runCli(opt); Future timeoutFuture = opt.exit_timeout ? timeExit(opt.exit_timeout) : Never(); auto f = stopNetworkAfter(success(cliFuture) || timeoutFuture); diff --git a/fdbcli/fdbcli.h b/fdbcli/fdbcli.h new file mode 100644 index 0000000000..d93616e657 --- /dev/null +++ b/fdbcli/fdbcli.h @@ -0,0 +1,60 @@ +/* + * fdbcli.actor.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2021 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FDBCLI_H +#define FDBCLI_H +#pragma once + +#include "fdbclient/IClientApi.h" +#include "flow/Arena.h" + +namespace FDBCLI { + +struct CommandHelp { + std::string usage; + std::string short_desc; + std::string long_desc; + CommandHelp() {} + CommandHelp(const char* u, const char* s, const char* l) : usage(u), short_desc(s), long_desc(l) {} +}; + +struct CommandFactory { + CommandFactory(const char* name, CommandHelp help) { commands()[name] = help; } + CommandFactory(const char* name) { hiddenCommands().insert(name); } + static std::map& commands() { + static std::map helpMap; + return helpMap; + } + static std::set& hiddenCommands() { + static std::set commands; + return commands; + } +}; + +// help functions +bool tokencmp(StringRef token, const char* command); +void printUsage(StringRef command); + +// consistency command +Future consistencycheckCommand(Reference db, std::vector tokens); + +} // namespace FDBCLI + +#endif \ No newline at end of file From b246e673bceab43b28cc4a855584333eb3404146 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Wed, 24 Mar 2021 15:34:19 -0400 Subject: [PATCH 028/461] Added comment to seedShardServers (taken from existing desc in .h file) --- fdbserver/MoveKeys.actor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index 83f7170e95..0702b8d097 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -1212,6 +1212,8 @@ ACTOR Future moveKeys(Database cx, return Void(); } +// Called by the master server to write the very first transaction to the database +// establishing a set of shard servers and all invariants of the systemKeys. void seedShardServers(Arena& arena, CommitTransactionRef& tr, vector servers) { std::map, Tag> dcId_locality; std::map server_tag; From 2dfd420882537d7fa7d477c08b699f1a5e961a1c Mon Sep 17 00:00:00 2001 From: Lukas Joswiak Date: Wed, 24 Mar 2021 14:52:42 -0700 Subject: [PATCH 029/461] Add sampling profiler thread --- fdbrpc/AsyncFileKAIO.actor.h | 6 +++++- fdbrpc/IAsyncFile.h | 4 ++++ fdbrpc/Net2FileSystem.cpp | 4 ++++ fdbrpc/Net2FileSystem.h | 3 +++ fdbrpc/sim2.actor.cpp | 4 ++++ fdbrpc/simulator.h | 4 ++++ fdbserver/fdbserver.actor.cpp | 1 + flow/Platform.actor.cpp | 27 +++++++++++++++++++++++++++ flow/Platform.h | 2 ++ 9 files changed, 54 insertions(+), 1 deletion(-) diff --git a/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/AsyncFileKAIO.actor.h index 5e6592e6ba..dbdb040d00 100644 --- a/fdbrpc/AsyncFileKAIO.actor.h +++ b/fdbrpc/AsyncFileKAIO.actor.h @@ -242,7 +242,11 @@ public: // result = map(result, [=](int r) mutable { KAIOLogBlockEvent(io, OpLogEntry::READY, r); return r; }); #endif - return success(result); + auto& actorLineageSet = IAsyncFileSystem::filesystem()->getActorLineageSet(); + auto index = actorLineageSet.insert(currentLineage); + Future res = success(result); + actorLineageSet.erase(index); + return res; } // TODO(alexmiller): Remove when we upgrade the dev docker image to >14.10 #ifndef FALLOC_FL_ZERO_RANGE diff --git a/fdbrpc/IAsyncFile.h b/fdbrpc/IAsyncFile.h index ed703514c6..ad48db5f07 100644 --- a/fdbrpc/IAsyncFile.h +++ b/fdbrpc/IAsyncFile.h @@ -25,6 +25,7 @@ #include #include "flow/flow.h" +#include "flow/WriteOnlySet.h" #include "fdbrpc/IRateControl.h" // All outstanding operations must be cancelled before the destructor of IAsyncFile is called. @@ -118,6 +119,9 @@ public: // Returns the time of the last modification of the file. virtual Future lastWriteTime(const std::string& filename) = 0; + // Returns the shared memory data structure used to store actor lineages. + virtual ActorLineageSet& getActorLineageSet() = 0; + static IAsyncFileSystem* filesystem() { return filesystem(g_network); } static runCycleFuncPtr runCycleFunc() { return reinterpret_cast( diff --git a/fdbrpc/Net2FileSystem.cpp b/fdbrpc/Net2FileSystem.cpp index 71a7d784a1..8e895c08dc 100644 --- a/fdbrpc/Net2FileSystem.cpp +++ b/fdbrpc/Net2FileSystem.cpp @@ -89,6 +89,10 @@ Future Net2FileSystem::lastWriteTime(const std::string& filename) { return Net2AsyncFile::lastWriteTime(filename); } +ActorLineageSet& Net2FileSystem::getActorLineageSet() { + return actorLineageSet; +} + void Net2FileSystem::newFileSystem(double ioTimeout, const std::string& fileSystemPath) { g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Net2FileSystem(ioTimeout, fileSystemPath)); } diff --git a/fdbrpc/Net2FileSystem.h b/fdbrpc/Net2FileSystem.h index 702b87828f..0c2229b5ca 100644 --- a/fdbrpc/Net2FileSystem.h +++ b/fdbrpc/Net2FileSystem.h @@ -39,6 +39,8 @@ public: Future renameFile(std::string const& from, std::string const& to) override; + ActorLineageSet& getActorLineageSet() override; + // void init(); static void stop(); @@ -52,6 +54,7 @@ public: dev_t fileSystemDeviceId; bool checkFileSystem; #endif + ActorLineageSet actorLineageSet; }; #endif diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 6101ca8512..e9219f3ff3 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -2494,6 +2494,10 @@ Future Sim2FileSystem::lastWriteTime(const std::string& filename) { return fileWrites[filename]; } +ActorLineageSet& Sim2FileSystem::getActorLineageSet() { + return actorLineageSet; +} + void Sim2FileSystem::newFileSystem() { g_network->setGlobal(INetwork::enFileSystem, (flowGlobalType) new Sim2FileSystem()); } diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index cde0eb0dda..08b4264e81 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -471,6 +471,8 @@ public: Future lastWriteTime(const std::string& filename) override; + ActorLineageSet& getActorLineageSet() override; + Future renameFile(std::string const& from, std::string const& to) override; Sim2FileSystem() {} @@ -478,6 +480,8 @@ public: ~Sim2FileSystem() override {} static void newFileSystem(); + + ActorLineageSet actorLineageSet; }; #endif diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index a285c0b958..fbcd7fd9ee 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -1948,6 +1948,7 @@ int main(int argc, char* argv[]) { ASSERT(opts.connectionFile); setupRunLoopProfiler(); + setupSamplingProfiler(); auto dataFolder = opts.dataFolder; if (!dataFolder.size()) diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp index 42d8decccc..756fb6a7e3 100644 --- a/flow/Platform.actor.cpp +++ b/flow/Platform.actor.cpp @@ -48,6 +48,8 @@ #include "flow/UnitTest.h" #include "flow/FaultInjection.h" +#include "fdbrpc/IAsyncFile.h" + #ifdef _WIN32 #include #include @@ -3673,6 +3675,31 @@ void setupRunLoopProfiler() { #endif } +void* sampleThread(void* arg) { + while (true) { + threadSleep(1.0); // TODO: Read sample rate from global config + + // TODO: Copy actor lineage of currently running actor + + auto diskAlps = IAsyncFileSystem::filesystem()->getActorLineageSet().copy(); + printf("Disk ALPs: %d\n", diskAlps.size()); + + // TODO: Call collect on all actor lineages + for (auto actorLineage : diskAlps) { + } + + // TODO: Serialize collected actor linage properties + } + + return nullptr; +} + +void setupSamplingProfiler() { + // TODO: Add knob + TraceEvent("StartingSamplingProfilerThread"); + startThread(&sampleThread, nullptr); +} + // UnitTest for getMemoryInfo #ifdef __linux__ TEST_CASE("/flow/Platform/getMemoryInfo") { diff --git a/flow/Platform.h b/flow/Platform.h index 74c9395c53..edf9ff3997 100644 --- a/flow/Platform.h +++ b/flow/Platform.h @@ -741,6 +741,8 @@ void registerCrashHandler(); void setupRunLoopProfiler(); EXTERNC void setProfilingEnabled(int enabled); +void setupSamplingProfiler(); + // Use _exit() or criticalError(), not exit() #define exit static_assert(false, "Calls to exit() are forbidden by policy"); From 36f4c17ef143cd3c82b7038f001d256867e2a7fa Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 24 Mar 2021 15:04:45 -0700 Subject: [PATCH 030/461] Reduce the number of actor calls in load balancing to improve performance. --- fdbrpc/LoadBalance.actor.h | 321 +++++++++++++++++++++---------------- 1 file changed, 184 insertions(+), 137 deletions(-) diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h index 9b47912993..78f73352ba 100644 --- a/fdbrpc/LoadBalance.actor.h +++ b/fdbrpc/LoadBalance.actor.h @@ -75,109 +75,169 @@ struct LoadBalancedReply { Optional getLoadBalancedReply(const LoadBalancedReply* reply); Optional getLoadBalancedReply(const void*); -// Returns true if we got a value for our request -// Throws an error if the request returned an error that should bubble out -// Returns false if we got an error that should result in reissuing the request -template -bool checkAndProcessResult(ErrorOr result, Reference holder, bool atMostOnce, bool triedAllOptions) { - Optional loadBalancedReply; - if (!result.isError()) { - loadBalancedReply = getLoadBalancedReply(&result.get()); +// Stores state for a request made by the load balancer +template +struct RequestData : NonCopyable { + Future> response; + Reference modelHolder; + Future backoffDelay; + RequestStream const* stream = nullptr; + bool triedAllOptions = false; + + bool requestStarted = false; // true once the request has been sent to an alternative + bool requestProcessed = false; // true once a response has been received and handled by checkAndProcessResult + + // Whether or not the response future is valid + // This is true once setupRequest is called, even though at that point the response is Never(). + bool isValid() { return response.isValid(); } + + // Initializes the request state and starts the backoff delay + void setupRequest(double backoff, bool triedAllOptions, RequestStream const* stream) { + backoffDelay = (backoff > 0) ? delay(backoff) : Void(); + response = Never(); + modelHolder = Reference(); + requestStarted = false; + requestProcessed = false; + + this->stream = stream; + this->triedAllOptions = triedAllOptions; } - int errCode; - if (loadBalancedReply.present()) { - errCode = - loadBalancedReply.get().error.present() ? loadBalancedReply.get().error.get().code() : error_code_success; - } else { - errCode = result.isError() ? result.getError().code() : error_code_success; + // Sends the request to the configured stream + // This should not be called until after setupRequest has been called and the backoff delay has elapsed + void startRequest(Request request, QueueModel* model) { + ASSERT(stream); + ASSERT(backoffDelay.isReady()); + + backoffDelay = Never(); + modelHolder = Reference(new ModelHolder(model, stream->getEndpoint().token.first())); + response = stream->tryGetReply(request); + requestStarted = true; } - bool maybeDelivered = errCode == error_code_broken_promise || errCode == error_code_request_maybe_delivered; - bool receivedResponse = loadBalancedReply.present() ? !loadBalancedReply.get().error.present() : result.present(); - receivedResponse = receivedResponse || (!maybeDelivered && errCode != error_code_process_behind); - bool futureVersion = errCode == error_code_future_version || errCode == error_code_process_behind; + // Implementation of the logic to handle a response. + // Checks the state of the response, updates the queue model, and returns one of the following outcomes: + // A return value of true means that the request completed successfully + // A return value of false means that the request failed but should be retried + // A return value with an error means that the error should be thrown back to original caller + static ErrorOr checkAndProcessResultImpl(ErrorOr result, + Reference modelHolder, + bool atMostOnce, + bool triedAllOptions) { + ASSERT(modelHolder); - holder->release( - receivedResponse, futureVersion, loadBalancedReply.present() ? loadBalancedReply.get().penalty : -1.0); + Optional loadBalancedReply; + if (!result.isError()) { + loadBalancedReply = getLoadBalancedReply(&result.get()); + } + + int errCode; + if (loadBalancedReply.present()) { + errCode = loadBalancedReply.get().error.present() ? loadBalancedReply.get().error.get().code() + : error_code_success; + } else { + errCode = result.isError() ? result.getError().code() : error_code_success; + } + + bool maybeDelivered = errCode == error_code_broken_promise || errCode == error_code_request_maybe_delivered; + bool receivedResponse = + loadBalancedReply.present() ? !loadBalancedReply.get().error.present() : result.present(); + receivedResponse = receivedResponse || (!maybeDelivered && errCode != error_code_process_behind); + bool futureVersion = errCode == error_code_future_version || errCode == error_code_process_behind; + + modelHolder->release( + receivedResponse, futureVersion, loadBalancedReply.present() ? loadBalancedReply.get().penalty : -1.0); + + if (errCode == error_code_server_overloaded) { + return false; + } + + if (loadBalancedReply.present() && !loadBalancedReply.get().error.present()) { + return true; + } + + if (!loadBalancedReply.present() && result.present()) { + return true; + } + + if (receivedResponse) { + return loadBalancedReply.present() ? loadBalancedReply.get().error.get() : result.getError(); + } + + if (atMostOnce && maybeDelivered) { + return request_maybe_delivered(); + } + + if (triedAllOptions && errCode == error_code_process_behind) { + return process_behind(); + } - if (errCode == error_code_server_overloaded) { return false; } - if (loadBalancedReply.present() && !loadBalancedReply.get().error.present()) { - return true; + // Checks the state of the response, updates the queue model, and returns one of the following outcomes: + // A return value of true means that the request completed successfully + // A return value of false means that the request failed but should be retried + // In the event of a non-retryable failure, an error is thrown indicating the failure + bool checkAndProcessResult(bool atMostOnce) { + ASSERT(response.isReady()); + requestProcessed = true; + + ErrorOr outcome = + checkAndProcessResultImpl(response.get(), std::move(modelHolder), atMostOnce, triedAllOptions); + + if (outcome.isError()) { + throw outcome.getError(); + } else if (!outcome.get()) { + response = Future>(); + } + + return outcome.get(); } - if (!loadBalancedReply.present() && result.present()) { - return true; + // Convert this request to a lagging request. Such a request is no longer being waited on, but it still needs to be + // processed so we can update the queue model. + void makeLaggingRequest() { + ASSERT(response.isValid()); + ASSERT(!response.isReady()); + ASSERT(modelHolder); + ASSERT(modelHolder->model); + + QueueModel* model = modelHolder->model; + if (model->laggingRequestCount > FLOW_KNOBS->MAX_LAGGING_REQUESTS_OUTSTANDING || + model->laggingRequests.isReady()) { + model->laggingRequests.cancel(); + model->laggingRequestCount = 0; + model->addActor = PromiseStream>(); + model->laggingRequests = actorCollection(model->addActor.getFuture(), &model->laggingRequestCount); + } + + // We need to process the lagging request in order to update the queue model + Reference holderCapture = std::move(modelHolder); + bool triedAllOptionsCapture = triedAllOptions; + Future updateModel = + map(response, [holderCapture, triedAllOptionsCapture](ErrorOr result) { + checkAndProcessResultImpl(result, holderCapture, false, triedAllOptionsCapture); + return Void(); + }); + model->addActor.send(updateModel); } - if (receivedResponse) { - throw loadBalancedReply.present() ? loadBalancedReply.get().error.get() : result.getError(); - } - - if (atMostOnce && maybeDelivered) { - throw request_maybe_delivered(); - } - - if (triedAllOptions && errCode == error_code_process_behind) { - throw process_behind(); - } - - return false; -} - -ACTOR template -Future> makeRequest(RequestStream const* stream, - Request request, - double backoff, - Future requestUnneeded, - QueueModel* model, - bool isFirstRequest, - bool atMostOnce, - bool triedAllOptions) { - if (backoff > 0.0) { - wait(delay(backoff) || requestUnneeded); - } - - if (requestUnneeded.isReady()) { - return Optional(); - } - - state Reference holder(new ModelHolder(model, stream->getEndpoint().token.first())); - - ErrorOr result = wait(stream->tryGetReply(request)); - if (checkAndProcessResult(result, holder, atMostOnce, triedAllOptions)) { - return result.get(); - } else { - return Optional(); - } -} - -template -void addLaggingRequest(Future> reply, Promise requestFinished, QueueModel* model) { - requestFinished.send(Void()); - if (!reply.isReady()) { - if (model) { - if (model->laggingRequestCount > FLOW_KNOBS->MAX_LAGGING_REQUESTS_OUTSTANDING || - model->laggingRequests.isReady()) { - model->laggingRequests.cancel(); - model->laggingRequestCount = 0; - model->addActor = PromiseStream>(); - model->laggingRequests = actorCollection(model->addActor.getFuture(), &model->laggingRequestCount); - } - - model->addActor.send(success(errorOr(reply))); + ~RequestData() { + // If the request has been started but hasn't completed, mark it as a lagging request + if (requestStarted && !requestProcessed && modelHolder && modelHolder->model) { + makeLaggingRequest(); } } -} +}; -// Keep trying to get a reply from any of servers until success or cancellation; tries to take into account -// failMon's information for load balancing and avoiding failed servers +// Try to get a reply from one of the alternatives until success, cancellation, or certain errors. +// Load balancing has a budget to race requests to a second alternative if the first request is slow. +// Tries to take into account failMon's information for load balancing and avoiding failed servers. // If ALL the servers are failed and the list of servers is not fresh, throws an exception to let the caller refresh the -// list of servers. When model is set, load balance among alternatives in the same DC, aiming to balance request queue -// length on these interfaces. If too many interfaces in the same DC are bad, try remote interfaces. +// list of servers. +// When model is set, load balance among alternatives in the same DC aims to balance request queue length on these +// interfaces. If too many interfaces in the same DC are bad, try remote interfaces. ACTOR template Future loadBalance( Reference> alternatives, @@ -186,9 +246,11 @@ Future loadBalance( TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint, bool atMostOnce = false, // if true, throws request_maybe_delivered() instead of retrying automatically QueueModel* model = nullptr) { - state Future> firstRequest; + + state RequestData firstRequestData; + state RequestData secondRequestData; + state Optional firstRequestEndpoint; - state Future> secondRequest; state Future secondDelay = Never(); state Promise requestFinished; @@ -320,7 +382,7 @@ Future loadBalance( } // Find an alternative, if any, that is not failed, starting with - // nextAlt. This logic matters only if model == NULL. Otherwise, the + // nextAlt. This logic matters only if model == nullptr. Otherwise, the // bestAlt and nextAlt have been decided. state RequestStream const* stream = nullptr; for (int alternativeNum = 0; alternativeNum < alternatives->size(); alternativeNum++) { @@ -340,7 +402,7 @@ Future loadBalance( stream = nullptr; } - if (!stream && !firstRequest.isValid()) { + if (!stream && !firstRequestData.isValid()) { // Everything is down! Wait for someone to be up. vector> ok(alternatives->size()); @@ -391,50 +453,40 @@ Future loadBalance( numAttempts = 0; // now that we've got a server back, reset the backoff } else if (!stream) { // Only the first location is available. - Optional result = wait(firstRequest); - if (result.present()) { - return result.get(); - } + loop choose { + when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); } + when(ErrorOr result = wait(firstRequestData.response)) { + if (firstRequestData.checkAndProcessResult(atMostOnce)) { + return result.get(); + } - firstRequest = Future>(); - firstRequestEndpoint = Optional(); - } else if (firstRequest.isValid()) { + firstRequestEndpoint = Optional(); + break; + } + } + } else if (firstRequestData.isValid()) { // Issue a second request, the first one is taking a long time. - secondRequest = makeRequest( - stream, request, backoff, requestFinished.getFuture(), model, false, atMostOnce, triedAllOptions); + secondRequestData.setupRequest(backoff, triedAllOptions, stream); state bool firstFinished = false; - loop { - choose { - when(ErrorOr> result = - wait(firstRequest.isValid() ? errorOr(firstRequest) : Never())) { - if (result.isError() || result.get().present()) { - addLaggingRequest(secondRequest, requestFinished, model); - if (result.isError()) { - throw result.getError(); - } else { - return result.get().get(); - } - } - - firstRequest = Future>(); - firstRequestEndpoint = Optional(); - firstFinished = true; + loop choose { + when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); } + when(wait(secondRequestData.backoffDelay)) { secondRequestData.startRequest(request, model); } + when(ErrorOr result = + wait(firstRequestData.response.isValid() ? firstRequestData.response : Never())) { + if (firstRequestData.checkAndProcessResult(atMostOnce)) { + return result.get(); } - when(ErrorOr> result = wait(errorOr(secondRequest))) { - if (result.isError() || result.get().present()) { - if (!firstFinished) { - addLaggingRequest(firstRequest, requestFinished, model); - } - if (result.isError()) { - throw result.getError(); - } else { - return result.get().get(); - } - } - break; + firstRequestEndpoint = Optional(); + firstFinished = true; + } + when(ErrorOr result = wait(secondRequestData.response)) { + if (secondRequestData.checkAndProcessResult(atMostOnce)) { + return result.get(); } + + break; } } @@ -445,13 +497,13 @@ Future loadBalance( } } else { // Issue a request, if it takes too long to get a reply, go around the loop - firstRequest = makeRequest( - stream, request, backoff, requestFinished.getFuture(), model, true, atMostOnce, triedAllOptions); + firstRequestData.setupRequest(backoff, triedAllOptions, stream); firstRequestEndpoint = stream->getEndpoint().token.first(); loop { choose { - when(ErrorOr> result = wait(errorOr(firstRequest))) { + when(wait(firstRequestData.backoffDelay)) { firstRequestData.startRequest(request, model); } + when(ErrorOr result = wait(firstRequestData.response)) { if (model) { model->secondMultiplier = std::max(model->secondMultiplier - FLOW_KNOBS->SECOND_REQUEST_MULTIPLIER_DECAY, 1.0); @@ -460,15 +512,10 @@ Future loadBalance( FLOW_KNOBS->SECOND_REQUEST_MAX_BUDGET); } - if (result.isError()) { - throw result.getError(); + if (firstRequestData.checkAndProcessResult(atMostOnce)) { + return result.get(); } - if (result.get().present()) { - return result.get().get(); - } - - firstRequest = Future>(); firstRequestEndpoint = Optional(); break; } From f7d3b31ef8f93a9ec845bef3a8216e70c384d804 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 24 Mar 2021 16:27:35 -0600 Subject: [PATCH 031/461] Actually close files in simulation --- fdbrpc/AsyncFileNonDurable.actor.h | 4 ++++ fdbrpc/sim2.actor.cpp | 16 ++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 49fe0e2c8f..13fdcc25a5 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -267,6 +267,10 @@ public: Future deleteFuture = deleteFile(this); if (!deleteFuture.isReady()) filesBeingDeleted[filename] = deleteFuture; + } else if (isSoleOwner()) { + // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we + // we remove the file from the map to make sure it gets closed. + g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); } } diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 1af14ec676..6cddbb7e88 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -536,7 +536,10 @@ public: std::string getFilename() const override { return actualFilename; } - ~SimpleFile() override { _close(h); } + ~SimpleFile() override { + _close(h); + --openCount; + } private: int h; @@ -1933,10 +1936,7 @@ public: TraceEvent("ClogInterface") .detail("IP", ip.toString()) .detail("Delay", seconds) - .detail("Queue", - mode == ClogSend ? "Send" - : mode == ClogReceive ? "Receive" - : "All"); + .detail("Queue", mode == ClogSend ? "Send" : mode == ClogReceive ? "Receive" : "All"); if (mode == ClogSend || mode == ClogAll) g_clogging.clogSendFor(ip, seconds); @@ -2408,9 +2408,9 @@ int sf_open(const char* filename, int flags, int convFlags, int mode) { GENERIC_READ | ((flags & IAsyncFile::OPEN_READWRITE) ? GENERIC_WRITE : 0), FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, nullptr, - (flags & IAsyncFile::OPEN_EXCLUSIVE) ? CREATE_NEW - : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS - : OPEN_EXISTING, + (flags & IAsyncFile::OPEN_EXCLUSIVE) + ? CREATE_NEW + : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS : OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); int h = -1; From 6a344ddeab4eac19ee34f1af7649a6b5e8e39efc Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 24 Mar 2021 16:56:11 -0600 Subject: [PATCH 032/461] fix typo --- fdbrpc/AsyncFileNonDurable.actor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 13fdcc25a5..8cc65bf4a5 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -269,7 +269,7 @@ public: filesBeingDeleted[filename] = deleteFuture; } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we - // we remove the file from the map to make sure it gets closed. + // remove the file from the map to make sure it gets closed. g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); } } From b51e4aa59048ed73afbb6a6d82b4d86f520f6129 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 24 Mar 2021 19:57:24 -0600 Subject: [PATCH 033/461] handle file renames properly --- fdbrpc/AsyncFileNonDurable.actor.h | 12 +++++++++++- flow/flow.h | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 8cc65bf4a5..21cfda8907 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -270,7 +270,17 @@ public: } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we // remove the file from the map to make sure it gets closed. - g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); + auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; + auto iter = openFiles.find(filename); + // the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the + // map anymore. + if (iter != openFiles.end()) { + // even if the filename exists, it doesn't mean that it references the same file. It could be that the + // file was renamed and later a file with the same name was opened. + if (iter->second.canGet() && iter->second.get().getPtr() == this) { + openFiles.erase(filename); + } + } } } diff --git a/flow/flow.h b/flow/flow.h index 987572d7c5..e03d598d9b 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -674,6 +674,8 @@ public: bool isValid() const { return sav != 0; } bool isReady() const { return sav->isSet(); } bool isError() const { return sav->isError(); } + // returns true if get can be called on this future (counterpart of canBeSet on Promises) + bool canGet() const { return isValid() && isReady() && !isError(); } Error& getError() const { ASSERT(isError()); return sav->error_state; From 1385a776daa0b90cb20478251d0faf8766cb1a10 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 25 Mar 2021 13:22:29 -0600 Subject: [PATCH 034/461] only remove files from the open map if they have no modifications in flight --- fdbrpc/AsyncFileNonDurable.actor.h | 49 ++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 21cfda8907..281b3f289d 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -259,6 +259,37 @@ public: //TraceEvent("AsyncFileNonDurable_Destroy", id).detail("Filename", filename); } + // The purpose of this actor is to simply keep a reference to a non-durable file until all pending modifications + // have completed. When they return, this actor will die and therefore decrement the reference count by 1. + ACTOR void waitOnOutstandingModifications(Reference self) { + state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); + state std::string filename = self->filename; + + wait(g_simulator.onMachine(currentProcess)); + try { + Promise startSyncPromise = self->startSyncPromise; + self->startSyncPromise = Promise(); + startSyncPromise.send(true); + + std::vector> outstandingModifications; + + for (auto itr = self->pendingModifications.ranges().begin(); + itr != self->pendingModifications.ranges().end(); + ++itr) + if (itr->value().isValid() && !itr->value().isReady()) + outstandingModifications.push_back(itr->value()); + + // Ignore errors here so that all modifications can finish + wait(waitForAllReady(outstandingModifications)); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); + } catch (Error& e) { + state Error err = e; + wait(g_simulator.onProcess(currentProcess, currentTaskID)); + throw err; + } + } + void addref() override { ReferenceCounted::addref(); } void delref() override { if (delref_no_destroy()) { @@ -270,6 +301,24 @@ public: } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we // remove the file from the map to make sure it gets closed. + bool hasPendingModifications = false; + for (auto iter = pendingModifications.ranges().begin(); iter != pendingModifications.ranges().end(); + ++iter) { + if (iter->value().isValid() && !iter->value().isReady()) { + hasPendingModifications = true; + break; + } + } + if (hasPendingModifications) { + // If we still have pending references we won't close the file and instead wait for them. But while we + // wait for those to complete, another actor might open the file. So we call into an actor that will + // hold a refernce until all pending operations are complete. If someone opens this file before this + // completes, nothing will happen. Otherwise we will enter delref again but this time + // hasPendingModifications will evalualte to false. + addref(); + waitOnOutstandingModifications(Reference(this)); + return; + } auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; auto iter = openFiles.find(filename); // the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the From 1033db9fba275a809b3159fc2d52a92293350a45 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 25 Mar 2021 14:00:07 -0600 Subject: [PATCH 035/461] Revert change --- fdbrpc/AsyncFileNonDurable.actor.h | 47 +++++++----------------------- 1 file changed, 11 insertions(+), 36 deletions(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 281b3f289d..f65895067e 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -267,27 +267,20 @@ public: state std::string filename = self->filename; wait(g_simulator.onMachine(currentProcess)); - try { - Promise startSyncPromise = self->startSyncPromise; - self->startSyncPromise = Promise(); - startSyncPromise.send(true); + Promise startSyncPromise = self->startSyncPromise; + self->startSyncPromise = Promise(); + startSyncPromise.send(true); - std::vector> outstandingModifications; + std::vector> outstandingModifications; - for (auto itr = self->pendingModifications.ranges().begin(); - itr != self->pendingModifications.ranges().end(); - ++itr) - if (itr->value().isValid() && !itr->value().isReady()) - outstandingModifications.push_back(itr->value()); + for (auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end(); + ++itr) + if (itr->value().isValid() && !itr->value().isReady()) + outstandingModifications.push_back(itr->value()); - // Ignore errors here so that all modifications can finish - wait(waitForAllReady(outstandingModifications)); - wait(g_simulator.onProcess(currentProcess, currentTaskID)); - } catch (Error& e) { - state Error err = e; - wait(g_simulator.onProcess(currentProcess, currentTaskID)); - throw err; - } + // Ignore errors here so that all modifications can finish + wait(waitForAllReady(outstandingModifications)); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); } void addref() override { ReferenceCounted::addref(); } @@ -301,24 +294,6 @@ public: } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we // remove the file from the map to make sure it gets closed. - bool hasPendingModifications = false; - for (auto iter = pendingModifications.ranges().begin(); iter != pendingModifications.ranges().end(); - ++iter) { - if (iter->value().isValid() && !iter->value().isReady()) { - hasPendingModifications = true; - break; - } - } - if (hasPendingModifications) { - // If we still have pending references we won't close the file and instead wait for them. But while we - // wait for those to complete, another actor might open the file. So we call into an actor that will - // hold a refernce until all pending operations are complete. If someone opens this file before this - // completes, nothing will happen. Otherwise we will enter delref again but this time - // hasPendingModifications will evalualte to false. - addref(); - waitOnOutstandingModifications(Reference(this)); - return; - } auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; auto iter = openFiles.find(filename); // the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the From c3ba4659ff461d3a5eb16eaa62d563627ea2032b Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Fri, 26 Mar 2021 18:06:21 +0000 Subject: [PATCH 036/461] Document that ryw disable can only be set at beginning of transaction --- fdbclient/vexillographer/fdb.options | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options index 82ba1910c2..db68bb31a4 100644 --- a/fdbclient/vexillographer/fdb.options +++ b/fdbclient/vexillographer/fdb.options @@ -210,7 +210,7 @@ description is not currently required but encouraged. \ No newline at end of file diff --git a/flow/actorcompiler/actorcompiler.sln b/flow/actorcompiler/actorcompiler.sln deleted file mode 100644 index a4292bfaaa..0000000000 --- a/flow/actorcompiler/actorcompiler.sln +++ /dev/null @@ -1,34 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 15 -VisualStudioVersion = 15.0.26124.0 -MinimumVisualStudioVersion = 15.0.26124.0 -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "actorcompiler", "actorcompiler.csproj", "{0ECC1314-3FC2-458D-8E41-B50B4EA24E51}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Any CPU = Debug|Any CPU - Debug|x64 = Debug|x64 - Debug|x86 = Debug|x86 - Release|Any CPU = Release|Any CPU - Release|x64 = Release|x64 - Release|x86 = Release|x86 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|Any CPU.Build.0 = Debug|Any CPU - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x64.ActiveCfg = Debug|Any CPU - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x64.Build.0 = Debug|Any CPU - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x86.ActiveCfg = Debug|Any CPU - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Debug|x86.Build.0 = Debug|Any CPU - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|Any CPU.ActiveCfg = Debug|Any CPU - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|Any CPU.Build.0 = Debug|Any CPU - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x64.ActiveCfg = Debug|Any CPU - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x64.Build.0 = Debug|Any CPU - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x86.ActiveCfg = Debug|Any CPU - {0ECC1314-3FC2-458D-8E41-B50B4EA24E51}.Release|x86.Build.0 = Debug|Any CPU - EndGlobalSection -EndGlobal diff --git a/flow/flow.cpp b/flow/flow.cpp index ec65640fe2..74f0b334f5 100644 --- a/flow/flow.cpp +++ b/flow/flow.cpp @@ -26,27 +26,6 @@ #include #include -thread_local Reference currentLineage; -WriteOnlyVariable currentLineageThreadSafe; - -LineagePropertiesBase::~LineagePropertiesBase() {} - -ActorLineage::ActorLineage() : properties(), parent(currentLineage) {} - -ActorLineage::~ActorLineage() { - for (auto ptr : properties) { - delete ptr.second; - } -} - -using namespace std::literals; - -const std::string_view StackLineage::name = "StackLineage"sv; - -std::vector getActorStackTrace() { - return currentLineage->stack(&StackLineage::actorName); -} - #if (defined(__linux__) || defined(__FreeBSD__)) && defined(__AVX__) && !defined(MEMORY_SANITIZER) // For benchmarking; need a version of rte_memcpy that doesn't live in the same compilation unit as the test. void* rte_memcpy_noinline(void* __restrict __dest, const void* __restrict __src, size_t __n) { diff --git a/flow/flow.h b/flow/flow.h index 8388113253..987572d7c5 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -20,8 +20,6 @@ #ifndef FLOW_FLOW_H #define FLOW_FLOW_H -#include "flow/Arena.h" -#include "flow/FastRef.h" #pragma once #pragma warning(disable : 4244 4267) // SOMEDAY: Carefully check for integer overflow issues (e.g. size_t to int @@ -31,18 +29,14 @@ #include #include -#include #include #include #include #include #include #include -#include #include #include -#include -#include #include "flow/Platform.h" #include "flow/FastAlloc.h" @@ -52,7 +46,6 @@ #include "flow/ThreadPrimitives.h" #include "flow/network.h" #include "flow/FileIdentifier.h" -#include "flow/WriteOnlySet.h" #include @@ -427,127 +420,6 @@ struct SingleCallback { } }; -struct LineagePropertiesBase { - virtual ~LineagePropertiesBase(); -}; - -// helper class to make implementation of LineageProperties easier -template -struct LineageProperties : LineagePropertiesBase { - // Contract: - // - // StringRef name = "SomeUniqueName"_str; - - // this has to be implemented by subclasses - // but can't be made virtual. - // A user should implement this for any type - // within the properies class. - template - bool isSet(Value Derived::*member) const { - return true; - } -}; - -struct ActorLineage : ThreadSafeReferenceCounted { - friend class LocalLineage; - -private: - std::unordered_map properties; - Reference parent; - mutable std::mutex mutex; - using Lock = std::unique_lock; - -public: - ActorLineage(); - ~ActorLineage(); - bool isRoot() const { - Lock _{ mutex }; - return parent.getPtr() == nullptr; - } - void makeRoot() { - Lock _{ mutex }; - parent.clear(); - } - template - V& modify(V T::*member) { - Lock _{ mutex }; - auto& res = properties[T::name]; - if (!res) { - res = new T{}; - } - T* map = static_cast(res); - return map->*member; - } - template - std::optional get(V T::*member) const { - Lock _{ mutex }; - auto current = this; - while (current != nullptr) { - auto iter = current->properties.find(T::name); - if (iter != current->properties.end()) { - T const& map = static_cast(*iter->second); - if (map.isSet(member)) { - return map.*member; - } - } - current = current->parent.getPtr(); - } - return std::optional{}; - } - template - std::vector stack(V T::*member) const { - Lock _{ mutex }; - auto current = this; - std::vector res; - while (current != nullptr) { - auto iter = current->properties.find(T::name); - if (iter != current->properties.end()) { - T const& map = static_cast(*iter->second); - if (map.isSet(member)) { - res.push_back(map.*member); - } - } - current = current->parent.getPtr(); - } - return res; - } -}; - -extern thread_local Reference currentLineage; -extern WriteOnlyVariable currentLineageThreadSafe; - -// This class can be used in order to modify all lineage properties -// of actors created within a (non-actor) scope -struct LocalLineage { - Reference lineage = Reference{ new ActorLineage() }; - Reference oldLineage; - LocalLineage() { - oldLineage = currentLineage; - currentLineage = lineage; - currentLineageThreadSafe.replace(lineage); - } - ~LocalLineage() { - currentLineage = oldLineage; - currentLineageThreadSafe.replace(oldLineage); - } -}; - -struct restore_lineage { - Reference prev; - restore_lineage() : prev(currentLineage) {} - ~restore_lineage() { - currentLineage = prev; - currentLineageThreadSafe.replace(prev); - } -}; - -struct StackLineage : LineageProperties { - static const std::string_view name; - StringRef actorName; -}; - -extern std::vector getActorStackTrace(); - // SAV is short for Single Assignment Variable: It can be assigned for only once! template struct SAV : private Callback, FastAllocated> { @@ -589,9 +461,8 @@ public: ASSERT(canBeSet()); new (&value_storage) T(std::forward(value)); this->error_state = Error::fromCode(SET_ERROR_CODE); - while (Callback::next != this) { + while (Callback::next != this) Callback::next->fire(this->value()); - } } void send(Never) { @@ -602,9 +473,8 @@ public: void sendError(Error err) { ASSERT(canBeSet() && int16_t(err.code()) > 0); this->error_state = err; - while (Callback::next != this) { + while (Callback::next != this) Callback::next->error(err); - } } template @@ -753,9 +623,8 @@ struct NotifiedQueue : private SingleCallback, FastAllocated return; this->error = err; - if (SingleCallback::next != this) { + if (SingleCallback::next != this) SingleCallback::next->error(err); - } } void addPromiseRef() { promises++; } @@ -1123,73 +992,36 @@ static inline void destruct(T& t) { template struct Actor : SAV { - Reference lineage = Reference{ new ActorLineage() }; int8_t actor_wait_state; // -1 means actor is cancelled; 0 means actor is not waiting; 1-N mean waiting in callback // group # - Actor() : SAV(1, 1), actor_wait_state(0) { - /*++actorCount;*/ - currentLineage = lineage; - currentLineageThreadSafe.replace(lineage); + Actor() : SAV(1, 1), actor_wait_state(0) { /*++actorCount;*/ } //~Actor() { --actorCount; } - - Reference setLineage() { - auto res = currentLineage; - currentLineage = lineage; - currentLineageThreadSafe.replace(lineage); - return res; - } }; template <> struct Actor { // This specialization is for a void actor (one not returning a future, hence also uncancellable) - Reference lineage = Reference{ new ActorLineage() }; int8_t actor_wait_state; // 0 means actor is not waiting; 1-N mean waiting in callback group # - Actor() : actor_wait_state(0) { - /*++actorCount;*/ - currentLineage = lineage; - currentLineageThreadSafe.replace(lineage); + Actor() : actor_wait_state(0) { /*++actorCount;*/ } //~Actor() { --actorCount; } - - Reference setLineage() { - auto res = currentLineage; - currentLineage = lineage; - currentLineageThreadSafe.replace(lineage); - return res; - } }; template struct ActorCallback : Callback { - virtual void fire(ValueType const& value) override { - auto _ = static_cast(this)->setLineage(); - static_cast(this)->a_callback_fire(this, value); - } - virtual void error(Error e) override { - auto _ = static_cast(this)->setLineage(); - static_cast(this)->a_callback_error(this, e); - } + void fire(ValueType const& value) override { static_cast(this)->a_callback_fire(this, value); } + void error(Error e) override { static_cast(this)->a_callback_error(this, e); } }; template struct ActorSingleCallback : SingleCallback { - void fire(ValueType const& value) override { - auto _ = static_cast(this)->setLineage(); - static_cast(this)->a_callback_fire(this, value); - } - void fire(ValueType&& value) override { - auto _ = static_cast(this)->setLineage(); - static_cast(this)->a_callback_fire(this, std::move(value)); - } - void error(Error e) override { - auto _ = static_cast(this)->setLineage(); - static_cast(this)->a_callback_error(this, e); - } + void fire(ValueType const& value) override { static_cast(this)->a_callback_fire(this, value); } + void fire(ValueType&& value) override { static_cast(this)->a_callback_fire(this, std::move(value)); } + void error(Error e) override { static_cast(this)->a_callback_error(this, e); } }; inline double now() { return g_network->now(); diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 8561bc623c..46cdb6d73f 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -1547,10 +1547,6 @@ struct YieldedFutureActor : SAV, ActorCallback setLineage() { - return currentLineage; - } - void a_callback_fire(ActorCallback*, Void) { if (int16_t(in_error_state.code()) == UNSET_ERROR_CODE) { in_error_state = Error::fromCode(SET_ERROR_CODE); diff --git a/flow/network.h b/flow/network.h index e5683e4ca7..1eeb5bdc2d 100644 --- a/flow/network.h +++ b/flow/network.h @@ -35,7 +35,6 @@ #include "flow/Arena.h" #include "flow/IRandom.h" #include "flow/Trace.h" -#include "flow/WriteOnlySet.h" enum class TaskPriority { Max = 1000000, @@ -560,9 +559,6 @@ public: // returns false. virtual bool checkRunnable() = 0; - // Returns the shared memory data structure used to store actor lineages. - virtual ActorLineageSet& getActorLineageSet() = 0; - virtual ProtocolVersion protocolVersion() = 0; // Shorthand for transport().getLocalAddress() diff --git a/flow/singleton.h b/flow/singleton.h deleted file mode 100644 index c6a256ac42..0000000000 --- a/flow/singleton.h +++ /dev/null @@ -1,237 +0,0 @@ -/* - * (C) Copyright 2015 ETH Zurich Systems Group (http://www.systems.ethz.ch/) and others. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * Contributors: - * Markus Pilman - * Simon Loesing - * Thomas Etter - * Kevin Bocksrocker - * Lucas Braun - */ -#pragma once - -#include -#include -#include -#include - -namespace crossbow { - -/** - * @brief A mock mutex for disabling locking in the singleton - * - * This class implements the mutex concept with empty methods. - * This can be used to disable synchronization in the singleton - * holder. - */ -struct no_locking { - void lock() {} - void unlock() {} - bool try_lock() { return true; } -}; - -template -struct create_static { - static constexpr bool supports_recreation = false; - union max_align { - char t_[sizeof(T)]; - short int short_int_; - long int long_int_; - float float_; - double double_; - long double longDouble_; - struct Test; - int Test::*pMember_; - int (Test::*pMemberFn_)(int); - }; - - static T* create() { - static max_align static_memory_; - return new (&static_memory_) T; - } - - static void destroy(T* ptr) { ptr->~T(); } -}; - -template -struct create_using_new { - static constexpr bool supports_recreation = true; - static T* create() { return new T; }; - - static void destroy(T* ptr) { delete ptr; } -}; - -template -struct create_using_malloc { - static constexpr bool supports_recreation = true; - static T* create() { - void* p = std::malloc(sizeof(T)); - if (!p) - return nullptr; - return new (p) T; - } - - static void destroy(T* ptr) { - ptr->~T(); - free(ptr); - } -}; - -template -struct create_using { - static constexpr bool supports_recreation = true; - static allocator alloc_; - - static T* create() { - T* p = alloc_.allocate(1); - if (!p) - return nullptr; - alloc_.construct(p); - return p; - }; - - static void destroy(T* ptr) { - alloc_.destroy(ptr); - alloc_.deallocate(ptr, 1); - } -}; - -template -struct default_lifetime { - static void schedule_destruction(T*, void (*func)()) { std::atexit(func); } - - static void on_dead_ref() { throw std::logic_error("Dead reference detected"); } -}; - -template -struct phoenix_lifetime { - static void schedule_destruction(T*, void (*func)()) { std::atexit(func); } - - static void on_dead_ref() {} -}; - -template -struct infinite_lifetime { - static void schedule_destruction(T*, void (*)()) {} - static void on_dead_ref() {} -}; - -template -struct lifetime_traits { - static constexpr bool supports_recreation = true; -}; - -template -struct lifetime_traits> { - static constexpr bool supports_recreation = false; -}; - -template -struct lifetime_traits> { - static constexpr bool supports_recreation = false; -}; - -template , - typename LifetimePolicy = default_lifetime, - typename Mutex = std::mutex> -class singleton { -public: - typedef Type value_type; - typedef Type* pointer; - typedef const Type* const_pointer; - typedef const Type& const_reference; - typedef Type& reference; - -private: - static bool destroyed_; - static pointer instance_; - static Mutex mutex_; - - static void destroy() { - if (destroyed_) - return; - Create::destroy(instance_); - instance_ = nullptr; - destroyed_ = true; - } - -public: - static reference instance() { - static_assert(Create::supports_recreation || !lifetime_traits::supports_recreation, - "The creation policy does not support instance recreation, while the lifetime does support it."); - if (!instance_) { - std::lock_guard l(mutex_); - if (!instance_) { - if (destroyed_) { - destroyed_ = false; - LifetimePolicy::on_dead_ref(); - } - instance_ = Create::create(); - LifetimePolicy::schedule_destruction(instance_, &destroy); - } - } - return *instance_; - } - /** - * WARNING: DO NOT EXECUTE THIS MULTITHREADED!!! - */ - static void destroy_instance() { - if (instance_) { - std::lock_guard l(mutex_); - destroy(); - } - } - -public: - pointer operator->() { - if (!instance_) { - instance(); - } - return instance_; - } - - reference operator*() { - if (!instance_) { - instance(); - } - return *instance_; - } - - const_pointer operator->() const { - if (!instance_) { - instance(); - } - return instance_; - } - - const_reference operator*() const { - if (!instance_) { - instance(); - } - return *instance_; - } -}; - -template -bool singleton::destroyed_ = false; - -template -typename singleton::pointer singleton::instance_ = nullptr; - -template -M singleton::mutex_; - -} // namespace crossbow \ No newline at end of file diff --git a/tests/TestRunner/local_cluster.py b/tests/TestRunner/local_cluster.py index 85f2094774..68318d51dd 100644 --- a/tests/TestRunner/local_cluster.py +++ b/tests/TestRunner/local_cluster.py @@ -38,7 +38,7 @@ cluster_file = {etcdir}/fdb.cluster command = {fdbserver_bin} public_address = auto:$ID listen_address = public -datadir = {datadir}/$ID +datadir = {datadir} logdir = {logdir} # logsize = 10MiB # maxlogssize = 100MiB From 359abfb0087b68d028ad81d365cf7450eca58167 Mon Sep 17 00:00:00 2001 From: Chaoguang Lin Date: Tue, 11 May 2021 12:08:48 -0700 Subject: [PATCH 378/461] Update FDB_API_VERSION to 710 --- fdbcli/fdbcli.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 4d49c8efc6..49ca2547ff 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -60,7 +60,7 @@ #include "flow/actorcompiler.h" // This must be the last #include. -#define FDB_API_VERSION 700 +#define FDB_API_VERSION 710 /* * While we could just use the MultiVersionApi instance directly, this #define allows us to swap in any other IClientApi * instance (e.g. from ThreadSafeApi) From 8002a389d4ecb5abc78cf5de027b33ea85c035dc Mon Sep 17 00:00:00 2001 From: Chaoguang Lin Date: Tue, 11 May 2021 12:12:33 -0700 Subject: [PATCH 379/461] add comments for error handling in ConsistencyCheckCommand.actor.cpp --- fdbcli/ConsistencyCheckCommand.actor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdbcli/ConsistencyCheckCommand.actor.cpp b/fdbcli/ConsistencyCheckCommand.actor.cpp index 4c4370ff30..38fc310237 100644 --- a/fdbcli/ConsistencyCheckCommand.actor.cpp +++ b/fdbcli/ConsistencyCheckCommand.actor.cpp @@ -31,6 +31,8 @@ using namespace fdb_cli; ACTOR static Future consistencyCheckCommandActor(Reference tr, std::vector tokens) { + // We do not add a try-catch loop here as the this transaction is always supposed to succeed + // If not, the outer loop catch block(fdbcli.actor.cpp) will handle the error and print out the error message tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); if (tokens.size() == 1) { Optional suspended = wait(safeThreadFutureToFuture(tr->get(consistencyCheckSpecialKey))); From ed3415c93e202f0d2a3fb219ef1b86b83ac561cf Mon Sep 17 00:00:00 2001 From: Chaoguang Lin Date: Tue, 11 May 2021 12:21:09 -0700 Subject: [PATCH 380/461] Guard the added unit test by NOT OPEN_FOR_IDE --- tests/CMakeLists.txt | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 76bab08cde..e12b1e3ce9 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -268,13 +268,12 @@ if(WITH_PYTHON) NAME multiversion_client/unit_tests COMMAND $ -r unittests -f /fdbclient/multiversionclient/ ) + add_test( + NAME threadsafe_threadfuture_to_future/unit_tests + COMMAND $ -r unittests -f /flow/safeThreadFutureToFuture/ + ) endif() - add_test( - NAME threadsafe_threadfuture_to_future/unit_tests - COMMAND $ -r unittests -f /flow/safeThreadFutureToFuture/ - ) - verify_testing() if (NOT OPEN_FOR_IDE AND NOT WIN32) create_correctness_package() From b0554b455478cb9d039ea4df17fb200eec88d1b2 Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Tue, 11 May 2021 20:03:21 +0000 Subject: [PATCH 381/461] Capture how fast an SS is catching up to its tLog-SS lag Changes: LogSystem.h, LogSystemPeekCursor.actor.cpp: Add APIs to find the ID of the tLog from which an SS has fetched the latest set of versions. storageserver.actor.cpp: Capture the number of latest set of versions fetched, the time (in seconds) in which those versions were fetched, and the tLog from which they were fetched. Add this information to a TraceLogEvent. Capture how many versions an SS has fetched in the --- fdbserver/LogSystem.h | 7 +++++++ fdbserver/LogSystemPeekCursor.actor.cpp | 28 ++++++++++++++++++++++++- fdbserver/storageserver.actor.cpp | 19 +++++++++++++++-- 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index f8a3e0b725..da2fbcf5f2 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -410,6 +410,8 @@ struct ILogSystem { virtual Optional getPrimaryPeekLocation() const = 0; + virtual Optional getCurrentPeekLocation() const = 0; + virtual void addref() = 0; virtual void delref() = 0; @@ -473,6 +475,7 @@ struct ILogSystem { Version popped() const override; Version getMinKnownCommittedVersion() const override; Optional getPrimaryPeekLocation() const override; + Optional getCurrentPeekLocation() const override; void addref() override { ReferenceCounted::addref(); } @@ -534,6 +537,7 @@ struct ILogSystem { Version popped() const override; Version getMinKnownCommittedVersion() const override; Optional getPrimaryPeekLocation() const override; + Optional getCurrentPeekLocation() const override; void addref() override { ReferenceCounted::addref(); } @@ -589,6 +593,7 @@ struct ILogSystem { Version popped() const override; Version getMinKnownCommittedVersion() const override; Optional getPrimaryPeekLocation() const override; + Optional getCurrentPeekLocation() const override; void addref() override { ReferenceCounted::addref(); } @@ -620,6 +625,7 @@ struct ILogSystem { Version popped() const override; Version getMinKnownCommittedVersion() const override; Optional getPrimaryPeekLocation() const override; + Optional getCurrentPeekLocation() const override; void addref() override { ReferenceCounted::addref(); } @@ -698,6 +704,7 @@ struct ILogSystem { Version popped() const override; Version getMinKnownCommittedVersion() const override; Optional getPrimaryPeekLocation() const override; + Optional getCurrentPeekLocation() const override; void addref() override { ReferenceCounted::addref(); } diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index cc8a350845..09e692e0b6 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -393,12 +393,16 @@ Version ILogSystem::ServerPeekCursor::getMinKnownCommittedVersion() const { } Optional ILogSystem::ServerPeekCursor::getPrimaryPeekLocation() const { - if (interf) { + if (interf->get().present()) { return interf->get().id(); } return Optional(); } +Optional ILogSystem::ServerPeekCursor::getCurrentPeekLocation() const { + return ILogSystem::ServerPeekCursor::getPrimaryPeekLocation(); +} + Version ILogSystem::ServerPeekCursor::popped() const { return poppedVersion; } @@ -673,6 +677,13 @@ Optional ILogSystem::MergedPeekCursor::getPrimaryPeekLocation() const { return Optional(); } +Optional ILogSystem::MergedPeekCursor::getCurrentPeekLocation() const { + if (currentCursor >= 0) { + return serverCursors[currentCursor]->getPrimaryPeekLocation(); + } + return Optional(); +} + Version ILogSystem::MergedPeekCursor::popped() const { Version poppedVersion = 0; for (auto& c : serverCursors) @@ -1023,6 +1034,13 @@ Optional ILogSystem::SetPeekCursor::getPrimaryPeekLocation() const { return Optional(); } +Optional ILogSystem::SetPeekCursor::getCurrentPeekLocation() const { + if (currentCursor >= 0 && currentSet >= 0) { + return serverCursors[currentSet][currentCursor]->getPrimaryPeekLocation(); + } + return Optional(); +} + Version ILogSystem::SetPeekCursor::popped() const { Version poppedVersion = 0; for (auto& cursors : serverCursors) { @@ -1123,6 +1141,10 @@ Optional ILogSystem::MultiCursor::getPrimaryPeekLocation() const { return cursors.back()->getPrimaryPeekLocation(); } +Optional ILogSystem::MultiCursor::getCurrentPeekLocation() const { + return cursors.back()->getCurrentPeekLocation(); +} + Version ILogSystem::MultiCursor::popped() const { return std::max(poppedVersion, cursors.back()->popped()); } @@ -1403,6 +1425,10 @@ Optional ILogSystem::BufferedCursor::getPrimaryPeekLocation() const { return Optional(); } +Optional ILogSystem::BufferedCursor::getCurrentPeekLocation() const { + return Optional(); +} + Version ILogSystem::BufferedCursor::popped() const { if (initialPoppedVersion == poppedVersion) { return 0; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index c92402ce0d..728dae05fc 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -545,6 +545,10 @@ public: int64_t versionLag; // An estimate for how many versions it takes for the data to move from the logs to this storage // server + int64_t versionCount; + double duration; + Optional sourceTLogID; + ProtocolVersion logProtocol; Reference logSystem; @@ -732,7 +736,7 @@ public: : fetchKeysHistograms(), instanceID(deterministicRandom()->randomUniqueID().first()), storage(this, storage), db(db), actors(false), lastTLogVersion(0), lastVersionWithData(0), restoredVersion(0), rebootAfterDurableVersion(std::numeric_limits::max()), durableInProgress(Void()), versionLag(0), - primaryLocality(tagLocalityInvalid), updateEagerReads(0), shardChangeCounter(0), + versionCount(0), duration(0), primaryLocality(tagLocalityInvalid), updateEagerReads(0), shardChangeCounter(0), fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES), shuttingDown(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), watchBytes(0), numWatches(0), logProtocol(0), counters(this), tag(invalidTag), maxQueryQueue(0), thisServerID(ssi.id()), @@ -3523,9 +3527,20 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); + auto curTime = now(); + data->versionCount = ver - data->version.get(); + data->duration = curTime - data->lastUpdate; + data->sourceTLogID = cursor->getCurrentPeekLocation(); + + TraceEvent("StorageServerCatchUpRate", data->thisServerID) + .detail("VersionCount", data->versionCount) + .detail("Duration", data->duration) + .detail("SourceTLogId", data->sourceTLogID.present() ? data->sourceTLogID.get().toString() : "unknown"); + data->noRecentUpdates.set(false); - data->lastUpdate = now(); + data->lastUpdate = curTime; data->version.set(ver); // Triggers replies to waiting gets for new version(s) + setDataVersion(data->thisServerID, data->version.get()); if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); From 9a6151d3fcb5838f27f6c6f1d685df913981319a Mon Sep 17 00:00:00 2001 From: Chaoguang Lin Date: Tue, 11 May 2021 14:31:08 -0700 Subject: [PATCH 382/461] Update fdbcli.h to fdbcli.actor.h, removed the unnecessary wrapper --- fdbcli/CMakeLists.txt | 2 +- fdbcli/ConsistencyCheckCommand.actor.cpp | 16 +++++----------- fdbcli/Util.cpp | 2 +- fdbcli/fdbcli.actor.cpp | 4 ++-- fdbcli/{fdbcli.h => fdbcli.actor.h} | 15 ++++++++++++--- 5 files changed, 21 insertions(+), 18 deletions(-) rename fdbcli/{fdbcli.h => fdbcli.actor.h} (77%) diff --git a/fdbcli/CMakeLists.txt b/fdbcli/CMakeLists.txt index b97619fc9a..d0cab5b178 100644 --- a/fdbcli/CMakeLists.txt +++ b/fdbcli/CMakeLists.txt @@ -1,6 +1,6 @@ set(FDBCLI_SRCS - fdbcli.h fdbcli.actor.cpp + fdbcli.actor.h ConsistencyCheckCommand.actor.cpp FlowLineNoise.actor.cpp FlowLineNoise.h diff --git a/fdbcli/ConsistencyCheckCommand.actor.cpp b/fdbcli/ConsistencyCheckCommand.actor.cpp index 38fc310237..4b8107f954 100644 --- a/fdbcli/ConsistencyCheckCommand.actor.cpp +++ b/fdbcli/ConsistencyCheckCommand.actor.cpp @@ -18,7 +18,7 @@ * limitations under the License. */ -#include "fdbcli/fdbcli.h" +#include "fdbcli/fdbcli.actor.h" #include "fdbclient/FDBOptions.g.h" #include "fdbclient/IClientApi.h" @@ -28,9 +28,11 @@ #include "flow/ThreadHelper.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. -using namespace fdb_cli; +namespace fdb_cli { -ACTOR static Future consistencyCheckCommandActor(Reference tr, std::vector tokens) { +const KeyRef consistencyCheckSpecialKey = LiteralStringRef("\xff\xff/management/consistency_check_suspended"); + +ACTOR Future consistencyCheckCommandActor(Reference tr, std::vector tokens) { // We do not add a try-catch loop here as the this transaction is always supposed to succeed // If not, the outer loop catch block(fdbcli.actor.cpp) will handle the error and print out the error message tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); @@ -50,14 +52,6 @@ ACTOR static Future consistencyCheckCommandActor(Reference t return true; } -namespace fdb_cli { - -const KeyRef consistencyCheckSpecialKey = LiteralStringRef("\xff\xff/management/consistency_check_suspended"); - -Future consistencyCheckCommand(Reference tr, std::vector tokens) { - return consistencyCheckCommandActor(tr, tokens); -} - CommandFactory consistencyCheckFactory( "consistencycheck", CommandHelp( diff --git a/fdbcli/Util.cpp b/fdbcli/Util.cpp index 2b755bd9d3..f67f27c774 100644 --- a/fdbcli/Util.cpp +++ b/fdbcli/Util.cpp @@ -18,7 +18,7 @@ * limitations under the License. */ -#include "fdbcli/fdbcli.h" +#include "fdbcli/fdbcli.actor.h" #include "flow/Arena.h" diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 49ca2547ff..d10da845ec 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -44,7 +44,7 @@ #include "flow/SimpleOpt.h" #include "fdbcli/FlowLineNoise.h" -#include "fdbcli/fdbcli.h" +#include "fdbcli/fdbcli.actor.h" #include #include @@ -3821,7 +3821,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { if (tokencmp(tokens[0], "consistencycheck")) { getTransaction(db, tr, tr2, options, intrans); - bool _result = wait(consistencyCheckCommand(tr2, tokens)); + bool _result = wait(consistencyCheckCommandActor(tr2, tokens)); is_error = !_result; continue; } diff --git a/fdbcli/fdbcli.h b/fdbcli/fdbcli.actor.h similarity index 77% rename from fdbcli/fdbcli.h rename to fdbcli/fdbcli.actor.h index 831de2decd..ceae1263c2 100644 --- a/fdbcli/fdbcli.h +++ b/fdbcli/fdbcli.actor.h @@ -18,13 +18,21 @@ * limitations under the License. */ -#ifndef FDBCLI_H -#define FDBCLI_H #pragma once +// When actually compiled (NO_INTELLISENSE), include the generated +// version of this file. In intellisense use the source version. +#if defined(NO_INTELLISENSE) && !defined(FDBCLI_FDBCLI_ACTOR_G_H) +#define FDBCLI_FDBCLI_ACTOR_G_H +#include "fdbcli/fdbcli.actor.g.h" +#elif !defined(FDBCLI_FDBCLI_ACTOR_H) +#define FDBCLI_FDBCLI_ACTOR_H + #include "fdbclient/IClientApi.h" #include "flow/Arena.h" +#include "flow/actorcompiler.h" // This must be the last #include. + namespace fdb_cli { struct CommandHelp { @@ -62,8 +70,9 @@ void printUsage(StringRef command); // All fdbcli commands (alphabetically) // consistency command -Future consistencyCheckCommand(Reference tr, std::vector tokens); +ACTOR Future consistencyCheckCommandActor(Reference tr, std::vector tokens); } // namespace fdb_cli +#include "flow/unactorcompiler.h" #endif From 6e10a8abf17c235ea8b16e4c3afba60ce7ebfaeb Mon Sep 17 00:00:00 2001 From: Chaoguang Lin Date: Tue, 11 May 2021 14:38:21 -0700 Subject: [PATCH 383/461] fix header's include order --- flow/ThreadHelper.actor.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/flow/ThreadHelper.actor.cpp b/flow/ThreadHelper.actor.cpp index 4c0a89c7d5..06645f8d3e 100644 --- a/flow/ThreadHelper.actor.cpp +++ b/flow/ThreadHelper.actor.cpp @@ -18,13 +18,14 @@ * limitations under the License. */ +#include + +#include "flow/flow.h" +#include "flow/network.h" #include "flow/ThreadHelper.actor.h" #include "flow/Error.h" #include "flow/UnitTest.h" #include "flow/actorcompiler.h" // This must be the last #include. -#include "flow/flow.h" -#include "flow/network.h" -#include ThreadCallback* ThreadCallback::addCallback(ThreadCallback* cb) { return (new ThreadMultiCallback())->addCallback(this)->addCallback(cb); From 42eced15c9ed26e46d0284e6630bf3e40c15b6c2 Mon Sep 17 00:00:00 2001 From: Chaoguang Lin Date: Tue, 11 May 2021 16:46:07 -0700 Subject: [PATCH 384/461] Update comments and trigger CI --- fdbcli/ConsistencyCheckCommand.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbcli/ConsistencyCheckCommand.actor.cpp b/fdbcli/ConsistencyCheckCommand.actor.cpp index 4b8107f954..892acbb239 100644 --- a/fdbcli/ConsistencyCheckCommand.actor.cpp +++ b/fdbcli/ConsistencyCheckCommand.actor.cpp @@ -33,7 +33,7 @@ namespace fdb_cli { const KeyRef consistencyCheckSpecialKey = LiteralStringRef("\xff\xff/management/consistency_check_suspended"); ACTOR Future consistencyCheckCommandActor(Reference tr, std::vector tokens) { - // We do not add a try-catch loop here as the this transaction is always supposed to succeed + // Here we do not proceed in a try-catch loop since the transaction is always supposed to succeed. // If not, the outer loop catch block(fdbcli.actor.cpp) will handle the error and print out the error message tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES); if (tokens.size() == 1) { From 4361dcca2e48a180a260b6680ed72b4fad458119 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Wed, 12 May 2021 11:54:55 -0400 Subject: [PATCH 385/461] Set connectionFile instead of creating a shadow --- fdbserver/fdbserver.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 712186affe..403c2ef48d 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -1505,7 +1505,7 @@ private: fprintf(stderr, "%s\n", ClusterConnectionString::getErrorString(connectionString, e).c_str()); throw; } - auto connectionFile = makeReference(connFile, ccs); + connectionFile = makeReference(connFile, ccs); } else { std::pair resolvedClusterFile; try { From 78ef6822f6a26c366088f2362433c665e33b2754 Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Wed, 12 May 2021 16:40:33 +0000 Subject: [PATCH 386/461] Capture how fast an SS is catching up to its tLog-SS lag Changes: storagegroupserver.actor.cpp: - Report "fetchedVersions" and "duration" as part of StorageMetrics trace event. - Report "sourceTLogID" as a separte trace event (and report this only when it changes).. --- fdbserver/storageserver.actor.cpp | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 728dae05fc..8cd4680f6d 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -545,9 +545,10 @@ public: int64_t versionLag; // An estimate for how many versions it takes for the data to move from the logs to this storage // server - int64_t versionCount; - double duration; - Optional sourceTLogID; + // Metrics about the latest batch of versions fetched by this StorageServer + int64_t fetchedVersions; // how many versions were fetched + double duration; // how long (in seconds) it took to fetch the versions + Optional sourceTLogID; // the tLog from which the versions were fetched ProtocolVersion logProtocol; @@ -710,6 +711,8 @@ public: specialCounter(cc, "DurableVersion", [self]() { return self->durableVersion.get(); }); specialCounter(cc, "DesiredOldestVersion", [self]() { return self->desiredOldestVersion.get(); }); specialCounter(cc, "VersionLag", [self]() { return self->versionLag; }); + specialCounter(cc, "FetchedVersions", [self]() { return self->fetchedVersions; }); + specialCounter(cc, "Duration", [self]() { return self->duration; }); specialCounter(cc, "LocalRate", [self] { return self->currentRate() * 100; }); specialCounter(cc, "BytesReadSampleCount", [self]() { return self->metrics.bytesReadSample.queue.size(); }); @@ -736,7 +739,7 @@ public: : fetchKeysHistograms(), instanceID(deterministicRandom()->randomUniqueID().first()), storage(this, storage), db(db), actors(false), lastTLogVersion(0), lastVersionWithData(0), restoredVersion(0), rebootAfterDurableVersion(std::numeric_limits::max()), durableInProgress(Void()), versionLag(0), - versionCount(0), duration(0), primaryLocality(tagLocalityInvalid), updateEagerReads(0), shardChangeCounter(0), + fetchedVersions(0), duration(0.0), primaryLocality(tagLocalityInvalid), updateEagerReads(0), shardChangeCounter(0), fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES), shuttingDown(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), watchBytes(0), numWatches(0), logProtocol(0), counters(this), tag(invalidTag), maxQueryQueue(0), thisServerID(ssi.id()), @@ -3527,18 +3530,20 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); - auto curTime = now(); - data->versionCount = ver - data->version.get(); - data->duration = curTime - data->lastUpdate; - data->sourceTLogID = cursor->getCurrentPeekLocation(); + data->fetchedVersions = ver - data->version.get(); + data->duration = now() - data->lastUpdate; + Optional curSourceTLogID = cursor->getCurrentPeekLocation(); - TraceEvent("StorageServerCatchUpRate", data->thisServerID) - .detail("VersionCount", data->versionCount) - .detail("Duration", data->duration) - .detail("SourceTLogId", data->sourceTLogID.present() ? data->sourceTLogID.get().toString() : "unknown"); + if (curSourceTLogID != data->sourceTLogID) { + data->sourceTLogID = curSourceTLogID; + + TraceEvent("StorageServerSourceTLogID", data->thisServerID) + .detail("SourceTLogID", data->sourceTLogID.present() ? data->sourceTLogID.get().toString() : "unknown") + .trackLatest(data->thisServerID.toString() + "/StorageServerSourceTLogID"); + } data->noRecentUpdates.set(false); - data->lastUpdate = curTime; + data->lastUpdate = now(); data->version.set(ver); // Triggers replies to waiting gets for new version(s) setDataVersion(data->thisServerID, data->version.get()); From 061afda2ec108b9343a3306a5b07824ec41eb28f Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Wed, 12 May 2021 09:41:26 -0700 Subject: [PATCH 387/461] Fix several package issues (#4801) * Make the structure of the server pkg match 6.2.28 * Fix OSX lib path * Fix install destinations in client Previously, backup_agent would map to fdbmonitor installation dir - which is incorrect in the sense that it disagrees with where the default foundationdb.conf expects to find backup_agent. Add a new backupagent installation dir and install there, matching foundationdb.conf's expectations. Also fix an issue where several of the versions of fdbbackup weren't being installed * Update packaging/osx/buildpkg.sh for cmake * Update README instructions for pkg file * Remove osx cpack config * Remove pm install destinations * Fix weird syntax * Remove cpack reference to PM * Address review comments --- README.md | 4 +- cmake/CPackConfig.cmake | 18 --------- cmake/FDBInstall.cmake | 24 +++++++++--- cmake/InstallLayout.cmake | 46 +++-------------------- fdbbackup/CMakeLists.txt | 4 +- packaging/osx/buildpkg.sh | 48 +++++++++++++++--------- packaging/osx/scripts-server/postinstall | 6 +-- 7 files changed, 61 insertions(+), 89 deletions(-) diff --git a/README.md b/README.md index cd28c798f0..9e0ddb78a5 100755 --- a/README.md +++ b/README.md @@ -157,11 +157,11 @@ The build under MacOS will work the same way as on Linux. To get boost and ninja cmake -G Ninja ``` -To generate a installable package, you can use cpack: +To generate a installable package, ```sh ninja -cpack -G productbuild +$SRCDIR/packaging/osx/buildpkg.sh . $SRCDIR ``` ### Windows diff --git a/cmake/CPackConfig.cmake b/cmake/CPackConfig.cmake index 08f90bc0c5..c67059ec65 100644 --- a/cmake/CPackConfig.cmake +++ b/cmake/CPackConfig.cmake @@ -9,24 +9,6 @@ elseif(CPACK_GENERATOR MATCHES "DEB") set(CPACK_COMPONENTS_ALL clients-deb server-deb clients-versioned server-versioned) set(CPACK_RESOURCE_FILE_README ${CMAKE_SOURCE_DIR}/README.md) set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_SOURCE_DIR}/LICENSE) -elseif(CPACK_GENERATOR MATCHES "productbuild") - set(CPACK_PACKAGING_INSTALL_PREFIX "/") - set(CPACK_COMPONENTS_ALL clients-pm server-pm) - set(CPACK_STRIP_FILES TRUE) - set(CPACK_PREFLIGHT_SERVER_SCRIPT ${CMAKE_SOURCE_DIR}/packaging/osx/scripts-server/preinstall) - set(CPACK_POSTFLIGHT_SERVER_SCRIPT ${CMAKE_SOURCE_DIR}/packaging/osx/scripts-server/postinstall) - set(CPACK_POSTFLIGHT_CLIENTS_SCRIPT ${CMAKE_SOURCE_DIR}/packaging/osx/scripts-server/preinstall) -# Commenting out this readme file until it works within packaging - set(CPACK_RESOURCE_FILE_README ${CMAKE_SOURCE_DIR}/packaging/osx/resources/conclusion.rtf) - set(CPACK_PRODUCTBUILD_RESOURCES_DIR ${CMAKE_SOURCE_DIR}/packaging/osx/resources) -# Changing the path of this file as CMAKE_BINARY_DIR does not seem to be defined - set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_BINARY_DIR}/License.txt) - if(NOT FDB_RELEASE) - set(prerelease_string "-PRERELEASE") - else() - set(prerelease_string "") - endif() - set(CPACK_PACKAGE_FILE_NAME "FoundationDB-${PROJECT_VERSION}${prerelease_string}") elseif(CPACK_GENERATOR MATCHES "TGZ") set(CPACK_STRIP_FILES TRUE) set(CPACK_COMPONENTS_ALL clients-tgz server-tgz) diff --git a/cmake/FDBInstall.cmake b/cmake/FDBInstall.cmake index 263291c433..2dd4be696f 100644 --- a/cmake/FDBInstall.cmake +++ b/cmake/FDBInstall.cmake @@ -214,7 +214,7 @@ endfunction() function(fdb_install) if(NOT WIN32 AND NOT OPEN_FOR_IDE) - set(one_value_options COMPONENT DESTINATION EXPORT DESTINATION_SUFFIX) + set(one_value_options COMPONENT DESTINATION EXPORT DESTINATION_SUFFIX RENAME) set(multi_value_options TARGETS FILES PROGRAMS DIRECTORY) cmake_parse_arguments(IN "${options}" "${one_value_options}" "${multi_value_options}" "${ARGN}") @@ -237,6 +237,9 @@ function(fdb_install) get_install_dest(${pkg} ${destination} install_path) string(TOLOWER "${pkg}" package) if(install_export) + if(IN_RENAME) + message(FATAL_ERROR "RENAME for EXPORT target not implemented") + endif() install( EXPORT "${IN_EXPORT}-${package}" DESTINATION "${install_path}${IN_DESTINATION_SUFFIX}" @@ -248,11 +251,20 @@ function(fdb_install) set(export_args EXPORT "${IN_EXPORT}-${package}") endif() if(NOT ${install_path} STREQUAL "") - install( - ${args} - ${export_args} - DESTINATION "${install_path}${IN_DESTINATION_SUFFIX}" - COMPONENT "${IN_COMPONENT}-${package}") + if(IN_RENAME) + install( + ${args} + ${export_args} + DESTINATION "${install_path}${IN_DESTINATION_SUFFIX}" + COMPONENT "${IN_COMPONENT}-${package}" + RENAME ${IN_RENAME}) + else() + install( + ${args} + ${export_args} + DESTINATION "${install_path}${IN_DESTINATION_SUFFIX}" + COMPONENT "${IN_COMPONENT}-${package}") + endif() endif() endif() endforeach() diff --git a/cmake/InstallLayout.cmake b/cmake/InstallLayout.cmake index a037b65df2..91d39d4125 100644 --- a/cmake/InstallLayout.cmake +++ b/cmake/InstallLayout.cmake @@ -46,10 +46,6 @@ function(install_symlink) TO "../${rel_path}bin/${IN_FILE_NAME}" DESTINATION "usr/lib64/${IN_LINK_NAME}" COMPONENTS "${IN_COMPONENT}-deb") - install_symlink_impl( - TO "../${rel_path}local/bin/${IN_FILE_NAME}" - DESTINATION "usr/lib64/${IN_LINK_NAME}" - COMPONENTS "${IN_COMPONENT}-pm") elseif("${IN_LINK_DIR}" MATCHES "bin") install_symlink_impl( TO "../${rel_path}bin/${IN_FILE_NAME}" @@ -61,10 +57,6 @@ function(install_symlink) COMPONENTS "${IN_COMPONENT}-el6" "${IN_COMPONENT}-el7" "${IN_COMPONENT}-deb") - install_symlink_impl( - TO "../${rel_path}/bin/${IN_FILE_NAME}" - DESTINATION "usr/local/bin/${IN_LINK_NAME}" - COMPONENTS "${IN_COMPONENT}-pm") elseif("${IN_LINK_DIR}" MATCHES "fdbmonitor") install_symlink_impl( TO "../../${rel_path}bin/${IN_FILE_NAME}" @@ -76,10 +68,6 @@ function(install_symlink) COMPONENTS "${IN_COMPONENT}-el6" "${IN_COMPONENT}-el7" "${IN_COMPONENT}-deb") - install_symlink_impl( - TO "../../${rel_path}/bin/${IN_FILE_NAME}" - DESTINATION "usr/local/lib/foundationdb/${IN_LINK_NAME}" - COMPONENTS "${IN_COMPONENT}-pm") else() message(FATAL_ERROR "Unknown LINK_DIR ${IN_LINK_DIR}") endif() @@ -103,8 +91,8 @@ function(symlink_files) endif() endfunction() -fdb_install_packages(TGZ DEB EL7 PM VERSIONED) -fdb_install_dirs(BIN SBIN LIB FDBMONITOR INCLUDE ETC LOG DATA) +fdb_install_packages(TGZ DEB EL7 VERSIONED) +fdb_install_dirs(BIN SBIN LIB FDBMONITOR INCLUDE ETC LOG DATA BACKUPAGENT) message(STATUS "FDB_INSTALL_DIRS -> ${FDB_INSTALL_DIRS}") install_destinations(TGZ @@ -112,6 +100,7 @@ install_destinations(TGZ SBIN sbin LIB lib FDBMONITOR sbin + BACKUPAGENT usr/lib/foundationdb INCLUDE include ETC etc/foundationdb LOG log/foundationdb @@ -122,19 +111,13 @@ install_destinations(DEB SBIN usr/sbin LIB usr/lib FDBMONITOR usr/lib/foundationdb + BACKUPAGENT usr/lib/foundationdb INCLUDE usr/include ETC etc/foundationdb LOG var/log/foundationdb DATA var/lib/foundationdb/data) copy_install_destinations(DEB EL7) install_destinations(EL7 LIB usr/lib64) -install_destinations(PM - BIN usr/local/bin - SBIN usr/local/sbin - LIB lib - FDBMONITOR usr/local/libexec - INCLUDE usr/local/include - ETC usr/local/etc/foundationdb) # This can be used for debugging in case above is behaving funky #print_install_destinations() @@ -142,7 +125,7 @@ install_destinations(PM set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated") if(APPLE) - set(CPACK_GENERATOR TGZ productbuild) + set(CPACK_GENERATOR TGZ) else() set(CPACK_GENERATOR RPM DEB TGZ) endif() @@ -212,19 +195,16 @@ set(CPACK_PACKAGE_CONTACT "The FoundationDB Community") set(CPACK_COMPONENT_SERVER-EL7_DEPENDS clients-el7) set(CPACK_COMPONENT_SERVER-DEB_DEPENDS clients-deb) set(CPACK_COMPONENT_SERVER-TGZ_DEPENDS clients-tgz) -set(CPACK_COMPONENT_SERVER-PM_DEPENDS clients-pm) set(CPACK_COMPONENT_SERVER-VERSIONED_DEPENDS clients-versioned) set(CPACK_COMPONENT_SERVER-EL7_DISPLAY_NAME "foundationdb-server") set(CPACK_COMPONENT_SERVER-DEB_DISPLAY_NAME "foundationdb-server") set(CPACK_COMPONENT_SERVER-TGZ_DISPLAY_NAME "foundationdb-server") -set(CPACK_COMPONENT_SERVER-PM_DISPLAY_NAME "foundationdb-server") set(CPACK_COMPONENT_SERVER-VERSIONED_DISPLAY_NAME "foundationdb-server-${PROJECT_VERSION}") set(CPACK_COMPONENT_CLIENTS-EL7_DISPLAY_NAME "foundationdb-clients") set(CPACK_COMPONENT_CLIENTS-DEB_DISPLAY_NAME "foundationdb-clients") set(CPACK_COMPONENT_CLIENTS-TGZ_DISPLAY_NAME "foundationdb-clients") -set(CPACK_COMPONENT_CLIENTS-PM_DISPLAY_NAME "foundationdb-clients") set(CPACK_COMPONENT_CLIENTS-VERSIONED_DISPLAY_NAME "foundationdb-clients-${PROJECT_VERSION}") @@ -382,19 +362,6 @@ set(CPACK_DEBIAN_SERVER-VERSIONED_PACKAGE_CONTROL_EXTRA ${CMAKE_BINARY_DIR}/packaging/multiversion/server/postinst ${CMAKE_BINARY_DIR}/packaging/multiversion/server/prerm) -################################################################################ -# MacOS configuration -################################################################################ - -if(APPLE) - install(PROGRAMS ${CMAKE_SOURCE_DIR}/packaging/osx/uninstall-FoundationDB.sh - DESTINATION "usr/local/foundationdb" - COMPONENT clients-pm) - install(FILES ${CMAKE_SOURCE_DIR}/packaging/osx/com.foundationdb.fdbmonitor.plist - DESTINATION "Library/LaunchDaemons" - COMPONENT server-pm) -endif() - ################################################################################ # Configuration for DEB ################################################################################ @@ -413,9 +380,6 @@ set(CLUSTER_DESCRIPTION1 ${description1} CACHE STRING "Cluster description") set(CLUSTER_DESCRIPTION2 ${description2} CACHE STRING "Cluster description") if(NOT WIN32) - install(FILES ${CMAKE_SOURCE_DIR}/packaging/osx/foundationdb.conf.new - DESTINATION "usr/local/etc" - COMPONENT server-pm) fdb_install(FILES ${CMAKE_SOURCE_DIR}/packaging/foundationdb.conf DESTINATION etc COMPONENT server) diff --git a/fdbbackup/CMakeLists.txt b/fdbbackup/CMakeLists.txt index 48b1ad1aef..da2457b850 100644 --- a/fdbbackup/CMakeLists.txt +++ b/fdbbackup/CMakeLists.txt @@ -23,14 +23,14 @@ target_link_libraries(fdbdecode PRIVATE fdbclient) if(NOT OPEN_FOR_IDE) if(GENERATE_DEBUG_PACKAGES) fdb_install(TARGETS fdbbackup DESTINATION bin COMPONENT clients) - fdb_install(PROGRAMS $ DESTINATION fdbmonitor COMPONENT clients RENAME backup_agent/backup_agent) + fdb_install(PROGRAMS $ DESTINATION backupagent COMPONENT clients RENAME backup_agent/backup_agent) fdb_install(PROGRAMS $ DESTINATION bin COMPONENT clients RENAME fdbrestore) fdb_install(PROGRAMS $ DESTINATION bin COMPONENT clients RENAME dr_agent) fdb_install(PROGRAMS $ DESTINATION bin COMPONENT clients RENAME fdbdr) else() add_custom_target(prepare_fdbbackup_install ALL DEPENDS strip_only_fdbbackup) fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbbackup DESTINATION bin COMPONENT clients) - fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbbackup DESTINATION fdbmonitor COMPONENT clients RENAME backup_agent/backup_agent) + fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbbackup DESTINATION backupagent COMPONENT clients RENAME backup_agent/backup_agent) fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbbackup DESTINATION bin COMPONENT clients RENAME fdbrestore) fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbbackup DESTINATION bin COMPONENT clients RENAME dr_agent) fdb_install(PROGRAMS ${CMAKE_BINARY_DIR}/packages/bin/fdbbackup DESTINATION bin COMPONENT clients RENAME fdbdr) diff --git a/packaging/osx/buildpkg.sh b/packaging/osx/buildpkg.sh index 60b441b191..d8b9f560a5 100755 --- a/packaging/osx/buildpkg.sh +++ b/packaging/osx/buildpkg.sh @@ -1,12 +1,26 @@ #!/bin/bash -set -e +set -Eeuo pipefail umask 0022 -PKGFILE=$1 -VERSION=$2 -RELEASE=$3 +if [ "$#" -ne 2 ] ; then + echo "Usage: $0 " + exit 1 +fi + +# BUILDDIR is the path to the cmake build directory +# SRCDIR is the path to the source directory +# +# e.g. If your current directory is the project root and the build directory is _build, then you want to do +# $ ./packaging/osx/buildpkg.sh _build . +# +BUILDDIR="$1" +SRCDIR="$2" + +VERSION="$(grep 'CMAKE_PROJECT_VERSION[^_]' "$BUILDDIR/CMakeCache.txt" | sed -e 's/[^=]*=//')" + +PKGFILE="$BUILDDIR/packages/FoundationDB-$VERSION.pkg" CLIENTSDIR=$( mktemp -d -t fdb-clients-pkg ) SERVERDIR=$( mktemp -d -t fdb-server-pkg ) @@ -23,20 +37,20 @@ mkdir -p -m 0755 $CLIENTSDIR/Library/Python/2.7/site-packages/fdb mkdir -p -m 0775 $CLIENTSDIR/usr/local/etc/foundationdb mkdir -p -m 0755 $CLIENTSDIR/usr/local/foundationdb/backup_agent -install -m 0755 bin/fdbcli $CLIENTSDIR/usr/local/bin -install -m 0644 bindings/c/foundationdb/fdb_c.h bindings/c/foundationdb/fdb_c_options.g.h fdbclient/vexillographer/fdb.options $CLIENTSDIR/usr/local/include/foundationdb -install -m 0755 lib/libfdb_c.dylib $CLIENTSDIR/usr/local/lib -install -m 0644 bindings/python/fdb/*.py $CLIENTSDIR/Library/Python/2.7/site-packages/fdb -install -m 0755 bin/fdbbackup $CLIENTSDIR/usr/local/foundationdb/backup_agent/backup_agent -install -m 0755 packaging/osx/uninstall-FoundationDB.sh $CLIENTSDIR/usr/local/foundationdb -dos2unix README.md $CLIENTSDIR/usr/local/foundationdb/README +install -m 0755 "$BUILDDIR"/bin/fdbcli $CLIENTSDIR/usr/local/bin +install -m 0644 "$SRCDIR"/bindings/c/foundationdb/fdb_c.h "$BUILDDIR"/bindings/c/foundationdb/fdb_c_options.g.h "$SRCDIR"/fdbclient/vexillographer/fdb.options $CLIENTSDIR/usr/local/include/foundationdb +install -m 0755 "$BUILDDIR"/lib/libfdb_c.dylib $CLIENTSDIR/usr/local/lib +install -m 0644 "$BUILDDIR"/bindings/python/fdb/*.py $CLIENTSDIR/Library/Python/2.7/site-packages/fdb +install -m 0755 "$BUILDDIR"/bin/fdbbackup $CLIENTSDIR/usr/local/foundationdb/backup_agent/backup_agent +install -m 0755 "$SRCDIR"/packaging/osx/uninstall-FoundationDB.sh $CLIENTSDIR/usr/local/foundationdb +dos2unix "$SRCDIR"/README.md $CLIENTSDIR/usr/local/foundationdb/README chmod 0644 $CLIENTSDIR/usr/local/foundationdb/README ln -s /usr/local/foundationdb/backup_agent/backup_agent $CLIENTSDIR/usr/local/bin/fdbbackup ln -s /usr/local/foundationdb/backup_agent/backup_agent $CLIENTSDIR/usr/local/bin/fdbrestore ln -s /usr/local/foundationdb/backup_agent/backup_agent $CLIENTSDIR/usr/local/bin/fdbdr ln -s /usr/local/foundationdb/backup_agent/backup_agent $CLIENTSDIR/usr/local/bin/dr_agent -pkgbuild --root $CLIENTSDIR --identifier FoundationDB-clients --version $VERSION.$RELEASE --scripts packaging/osx/scripts-clients FoundationDB-clients.pkg +pkgbuild --root $CLIENTSDIR --identifier FoundationDB-clients --version $VERSION --scripts "$SRCDIR"/packaging/osx/scripts-clients FoundationDB-clients.pkg rm -rf $CLIENTSDIR @@ -46,14 +60,14 @@ mkdir -p -m 0755 $SERVERDIR/Library/LaunchDaemons mkdir -p -m 0700 $SERVERDIR/usr/local/foundationdb/data mkdir -p -m 0700 $SERVERDIR/usr/local/foundationdb/logs -install -m 0664 packaging/osx/foundationdb.conf.new $SERVERDIR/usr/local/etc/foundationdb -install -m 0755 bin/fdbserver bin/fdbmonitor $SERVERDIR/usr/local/libexec -install -m 0644 packaging/osx/com.foundationdb.fdbmonitor.plist $SERVERDIR/Library/LaunchDaemons +install -m 0664 "$SRCDIR"/packaging/osx/foundationdb.conf.new $SERVERDIR/usr/local/etc/foundationdb +install -m 0755 "$BUILDDIR"/bin/fdbserver "$BUILDDIR"/bin/fdbmonitor $SERVERDIR/usr/local/libexec +install -m 0644 "$SRCDIR"/packaging/osx/com.foundationdb.fdbmonitor.plist $SERVERDIR/Library/LaunchDaemons -pkgbuild --root $SERVERDIR --identifier FoundationDB-server --version $VERSION.$RELEASE --scripts packaging/osx/scripts-server FoundationDB-server.pkg +pkgbuild --root $SERVERDIR --identifier FoundationDB-server --version "$VERSION" --scripts "$SRCDIR"/packaging/osx/scripts-server FoundationDB-server.pkg rm -rf $SERVERDIR -productbuild --distribution packaging/osx/Distribution.xml --resources packaging/osx/resources --package-path . $PKGFILE +productbuild --distribution "$SRCDIR"/packaging/osx/Distribution.xml --resources "$SRCDIR"/packaging/osx/resources --package-path . "$PKGFILE" rm FoundationDB-clients.pkg FoundationDB-server.pkg diff --git a/packaging/osx/scripts-server/postinstall b/packaging/osx/scripts-server/postinstall index a31c3fd416..34ce9f7dad 100755 --- a/packaging/osx/scripts-server/postinstall +++ b/packaging/osx/scripts-server/postinstall @@ -1,10 +1,10 @@ #!/bin/bash if [ ! -f /usr/local/etc/foundationdb/fdb.cluster ]; then - description=`LC_CTYPE=C tr -dc A-Za-z0-9 < /dev/urandom | head -c 8` - random_str=`LC_CTYPE=C tr -dc A-Za-z0-9 < /dev/urandom | head -c 8` + description=`LC_CTYPE=C tr -dc '[:lower:][:upper:][:digit:]' < /dev/urandom | head -c 8` + random_str=`LC_CTYPE=C tr -dc '[:lower:][:upper:][:digit:]' < /dev/urandom | head -c 8` echo $description:$random_str@127.0.0.1:4689 > /usr/local/etc/foundationdb/fdb.cluster - chmod 0664 /etc/foundationdb/fdb.cluster + chmod 0664 /usr/local/etc/foundationdb/fdb.cluster NEWDB=1 fi From cc6497ddfb1a023171113b8286c3959bcd176faf Mon Sep 17 00:00:00 2001 From: RenxuanW Date: Wed, 12 May 2021 16:21:44 -0700 Subject: [PATCH 388/461] Only log timeout when CC is unknown for a worker. --- fdbserver/Knobs.cpp | 2 +- fdbserver/Knobs.h | 2 +- fdbserver/worker.actor.cpp | 16 ++++++++++------ 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index aee36e39f1..fc1234d243 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -616,7 +616,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi //Worker init( WORKER_LOGGING_INTERVAL, 5.0 ); init( HEAP_PROFILER_INTERVAL, 30.0 ); - init( REGISTER_WORKER_REQUEST_TIMEOUT, 300.0 ); + init( UNKNOWN_CC_TIMEOUT, 600.0 ); init( DEGRADED_RESET_INTERVAL, 24*60*60 ); if ( randomize && BUGGIFY ) DEGRADED_RESET_INTERVAL = 10; init( DEGRADED_WARNING_LIMIT, 1 ); init( DEGRADED_WARNING_RESET_DELAY, 7*24*60*60 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 1c7c273a7b..be2caba6a1 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -543,7 +543,7 @@ public: // Worker double WORKER_LOGGING_INTERVAL; double HEAP_PROFILER_INTERVAL; - double REGISTER_WORKER_REQUEST_TIMEOUT; + double UNKNOWN_CC_TIMEOUT; double DEGRADED_RESET_INTERVAL; double DEGRADED_WARNING_LIMIT; double DEGRADED_WARNING_RESET_DELAY; diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 5a568fc96d..5721b154d4 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -554,20 +554,24 @@ ACTOR Future registrationClient(Referenceget().present(); state Future registrationReply = - ccInterface->get().present() - ? brokenPromiseToNever(ccInterface->get().get().registerWorker.getReply(request)) - : Never(); + ccInterfacePresent ? brokenPromiseToNever(ccInterface->get().get().registerWorker.getReply(request)) + : Never(); state double startTime = now(); loop choose { when(RegisterWorkerReply reply = wait(registrationReply)) { processClass = reply.processClass; asyncPriorityInfo->set(reply.priorityInfo); - TraceEvent("WorkerRegisterReply").detail("CCID", ccInterface->get().get().id()); + TraceEvent("WorkerRegisterReply") + .detail("CCID", ccInterface->get().get().id()) + .detail("ProcessClass", reply.processClass.toString()); break; } - when(wait(delay(SERVER_KNOBS->REGISTER_WORKER_REQUEST_TIMEOUT))) { - TraceEvent(SevWarn, "WorkerRegisterTimeout").detail("WaitTime", now() - startTime); + when(wait(delay(SERVER_KNOBS->UNKNOWN_CC_TIMEOUT))) { + if (!ccInterfacePresent) { + TraceEvent(SevWarn, "WorkerRegisterTimeout").detail("WaitTime", now() - startTime); + } } when(wait(ccInterface->onChange())) { break; } when(wait(ddInterf->onChange())) { break; } From 160293bd5404e9ff2b7003ffe4f66e6b698e2b4b Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Thu, 13 May 2021 14:28:06 +0000 Subject: [PATCH 389/461] Report bounce impact in fdbcli status Changes: Schemas.cpp: Extend the JSON schema to report whether the cluster is bounceable and if not, report the reason for why it is not bounceable. Status.actor.cpp: Extend recoveryStateStatusFetcher() to populate the bounce related field(s). mr-status-json-schemas.rst.inc: Update the schema to reflect the change made in Schemas.cpp. release-notes-700.rst: Add a note about the new status fields in "Status" section. --- .../source/mr-status-json-schemas.rst.inc | 5 +++- .../release-notes/release-notes-700.rst | 2 +- fdbclient/Schemas.cpp | 5 +++- fdbserver/Status.actor.cpp | 23 +++++++++++++------ 4 files changed, 25 insertions(+), 10 deletions(-) diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index acce461308..974244680d 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -477,7 +477,6 @@ ], "recovery_state":{ "seconds_since_last_recovered":1, - "seconds_since_fully_recovered":1, "required_resolvers":1, "required_commit_proxies":1, "required_grv_proxies":1, @@ -503,6 +502,10 @@ "required_logs":3, "missing_logs":"7f8d623d0cb9966e", "active_generations":1, + "bounce_impact":{ + "can_clean_bounce":true, + "reason":"" + }, "description":"Recovery complete." }, "workload":{ diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst index 8e825035a9..c046690b2b 100644 --- a/documentation/sphinx/source/release-notes/release-notes-700.rst +++ b/documentation/sphinx/source/release-notes/release-notes-700.rst @@ -31,7 +31,7 @@ Fixes Status ------ * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4735) `_ -* Added ``seconds_since_fully_recovered`` to the recovery_state section of status to report the time, in seconds, since last full recovery. `(PR #4770) `_ +* Added ``bounce_impact`` to the recovery_state section of status to report if the cluster is bounceable and if not, the reason for why it is not bounceable. `(PR #4770) `_ Bindings -------- diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 1b24af0e77..22f0543a7e 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -521,7 +521,6 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( R"statusSchema( "recovery_state":{ "seconds_since_last_recovered":1, - "seconds_since_fully_recovered":1, "required_resolvers":1, "required_commit_proxies":1, "required_grv_proxies":1, @@ -547,6 +546,10 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "required_logs":3, "missing_logs":"7f8d623d0cb9966e", "active_generations":1, + "bounce_impact":{ + "can_clean_bounce":true, + "reason":"" + }, "description":"Recovery complete." }, "workload":{ diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 992338310c..eef9dc7d59 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -387,6 +387,19 @@ JsonBuilderObject getLagObject(int64_t versions) { return lag; } +static JsonBuilderObject getBounceImpactInfo(int recoveryStatusCode) { + JsonBuilderObject bounceImpact; + + if (recoveryStatusCode == RecoveryStatus::fully_recovered) { + bounceImpact["can_clean_bounce"] = true; + } else { + bounceImpact["can_clean_bounce"] = false; + bounceImpact["reason"] = "cluster hasn't fully recovered yet"; + } + + return bounceImpact; +} + struct MachineMemoryInfo { double memoryUsage; double aggregateLimit; @@ -1168,14 +1181,10 @@ ACTOR static Future recoveryStateStatusFetcher(Database cx, message["required_resolvers"] = requiredResolvers; } else if (mStatusCode == RecoveryStatus::locking_old_transaction_servers) { message["missing_logs"] = md.getValue("MissingIDs").c_str(); - } else if (mStatusCode == RecoveryStatus::fully_recovered) { - if (!rv.isError()) { - int64_t fullyRecoveredAtVersion = md.getInt64("FullyRecoveredAtVersion"); - double secondsSinceFulyRecovered = std::max((int64_t)0, (int64_t)(rv.get() - fullyRecoveredAtVersion)) / - (double)SERVER_KNOBS->VERSIONS_PER_SECOND; - message["seconds_since_fully_recovered"] = secondsSinceFulyRecovered; - } } + + message["bounce_impact"] = getBounceImpactInfo(mStatusCode); + // TODO: time_in_recovery: 0.5 // time_in_state: 0.1 From d8cad8efcae1d721ce602563509d81a051ad55c8 Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Thu, 13 May 2021 16:36:57 +0000 Subject: [PATCH 390/461] Report bounce impact info as part of cluster JSON object. --- fdbclient/Schemas.cpp | 8 ++++---- fdbserver/Status.actor.cpp | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 22f0543a7e..0ba2feaaaa 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -546,10 +546,6 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "required_logs":3, "missing_logs":"7f8d623d0cb9966e", "active_generations":1, - "bounce_impact":{ - "can_clean_bounce":true, - "reason":"" - }, "description":"Recovery complete." }, "workload":{ @@ -652,6 +648,10 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "data_distribution_disabled_for_rebalance":true, "data_distribution_disabled":true, "active_primary_dc":"pv", + "bounce_impact":{ + "can_clean_bounce":true, + "reason":"" + }, "configuration":{ "log_anti_quorum":0, "log_replicas":2, diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index eef9dc7d59..12235f9d31 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1183,8 +1183,6 @@ ACTOR static Future recoveryStateStatusFetcher(Database cx, message["missing_logs"] = md.getValue("MissingIDs").c_str(); } - message["bounce_impact"] = getBounceImpactInfo(mStatusCode); - // TODO: time_in_recovery: 0.5 // time_in_state: 0.1 @@ -2791,6 +2789,7 @@ ACTOR Future clusterGetStatus( statusObj["protocol_version"] = format("%" PRIx64, g_network->protocolVersion().version()); statusObj["connection_string"] = coordinators.ccf->getConnectionString().toString(); + statusObj["bounce_impact"] = getBounceImpactInfo(statusCode); state Optional configuration; state Optional loadResult; From 99f6032239aecb5402ef974e076f32de15d43e13 Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Thu, 13 May 2021 16:47:05 +0000 Subject: [PATCH 391/461] Report bounce impact info as part of cluster JSON object. --- .../sphinx/source/mr-status-json-schemas.rst.inc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index 974244680d..202496620d 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -502,10 +502,6 @@ "required_logs":3, "missing_logs":"7f8d623d0cb9966e", "active_generations":1, - "bounce_impact":{ - "can_clean_bounce":true, - "reason":"" - }, "description":"Recovery complete." }, "workload":{ @@ -608,6 +604,10 @@ "data_distribution_disabled_for_rebalance":true, "data_distribution_disabled":true, "active_primary_dc":"pv", + "bounce_impact":{ + "can_clean_bounce":true, + "reason":"" + }, "configuration":{ "log_anti_quorum":0, "log_replicas":2, From 8a15d7d14bb198e7f722858607cf2f38dc51ba11 Mon Sep 17 00:00:00 2001 From: RenxuanW Date: Thu, 13 May 2021 12:20:31 -0700 Subject: [PATCH 392/461] Bring #4518 (Logging more detailed information during Tlog recruitment) back. --- fdbserver/ClusterController.actor.cpp | 110 ++++++++++++++++++++++---- 1 file changed, 94 insertions(+), 16 deletions(-) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index b43e0de27d..107c865221 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -458,6 +458,33 @@ public: } } + // Log the reason why the worker is considered as unavailable. + void logWorkerUnavailable(const UID& id, + const std::string& method, + const std::string& reason, + const WorkerDetails& details, + const ProcessClass::Fitness& fitness, + const std::set>& dcIds) { + // Construct the list of DCs where the TLog recruitment is happening. This is mainly for logging purpose. + std::string dcList; + for (const auto& dc : dcIds) { + if (!dcList.empty()) { + dcList += ','; + } + dcList += printable(dc); + } + // Note that the recruitment happens only during initial database creation and recovery. So these trace + // events should be sparse. + TraceEvent("GetTLogTeamWorkerUnavailable", id) + .detail("TLogRecruitMethod", method) + .detail("Reason", reason) + .detail("WorkerID", details.interf.id()) + .detail("WorkerDC", details.interf.locality.dcId()) + .detail("Address", details.interf.addresses().toString()) + .detail("Fitness", fitness) + .detail("RecruitmentDcIds", dcList); + }; + // A TLog recruitment method specialized for three_data_hall and three_datacenter configurations // It attempts to evenly recruit processes from across data_halls or datacenters std::vector getWorkersForTlogsComplex(DatabaseConfiguration const& conf, @@ -478,11 +505,30 @@ public: auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog); if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) != - exclusionWorkerIds.end() || - !workerAvailable(worker_info, checkStable) || - conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign || - (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) || - (!allowDegraded && worker_details.degraded)) { + exclusionWorkerIds.end()) { + logWorkerUnavailable(id, "complex", "Worker is excluded", worker_details, fitness, dcIds); + continue; + } + if (!workerAvailable(worker_info, checkStable)) { + logWorkerUnavailable(id, "complex", "Worker is not available", worker_details, fitness, dcIds); + continue; + } + if (conf.isExcludedServer(worker_details.interf.addresses())) { + logWorkerUnavailable( + id, "complex", "Worker server is excluded from the cluster", worker_details, fitness, dcIds); + continue; + } + if (fitness == ProcessClass::NeverAssign) { + logWorkerUnavailable(id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds); + continue; + } + if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) { + logWorkerUnavailable(id, "complex", "Worker is not in the target DC", worker_details, fitness, dcIds); + continue; + } + if (!allowDegraded && worker_details.degraded) { + logWorkerUnavailable( + id, "complex", "Worker is degraded and not allowed", worker_details, fitness, dcIds); continue; } @@ -686,10 +732,25 @@ public: const auto& worker_details = worker_info.details; auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog); if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) != - exclusionWorkerIds.end() || - !workerAvailable(worker_info, checkStable) || - conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign || - (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0)) { + exclusionWorkerIds.end()) { + logWorkerUnavailable(id, "simple", "Worker is excluded", worker_details, fitness, dcIds); + continue; + } + if (!workerAvailable(worker_info, checkStable)) { + logWorkerUnavailable(id, "simple", "Worker is not available", worker_details, fitness, dcIds); + continue; + } + if (conf.isExcludedServer(worker_details.interf.addresses())) { + logWorkerUnavailable( + id, "simple", "Worker server is excluded from the cluster", worker_details, fitness, dcIds); + continue; + } + if (fitness == ProcessClass::NeverAssign) { + logWorkerUnavailable(id, "simple", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds); + continue; + } + if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) { + logWorkerUnavailable(id, "simple", "Worker is not in the target DC", worker_details, fitness, dcIds); continue; } @@ -795,10 +856,27 @@ public: const auto& worker_details = worker_info.details; auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog); if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) != - exclusionWorkerIds.end() || - !workerAvailable(worker_info, checkStable) || - conf.isExcludedServer(worker_details.interf.addresses()) || fitness == ProcessClass::NeverAssign || - (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0)) { + exclusionWorkerIds.end()) { + logWorkerUnavailable(id, "deprecated", "Worker is excluded", worker_details, fitness, dcIds); + continue; + } + if (!workerAvailable(worker_info, checkStable)) { + logWorkerUnavailable(id, "deprecated", "Worker is not available", worker_details, fitness, dcIds); + continue; + } + if (conf.isExcludedServer(worker_details.interf.addresses())) { + logWorkerUnavailable( + id, "deprecated", "Worker server is excluded from the cluster", worker_details, fitness, dcIds); + continue; + } + if (fitness == ProcessClass::NeverAssign) { + logWorkerUnavailable( + id, "deprecated", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds); + continue; + } + if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) { + logWorkerUnavailable( + id, "deprecated", "Worker is not in the target DC", worker_details, fitness, dcIds); continue; } @@ -3091,9 +3169,9 @@ ACTOR Future workerAvailabilityWatch(WorkerInterface worker, cluster->masterProcessId = Optional(); } TraceEvent("ClusterControllerWorkerFailed", cluster->id) - .detail("ProcessId", worker.locality.processId()) - .detail("ProcessClass", failedWorkerInfo.details.processClass.toString()) - .detail("Address", worker.address()); + .detail("ProcessId", worker.locality.processId()) + .detail("ProcessClass", failedWorkerInfo.details.processClass.toString()) + .detail("Address", worker.address()); cluster->removedDBInfoEndpoints.insert(worker.updateServerDBInfo.getEndpoint()); cluster->id_worker.erase(worker.locality.processId()); cluster->updateWorkerList.set(worker.locality.processId(), Optional()); From bdb5517f1e281c6ceabe9d0dd61a10ebfa701465 Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Thu, 13 May 2021 19:38:05 +0000 Subject: [PATCH 393/461] Provide a better explanation of the new metrics in the release notes. --- documentation/sphinx/source/release-notes/release-notes-700.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst index c046690b2b..8997f8c9fd 100644 --- a/documentation/sphinx/source/release-notes/release-notes-700.rst +++ b/documentation/sphinx/source/release-notes/release-notes-700.rst @@ -31,7 +31,7 @@ Fixes Status ------ * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4735) `_ -* Added ``bounce_impact`` to the recovery_state section of status to report if the cluster is bounceable and if not, the reason for why it is not bounceable. `(PR #4770) `_ +* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) `_ Bindings -------- From 6275adc5a063d76238b00ead7fe8baeac7b35aab Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Thu, 13 May 2021 21:38:07 +0000 Subject: [PATCH 394/461] Address build failure LogSystemPeekCursor.actor.cpp: Check if "interf" is set before referencing it. --- fdbserver/LogSystemPeekCursor.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 09e692e0b6..26287919cd 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -393,7 +393,7 @@ Version ILogSystem::ServerPeekCursor::getMinKnownCommittedVersion() const { } Optional ILogSystem::ServerPeekCursor::getPrimaryPeekLocation() const { - if (interf->get().present()) { + if (interf && interf->get().present()) { return interf->get().id(); } return Optional(); From 4163270c02b763b739edacbd4504815c02f85ac1 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Thu, 13 May 2021 23:13:14 +0000 Subject: [PATCH 395/461] Put aarch64 libfdb_java in the right place for fat jar --- bindings/java/CMakeLists.txt | 8 +++--- .../main/com/apple/foundationdb/JNIUtil.java | 26 ++++++++++--------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/bindings/java/CMakeLists.txt b/bindings/java/CMakeLists.txt index 2da8639b8d..09012cdf97 100644 --- a/bindings/java/CMakeLists.txt +++ b/bindings/java/CMakeLists.txt @@ -141,8 +141,6 @@ endif() target_include_directories(fdb_java PRIVATE ${JNI_INCLUDE_DIRS}) # libfdb_java.so is loaded by fdb-java.jar and doesn't need to depened on jvm shared libraries. target_link_libraries(fdb_java PRIVATE fdb_c) -set_target_properties(fdb_java PROPERTIES - LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib/${SYSTEM_NAME}/amd64/) if(APPLE) set_target_properties(fdb_java PROPERTIES SUFFIX ".jnilib") endif() @@ -217,7 +215,11 @@ if(NOT OPEN_FOR_IDE) elseif(APPLE) set(lib_destination "osx/x86_64") else() - set(lib_destination "linux/amd64") + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + set(lib_destination "linux/aarch64") + else() + set(lib_destination "linux/amd64") + endif() endif() set(lib_destination "${unpack_dir}/lib/${lib_destination}") set(jni_package "${CMAKE_BINARY_DIR}/packages/lib") diff --git a/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java b/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java index 8aa3d9f138..a5380112cd 100644 --- a/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java +++ b/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java @@ -36,11 +36,7 @@ class JNIUtil { private static final String TEMPFILE_PREFIX = "fdbjni"; private static final String TEMPFILE_SUFFIX = ".library"; - private enum OS { - WIN32("windows", "amd64", false), - LINUX("linux", "amd64", true), - OSX("osx", "x86_64", true); - + private static class OS { private final String name; private final String arch; private final boolean canDeleteEager; @@ -171,13 +167,19 @@ class JNIUtil { private static OS getRunningOS() { String osname = System.getProperty("os.name").toLowerCase(); - if(osname.startsWith("windows")) - return OS.WIN32; - if(osname.startsWith("linux")) - return OS.LINUX; - if(osname.startsWith("mac") || osname.startsWith("darwin")) - return OS.OSX; - throw new IllegalStateException("Unknown or unsupported OS: " + osname); + String arch = System.getProperty("os.arch"); + if (arch != "amd64" && arch != "x86_64" && arch != "aarch64") { + throw new IllegalStateException("Unknown or unsupported arch: " + arch); + } + if (osname.startsWith("windows")) { + return new OS("windows", arch, /* canDeleteEager */ false); + } else if (osname.startsWith("linux")) { + return new OS("linux", arch, /* canDeleteEager */ true); + } else if (osname.startsWith("mac") || osname.startsWith("darwin")) { + return new OS("osx", arch, /* canDeleteEager */ true); + } else { + throw new IllegalStateException("Unknown or unsupported OS: " + osname); + } } private JNIUtil() {} From e892ca00e4f3e2f78a011d543f452691f2eb7ba7 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Fri, 14 May 2021 00:03:03 +0000 Subject: [PATCH 396/461] Use proper string equality --- bindings/java/src/main/com/apple/foundationdb/JNIUtil.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java b/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java index a5380112cd..99c2f8a322 100644 --- a/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java +++ b/bindings/java/src/main/com/apple/foundationdb/JNIUtil.java @@ -168,7 +168,7 @@ class JNIUtil { private static OS getRunningOS() { String osname = System.getProperty("os.name").toLowerCase(); String arch = System.getProperty("os.arch"); - if (arch != "amd64" && arch != "x86_64" && arch != "aarch64") { + if (!arch.equals("amd64") && !arch.equals("x86_64") && !arch.equals("aarch64")) { throw new IllegalStateException("Unknown or unsupported arch: " + arch); } if (osname.startsWith("windows")) { From 70e53605cfe83ae798ccf4027e27d36b0077ad67 Mon Sep 17 00:00:00 2001 From: Alex Moundalexis Date: Fri, 14 May 2021 10:50:46 -0400 Subject: [PATCH 397/461] updated copyright year on web site --- documentation/sphinx/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/sphinx/conf.py b/documentation/sphinx/conf.py index 5ec9238930..ab42fdba6a 100644 --- a/documentation/sphinx/conf.py +++ b/documentation/sphinx/conf.py @@ -49,7 +49,7 @@ master_doc = 'index' # General information about the project. project = u'FoundationDB' -copyright = u'2013-2018 Apple, Inc and the FoundationDB project authors' +copyright = u'2013-2021 Apple, Inc and the FoundationDB project authors' # Load the version information from 'versions.target' import xml.etree.ElementTree as ET From a162682d6d63a435da4e283c28c3762e40f184ad Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 14 May 2021 11:12:47 -0700 Subject: [PATCH 398/461] Fix accounting for time spent in run loop after breaking due to yield --- flow/Net2.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index bb0b0325c6..c3b35f1203 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -1513,6 +1513,7 @@ void Net2::run() { double newTaskBegin = timer_monotonic(); if (check_yield(TaskPriority::Max, tscNow)) { checkForSlowTask(tscBegin, tscNow, newTaskBegin - taskBegin, currentTaskID); + taskBegin = newTaskBegin; FDB_TRACE_PROBE(run_loop_yield); ++countYields; break; From d55b94fc06ca521bfc8b35d91a9f0f0ad226f5a8 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 14 May 2021 12:38:26 -0700 Subject: [PATCH 399/461] Add release note --- documentation/sphinx/source/release-notes/release-notes-630.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst index f4b5c8aacb..2057e7fcb2 100644 --- a/documentation/sphinx/source/release-notes/release-notes-630.rst +++ b/documentation/sphinx/source/release-notes/release-notes-630.rst @@ -2,9 +2,11 @@ Release Notes ############# + 6.3.13 ====== * The multi-version client now requires at most two client connections with version 6.2 or larger, regardless of how many external clients are configured. Clients older than 6.2 will continue to create an additional connection each. `(PR #4667) `_ +* Fix an accounting error that could potentially result in inaccuracies in priority busyness metrics. `(PR #4824) `_ 6.3.12 ====== From 2298567c2bf998915123401e004680ee6726b099 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Fri, 14 May 2021 23:12:00 -0700 Subject: [PATCH 400/461] Use of aligned_alloc() for 4k pages causes too much wasted virtual memory. Added new 4k-aligned fast allocator, and changed Arena::allocatedAlignedBuffer() to be 4k-specific, now called Arena::allocate4kAlignedBuffer(). --- fdbrpc/dsltest.actor.cpp | 4 ++-- fdbserver/IPager.h | 5 ++++- flow/Arena.cpp | 36 ++++++++++++++++++------------------ flow/Arena.h | 14 +++++++------- flow/FastAlloc.h | 20 ++++++++++++++++++++ 5 files changed, 51 insertions(+), 28 deletions(-) diff --git a/fdbrpc/dsltest.actor.cpp b/fdbrpc/dsltest.actor.cpp index 21e1808afd..89703ea2d2 100644 --- a/fdbrpc/dsltest.actor.cpp +++ b/fdbrpc/dsltest.actor.cpp @@ -632,8 +632,8 @@ void showArena(ArenaBlock* a, ArenaBlock* parent) { ArenaBlockRef* r = (ArenaBlockRef*)((char*)a->getData() + o); // If alignedBuffer is valid then print its pointer and size, else recurse - if (r->alignedBufferSize != 0) { - printf("AlignedBuffer %p (<-%p) %u bytes\n", r->alignedBuffer, a, r->alignedBufferSize); + if (r->aligned4kBufferSize != 0) { + printf("AlignedBuffer %p (<-%p) %u bytes\n", r->aligned4kBuffer, a, r->aligned4kBufferSize); } else { showArena(r->next, a); } diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index cb9612fd95..79d7ed2a80 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -43,7 +43,7 @@ public: // The page's logical size includes an opaque checksum, use size() to get usable size ArenaPage(int logicalSize, int bufferSize) : logicalSize(logicalSize), bufferSize(bufferSize), userData(nullptr) { if (bufferSize > 0) { - buffer = (uint8_t*)arena.allocateAlignedBuffer(4096, bufferSize); + buffer = (uint8_t*)arena.allocate4kAlignedBuffer(bufferSize); // Mark any unused page portion defined VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize); @@ -56,6 +56,9 @@ public: if (userData != nullptr && userDataDestructor != nullptr) { userDataDestructor(userData); } + if(buffer != 0) { + VALGRIND_MAKE_MEM_UNDEFINED(buffer, bufferSize); + } } uint8_t const* begin() const { return (uint8_t*)buffer; } diff --git a/flow/Arena.cpp b/flow/Arena.cpp index fe649c548b..016112cc5b 100644 --- a/flow/Arena.cpp +++ b/flow/Arena.cpp @@ -102,8 +102,8 @@ void Arena::dependsOn(const Arena& p) { } } -void* Arena::allocateAlignedBuffer(size_t alignment, size_t size) { - return ArenaBlock::dependOnAlignedBuffer(impl, alignment, size); +void* Arena::allocate4kAlignedBuffer(size_t size) { + return ArenaBlock::dependOn4kAlignedBuffer(impl, size); } size_t Arena::getSize() const { @@ -177,8 +177,8 @@ size_t ArenaBlock::totalSize() { while (o) { ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + o); makeDefined(r, sizeof(ArenaBlockRef)); - if (r->alignedBufferSize != 0) { - s += r->alignedBufferSize; + if (r->aligned4kBufferSize != 0) { + s += r->aligned4kBufferSize; } else { allowAccess(r->next); s += r->next->totalSize(); @@ -201,7 +201,7 @@ void ArenaBlock::getUniqueBlocks(std::set& a) { makeDefined(r, sizeof(ArenaBlockRef)); // If next is valid recursively count its blocks - if (r->alignedBufferSize == 0) { + if (r->aligned4kBufferSize == 0) { r->next->getUniqueBlocks(a); } @@ -226,7 +226,7 @@ int ArenaBlock::addUsed(int bytes) { void ArenaBlock::makeReference(ArenaBlock* next) { ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + bigUsed); makeDefined(r, sizeof(ArenaBlockRef)); - r->alignedBufferSize = 0; + r->aligned4kBufferSize = 0; r->next = next; r->nextBlockOffset = nextBlockOffset; makeNoAccess(r, sizeof(ArenaBlockRef)); @@ -234,17 +234,17 @@ void ArenaBlock::makeReference(ArenaBlock* next) { bigUsed += sizeof(ArenaBlockRef); } -void* ArenaBlock::makeAlignedBuffer(size_t alignment, size_t size) { +void* ArenaBlock::make4kAlignedBuffer(size_t size) { ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + bigUsed); makeDefined(r, sizeof(ArenaBlockRef)); - r->alignedBufferSize = size; - r->alignedBuffer = aligned_alloc(alignment, size); - // printf("Arena::alignedBuffer alloc %p\n", r->alignedBuffer); + r->aligned4kBufferSize = size; + r->aligned4kBuffer = allocateFast4kAligned(size); + //printf("Arena::aligned4kBuffer alloc size=%u ptr=%p\n", size, r->aligned4kBuffer); r->nextBlockOffset = nextBlockOffset; makeNoAccess(r, sizeof(ArenaBlockRef)); nextBlockOffset = bigUsed; bigUsed += sizeof(ArenaBlockRef); - return r->alignedBuffer; + return r->aligned4kBuffer; } void ArenaBlock::dependOn(Reference& self, ArenaBlock* other) { @@ -255,11 +255,11 @@ void ArenaBlock::dependOn(Reference& self, ArenaBlock* other) { self->makeReference(other); } -void* ArenaBlock::dependOnAlignedBuffer(Reference& self, size_t alignment, size_t size) { +void* ArenaBlock::dependOn4kAlignedBuffer(Reference& self, size_t size) { if (!self || self->isTiny() || self->unused() < sizeof(ArenaBlockRef)) { - return create(SMALL, self)->makeAlignedBuffer(alignment, size); + return create(SMALL, self)->make4kAlignedBuffer(size); } else { - return self->makeAlignedBuffer(alignment, size); + return self->make4kAlignedBuffer(size); } } @@ -396,10 +396,10 @@ void ArenaBlock::destroy() { ArenaBlockRef* br = (ArenaBlockRef*)((char*)b->getData() + o); makeDefined(br, sizeof(ArenaBlockRef)); - // If alignedBuffer is valid, free it - if (br->alignedBufferSize != 0) { - // printf("Arena::alignedBuffer free %p\n", br->alignedBuffer); - aligned_free(br->alignedBuffer); + // If aligned4kBuffer is valid, free it + if (br->aligned4kBufferSize != 0) { + //printf("Arena::aligned4kBuffer free %p\n", br->aligned4kBuffer); + freeFast4kAligned(br->aligned4kBufferSize, br->aligned4kBuffer); } else { allowAccess(br->next); if (br->next->delref_no_destroy()) diff --git a/flow/Arena.h b/flow/Arena.h index 999e873044..a34dcf67c6 100644 --- a/flow/Arena.h +++ b/flow/Arena.h @@ -102,7 +102,7 @@ public: Arena& operator=(Arena&&) noexcept; void dependsOn(const Arena& p); - void* allocateAlignedBuffer(size_t alignment, size_t size); + void* allocate4kAlignedBuffer(size_t size); size_t getSize() const; bool hasFree(size_t size, const void* address); @@ -130,12 +130,12 @@ struct scalar_traits : std::true_type { }; struct ArenaBlockRef { - // If alignedBufferSize is not 0, alignedBuffer is valid and must be freed with aligned_free() - // Otherwise, next is valid - size_t alignedBufferSize; + // Only one of (next, aligned4kBuffer) are valid at any one time, as they occupy the same space. + // If aligned4kBufferSize is not 0, aligned4kBuffer is valid, otherwise next is valid. + size_t aligned4kBufferSize; union { ArenaBlock* next; - void* alignedBuffer; + void* aligned4kBuffer; }; uint32_t nextBlockOffset; }; @@ -167,9 +167,9 @@ struct ArenaBlock : NonCopyable, ThreadSafeReferenceCounted { void getUniqueBlocks(std::set& a); int addUsed(int bytes); void makeReference(ArenaBlock* next); - void* makeAlignedBuffer(size_t alignment, size_t size); + void* make4kAlignedBuffer(size_t size); static void dependOn(Reference& self, ArenaBlock* other); - static void* dependOnAlignedBuffer(Reference& self, size_t alignment, size_t size); + static void* dependOn4kAlignedBuffer(Reference& self, size_t size); static void* allocate(Reference& self, int bytes); // Return an appropriately-sized ArenaBlock to store the given data static ArenaBlock* create(int dataSize, Reference& next); diff --git a/flow/FastAlloc.h b/flow/FastAlloc.h index f9fff408d2..3f5f2ab58b 100644 --- a/flow/FastAlloc.h +++ b/flow/FastAlloc.h @@ -266,4 +266,24 @@ inline void freeFast(int size, void* ptr) { delete[](uint8_t*) ptr; } +[[nodiscard]] inline void* allocateFast4kAligned(int size) { + if (size <= 4096) + return FastAllocator<4096>::allocate(); + if (size <= 8192) + return FastAllocator<8192>::allocate(); + if (size <= 16384) + return FastAllocator<16384>::allocate(); + return aligned_alloc(4096, size); +} + +inline void freeFast4kAligned(int size, void* ptr) { + if (size <= 4096) + return FastAllocator<4096>::release(ptr); + if (size <= 8192) + return FastAllocator<8192>::release(ptr); + if (size <= 16384) + return FastAllocator<16384>::release(ptr); + aligned_free(ptr); +} + #endif From 6a5bf120f83c9efa73b91ebaa54ab6764e53bb41 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sat, 15 May 2021 23:00:21 -0700 Subject: [PATCH 401/461] Bug fix: It is possible for the pager to be shut down while a cursor operation is acquiring its mutex, specifically after the permit is available but before the delay(0) inside take() is ready, causing the cursor to operate on an invalid pager. --- fdbserver/VersionedBTree.actor.cpp | 9 +++++++++ flow/genericactors.actor.h | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 1c97ab11bd..66e6bed9b7 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -264,6 +264,7 @@ public: Future writeOperations; FlowLock mutex; + Future killMutex; Cursor() : mode(NONE) {} @@ -274,6 +275,14 @@ public: int readOffset = 0, LogicalPageID endPage = invalidLogicalPageID) { queue = q; + + // If the pager gets an error, which includes shutdown, kill the mutex so any waiters can no longer run. + // This avoids having every mutex wait also wait on pagerError. + killMutex = map(ready(queue->pagerError), [=](Void e) { + mutex.kill(); + return Void(); + }); + mode = m; firstPageIDWritten = invalidLogicalPageID; offset = readOffset; diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 46cdb6d73f..7bf2a05e63 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -1334,6 +1334,14 @@ struct FlowLock : NonCopyable, public ReferenceCounted { int64_t activePermits() const { return active; } int waiters() const { return takers.size(); } + // Try to send error to all current and future waiters + // Only works if broken_on_destruct.canBeSet() + void kill(Error e = broken_promise()) { + if (broken_on_destruct.canBeSet()) { + broken_on_destruct.sendError(e); + } + } + private: std::list, int64_t>> takers; const int64_t permits; From cfeff9aa4bf8a2cc43845218e8bc33e00cc128ab Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sun, 16 May 2021 01:41:40 -0700 Subject: [PATCH 402/461] Clarity improvement, loop was reusing variable name from enclosing scope. --- fdbserver/IPager.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 79d7ed2a80..811ed8ab77 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -93,16 +93,16 @@ public: int usableSize = pages.front()->size(); int totalUsableSize = pages.size() * usableSize; int totalBufferSize = pages.front()->bufferSize * pages.size(); - ArenaPage* p = new ArenaPage(totalUsableSize + sizeof(Checksum), totalBufferSize); + ArenaPage* superpage = new ArenaPage(totalUsableSize + sizeof(Checksum), totalBufferSize); - uint8_t* wptr = p->mutate(); + uint8_t* wptr = superpage->mutate(); for (auto& p : pages) { ASSERT(p->size() == usableSize); memcpy(wptr, p->begin(), usableSize); wptr += usableSize; } - return Reference(p); + return Reference(superpage); } Checksum& getChecksum() { return *(Checksum*)(buffer + size()); } From b4e766bd13d8b44eb4be53b3ed96587db8af4e99 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sun, 16 May 2021 02:00:43 -0700 Subject: [PATCH 403/461] Bug fix, returned value wasn't pointing into the correct arena. --- fdbserver/VersionedBTree.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 66e6bed9b7..abfacef7a7 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -6316,7 +6316,7 @@ public: wait(cur.seekGTE(key, 0)); if (cur.isValid() && cur.get().key == key) { - Value v = cur.get().value.get(); + ValueRef v = cur.get().value.get(); int len = std::min(v.size(), maxLength); // Return a Value prefix whose arena is the source page's arena return Value(v.substr(0, len), cur.back().page->getArena()); From bd0c4a4892398cb7629c7c8b19467321ae6c56ec Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sun, 16 May 2021 03:03:55 -0700 Subject: [PATCH 404/461] Avoid callers of getValue() and getValuePrefix() from being able to add arena dependencies to the source page arena. --- fdbserver/VersionedBTree.actor.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index abfacef7a7..1e7bb4e712 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -6292,7 +6292,10 @@ public: wait(cur.seekGTE(key, 0)); if (cur.isValid() && cur.get().key == key) { // Return a Value whose arena is the source page's arena - return Value(cur.get().value.get(), cur.back().page->getArena()); + Value v; + v.arena().dependsOn(cur.back().page->getArena()); + v.contents() = cur.get().value.get(); + return v; } return Optional(); @@ -6316,10 +6319,14 @@ public: wait(cur.seekGTE(key, 0)); if (cur.isValid() && cur.get().key == key) { - ValueRef v = cur.get().value.get(); - int len = std::min(v.size(), maxLength); - // Return a Value prefix whose arena is the source page's arena - return Value(v.substr(0, len), cur.back().page->getArena()); + // Return a Value whose arena is the source page's arena + Value v; + v.arena().dependsOn(cur.back().page->getArena()); + v.contents() = cur.get().value.get(); + if (v.size() > maxLength) { + v.contents() = v.substr(0, maxLength); + } + return v; } return Optional(); From a31e4f622f928dd6959ef40171f67d0abc2296ac Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sun, 16 May 2021 03:58:05 -0700 Subject: [PATCH 405/461] Changed ArenaBlockRef to use 32 bit aligned4kBuffer size. --- flow/Arena.cpp | 10 +++++----- flow/Arena.h | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/flow/Arena.cpp b/flow/Arena.cpp index 016112cc5b..096ded32fd 100644 --- a/flow/Arena.cpp +++ b/flow/Arena.cpp @@ -102,7 +102,7 @@ void Arena::dependsOn(const Arena& p) { } } -void* Arena::allocate4kAlignedBuffer(size_t size) { +void* Arena::allocate4kAlignedBuffer(uint32_t size) { return ArenaBlock::dependOn4kAlignedBuffer(impl, size); } @@ -234,12 +234,12 @@ void ArenaBlock::makeReference(ArenaBlock* next) { bigUsed += sizeof(ArenaBlockRef); } -void* ArenaBlock::make4kAlignedBuffer(size_t size) { +void* ArenaBlock::make4kAlignedBuffer(uint32_t size) { ArenaBlockRef* r = (ArenaBlockRef*)((char*)getData() + bigUsed); makeDefined(r, sizeof(ArenaBlockRef)); r->aligned4kBufferSize = size; r->aligned4kBuffer = allocateFast4kAligned(size); - //printf("Arena::aligned4kBuffer alloc size=%u ptr=%p\n", size, r->aligned4kBuffer); + // printf("Arena::aligned4kBuffer alloc size=%u ptr=%p\n", size, r->aligned4kBuffer); r->nextBlockOffset = nextBlockOffset; makeNoAccess(r, sizeof(ArenaBlockRef)); nextBlockOffset = bigUsed; @@ -255,7 +255,7 @@ void ArenaBlock::dependOn(Reference& self, ArenaBlock* other) { self->makeReference(other); } -void* ArenaBlock::dependOn4kAlignedBuffer(Reference& self, size_t size) { +void* ArenaBlock::dependOn4kAlignedBuffer(Reference& self, uint32_t size) { if (!self || self->isTiny() || self->unused() < sizeof(ArenaBlockRef)) { return create(SMALL, self)->make4kAlignedBuffer(size); } else { @@ -398,7 +398,7 @@ void ArenaBlock::destroy() { // If aligned4kBuffer is valid, free it if (br->aligned4kBufferSize != 0) { - //printf("Arena::aligned4kBuffer free %p\n", br->aligned4kBuffer); + // printf("Arena::aligned4kBuffer free %p\n", br->aligned4kBuffer); freeFast4kAligned(br->aligned4kBufferSize, br->aligned4kBuffer); } else { allowAccess(br->next); diff --git a/flow/Arena.h b/flow/Arena.h index a34dcf67c6..c08072e35c 100644 --- a/flow/Arena.h +++ b/flow/Arena.h @@ -102,7 +102,7 @@ public: Arena& operator=(Arena&&) noexcept; void dependsOn(const Arena& p); - void* allocate4kAlignedBuffer(size_t size); + void* allocate4kAlignedBuffer(uint32_t size); size_t getSize() const; bool hasFree(size_t size, const void* address); @@ -132,7 +132,7 @@ struct scalar_traits : std::true_type { struct ArenaBlockRef { // Only one of (next, aligned4kBuffer) are valid at any one time, as they occupy the same space. // If aligned4kBufferSize is not 0, aligned4kBuffer is valid, otherwise next is valid. - size_t aligned4kBufferSize; + uint32_t aligned4kBufferSize; union { ArenaBlock* next; void* aligned4kBuffer; @@ -167,9 +167,9 @@ struct ArenaBlock : NonCopyable, ThreadSafeReferenceCounted { void getUniqueBlocks(std::set& a); int addUsed(int bytes); void makeReference(ArenaBlock* next); - void* make4kAlignedBuffer(size_t size); + void* make4kAlignedBuffer(uint32_t size); static void dependOn(Reference& self, ArenaBlock* other); - static void* dependOn4kAlignedBuffer(Reference& self, size_t size); + static void* dependOn4kAlignedBuffer(Reference& self, uint32_t size); static void* allocate(Reference& self, int bytes); // Return an appropriately-sized ArenaBlock to store the given data static ArenaBlock* create(int dataSize, Reference& next); From f88596bfd0d207c3f953b9eb09effcf3251865de Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Sun, 16 May 2021 05:13:55 -0700 Subject: [PATCH 406/461] Applied clang-format after conflict resolution. --- fdbserver/VersionedBTree.actor.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index ba3dd2c86b..7e29d34147 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4279,8 +4279,7 @@ private: const RedwoodRecordRef* upperBound, bool forLazyClear = false, bool cacheable = true, - bool* fromCache = nullptr) - { + bool* fromCache = nullptr) { if (!forLazyClear) { debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), From 8b1f9f733749ffc347543152fe101d47c2427701 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Tue, 11 May 2021 00:05:08 +0000 Subject: [PATCH 407/461] Add command line support --- .gitignore | 2 +- fdbcli/fdbcli.actor.cpp | 11 ++++++++--- fdbclient/DatabaseConfiguration.cpp | 11 +++++++---- fdbclient/DatabaseConfiguration.h | 3 +++ fdbclient/ManagementAPI.actor.cpp | 8 ++++++++ 5 files changed, 27 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 2b74cc1f7c..f555965fab 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,7 @@ bindings/java/foundationdb-client*.jar bindings/java/foundationdb-tests*.jar bindings/java/fdb-java-*-sources.jar packaging/msi/FDBInstaller.msi - +builds/ # Generated source, build, and packaging files *.g.cpp *.g.h diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index d10da845ec..7f1bb3b735 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -496,7 +496,8 @@ void initHelp() { helpMap["configure"] = CommandHelp( "configure [new] " "|" - "commit_proxies=|grv_proxies=|logs=|resolvers=>*", + "commit_proxies=|grv_proxies=|logs=|resolvers=>*|" + "perpetual_storage_wiggle=", "change the database configuration", "The `new' option, if present, initializes a new database with the given configuration rather than changing " "the configuration of an existing one. When used, both a redundancy mode and a storage engine must be " @@ -517,8 +518,11 @@ void initHelp() { "1, or set to -1 which restores the number of GRV proxies to the default value.\n\nlogs=: Sets the " "desired number of log servers in the cluster. Must be at least 1, or set to -1 which restores the number of " "logs to the default value.\n\nresolvers=: Sets the desired number of resolvers in the cluster. " - "Must be at least 1, or set to -1 which restores the number of resolvers to the default value.\n\nSee the " - "FoundationDB Administration Guide for more information."); + "Must be at least 1, or set to -1 which restores the number of resolvers to the default value.\n\n" + "perpetual_storage_wiggle=: Set the value speed (a.k.a., the number of processes that the Data " + "Distributor should wiggle at a time). Currently, only 0 and 1 are supported. The value 0 means to disable the " + "perpetual storage wiggle.\n\n" + "See the FoundationDB Administration Guide for more information."); helpMap["fileconfigure"] = CommandHelp( "fileconfigure [new] ", "change the database configuration from a file", @@ -2766,6 +2770,7 @@ void configureGenerator(const char* text, const char* line, std::vectorinfo() != "dcid^2 x zoneid^2 x 1") && + // We cannot specify regions with three_datacenter replication + (perpetualStorageWiggleSpeed == 0 || perpetualStorageWiggleSpeed == 1))) { return false; } std::set dcIds; @@ -352,7 +353,7 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const { } result["backup_worker_enabled"] = (int32_t)backupWorkerEnabled; - + result["perpetual_storage_wiggle"] = perpetualStorageWiggleSpeed; return result; } @@ -499,6 +500,8 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) { parse(&repopulateRegionAntiQuorum, value); } else if (ck == LiteralStringRef("regions")) { parse(®ions, value); + } else if (ck == LiteralStringRef("perpetual_storage_wiggle")) { + parse(&perpetualStorageWiggleSpeed, value); } else { return false; } diff --git a/fdbclient/DatabaseConfiguration.h b/fdbclient/DatabaseConfiguration.h index bc64a6c9c5..ef539f40b0 100644 --- a/fdbclient/DatabaseConfiguration.h +++ b/fdbclient/DatabaseConfiguration.h @@ -239,6 +239,9 @@ struct DatabaseConfiguration { int32_t repopulateRegionAntiQuorum; std::vector regions; + // Perpetual Storage Setting + int32_t perpetualStorageWiggleSpeed; + // Excluded servers (no state should be here) bool isExcludedServer(NetworkAddressList) const; std::set getExcludedServers() const; diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 90d670e801..f53cf65828 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -134,6 +134,14 @@ std::map configForToken(std::string const& mode) { BinaryWriter::toValue(regionObj, IncludeVersion(ProtocolVersion::withRegionConfiguration())).toString(); } + if (key == "perpetual_storage_wiggle" && isInteger(value)) { + int ppWiggle = atoi(value.c_str()); + if (ppWiggle >= 2 || ppWiggle < 0) { + printf("Error: Only 0 and 1 are valid values of perpetual_storage_wiggle at present.\n"); + return out; + } + out[p + key] = value; + } return out; } From 6065d247f86e12988cdab05a1adc910a8abc7ea4 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Mon, 17 May 2021 20:22:27 +0000 Subject: [PATCH 408/461] fix perpetualStorageWiggleKey --- fdbclient/SystemData.cpp | 3 +++ fdbclient/SystemData.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 314df8930b..fd8d2faa9f 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -594,6 +594,9 @@ ProcessClass decodeProcessClassValue(ValueRef const& value) { const KeyRangeRef configKeys(LiteralStringRef("\xff/conf/"), LiteralStringRef("\xff/conf0")); const KeyRef configKeysPrefix = configKeys.begin; +const KeyRef perpetualStorageWiggleKey(LiteralStringRef("\xff/conf/perpetual_storage_wiggle")); +const KeyRef wigglingStorageServerKey(LiteralStringRef("\xff/storageWiggleUID")); + const KeyRef triggerDDTeamInfoPrintKey(LiteralStringRef("\xff/triggerDDTeamInfoPrint")); const KeyRangeRef excludedServersKeys(LiteralStringRef("\xff/conf/excluded/"), LiteralStringRef("\xff/conf/excluded0")); diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index e7a54e632c..79efb688c8 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -196,6 +196,8 @@ UID decodeProcessClassKeyOld(KeyRef const& key); extern const KeyRangeRef configKeys; extern const KeyRef configKeysPrefix; +extern const KeyRef perpetualStorageWiggleKey; +extern const KeyRef wigglingStorageServerKey; // Change the value of this key to anything and that will trigger detailed data distribution team info log. extern const KeyRef triggerDDTeamInfoPrintKey; From e40538729e29882c93396e15c5f3fc796cff2ddf Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 17 May 2021 14:46:06 -0700 Subject: [PATCH 409/461] Update fdbserver/IPager.h Co-authored-by: Andrew Noyes --- fdbserver/IPager.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 55146b6533..31ff36ef88 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -56,7 +56,7 @@ public: if (userData != nullptr && userDataDestructor != nullptr) { userDataDestructor(userData); } - if(buffer != 0) { + if(buffer != nullptr) { VALGRIND_MAKE_MEM_UNDEFINED(buffer, bufferSize); } } From 60504e12ac7e4e4e8f9624402496a98015e9b4ec Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 17 May 2021 18:02:09 -0700 Subject: [PATCH 410/461] Address review comments. --- fdbserver/VersionedBTree.actor.cpp | 33 +++++------------------------- flow/Arena.h | 8 +++++--- flow/FastAlloc.h | 2 ++ 3 files changed, 12 insertions(+), 31 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 7e29d34147..b5c1e85dfd 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -6372,33 +6372,6 @@ public: return result; } - ACTOR static Future> readValue_impl(KeyValueStoreRedwoodUnversioned* self, - Key key, - Optional debugID) { - state VersionedBTree::BTreeCursor cur; - wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion())); - - state Reference readLock = self->m_concurrentReads; - wait(readLock->take()); - state FlowLock::Releaser releaser(*readLock); - ++g_redwoodMetrics.opGet; - - wait(cur.seekGTE(key, 0)); - if (cur.isValid() && cur.get().key == key) { - // Return a Value whose arena is the source page's arena - Value v; - v.arena().dependsOn(cur.back().page->getArena()); - v.contents() = cur.get().value.get(); - return v; - } - - return Optional(); - } - - Future> readValue(KeyRef key, Optional debugID = Optional()) override { - return catchError(readValue_impl(this, key, debugID)); - } - ACTOR static Future> readValuePrefix_impl(KeyValueStoreRedwoodUnversioned* self, Key key, int maxLength, @@ -6413,7 +6386,7 @@ public: wait(cur.seekGTE(key, 0)); if (cur.isValid() && cur.get().key == key) { - // Return a Value whose arena is the source page's arena + // Return a Value whose arena depends on the source page arena Value v; v.arena().dependsOn(cur.back().page->getArena()); v.contents() = cur.get().value.get(); @@ -6432,6 +6405,10 @@ public: return catchError(readValuePrefix_impl(this, key, maxLength, debugID)); } + Future> readValue(KeyRef key, Optional debugID = Optional()) override { + return catchError(readValuePrefix_impl(this, key, std::numeric_limits::max(), debugID)); + } + ~KeyValueStoreRedwoodUnversioned() override{}; private: diff --git a/flow/Arena.h b/flow/Arena.h index c08072e35c..b940c6bcb0 100644 --- a/flow/Arena.h +++ b/flow/Arena.h @@ -130,13 +130,15 @@ struct scalar_traits : std::true_type { }; struct ArenaBlockRef { - // Only one of (next, aligned4kBuffer) are valid at any one time, as they occupy the same space. - // If aligned4kBufferSize is not 0, aligned4kBuffer is valid, otherwise next is valid. - uint32_t aligned4kBufferSize; union { ArenaBlock* next; void* aligned4kBuffer; }; + + // Only one of (next, aligned4kBuffer) is valid at any one time, as they occupy the same space. + // If aligned4kBufferSize is not 0, aligned4kBuffer is valid, otherwise next is valid. + uint32_t aligned4kBufferSize; + uint32_t nextBlockOffset; }; diff --git a/flow/FastAlloc.h b/flow/FastAlloc.h index 3f5f2ab58b..55e5731fd5 100644 --- a/flow/FastAlloc.h +++ b/flow/FastAlloc.h @@ -267,6 +267,7 @@ inline void freeFast(int size, void* ptr) { } [[nodiscard]] inline void* allocateFast4kAligned(int size) { + // Use FastAllocator for sizes it supports to avoid internal fragmentation in some implementations of aligned_alloc if (size <= 4096) return FastAllocator<4096>::allocate(); if (size <= 8192) @@ -277,6 +278,7 @@ inline void freeFast(int size, void* ptr) { } inline void freeFast4kAligned(int size, void* ptr) { + // Sizes supported by FastAllocator must be release via FastAllocator if (size <= 4096) return FastAllocator<4096>::release(ptr); if (size <= 8192) From f30793fd85157829541eecbe0c3cc49dff95efe1 Mon Sep 17 00:00:00 2001 From: Steve Atherton Date: Mon, 17 May 2021 19:27:06 -0700 Subject: [PATCH 411/461] Implement getValuePrefix() using getValue() rather than the other way around to avoid the common getValue()'s actor state increasing from 128 to 256 bytes since it is a very hot code path. --- fdbserver/IPager.h | 2 +- fdbserver/VersionedBTree.actor.cpp | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 31ff36ef88..bc2a0f68f1 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -56,7 +56,7 @@ public: if (userData != nullptr && userDataDestructor != nullptr) { userDataDestructor(userData); } - if(buffer != nullptr) { + if (buffer != nullptr) { VALGRIND_MAKE_MEM_UNDEFINED(buffer, bufferSize); } } diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index b5c1e85dfd..8a919fd190 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -6372,10 +6372,9 @@ public: return result; } - ACTOR static Future> readValuePrefix_impl(KeyValueStoreRedwoodUnversioned* self, - Key key, - int maxLength, - Optional debugID) { + ACTOR static Future> readValue_impl(KeyValueStoreRedwoodUnversioned* self, + Key key, + Optional debugID) { state VersionedBTree::BTreeCursor cur; wait(self->m_tree->initBTreeCursor(&cur, self->m_tree->getLastCommittedVersion())); @@ -6390,23 +6389,25 @@ public: Value v; v.arena().dependsOn(cur.back().page->getArena()); v.contents() = cur.get().value.get(); - if (v.size() > maxLength) { - v.contents() = v.substr(0, maxLength); - } return v; } return Optional(); } + Future> readValue(KeyRef key, Optional debugID = Optional()) override { + return catchError(readValue_impl(this, key, debugID)); + } + Future> readValuePrefix(KeyRef key, int maxLength, Optional debugID = Optional()) override { - return catchError(readValuePrefix_impl(this, key, maxLength, debugID)); - } - - Future> readValue(KeyRef key, Optional debugID = Optional()) override { - return catchError(readValuePrefix_impl(this, key, std::numeric_limits::max(), debugID)); + return catchError(map(readValue_impl(this, key, debugID), [maxLength](Optional v) { + if (v.present() && v.get().size() > maxLength) { + v.get().contents() = v.get().substr(0, maxLength); + } + return v; + })); } ~KeyValueStoreRedwoodUnversioned() override{}; From 319e77eef12897420e59715c57e69162405620d8 Mon Sep 17 00:00:00 2001 From: RenxuanW Date: Mon, 17 May 2021 19:15:33 -0700 Subject: [PATCH 412/461] Add severity in logWorkerUnavailable(). Also, only log when fitness is GoodFit or BestFit. --- fdbserver/ClusterController.actor.cpp | 89 +++++++++++++++++---------- 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 107c865221..53304bc6f6 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -459,7 +459,8 @@ public: } // Log the reason why the worker is considered as unavailable. - void logWorkerUnavailable(const UID& id, + void logWorkerUnavailable(const Severity severity, + const UID& id, const std::string& method, const std::string& reason, const WorkerDetails& details, @@ -473,17 +474,21 @@ public: } dcList += printable(dc); } - // Note that the recruitment happens only during initial database creation and recovery. So these trace - // events should be sparse. - TraceEvent("GetTLogTeamWorkerUnavailable", id) - .detail("TLogRecruitMethod", method) - .detail("Reason", reason) - .detail("WorkerID", details.interf.id()) - .detail("WorkerDC", details.interf.locality.dcId()) - .detail("Address", details.interf.addresses().toString()) - .detail("Fitness", fitness) - .detail("RecruitmentDcIds", dcList); - }; + // Logging every possible options is a lot for every recruitment; logging all of the options with GoodFit or + // BestFit may work because there should only be like 30 tlog class processes. Plus, the recruitment happens + // only during initial database creation and recovery. So these trace events should be sparse. + if (fitness == ProcessClass::GoodFit || fitness == ProcessClass::BestFit || + fitness == ProcessClass::NeverAssign) { + TraceEvent(severity, "GetTLogTeamWorkerUnavailable", id) + .detail("TLogRecruitMethod", method) + .detail("Reason", reason) + .detail("WorkerID", details.interf.id()) + .detail("WorkerDC", details.interf.locality.dcId()) + .detail("Address", details.interf.addresses().toString()) + .detail("Fitness", fitness) + .detail("RecruitmentDcIds", dcList); + } + } // A TLog recruitment method specialized for three_data_hall and three_datacenter configurations // It attempts to evenly recruit processes from across data_halls or datacenters @@ -506,29 +511,36 @@ public: if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) != exclusionWorkerIds.end()) { - logWorkerUnavailable(id, "complex", "Worker is excluded", worker_details, fitness, dcIds); + logWorkerUnavailable(SevInfo, id, "complex", "Worker is excluded", worker_details, fitness, dcIds); continue; } if (!workerAvailable(worker_info, checkStable)) { - logWorkerUnavailable(id, "complex", "Worker is not available", worker_details, fitness, dcIds); + logWorkerUnavailable(SevInfo, id, "complex", "Worker is not available", worker_details, fitness, dcIds); continue; } if (conf.isExcludedServer(worker_details.interf.addresses())) { - logWorkerUnavailable( - id, "complex", "Worker server is excluded from the cluster", worker_details, fitness, dcIds); + logWorkerUnavailable(SevInfo, + id, + "complex", + "Worker server is excluded from the cluster", + worker_details, + fitness, + dcIds); continue; } if (fitness == ProcessClass::NeverAssign) { - logWorkerUnavailable(id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds); + logWorkerUnavailable( + SevDebug, id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds); continue; } if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) { - logWorkerUnavailable(id, "complex", "Worker is not in the target DC", worker_details, fitness, dcIds); + logWorkerUnavailable( + SevDebug, id, "complex", "Worker is not in the target DC", worker_details, fitness, dcIds); continue; } if (!allowDegraded && worker_details.degraded) { logWorkerUnavailable( - id, "complex", "Worker is degraded and not allowed", worker_details, fitness, dcIds); + SevInfo, id, "complex", "Worker is degraded and not allowed", worker_details, fitness, dcIds); continue; } @@ -731,26 +743,34 @@ public: for (const auto& [worker_process_id, worker_info] : id_worker) { const auto& worker_details = worker_info.details; auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog); + if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) != exclusionWorkerIds.end()) { - logWorkerUnavailable(id, "simple", "Worker is excluded", worker_details, fitness, dcIds); + logWorkerUnavailable(SevInfo, id, "simple", "Worker is excluded", worker_details, fitness, dcIds); continue; } if (!workerAvailable(worker_info, checkStable)) { - logWorkerUnavailable(id, "simple", "Worker is not available", worker_details, fitness, dcIds); + logWorkerUnavailable(SevInfo, id, "simple", "Worker is not available", worker_details, fitness, dcIds); continue; } if (conf.isExcludedServer(worker_details.interf.addresses())) { - logWorkerUnavailable( - id, "simple", "Worker server is excluded from the cluster", worker_details, fitness, dcIds); + logWorkerUnavailable(SevInfo, + id, + "simple", + "Worker server is excluded from the cluster", + worker_details, + fitness, + dcIds); continue; } if (fitness == ProcessClass::NeverAssign) { - logWorkerUnavailable(id, "simple", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds); + logWorkerUnavailable( + SevDebug, id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds); continue; } if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) { - logWorkerUnavailable(id, "simple", "Worker is not in the target DC", worker_details, fitness, dcIds); + logWorkerUnavailable( + SevDebug, id, "simple", "Worker is not in the target DC", worker_details, fitness, dcIds); continue; } @@ -855,28 +875,35 @@ public: for (const auto& [worker_process_id, worker_info] : id_worker) { const auto& worker_details = worker_info.details; auto fitness = worker_details.processClass.machineClassFitness(ProcessClass::TLog); + if (std::find(exclusionWorkerIds.begin(), exclusionWorkerIds.end(), worker_details.interf.id()) != exclusionWorkerIds.end()) { - logWorkerUnavailable(id, "deprecated", "Worker is excluded", worker_details, fitness, dcIds); + logWorkerUnavailable(SevInfo, id, "deprecated", "Worker is excluded", worker_details, fitness, dcIds); continue; } if (!workerAvailable(worker_info, checkStable)) { - logWorkerUnavailable(id, "deprecated", "Worker is not available", worker_details, fitness, dcIds); + logWorkerUnavailable( + SevInfo, id, "deprecated", "Worker is not available", worker_details, fitness, dcIds); continue; } if (conf.isExcludedServer(worker_details.interf.addresses())) { - logWorkerUnavailable( - id, "deprecated", "Worker server is excluded from the cluster", worker_details, fitness, dcIds); + logWorkerUnavailable(SevInfo, + id, + "deprecated", + "Worker server is excluded from the cluster", + worker_details, + fitness, + dcIds); continue; } if (fitness == ProcessClass::NeverAssign) { logWorkerUnavailable( - id, "deprecated", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds); + SevDebug, id, "complex", "Worker's fitness is NeverAssign", worker_details, fitness, dcIds); continue; } if (!dcIds.empty() && dcIds.count(worker_details.interf.locality.dcId()) == 0) { logWorkerUnavailable( - id, "deprecated", "Worker is not in the target DC", worker_details, fitness, dcIds); + SevDebug, id, "deprecated", "Worker is not in the target DC", worker_details, fitness, dcIds); continue; } From 3066e856c962693926005c0828661aa7cf0eeef4 Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Wed, 19 May 2021 16:08:32 +0000 Subject: [PATCH 413/461] Expose "bounce impact" and Storage Server "version catch-up rate" metrics Changes: storageserver.actor.cpp: Use counters to capture (a) how fast a storage server is catching up in versions and (b) the version fetch frequency. Status.actor.cpp: Report the captured counter metrics as part of storage metrics. --- fdbserver/Status.actor.cpp | 2 ++ fdbserver/storageserver.actor.cpp | 24 +++++++++++------------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 12235f9d31..2277005867 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -491,6 +491,8 @@ struct RolesInfo { obj["mutation_bytes"] = StatusCounter(storageMetrics.getValue("MutationBytes")).getStatus(); obj["mutations"] = StatusCounter(storageMetrics.getValue("Mutations")).getStatus(); obj.setKeyRawNumber("local_rate", storageMetrics.getValue("LocalRate")); + obj["fetched_versions"] = StatusCounter(storageMetrics.getValue("FetchedVersions")).getStatus(); + obj["fetch_frequency"] = StatusCounter(storageMetrics.getValue("FetchFrequency")).getStatus(); Version version = storageMetrics.getInt64("Version"); Version durableVersion = storageMetrics.getInt64("DurableVersion"); diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 8cd4680f6d..4516e2a176 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -545,10 +545,7 @@ public: int64_t versionLag; // An estimate for how many versions it takes for the data to move from the logs to this storage // server - // Metrics about the latest batch of versions fetched by this StorageServer - int64_t fetchedVersions; // how many versions were fetched - double duration; // how long (in seconds) it took to fetch the versions - Optional sourceTLogID; // the tLog from which the versions were fetched + Optional sourceTLogID; // the tLog from which the latest batch of versions were fetched ProtocolVersion logProtocol; @@ -683,6 +680,8 @@ public: Counter loops; Counter fetchWaitingMS, fetchWaitingCount, fetchExecutingMS, fetchExecutingCount; Counter readsRejected; + Counter fetchedVersions; + Counter fetchFrequency; LatencySample readLatencySample; LatencyBands readLatencyBands; @@ -700,10 +699,11 @@ public: updateBatches("UpdateBatches", cc), updateVersions("UpdateVersions", cc), loops("Loops", cc), fetchWaitingMS("FetchWaitingMS", cc), fetchWaitingCount("FetchWaitingCount", cc), fetchExecutingMS("FetchExecutingMS", cc), fetchExecutingCount("FetchExecutingCount", cc), - readsRejected("ReadsRejected", cc), readLatencySample("ReadLatencyMetrics", - self->thisServerID, - SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, - SERVER_KNOBS->LATENCY_SAMPLE_SIZE), + readsRejected("ReadsRejected", cc), fetchedVersions("FetchedVersions", cc), + fetchFrequency("FetchFrequency", cc), readLatencySample("ReadLatencyMetrics", + self->thisServerID, + SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, + SERVER_KNOBS->LATENCY_SAMPLE_SIZE), readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY) { specialCounter(cc, "LastTLogVersion", [self]() { return self->lastTLogVersion; }); specialCounter(cc, "Version", [self]() { return self->version.get(); }); @@ -711,8 +711,6 @@ public: specialCounter(cc, "DurableVersion", [self]() { return self->durableVersion.get(); }); specialCounter(cc, "DesiredOldestVersion", [self]() { return self->desiredOldestVersion.get(); }); specialCounter(cc, "VersionLag", [self]() { return self->versionLag; }); - specialCounter(cc, "FetchedVersions", [self]() { return self->fetchedVersions; }); - specialCounter(cc, "Duration", [self]() { return self->duration; }); specialCounter(cc, "LocalRate", [self] { return self->currentRate() * 100; }); specialCounter(cc, "BytesReadSampleCount", [self]() { return self->metrics.bytesReadSample.queue.size(); }); @@ -739,7 +737,7 @@ public: : fetchKeysHistograms(), instanceID(deterministicRandom()->randomUniqueID().first()), storage(this, storage), db(db), actors(false), lastTLogVersion(0), lastVersionWithData(0), restoredVersion(0), rebootAfterDurableVersion(std::numeric_limits::max()), durableInProgress(Void()), versionLag(0), - fetchedVersions(0), duration(0.0), primaryLocality(tagLocalityInvalid), updateEagerReads(0), shardChangeCounter(0), + primaryLocality(tagLocalityInvalid), updateEagerReads(0), shardChangeCounter(0), fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES), shuttingDown(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), watchBytes(0), numWatches(0), logProtocol(0), counters(this), tag(invalidTag), maxQueryQueue(0), thisServerID(ssi.id()), @@ -3530,8 +3528,8 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); - data->fetchedVersions = ver - data->version.get(); - data->duration = now() - data->lastUpdate; + data->counters.fetchedVersions += (ver - data->version.get()); + ++data->counters.fetchFrequency; Optional curSourceTLogID = cursor->getCurrentPeekLocation(); if (curSourceTLogID != data->sourceTLogID) { From 622f43474aa3a44b711185910bb9304856e1a526 Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Wed, 19 May 2021 19:54:49 +0000 Subject: [PATCH 414/461] Expose "bounce impact" and Storage Server "version catch-up rate" metrics Changes: Schemas.cpp: Extend the JSON schema to report the new metrics that have been added. mr-status-json-schemas.rst.inc: Update the schema to reflect the changes made to the JSON schema. release-notes-700.rst: Add a note about the new metrics in "Status" section. --- .../sphinx/source/mr-status-json-schemas.rst.inc | 10 ++++++++++ .../sphinx/source/release-notes/release-notes-700.rst | 2 ++ fdbclient/Schemas.cpp | 10 ++++++++++ 3 files changed, 22 insertions(+) diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index 202496620d..deb8afcdb7 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -121,6 +121,16 @@ "counter":0, "roughness":0.0 }, + "fetched_versions":{ + "hz":0.0, + "counter":0, + "roughness":0.0 + }, + "fetch_frequency":{ + "hz":0.0, + "counter":0, + "roughness":0.0 + }, "grv_latency_statistics":{ // GRV Latency metrics are grouped according to priority (currently batch or default). "default":{ "count":0, diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst index 8997f8c9fd..85ca56979c 100644 --- a/documentation/sphinx/source/release-notes/release-notes-700.rst +++ b/documentation/sphinx/source/release-notes/release-notes-700.rst @@ -32,6 +32,8 @@ Status ------ * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4735) `_ * Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) `_ +* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) `_ +* Added ``fetch_frequency`` to the storage metrics section of status to report the version fetching frequency of a storage server. `(PR #4770) `_ Bindings -------- diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 0ba2feaaaa..c6dc059573 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -144,6 +144,16 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "counter":0, "roughness":0.0 }, + "fetched_versions":{ + "hz":0.0, + "counter":0, + "roughness":0.0 + }, + "fetch_frequency":{ + "hz":0.0, + "counter":0, + "roughness":0.0 + }, "grv_latency_statistics":{ "default":{ "count":0, From d067ca507bea78a7c4c15c5c3a2730536ec71a67 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Wed, 19 May 2021 19:57:51 +0000 Subject: [PATCH 415/461] Surfacing non-fatal parse errors in Test Harness output --- contrib/TestHarness/Program.cs.cmake | 11 ++++++++-- contrib/TraceLogHelper/JsonParser.cs | 5 +++-- contrib/TraceLogHelper/XmlParser.cs | 31 +++++++++++++++++----------- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/contrib/TestHarness/Program.cs.cmake b/contrib/TestHarness/Program.cs.cmake index 8d666b2725..075a2758d6 100644 --- a/contrib/TestHarness/Program.cs.cmake +++ b/contrib/TestHarness/Program.cs.cmake @@ -717,7 +717,7 @@ namespace SummarizeTest delegate IEnumerable parseDelegate(System.IO.Stream stream, string file, bool keepOriginalElement = false, double startTime = -1, double endTime = Double.MaxValue, - double samplingFactor = 1.0); + double samplingFactor = 1.0, Action nonFatalErrorMessage = null); static int Summarize(string[] traceFiles, string summaryFileName, string errorFileName, bool? killed, List outputErrors, int? exitCode, long? peakMemory, @@ -750,12 +750,14 @@ namespace SummarizeTest { try { + // Use Action to set this because IEnumerables with yield can't have an out variable + string nonFatalParseError = null; parseDelegate parse; if (traceFileName.EndsWith(".json")) parse = Magnesium.JsonParser.Parse; else parse = Magnesium.XmlParser.Parse; - foreach (var ev in parse(traceFile, traceFileName)) + foreach (var ev in parse(traceFile, traceFileName, nonFatalErrorMessage: (x) => { nonFatalParseError = x; })) { Magnesium.Severity newSeverity; if (severityMap.TryGetValue(new KeyValuePair(ev.Type, ev.Severity), out newSeverity)) @@ -876,6 +878,11 @@ namespace SummarizeTest if (ev.Type == "StderrSeverity") stderrSeverity = int.Parse(ev.Details.NewSeverity); } + if (nonFatalParseError != null) { + xout.Add(new XElement("NonFatalParseError", + new XAttribute("Severity", (int)Magnesium.Severity.SevWarnAlways), + new XAttribute("ErrorMessage", nonFatalParseError))); + } } catch (Exception e) diff --git a/contrib/TraceLogHelper/JsonParser.cs b/contrib/TraceLogHelper/JsonParser.cs index 9d7272a37f..84fbab81ab 100644 --- a/contrib/TraceLogHelper/JsonParser.cs +++ b/contrib/TraceLogHelper/JsonParser.cs @@ -1,4 +1,4 @@ -/* +/* * JsonParser.cs * * This source file is part of the FoundationDB open source project @@ -34,9 +34,10 @@ namespace Magnesium { static Random r = new Random(); + // dummy parameter nonFatalParseError to match xml public static IEnumerable Parse(System.IO.Stream stream, string file, bool keepOriginalElement = false, double startTime = -1, double endTime = Double.MaxValue, - double samplingFactor = 1.0) + double samplingFactor = 1.0, Action nonFatalErrorMessage = null) { using (var reader = new System.IO.StreamReader(stream)) { diff --git a/contrib/TraceLogHelper/XmlParser.cs b/contrib/TraceLogHelper/XmlParser.cs index 3728c58c3b..9ab79d920e 100644 --- a/contrib/TraceLogHelper/XmlParser.cs +++ b/contrib/TraceLogHelper/XmlParser.cs @@ -33,14 +33,29 @@ namespace Magnesium public static IEnumerable Parse(System.IO.Stream stream, string file, bool keepOriginalElement = false, double startTime = -1, double endTime = Double.MaxValue, - double samplingFactor = 1.0) + double samplingFactor = 1.0, Action nonFatalErrorMessage = null) { using (var reader = XmlReader.Create(stream)) { reader.ReadToDescendant("Trace"); reader.Read(); - foreach (var xev in StreamElements(reader)) + + // foreach (var xev in StreamElements(reader)) + // need to be able to catch and save non-fatal exceptions in StreamElements, so use explicit iterator instead of foreach + var iter = StreamElements(reader).GetEnumerator(); + while (true) { + try { + if (!iter.MoveNext()) { + break; + } + } catch (Exception e) { + if (nonFatalErrorMessage != null) { + nonFatalErrorMessage(e.Message); + } + break; + } + var xev = iter.Current; Event ev = null; try { @@ -165,28 +180,20 @@ namespace Magnesium } } + // throws exceptions if xml is invalid private static IEnumerable StreamElements(this XmlReader reader) { while (!reader.EOF) { if (reader.NodeType == XmlNodeType.Element) { - XElement node = null; - try - { - node = XElement.ReadFrom(reader) as XElement; - } - catch (Exception) { break; } + XElement node = XElement.ReadFrom(reader) as XElement; if (node != null) yield return node; } else { - try - { reader.Read(); - } - catch (Exception) { break; } } } } From 907248dcd4d06d1cc6bfffd4a2e5d7f1c364017f Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 19 May 2021 13:26:01 -0700 Subject: [PATCH 416/461] fixed a rare simulation bug where missingFinalCommit could be skipped by two successive logSystem changes --- fdbserver/OldTLogServer_6_0.actor.cpp | 2 +- fdbserver/OldTLogServer_6_2.actor.cpp | 2 +- fdbserver/TLogServer.actor.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index a442a3df6a..543111ede6 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1498,10 +1498,10 @@ ACTOR Future doQueueCommit(TLogData* self, ACTOR Future commitQueue(TLogData* self) { state Reference logData; + state std::vector> missingFinalCommit; loop { int foundCount = 0; - state std::vector> missingFinalCommit; for (auto it : self->id_data) { if (!it.second->stopped) { logData = it.second; diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index a305b27f3a..c7fea829c5 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -1925,10 +1925,10 @@ ACTOR Future doQueueCommit(TLogData* self, ACTOR Future commitQueue(TLogData* self) { state Reference logData; + state std::vector> missingFinalCommit; loop { int foundCount = 0; - state std::vector> missingFinalCommit; for (auto it : self->id_data) { if (!it.second->stopped) { logData = it.second; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 76d4bf3bf2..4ea9e83bee 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1965,10 +1965,10 @@ ACTOR Future doQueueCommit(TLogData* self, ACTOR Future commitQueue(TLogData* self) { state Reference logData; + state std::vector> missingFinalCommit; loop { int foundCount = 0; - state std::vector> missingFinalCommit; for (auto it : self->id_data) { if (!it.second->stopped) { logData = it.second; From 2fa80e79126381c1963333a13551feabf362436b Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Wed, 19 May 2021 22:04:43 +0000 Subject: [PATCH 417/461] Address review comments --- .../sphinx/source/mr-status-json-schemas.rst.inc | 2 +- .../source/release-notes/release-notes-700.rst | 2 +- fdbclient/Schemas.cpp | 2 +- fdbserver/Status.actor.cpp | 2 +- fdbserver/storageserver.actor.cpp | 12 ++++++------ 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index deb8afcdb7..7979331898 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -126,7 +126,7 @@ "counter":0, "roughness":0.0 }, - "fetch_frequency":{ + "fetches_from_logs":{ "hz":0.0, "counter":0, "roughness":0.0 diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst index 85ca56979c..cec839fc2e 100644 --- a/documentation/sphinx/source/release-notes/release-notes-700.rst +++ b/documentation/sphinx/source/release-notes/release-notes-700.rst @@ -33,7 +33,7 @@ Status * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4735) `_ * Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) `_ * Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) `_ -* Added ``fetch_frequency`` to the storage metrics section of status to report the version fetching frequency of a storage server. `(PR #4770) `_ +* Added ``fetch_frequency`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) `_ Bindings -------- diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index c6dc059573..5fef5fb6eb 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -149,7 +149,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "counter":0, "roughness":0.0 }, - "fetch_frequency":{ + "fetches_from_logs":{ "hz":0.0, "counter":0, "roughness":0.0 diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 2277005867..5f546638ff 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -492,7 +492,7 @@ struct RolesInfo { obj["mutations"] = StatusCounter(storageMetrics.getValue("Mutations")).getStatus(); obj.setKeyRawNumber("local_rate", storageMetrics.getValue("LocalRate")); obj["fetched_versions"] = StatusCounter(storageMetrics.getValue("FetchedVersions")).getStatus(); - obj["fetch_frequency"] = StatusCounter(storageMetrics.getValue("FetchFrequency")).getStatus(); + obj["fetches_from_logs"] = StatusCounter(storageMetrics.getValue("FetchesFromLogs")).getStatus(); Version version = storageMetrics.getInt64("Version"); Version durableVersion = storageMetrics.getInt64("DurableVersion"); diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 4516e2a176..1db250d9cd 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -681,7 +681,7 @@ public: Counter fetchWaitingMS, fetchWaitingCount, fetchExecutingMS, fetchExecutingCount; Counter readsRejected; Counter fetchedVersions; - Counter fetchFrequency; + Counter fetchesFromLogs; LatencySample readLatencySample; LatencyBands readLatencyBands; @@ -700,10 +700,10 @@ public: fetchWaitingMS("FetchWaitingMS", cc), fetchWaitingCount("FetchWaitingCount", cc), fetchExecutingMS("FetchExecutingMS", cc), fetchExecutingCount("FetchExecutingCount", cc), readsRejected("ReadsRejected", cc), fetchedVersions("FetchedVersions", cc), - fetchFrequency("FetchFrequency", cc), readLatencySample("ReadLatencyMetrics", - self->thisServerID, - SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, - SERVER_KNOBS->LATENCY_SAMPLE_SIZE), + fetchesFromLogs("FetchesFromLogs", cc), readLatencySample("ReadLatencyMetrics", + self->thisServerID, + SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, + SERVER_KNOBS->LATENCY_SAMPLE_SIZE), readLatencyBands("ReadLatencyBands", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY) { specialCounter(cc, "LastTLogVersion", [self]() { return self->lastTLogVersion; }); specialCounter(cc, "Version", [self]() { return self->version.get(); }); @@ -3529,7 +3529,7 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { data->otherError.getFuture().get(); data->counters.fetchedVersions += (ver - data->version.get()); - ++data->counters.fetchFrequency; + ++data->counters.fetchesFromLogs; Optional curSourceTLogID = cursor->getCurrentPeekLocation(); if (curSourceTLogID != data->sourceTLogID) { From 43e0d362df10991d40647e3730e279cf8e885ecf Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Wed, 19 May 2021 22:12:34 +0000 Subject: [PATCH 418/461] Address a review comment --- documentation/sphinx/source/release-notes/release-notes-700.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes/release-notes-700.rst b/documentation/sphinx/source/release-notes/release-notes-700.rst index cec839fc2e..ea78b9a10b 100644 --- a/documentation/sphinx/source/release-notes/release-notes-700.rst +++ b/documentation/sphinx/source/release-notes/release-notes-700.rst @@ -33,7 +33,7 @@ Status * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4735) `_ * Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) `_ * Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) `_ -* Added ``fetch_frequency`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) `_ +* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) `_ Bindings -------- From 93c809764f647a173adf73bcab6990c276213302 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Wed, 19 May 2021 23:52:16 +0000 Subject: [PATCH 419/461] fix Schema check error --- fdbclient/Schemas.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 682ddc9c9a..ae85799e85 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -727,7 +727,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "auto_logs":3, "commit_proxies":5, "grv_proxies":1, - "backup_worker_enabled":1 + "backup_worker_enabled":1, + "perpetual_storage_wiggle":0 }, "data":{ "least_operating_space_bytes_log_server":0, From a57061a5ed0de7b8ce9c60f40c0e31b4ee2f94d7 Mon Sep 17 00:00:00 2001 From: Xiaoxi Wang Date: Thu, 20 May 2021 00:06:53 +0000 Subject: [PATCH 420/461] change UID to PID meaning Process ID --- fdbclient/SystemData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index fd8d2faa9f..0f035b745c 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -595,7 +595,7 @@ const KeyRangeRef configKeys(LiteralStringRef("\xff/conf/"), LiteralStringRef("\ const KeyRef configKeysPrefix = configKeys.begin; const KeyRef perpetualStorageWiggleKey(LiteralStringRef("\xff/conf/perpetual_storage_wiggle")); -const KeyRef wigglingStorageServerKey(LiteralStringRef("\xff/storageWiggleUID")); +const KeyRef wigglingStorageServerKey(LiteralStringRef("\xff/storageWigglePID")); const KeyRef triggerDDTeamInfoPrintKey(LiteralStringRef("\xff/triggerDDTeamInfoPrint")); From 64608fe86b76738d53cbbe00d631a3af8cbbbe85 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 20 May 2021 13:48:41 -0600 Subject: [PATCH 421/461] allow simulation properties to be overwritten --- fdbserver/SimulatedCluster.actor.cpp | 478 ++++++++++++------ fdbserver/TesterInterface.actor.h | 21 - fdbserver/tester.actor.cpp | 14 - tests/fast/AtomicBackupToDBCorrectness.toml | 1 + tests/fast/BackupToDBCorrectness.toml | 1 + tests/fast/BackupToDBCorrectnessClean.toml | 1 + tests/fast/ConfigureLocked.toml | 3 +- tests/fast/FuzzApiCorrectness.toml | 1 + tests/fast/FuzzApiCorrectnessClean.toml | 1 + tests/fast/KillRegionCycle.toml | 1 + tests/fast/LongStackWriteDuringRead.toml | 1 + tests/fast/LowLatency.toml | 1 + tests/fast/ProtocolVersion.toml | 1 + tests/fast/ReportConflictingKeys.toml | 1 + tests/fast/WriteDuringRead.toml | 1 + tests/fast/WriteDuringReadClean.toml | 1 + tests/rare/ConflictRangeCheck.toml | 1 + tests/rare/ConflictRangeRYOWCheck.toml | 1 + .../from_7.0.0/SnapIncrementalRestore-1.toml | 1 + tests/slow/ApiCorrectnessSwitchover.toml | 1 + tests/slow/DifferentClustersSameRV.toml | 1 + tests/slow/LowLatencyWithFailures.toml | 1 + ...elRestoreNewBackupCorrectnessAtomicOp.toml | 1 + ...allelRestoreNewBackupCorrectnessCycle.toml | 1 + ...estoreNewBackupCorrectnessMultiCycles.toml | 1 + ...NewBackupWriteDuringReadAtomicRestore.toml | 1 + ...elRestoreOldBackupCorrectnessAtomicOp.toml | 1 + ...estoreOldBackupCorrectnessMultiCycles.toml | 1 + ...OldBackupWriteDuringReadAtomicRestore.toml | 1 + tests/slow/SharedBackupCorrectness.toml | 1 + tests/slow/SharedBackupToDBCorrectness.toml | 1 + tests/slow/VersionStampBackupToDB.toml | 1 + tests/slow/VersionStampSwitchover.toml | 1 + tests/slow/WriteDuringReadAtomicRestore.toml | 1 + tests/slow/WriteDuringReadSwitchover.toml | 1 + 35 files changed, 364 insertions(+), 183 deletions(-) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index f10ca774bb..128eace3a8 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include "fdbrpc/Locality.h" #include "fdbrpc/simulator.h" #include "fdbclient/DatabaseContext.h" @@ -37,8 +38,8 @@ #include "fdbclient/BackupAgent.actor.h" #include "fdbclient/versions.h" #include "flow/ProtocolVersion.h" -#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/network.h" +#include "flow/actorcompiler.h" // This must be the last #include. #undef max #undef min @@ -46,10 +47,210 @@ extern "C" int g_expect_full_pointermap; extern const char* getSourceVersion(); +using namespace std::literals; + const int MACHINE_REBOOT_TIME = 10; bool destructed = false; +// Configuration details specified in workload test files that change the simulation +// environment details +class TestConfig { + class ConfigBuilder { + using value_type = toml::basic_value; + std::unordered_map> confMap; + + public: + ConfigBuilder& add(std::string_view key, int* value) { + confMap.emplace(key, [value](value_type const& v) { *value = v.as_integer(); }); + return *this; + } + ConfigBuilder& add(std::string_view key, Optional* value) { + confMap.emplace(key, [value](value_type const& v) { *value = v.as_integer(); }); + return *this; + } + ConfigBuilder& add(std::string_view key, bool* value) { + confMap.emplace(key, [value](value_type const& v) { *value = v.as_boolean(); }); + return *this; + } + ConfigBuilder& add(std::string_view key, Optional* value) { + confMap.emplace(key, [value](value_type const& v) { *value = v.as_boolean(); }); + return *this; + } + ConfigBuilder& add(std::string_view key, std::string* value) { + confMap.emplace(key, [value](value_type const& v) { *value = v.as_string(); }); + return *this; + } + ConfigBuilder& add(std::string_view key, Optional* value) { + confMap.emplace(key, [value](value_type const& v) { *value = v.as_string(); }); + return *this; + } + ConfigBuilder& add(std::string_view key, std::vector* value) { + confMap.emplace(key, [value](value_type const& v) { + auto arr = v.as_array(); + for (const auto& i : arr) { + value->push_back(i.as_integer()); + } + }); + return *this; + } + void set(std::string const& key, value_type const& val) { + auto iter = confMap.find(key); + if (iter == confMap.end()) { + std::cerr << "Unknown configuration attribute " << key << std::endl; + TraceEvent("UnknownConfigurationAttribute").detail("Name", key); + throw unknown_error(); + } + iter->second(val); + } + }; + + bool isIniFile(const char* fileName) { + std::string name = fileName; + auto pos = name.find_last_of('.'); + ASSERT(pos != std::string::npos && pos + 1 < name.size()); + auto extension = name.substr(pos + 1); + return extension == "txt"sv; + } + + void loadIniFile(const char* testFile) { + std::ifstream ifs; + ifs.open(testFile, std::ifstream::in); + if (!ifs.good()) + return; + + std::string cline; + + while (ifs.good()) { + getline(ifs, cline); + std::string line = removeWhitespace(std::string(cline)); + if (!line.size() || line.find(';') == 0) + continue; + + size_t found = line.find('='); + if (found == std::string::npos) + // hmmm, not good + continue; + std::string attrib = removeWhitespace(line.substr(0, found)); + std::string value = removeWhitespace(line.substr(found + 1)); + + if (attrib == "extraDB") { + sscanf(value.c_str(), "%d", &extraDB); + } + + if (attrib == "minimumReplication") { + sscanf(value.c_str(), "%d", &minimumReplication); + } + + if (attrib == "minimumRegions") { + sscanf(value.c_str(), "%d", &minimumRegions); + } + + if (attrib == "configureLocked") { + sscanf(value.c_str(), "%d", &configureLocked); + } + + if (attrib == "startIncompatibleProcess") { + startIncompatibleProcess = strcmp(value.c_str(), "true") == 0; + } + + if (attrib == "logAntiQuorum") { + sscanf(value.c_str(), "%d", &logAntiQuorum); + } + + if (attrib == "storageEngineExcludeTypes") { + std::stringstream ss(value); + for (int i; ss >> i;) { + storageEngineExcludeTypes.push_back(i); + if (ss.peek() == ',') { + ss.ignore(); + } + } + } + if (attrib == "maxTLogVersion") { + sscanf(value.c_str(), "%d", &maxTLogVersion); + } + } + + ifs.close(); + } + + +public: + int extraDB = 0; + int minimumReplication = 0; + int minimumRegions = 0; + bool configureLocked = false; + bool startIncompatibleProcess = false; + int logAntiQuorum = -1; + // Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig + // 0 = "ssd" + // 1 = "memory" + // 2 = "memory-radixtree-beta" + // 3 = "ssd-redwood-experimental" + // Requires a comma-separated list of numbers WITHOUT whitespaces + std::vector storageEngineExcludeTypes; + // Set the maximum TLog version that can be selected for a test + // Refer to FDBTypes.h::TLogVersion. Defaults to the maximum supported version. + int maxTLogVersion = TLogVersion::MAX_SUPPORTED; + // Set true to simplify simulation configs for easier debugging + bool simpleConfig = false; + Optional generateFearless, buggify; + Optional datacenters, desiredTLogCount, commitProxyCount, grvProxyCount, resolverCount, storageEngineType, + stderrSeverity, machineCount, processesPerMachine, coordinators; + Optional config; + + void readFromConfig(const char* testFile) { + if (isIniFile(testFile)) { + loadIniFile(testFile); + return; + } + ConfigBuilder builder; + builder.add("extraDB", &extraDB) + .add("minimumReplication", &minimumReplication) + .add("minimumRegions", &minimumRegions) + .add("configureLocked", &configureLocked) + .add("startIncompatibleProcess", &startIncompatibleProcess) + .add("logAntiQuorum", &logAntiQuorum) + .add("storageEngineExcludeTypes", &storageEngineExcludeTypes) + .add("maxTLogVersion", &maxTLogVersion) + .add("simpleConfig", &simpleConfig) + .add("generateFearless", &generateFearless) + .add("datacenters", &datacenters) + .add("desiredTLogCount", &desiredTLogCount) + .add("commitProxyCount", &commitProxyCount) + .add("grvProxyCount", &grvProxyCount) + .add("resolverCount", &resolverCount) + .add("storageEngineType", &storageEngineType) + .add("config", &config) + .add("buggify", &buggify) + .add("StderrSeverity", &stderrSeverity) + .add("machineCount", &machineCount) + .add("processesPerMachine", &processesPerMachine) + .add("coordinators", &coordinators); + try { + auto file = toml::parse(testFile); + if (file.contains("configuration") && toml::find(file, "configuration").is_table()) { + auto conf = toml::find(file, "configuration").as_table(); + for (const auto& [key, value] : conf) { + if (key == "ClientInfoLogging") { + setNetworkOption(FDBNetworkOptions::DISABLE_CLIENT_STATISTICS_LOGGING); + } else { + builder.set(key, value); + } + } + if (stderrSeverity.present()) { + TraceEvent("StderrSeverity").detail("NewSeverity", stderrSeverity.get()); + } + } + } catch (std::exception& e) { + std::cerr << e.what() << std::endl; + TraceEvent("TOMLParseError").detail("Error", printable(e.what())); + throw unknown_error(); + } + } +}; + template T simulate(const T& in) { BinaryWriter writer(AssumeVersion(g_network->protocolVersion())); @@ -885,30 +1086,57 @@ StringRef StringRefOf(const char* s) { // of different combinations void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { set_config("new"); - const bool simple = false; // Set true to simplify simulation configs for easier debugging // generateMachineTeamTestConfig set up the number of servers per machine and the number of machines such that // if we do not remove the surplus server and machine teams, the simulation test will report error. // This is needed to make sure the number of server (and machine) teams is no larger than the desired number. bool generateMachineTeamTestConfig = BUGGIFY_WITH_PROB(0.1) ? true : false; - bool generateFearless = simple ? false : (testConfig.minimumRegions > 1 || deterministicRandom()->random01() < 0.5); - datacenters = simple ? 1 - : (generateFearless - ? (testConfig.minimumReplication > 0 || deterministicRandom()->random01() < 0.5 ? 4 : 6) + bool generateFearless = + testConfig.simpleConfig ? false : (testConfig.minimumRegions > 1 || deterministicRandom()->random01() < 0.5); + if (testConfig.generateFearless.present()) { + // overwrite whatever decision we made before + generateFearless = testConfig.generateFearless.get(); + } + datacenters = + testConfig.simpleConfig + ? 1 + : (generateFearless ? (testConfig.minimumReplication > 0 || deterministicRandom()->random01() < 0.5 ? 4 : 6) : deterministicRandom()->randomInt(1, 4)); - if (deterministicRandom()->random01() < 0.25) + if (testConfig.datacenters.present()) { + datacenters = testConfig.datacenters.get(); + } + if (testConfig.desiredTLogCount.present()) { + db.desiredTLogCount = testConfig.desiredTLogCount.get(); + } else if (deterministicRandom()->random01() < 0.25) { db.desiredTLogCount = deterministicRandom()->randomInt(1, 7); - if (deterministicRandom()->random01() < 0.25) + } + + if (testConfig.commitProxyCount.present()) { + db.commitProxyCount = testConfig.commitProxyCount.get(); + } else if (deterministicRandom()->random01() < 0.25) { db.commitProxyCount = deterministicRandom()->randomInt(1, 7); - if (deterministicRandom()->random01() < 0.25) + } + + if (testConfig.grvProxyCount.present()) { + db.grvProxyCount = testConfig.grvProxyCount.get(); + } else if (deterministicRandom()->random01() < 0.25) { db.grvProxyCount = deterministicRandom()->randomInt(1, 4); - if (deterministicRandom()->random01() < 0.25) + } + + if (testConfig.resolverCount.present()) { + db.resolverCount = testConfig.resolverCount.get(); + } else if (deterministicRandom()->random01() < 0.25) { db.resolverCount = deterministicRandom()->randomInt(1, 7); + } int storage_engine_type = deterministicRandom()->randomInt(0, 4); - // Continuously re-pick the storage engine type if it's the one we want to exclude - while (std::find(testConfig.storageEngineExcludeTypes.begin(), - testConfig.storageEngineExcludeTypes.end(), - storage_engine_type) != testConfig.storageEngineExcludeTypes.end()) { - storage_engine_type = deterministicRandom()->randomInt(0, 4); + if (testConfig.storageEngineType.present()) { + storage_engine_type = testConfig.storageEngineType.get(); + } else { + // Continuously re-pick the storage engine type if it's the one we want to exclude + while (std::find(testConfig.storageEngineExcludeTypes.begin(), + testConfig.storageEngineExcludeTypes.end(), + storage_engine_type) != testConfig.storageEngineExcludeTypes.end()) { + storage_engine_type = deterministicRandom()->randomInt(0, 4); + } } switch (storage_engine_type) { case 0: { @@ -941,75 +1169,81 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { // } // set_config("memory"); // set_config("memory-radixtree-beta"); - if (simple) { + if (testConfig.simpleConfig) { db.desiredTLogCount = 1; db.commitProxyCount = 1; db.grvProxyCount = 1; db.resolverCount = 1; } - int replication_type = simple ? 1 - : (std::max(testConfig.minimumReplication, - datacenters > 4 ? deterministicRandom()->randomInt(1, 3) - : std::min(deterministicRandom()->randomInt(0, 6), 3))); - switch (replication_type) { - case 0: { - TEST(true); // Simulated cluster using custom redundancy mode - int storage_servers = deterministicRandom()->randomInt(1, generateFearless ? 4 : 5); - // FIXME: log replicas must be more than storage replicas because otherwise better master exists will not - // recognize it needs to change dcs - int replication_factor = deterministicRandom()->randomInt(storage_servers, generateFearless ? 4 : 5); - int anti_quorum = deterministicRandom()->randomInt( - 0, - (replication_factor / 2) + 1); // The anti quorum cannot be more than half of the replication factor, or the - // log system will continue to accept commits when a recovery is impossible - // Go through buildConfiguration, as it sets tLogPolicy/storagePolicy. - set_config(format("storage_replicas:=%d log_replicas:=%d log_anti_quorum:=%d " - "replica_datacenters:=1 min_replica_datacenters:=1", - storage_servers, - replication_factor, - anti_quorum)); - break; - } - case 1: { - TEST(true); // Simulated cluster running in single redundancy mode - set_config("single"); - break; - } - case 2: { - TEST(true); // Simulated cluster running in double redundancy mode - set_config("double"); - break; - } - case 3: { - if (datacenters <= 2 || generateFearless) { - TEST(true); // Simulated cluster running in triple redundancy mode - set_config("triple"); - } else if (datacenters == 3) { - TEST(true); // Simulated cluster running in 3 data-hall mode - set_config("three_data_hall"); - } else { - ASSERT(false); - } - break; - } - default: - ASSERT(false); // Programmer forgot to adjust cases. - } - - if (deterministicRandom()->random01() < 0.5) { - int logSpill = deterministicRandom()->randomInt(TLogSpillType::VALUE, TLogSpillType::END); - set_config(format("log_spill:=%d", logSpill)); - int logVersion = deterministicRandom()->randomInt(TLogVersion::MIN_RECRUITABLE, testConfig.maxTLogVersion + 1); - set_config(format("log_version:=%d", logVersion)); + if (testConfig.config.present()) { + set_config(testConfig.config.get()); } else { - if (deterministicRandom()->random01() < 0.7) - set_config(format("log_version:=%d", testConfig.maxTLogVersion)); - if (deterministicRandom()->random01() < 0.5) - set_config(format("log_spill:=%d", TLogSpillType::DEFAULT)); - } - - if (deterministicRandom()->random01() < 0.5) { - set_config("backup_worker_enabled:=1"); + int replication_type = testConfig.simpleConfig + ? 1 + : (std::max(testConfig.minimumReplication, + datacenters > 4 ? deterministicRandom()->randomInt(1, 3) + : std::min(deterministicRandom()->randomInt(0, 6), 3))); + switch (replication_type) { + case 0: { + TEST(true); // Simulated cluster using custom redundancy mode + int storage_servers = deterministicRandom()->randomInt(1, generateFearless ? 4 : 5); + // FIXME: log replicas must be more than storage replicas because otherwise better master exists will not + // recognize it needs to change dcs + int replication_factor = deterministicRandom()->randomInt(storage_servers, generateFearless ? 4 : 5); + int anti_quorum = deterministicRandom()->randomInt( + 0, + (replication_factor / 2) + + 1); // The anti quorum cannot be more than half of the replication factor, or the + // log system will continue to accept commits when a recovery is impossible + // Go through buildConfiguration, as it sets tLogPolicy/storagePolicy. + set_config(format("storage_replicas:=%d log_replicas:=%d log_anti_quorum:=%d " + "replica_datacenters:=1 min_replica_datacenters:=1", + storage_servers, + replication_factor, + anti_quorum)); + break; + } + case 1: { + TEST(true); // Simulated cluster running in single redundancy mode + set_config("single"); + break; + } + case 2: { + TEST(true); // Simulated cluster running in double redundancy mode + set_config("double"); + break; + } + case 3: { + if (datacenters <= 2 || generateFearless) { + TEST(true); // Simulated cluster running in triple redundancy mode + set_config("triple"); + } else if (datacenters == 3) { + TEST(true); // Simulated cluster running in 3 data-hall mode + set_config("three_data_hall"); + } else { + ASSERT(false); + } + break; + } + default: + ASSERT(false); // Programmer forgot to adjust cases. + } + if (deterministicRandom()->random01() < 0.5) { + int logSpill = deterministicRandom()->randomInt(TLogSpillType::VALUE, TLogSpillType::END); + set_config(format("log_spill:=%d", logSpill)); + int logVersion = + deterministicRandom()->randomInt(TLogVersion::MIN_RECRUITABLE, testConfig.maxTLogVersion + 1); + set_config(format("log_version:=%d", logVersion)); + } else { + if (deterministicRandom()->random01() < 0.7) + set_config(format("log_version:=%d", testConfig.maxTLogVersion)); + if (deterministicRandom()->random01() < 0.5) + set_config(format("log_spill:=%d", TLogSpillType::DEFAULT)); + } + + if (deterministicRandom()->random01() < 0.5) { + set_config("backup_worker_enabled:=1"); + } } if (generateFearless || (datacenters == 2 && deterministicRandom()->random01() < 0.5)) { @@ -1211,7 +1445,9 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { } } - if (generateFearless && testConfig.minimumReplication > 1) { + if (testConfig.machineCount.present()) { + machine_count = testConfig.machineCount.get(); + } else if (generateFearless && testConfig.minimumReplication > 1) { // low latency tests in fearless configurations need 4 machines per datacenter (3 for triple replication, 1 that // is down during failures). machine_count = 16; @@ -1234,11 +1470,15 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { } } - // because we protect a majority of coordinators from being killed, it is better to run with low numbers of - // coordinators to prevent too many processes from being protected - coordinators = (testConfig.minimumRegions <= 1 && BUGGIFY) - ? deterministicRandom()->randomInt(1, std::max(machine_count, 2)) - : 1; + if (testConfig.coordinators.present()) { + coordinators = testConfig.coordinators.get(); + } else { + // because we protect a majority of coordinators from being killed, it is better to run with low numbers of + // coordinators to prevent too many processes from being protected + coordinators = (testConfig.minimumRegions <= 1 && BUGGIFY) + ? deterministicRandom()->randomInt(1, std::max(machine_count, 2)) + : 1; + } if (testConfig.minimumReplication > 1 && datacenters == 3) { // low latency tests in 3 data hall mode need 2 other data centers with 2 machines each to avoid waiting for @@ -1247,7 +1487,9 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { coordinators = 3; } - if (generateFearless) { + if (testConfig.processesPerMachine.present()) { + processes_per_machine = testConfig.processesPerMachine.get(); + } else if (generateFearless) { processes_per_machine = 1; } else { processes_per_machine = deterministicRandom()->randomInt(1, (extraDB ? 14 : 28) / machine_count + 2); @@ -1626,68 +1868,10 @@ void setupSimulatedSystem(vector>* systemActors, .detail("StartingConfiguration", pStartingConfiguration->toString()); } +using namespace std::literals; + // Populates the TestConfig fields according to what is found in the test file. -void checkTestConf(const char* testFile, TestConfig* testConfig) { - std::ifstream ifs; - ifs.open(testFile, std::ifstream::in); - if (!ifs.good()) - return; - - std::string cline; - - while (ifs.good()) { - getline(ifs, cline); - std::string line = removeWhitespace(std::string(cline)); - if (!line.size() || line.find(';') == 0) - continue; - - size_t found = line.find('='); - if (found == std::string::npos) - // hmmm, not good - continue; - std::string attrib = removeWhitespace(line.substr(0, found)); - std::string value = removeWhitespace(line.substr(found + 1)); - - if (attrib == "extraDB") { - sscanf(value.c_str(), "%d", &testConfig->extraDB); - } - - if (attrib == "minimumReplication") { - sscanf(value.c_str(), "%d", &testConfig->minimumReplication); - } - - if (attrib == "minimumRegions") { - sscanf(value.c_str(), "%d", &testConfig->minimumRegions); - } - - if (attrib == "configureLocked") { - sscanf(value.c_str(), "%d", &testConfig->configureLocked); - } - - if (attrib == "startIncompatibleProcess") { - testConfig->startIncompatibleProcess = strcmp(value.c_str(), "true") == 0; - } - - if (attrib == "logAntiQuorum") { - sscanf(value.c_str(), "%d", &testConfig->logAntiQuorum); - } - - if (attrib == "storageEngineExcludeTypes") { - std::stringstream ss(value); - for (int i; ss >> i;) { - testConfig->storageEngineExcludeTypes.push_back(i); - if (ss.peek() == ',') { - ss.ignore(); - } - } - } - if (attrib == "maxTLogVersion") { - sscanf(value.c_str(), "%d", &testConfig->maxTLogVersion); - } - } - - ifs.close(); -} +void checkTestConf(const char* testFile, TestConfig* testConfig) {} ACTOR void setupAndRun(std::string dataFolder, const char* testFile, @@ -1699,7 +1883,7 @@ ACTOR void setupAndRun(std::string dataFolder, state Standalone startingConfiguration; state int testerCount = 1; state TestConfig testConfig; - checkTestConf(testFile, &testConfig); + testConfig.readFromConfig(testFile); g_simulator.hasDiffProtocolProcess = testConfig.startIncompatibleProcess; g_simulator.setDiffProtocol = false; diff --git a/fdbserver/TesterInterface.actor.h b/fdbserver/TesterInterface.actor.h index ddfb04da22..8320cc566b 100644 --- a/fdbserver/TesterInterface.actor.h +++ b/fdbserver/TesterInterface.actor.h @@ -100,27 +100,6 @@ struct WorkloadRequest { } }; -// Configuration details specified in workload test files that change the simulation -// environment details -struct TestConfig { - int extraDB = 0; - int minimumReplication = 0; - int minimumRegions = 0; - int configureLocked = 0; - bool startIncompatibleProcess = false; - int logAntiQuorum = -1; - // Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig - // 0 = "ssd" - // 1 = "memory" - // 2 = "memory-radixtree-beta" - // 3 = "ssd-redwood-experimental" - // Requires a comma-separated list of numbers WITHOUT whitespaces - std::vector storageEngineExcludeTypes; - // Set the maximum TLog version that can be selected for a test - // Refer to FDBTypes.h::TLogVersion. Defaults to the maximum supported version. - int maxTLogVersion = TLogVersion::MAX_SUPPORTED; -}; - struct TesterInterface { constexpr static FileIdentifier file_identifier = 4465210; RequestStream recruitments; diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index fa18a376a4..4b98b38486 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -1249,20 +1249,6 @@ std::vector readTOMLTests_(std::string fileName) { const toml::value& conf = toml::parse(fileName); - // Handle all global settings - for (const auto& [k, v] : conf.as_table()) { - if (k == "test") { - continue; - } - if (testSpecGlobalKeys.find(k) != testSpecGlobalKeys.end()) { - testSpecGlobalKeys[k](toml_to_string(v)); - } else { - TraceEvent(SevError, "TestSpecUnrecognizedGlobalParam") - .detail("Attrib", k) - .detail("Value", toml_to_string(v)); - } - } - // Then parse each test const toml::array& tests = toml::find(conf, "test").as_array(); for (const toml::value& test : tests) { diff --git a/tests/fast/AtomicBackupToDBCorrectness.toml b/tests/fast/AtomicBackupToDBCorrectness.toml index cc6f8e453a..1b601da923 100644 --- a/tests/fast/AtomicBackupToDBCorrectness.toml +++ b/tests/fast/AtomicBackupToDBCorrectness.toml @@ -1,3 +1,4 @@ +[configuration] extraDB = 1 [[test]] diff --git a/tests/fast/BackupToDBCorrectness.toml b/tests/fast/BackupToDBCorrectness.toml index 62f30151d4..cf50093657 100644 --- a/tests/fast/BackupToDBCorrectness.toml +++ b/tests/fast/BackupToDBCorrectness.toml @@ -1,3 +1,4 @@ +[configuration] extraDB = 1 [[test]] diff --git a/tests/fast/BackupToDBCorrectnessClean.toml b/tests/fast/BackupToDBCorrectnessClean.toml index 0dfdbbd8b0..9c2e9135e5 100644 --- a/tests/fast/BackupToDBCorrectnessClean.toml +++ b/tests/fast/BackupToDBCorrectnessClean.toml @@ -1,3 +1,4 @@ +[configuration] extraDB = 1 [[test]] diff --git a/tests/fast/ConfigureLocked.toml b/tests/fast/ConfigureLocked.toml index 169fc2a2a3..592701931a 100644 --- a/tests/fast/ConfigureLocked.toml +++ b/tests/fast/ConfigureLocked.toml @@ -1,4 +1,5 @@ -configureLocked = 1 +[configuration] +configureLocked = true [[test]] testTitle = 'ConfigureLocked' diff --git a/tests/fast/FuzzApiCorrectness.toml b/tests/fast/FuzzApiCorrectness.toml index 0e1e88619c..20d4e215b5 100644 --- a/tests/fast/FuzzApiCorrectness.toml +++ b/tests/fast/FuzzApiCorrectness.toml @@ -1,3 +1,4 @@ +[configuration] StderrSeverity = 30 [[test]] diff --git a/tests/fast/FuzzApiCorrectnessClean.toml b/tests/fast/FuzzApiCorrectnessClean.toml index 9b66edf86d..7165deda42 100644 --- a/tests/fast/FuzzApiCorrectnessClean.toml +++ b/tests/fast/FuzzApiCorrectnessClean.toml @@ -1,3 +1,4 @@ +[configuration] StderrSeverity = 30 [[test]] diff --git a/tests/fast/KillRegionCycle.toml b/tests/fast/KillRegionCycle.toml index 71eebfbc2a..77bd6ce2ef 100644 --- a/tests/fast/KillRegionCycle.toml +++ b/tests/fast/KillRegionCycle.toml @@ -1,3 +1,4 @@ +[configuration] minimumRegions = 2 [[test]] diff --git a/tests/fast/LongStackWriteDuringRead.toml b/tests/fast/LongStackWriteDuringRead.toml index e80ff22846..d65d9a2a91 100644 --- a/tests/fast/LongStackWriteDuringRead.toml +++ b/tests/fast/LongStackWriteDuringRead.toml @@ -1,3 +1,4 @@ +[configuration] StderrSeverity = 30 [[test]] diff --git a/tests/fast/LowLatency.toml b/tests/fast/LowLatency.toml index bcf71ba942..d8af3b38c9 100644 --- a/tests/fast/LowLatency.toml +++ b/tests/fast/LowLatency.toml @@ -1,3 +1,4 @@ +[configuration] buggify = false minimumReplication = 2 diff --git a/tests/fast/ProtocolVersion.toml b/tests/fast/ProtocolVersion.toml index 626b876dd5..2cf223b5db 100644 --- a/tests/fast/ProtocolVersion.toml +++ b/tests/fast/ProtocolVersion.toml @@ -1,3 +1,4 @@ +[configuration] startIncompatibleProcess = true [[test]] diff --git a/tests/fast/ReportConflictingKeys.toml b/tests/fast/ReportConflictingKeys.toml index 2f81880c00..6b0654c143 100644 --- a/tests/fast/ReportConflictingKeys.toml +++ b/tests/fast/ReportConflictingKeys.toml @@ -1,3 +1,4 @@ +[configuration] buggify = false [[test]] diff --git a/tests/fast/WriteDuringRead.toml b/tests/fast/WriteDuringRead.toml index 82b39e78ae..565fc957df 100644 --- a/tests/fast/WriteDuringRead.toml +++ b/tests/fast/WriteDuringRead.toml @@ -1,3 +1,4 @@ +[configuration] StderrSeverity = 30 [[test]] diff --git a/tests/fast/WriteDuringReadClean.toml b/tests/fast/WriteDuringReadClean.toml index fca62f39ec..83e61507c2 100644 --- a/tests/fast/WriteDuringReadClean.toml +++ b/tests/fast/WriteDuringReadClean.toml @@ -1,3 +1,4 @@ +[configuration] StderrSeverity = 30 [[test]] diff --git a/tests/rare/ConflictRangeCheck.toml b/tests/rare/ConflictRangeCheck.toml index f923ebb137..3e8860fd50 100644 --- a/tests/rare/ConflictRangeCheck.toml +++ b/tests/rare/ConflictRangeCheck.toml @@ -1,3 +1,4 @@ +[configuration] buggify = false [[test]] diff --git a/tests/rare/ConflictRangeRYOWCheck.toml b/tests/rare/ConflictRangeRYOWCheck.toml index 1a2e4f39d0..d190459170 100644 --- a/tests/rare/ConflictRangeRYOWCheck.toml +++ b/tests/rare/ConflictRangeRYOWCheck.toml @@ -1,3 +1,4 @@ +[configuration] buggify = false [[test]] diff --git a/tests/restarting/from_7.0.0/SnapIncrementalRestore-1.toml b/tests/restarting/from_7.0.0/SnapIncrementalRestore-1.toml index efa3bae128..6321090c4e 100644 --- a/tests/restarting/from_7.0.0/SnapIncrementalRestore-1.toml +++ b/tests/restarting/from_7.0.0/SnapIncrementalRestore-1.toml @@ -1,3 +1,4 @@ +[configuration] logAntiQuorum = 0 [[test]] diff --git a/tests/slow/ApiCorrectnessSwitchover.toml b/tests/slow/ApiCorrectnessSwitchover.toml index 98b5ebd3a1..d97474e422 100644 --- a/tests/slow/ApiCorrectnessSwitchover.toml +++ b/tests/slow/ApiCorrectnessSwitchover.toml @@ -1,3 +1,4 @@ +[configuration] extraDB = 2 [[test]] diff --git a/tests/slow/DifferentClustersSameRV.toml b/tests/slow/DifferentClustersSameRV.toml index 4cda3eaea4..4d14271361 100644 --- a/tests/slow/DifferentClustersSameRV.toml +++ b/tests/slow/DifferentClustersSameRV.toml @@ -1,3 +1,4 @@ +[configuration] extraDB = 2 [[test]] diff --git a/tests/slow/LowLatencyWithFailures.toml b/tests/slow/LowLatencyWithFailures.toml index 3888bb9c26..21514f247c 100644 --- a/tests/slow/LowLatencyWithFailures.toml +++ b/tests/slow/LowLatencyWithFailures.toml @@ -1,3 +1,4 @@ +[configuration] minimumReplication = 2 [[test]] diff --git a/tests/slow/ParallelRestoreNewBackupCorrectnessAtomicOp.toml b/tests/slow/ParallelRestoreNewBackupCorrectnessAtomicOp.toml index a208f02872..ba56f68d31 100644 --- a/tests/slow/ParallelRestoreNewBackupCorrectnessAtomicOp.toml +++ b/tests/slow/ParallelRestoreNewBackupCorrectnessAtomicOp.toml @@ -1,4 +1,5 @@ # Disable buggify for parallel restore +#[configuration] #buggify=on [[test]] diff --git a/tests/slow/ParallelRestoreNewBackupCorrectnessCycle.toml b/tests/slow/ParallelRestoreNewBackupCorrectnessCycle.toml index 2c2d4b0333..3acece923d 100644 --- a/tests/slow/ParallelRestoreNewBackupCorrectnessCycle.toml +++ b/tests/slow/ParallelRestoreNewBackupCorrectnessCycle.toml @@ -1,4 +1,5 @@ # Disable buggify for parallel restore +#[configuration] #buggify=off [[test]] diff --git a/tests/slow/ParallelRestoreNewBackupCorrectnessMultiCycles.toml b/tests/slow/ParallelRestoreNewBackupCorrectnessMultiCycles.toml index c94b2bc7a8..7862f5784a 100644 --- a/tests/slow/ParallelRestoreNewBackupCorrectnessMultiCycles.toml +++ b/tests/slow/ParallelRestoreNewBackupCorrectnessMultiCycles.toml @@ -1,4 +1,5 @@ # Disable buggify for parallel restore +#[configuration] #buggify=off [[test]] diff --git a/tests/slow/ParallelRestoreNewBackupWriteDuringReadAtomicRestore.toml b/tests/slow/ParallelRestoreNewBackupWriteDuringReadAtomicRestore.toml index ae92b4b956..4b305660bc 100644 --- a/tests/slow/ParallelRestoreNewBackupWriteDuringReadAtomicRestore.toml +++ b/tests/slow/ParallelRestoreNewBackupWriteDuringReadAtomicRestore.toml @@ -1,3 +1,4 @@ +[configuration] StderrSeverity = 30 [[test]] diff --git a/tests/slow/ParallelRestoreOldBackupCorrectnessAtomicOp.toml b/tests/slow/ParallelRestoreOldBackupCorrectnessAtomicOp.toml index 04130159d1..647d15ec26 100644 --- a/tests/slow/ParallelRestoreOldBackupCorrectnessAtomicOp.toml +++ b/tests/slow/ParallelRestoreOldBackupCorrectnessAtomicOp.toml @@ -1,4 +1,5 @@ # Disable buggify for parallel restore +#[configuration] #buggify=on [[test]] diff --git a/tests/slow/ParallelRestoreOldBackupCorrectnessMultiCycles.toml b/tests/slow/ParallelRestoreOldBackupCorrectnessMultiCycles.toml index 8dc215c593..8f6f7b92aa 100644 --- a/tests/slow/ParallelRestoreOldBackupCorrectnessMultiCycles.toml +++ b/tests/slow/ParallelRestoreOldBackupCorrectnessMultiCycles.toml @@ -1,4 +1,5 @@ # Disable buggify for parallel restore +#[configuration] #buggify=off [[test]] diff --git a/tests/slow/ParallelRestoreOldBackupWriteDuringReadAtomicRestore.toml b/tests/slow/ParallelRestoreOldBackupWriteDuringReadAtomicRestore.toml index e09dc4fdd9..0479031d75 100644 --- a/tests/slow/ParallelRestoreOldBackupWriteDuringReadAtomicRestore.toml +++ b/tests/slow/ParallelRestoreOldBackupWriteDuringReadAtomicRestore.toml @@ -1,3 +1,4 @@ +[configuration] StderrSeverity = 30 [[test]] diff --git a/tests/slow/SharedBackupCorrectness.toml b/tests/slow/SharedBackupCorrectness.toml index 253736e6ce..c03b89831a 100644 --- a/tests/slow/SharedBackupCorrectness.toml +++ b/tests/slow/SharedBackupCorrectness.toml @@ -1,3 +1,4 @@ +[configuration] extraDB = 1 [[test]] diff --git a/tests/slow/SharedBackupToDBCorrectness.toml b/tests/slow/SharedBackupToDBCorrectness.toml index 2a6b45f0ec..3a3a07dfbd 100644 --- a/tests/slow/SharedBackupToDBCorrectness.toml +++ b/tests/slow/SharedBackupToDBCorrectness.toml @@ -1,3 +1,4 @@ +[configuration] extraDB = 1 [[test]] diff --git a/tests/slow/VersionStampBackupToDB.toml b/tests/slow/VersionStampBackupToDB.toml index 29d86df4e1..4b36182dd0 100644 --- a/tests/slow/VersionStampBackupToDB.toml +++ b/tests/slow/VersionStampBackupToDB.toml @@ -1,3 +1,4 @@ +[configuration] extraDB = 2 [[test]] diff --git a/tests/slow/VersionStampSwitchover.toml b/tests/slow/VersionStampSwitchover.toml index f65086ab59..328c199b93 100644 --- a/tests/slow/VersionStampSwitchover.toml +++ b/tests/slow/VersionStampSwitchover.toml @@ -1,3 +1,4 @@ +[configuration] extraDB = 2 [[test]] diff --git a/tests/slow/WriteDuringReadAtomicRestore.toml b/tests/slow/WriteDuringReadAtomicRestore.toml index 96868a11ef..a148f0a1c9 100644 --- a/tests/slow/WriteDuringReadAtomicRestore.toml +++ b/tests/slow/WriteDuringReadAtomicRestore.toml @@ -1,3 +1,4 @@ +[configuration] StderrSeverity = 30 [[test]] diff --git a/tests/slow/WriteDuringReadSwitchover.toml b/tests/slow/WriteDuringReadSwitchover.toml index 7eaa3a36b0..b5232c9119 100644 --- a/tests/slow/WriteDuringReadSwitchover.toml +++ b/tests/slow/WriteDuringReadSwitchover.toml @@ -1,3 +1,4 @@ +[configuration] StderrSeverity = 30 extraDB = 2 From bb076115c9b308ecfa703f396955c82df65d5c3d Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 21 May 2021 16:40:29 -0700 Subject: [PATCH 422/461] Only enable backup worker when using partitioned logs This addresses issue #4849. --- fdbclient/FileBackupAgent.actor.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index dfd33b5b67..317e4cc095 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -2705,13 +2705,17 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase { wait(checkTaskVersion(cx, task, StartFullBackupTaskFunc::name, StartFullBackupTaskFunc::version)); state Reference tr(new ReadYourWritesTransaction(cx)); + state BackupConfig config(task); + state Future> partitionedLog; loop { try { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - Version startVersion = wait(tr->getReadVersion()); + partitionedLog = config.partitionedLogEnabled().get(tr); + state Future startVersionFuture = tr->getReadVersion(); + wait(success(partitionedLog) && success(startVersionFuture)); - Params.beginVersion().set(task, startVersion); + Params.beginVersion().set(task, startVersionFuture.get()); break; } catch (Error& e) { wait(tr->onError(e)); @@ -2721,14 +2725,15 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase { // Check if backup worker is enabled DatabaseConfiguration dbConfig = wait(getDatabaseConfiguration(cx)); state bool backupWorkerEnabled = dbConfig.backupWorkerEnabled; - if (!backupWorkerEnabled) { + if (!backupWorkerEnabled && partitionedLog.get().present() && partitionedLog.get().get()) { + // Change configuration only when we set to use partitioned logs and + // the flag was not set before. wait(success(changeConfig(cx, "backup_worker_enabled:=1", true))); backupWorkerEnabled = true; } // Set the "backupStartedKey" and wait for all backup worker started tr->reset(); - state BackupConfig config(task); loop { state Future watchFuture; try { @@ -2738,7 +2743,7 @@ struct StartFullBackupTaskFunc : BackupTaskFuncBase { state Future> started = tr->get(backupStartedKey); state Future> taskStarted = tr->get(config.allWorkerStarted().key); - state Future> partitionedLog = config.partitionedLogEnabled().get(tr); + partitionedLog = config.partitionedLogEnabled().get(tr); wait(success(started) && success(taskStarted) && success(partitionedLog)); if (!partitionedLog.get().present() || !partitionedLog.get().get()) { From 19b9a35c586b7d6c517560df09d89c351d53629a Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala Date: Mon, 24 May 2021 18:37:48 +0000 Subject: [PATCH 423/461] Expose "bounce impact" and Storage Server "version catch-up rate" metrics Update the Status section of release-notes-630.rst with info about the new status fields introduced by PR https://github.com/apple/foundationdb/pull/4770 . --- .../sphinx/source/release-notes/release-notes-630.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst index 2057e7fcb2..8d2ff59aba 100644 --- a/documentation/sphinx/source/release-notes/release-notes-630.rst +++ b/documentation/sphinx/source/release-notes/release-notes-630.rst @@ -113,6 +113,9 @@ Status * If a process is unable to flush trace logs to disk, the problem will now be reported via the output of ``status`` command inside ``fdbcli``. `(PR #2605) `_ `(PR #2820) `_ * When a configuration key is changed, it will always be included in ``status json`` output, even the value is reverted back to the default value. [6.3.5] `(PR #3610) `_ * Added transactions.rejected_for_queued_too_long for bookkeeping the number of transactions rejected by commit proxy because its queuing time exceeds MVCC window.[6.3.11] `(PR #4353) `_ +* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) `_ +* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) `_ +* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) `_ Bindings -------- From eaad5798dd69cdf9c2be6ba65c797c9ca03f8819 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Mon, 24 May 2021 15:10:58 -0400 Subject: [PATCH 424/461] update release notes to match the ones on release-6.3 --- .../sphinx/source/release-notes/release-notes-630.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst index 8d2ff59aba..38eab2be6f 100644 --- a/documentation/sphinx/source/release-notes/release-notes-630.rst +++ b/documentation/sphinx/source/release-notes/release-notes-630.rst @@ -5,14 +5,19 @@ Release Notes 6.3.13 ====== +* Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4736) `_ * The multi-version client now requires at most two client connections with version 6.2 or larger, regardless of how many external clients are configured. Clients older than 6.2 will continue to create an additional connection each. `(PR #4667) `_ * Fix an accounting error that could potentially result in inaccuracies in priority busyness metrics. `(PR #4824) `_ +* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) `_ +* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) `_ +* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) `_ 6.3.12 ====== * Change the default for --knob_tls_server_handshake_threads to 64. The previous was 1000. This avoids starting 1000 threads by default, but may adversely affect recovery time for large clusters using tls. Users with large tls clusters should consider explicitly setting this knob in their foundationdb.conf file. `(PR #4421) `_ * Fix accounting error that could cause commits to incorrectly fail with ``proxy_memory_limit_exceeded``. `(PR #4526) `_ * As an optimization, partial restore using target key ranges now filters backup log data prior to loading it into the database. `(PR #4554) `_ +* Fix fault tolerance calculation when there are no tLogs in LogSet. `(PR #4454) `_ 6.3.11 ====== From a107604d3a4dde095448f623f6e31601eeace18d Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Mon, 24 May 2021 15:53:21 -0400 Subject: [PATCH 425/461] Move patch notes to the appropriate section, and add new 6.3.14 notes --- .../sphinx/source/release-notes/release-notes-630.rst | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst index 38eab2be6f..6cb34252ec 100644 --- a/documentation/sphinx/source/release-notes/release-notes-630.rst +++ b/documentation/sphinx/source/release-notes/release-notes-630.rst @@ -3,14 +3,18 @@ Release Notes ############# +6.3.14 +====== +* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) `_ +* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) `_ +* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) `_ +* Added the ``bypass_unreadable`` transaction option which allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. `(PR #4774) `_ + 6.3.13 ====== * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4736) `_ * The multi-version client now requires at most two client connections with version 6.2 or larger, regardless of how many external clients are configured. Clients older than 6.2 will continue to create an additional connection each. `(PR #4667) `_ * Fix an accounting error that could potentially result in inaccuracies in priority busyness metrics. `(PR #4824) `_ -* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) `_ -* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) `_ -* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) `_ 6.3.12 ====== From 61e2ec62cc3cffdcfa8892978044dd306dc7e0ff Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Mon, 24 May 2021 16:06:37 -0400 Subject: [PATCH 426/461] more release notes added --- documentation/sphinx/source/release-notes/release-notes-630.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst index 6cb34252ec..c180798496 100644 --- a/documentation/sphinx/source/release-notes/release-notes-630.rst +++ b/documentation/sphinx/source/release-notes/release-notes-630.rst @@ -9,6 +9,7 @@ Release Notes * Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) `_ * Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) `_ * Added the ``bypass_unreadable`` transaction option which allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. `(PR #4774) `_ +* Fix several packaging issues. The osx package should now install successfully, and the structure of the RPM and DEB packages should match that of 6.2. `(PR #4810) `_ 6.3.13 ====== From 3b08f39cc34af28579b32120b6f51ca127fe0791 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Mon, 24 May 2021 16:07:55 -0400 Subject: [PATCH 427/461] remove extra notes in "Status" section --- .../sphinx/source/release-notes/release-notes-630.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst index c180798496..307e08ce87 100644 --- a/documentation/sphinx/source/release-notes/release-notes-630.rst +++ b/documentation/sphinx/source/release-notes/release-notes-630.rst @@ -123,9 +123,6 @@ Status * If a process is unable to flush trace logs to disk, the problem will now be reported via the output of ``status`` command inside ``fdbcli``. `(PR #2605) `_ `(PR #2820) `_ * When a configuration key is changed, it will always be included in ``status json`` output, even the value is reverted back to the default value. [6.3.5] `(PR #3610) `_ * Added transactions.rejected_for_queued_too_long for bookkeeping the number of transactions rejected by commit proxy because its queuing time exceeds MVCC window.[6.3.11] `(PR #4353) `_ -* Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) `_ -* Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) `_ -* Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) `_ Bindings -------- From a0b136356054230aa69248f87836afc70efac35d Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Mon, 24 May 2021 16:28:55 -0400 Subject: [PATCH 428/461] move note to 6.3.14 --- documentation/sphinx/source/release-notes/release-notes-630.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst index 307e08ce87..af5b2a02c7 100644 --- a/documentation/sphinx/source/release-notes/release-notes-630.rst +++ b/documentation/sphinx/source/release-notes/release-notes-630.rst @@ -10,12 +10,12 @@ Release Notes * Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) `_ * Added the ``bypass_unreadable`` transaction option which allows ``get`` operations to read from sections of keyspace that have become unreadable because of versionstamp operations. `(PR #4774) `_ * Fix several packaging issues. The osx package should now install successfully, and the structure of the RPM and DEB packages should match that of 6.2. `(PR #4810) `_ +* Fix an accounting error that could potentially result in inaccuracies in priority busyness metrics. `(PR #4824) `_ 6.3.13 ====== * Added ``commit_batching_window_size`` to the proxy roles section of status to record statistics about commit batching window size on each proxy. `(PR #4736) `_ * The multi-version client now requires at most two client connections with version 6.2 or larger, regardless of how many external clients are configured. Clients older than 6.2 will continue to create an additional connection each. `(PR #4667) `_ -* Fix an accounting error that could potentially result in inaccuracies in priority busyness metrics. `(PR #4824) `_ 6.3.12 ====== From 7efa4e02bcc5de4f9608febf9139bf97c0e939da Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Mon, 24 May 2021 18:23:14 -0400 Subject: [PATCH 429/461] add 6.3.12 notes --- documentation/sphinx/source/release-notes/release-notes-630.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst index af5b2a02c7..859aee0eb4 100644 --- a/documentation/sphinx/source/release-notes/release-notes-630.rst +++ b/documentation/sphinx/source/release-notes/release-notes-630.rst @@ -23,6 +23,8 @@ Release Notes * Fix accounting error that could cause commits to incorrectly fail with ``proxy_memory_limit_exceeded``. `(PR #4526) `_ * As an optimization, partial restore using target key ranges now filters backup log data prior to loading it into the database. `(PR #4554) `_ * Fix fault tolerance calculation when there are no tLogs in LogSet. `(PR #4454) `_ +* Change client's ``iteration_progression`` size defaults from 256 to 4096 bytes for better performance. `(PR #4416) `_ +* Add the ability to instrument java driver actions, such as FDBTransaction and RangeQuery. `(PR #4385) `_ 6.3.11 ====== From 80abc4d86543b34b58abb14c8418f94f66eda86d Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Mon, 24 May 2021 18:27:56 -0400 Subject: [PATCH 430/461] update formatting --- documentation/sphinx/source/release-notes/release-notes-630.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst index 859aee0eb4..bebd55e859 100644 --- a/documentation/sphinx/source/release-notes/release-notes-630.rst +++ b/documentation/sphinx/source/release-notes/release-notes-630.rst @@ -24,7 +24,7 @@ Release Notes * As an optimization, partial restore using target key ranges now filters backup log data prior to loading it into the database. `(PR #4554) `_ * Fix fault tolerance calculation when there are no tLogs in LogSet. `(PR #4454) `_ * Change client's ``iteration_progression`` size defaults from 256 to 4096 bytes for better performance. `(PR #4416) `_ -* Add the ability to instrument java driver actions, such as FDBTransaction and RangeQuery. `(PR #4385) `_ +* Add the ability to instrument java driver actions, such as ``FDBTransaction`` and ``RangeQuery``. `(PR #4385) `_ 6.3.11 ====== From 7cdd43c352c6d04555f12aef9036e2c7d906da66 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Mon, 24 May 2021 19:19:54 +0000 Subject: [PATCH 431/461] Handle retriable errors better in fdb_c_unit_tests --- bindings/c/test/unit/unit_tests.cpp | 87 +++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 24 deletions(-) diff --git a/bindings/c/test/unit/unit_tests.cpp b/bindings/c/test/unit/unit_tests.cpp index 360284e55d..703b1273dd 100644 --- a/bindings/c/test/unit/unit_tests.cpp +++ b/bindings/c/test/unit/unit_tests.cpp @@ -263,13 +263,15 @@ TEST_CASE("fdb_future_set_callback") { &context)); fdb_error_t err = wait_future(f1); + + context.event.wait(); // Wait until callback is called + if (err) { fdb::EmptyFuture f2 = tr.on_error(err); fdb_check(wait_future(f2)); continue; } - context.event.wait(); break; } } @@ -515,10 +517,10 @@ TEST_CASE("write system key") { fdb::Transaction tr(db); std::string syskey("\xff\x02"); - fdb_check(tr.set_option(FDB_TR_OPTION_ACCESS_SYSTEM_KEYS, nullptr, 0)); - tr.set(syskey, "bar"); while (1) { + fdb_check(tr.set_option(FDB_TR_OPTION_ACCESS_SYSTEM_KEYS, nullptr, 0)); + tr.set(syskey, "bar"); fdb::EmptyFuture f1 = tr.commit(); fdb_error_t err = wait_future(f1); @@ -949,16 +951,25 @@ TEST_CASE("fdb_transaction_clear") { } TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_ADD") { - insert_data(db, create_data({ { "foo", "a" } })); + insert_data(db, create_data({ { "foo", "\x00" } })); fdb::Transaction tr(db); int8_t param = 1; + int potentialCommitCount = 0; while (1) { tr.atomic_op(key("foo"), (const uint8_t*)¶m, sizeof(param), FDB_MUTATION_TYPE_ADD); + if (potentialCommitCount + 1 == 256) { + // Trying to commit again might overflow the one unsigned byte we're looking at + break; + } + ++potentialCommitCount; fdb::EmptyFuture f1 = tr.commit(); fdb_error_t err = wait_future(f1); if (err) { + if (fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE_NOT_COMMITTED, err)) { + --potentialCommitCount; + } fdb::EmptyFuture f2 = tr.on_error(err); fdb_check(wait_future(f2)); continue; @@ -969,7 +980,8 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_ADD") { auto value = get_value(key("foo"), /* snapshot */ false, {}); REQUIRE(value.has_value()); CHECK(value->size() == 1); - CHECK(value->data()[0] == 'b'); // incrementing 'a' results in 'b' + CHECK(uint8_t(value->data()[0]) > 0); + CHECK(uint8_t(value->data()[0]) <= potentialCommitCount); } TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_BIT_AND") { @@ -1139,14 +1151,19 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_BIT_XOR") { fdb::Transaction tr(db); char param[] = { 'a', 'd' }; + int potentialCommitCount = 0; while (1) { tr.atomic_op(key("foo"), (const uint8_t*)"b", 1, FDB_MUTATION_TYPE_BIT_XOR); tr.atomic_op(key("bar"), (const uint8_t*)param, 2, FDB_MUTATION_TYPE_BIT_XOR); tr.atomic_op(key("baz"), (const uint8_t*)"d", 1, FDB_MUTATION_TYPE_BIT_XOR); + ++potentialCommitCount; fdb::EmptyFuture f1 = tr.commit(); fdb_error_t err = wait_future(f1); if (err) { + if (fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE_NOT_COMMITTED, err)) { + --potentialCommitCount; + } fdb::EmptyFuture f2 = tr.on_error(err); fdb_check(wait_future(f2)); continue; @@ -1154,6 +1171,11 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_BIT_XOR") { break; } + if (potentialCommitCount != 1) { + MESSAGE("Transaction may not have committed exactly once. Suppressing assertions"); + return; + } + auto value = get_value(key("foo"), /* snapshot */ false, {}); REQUIRE(value.has_value()); CHECK(value->size() == 1); @@ -1204,13 +1226,18 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_APPEND_IF_FITS") { insert_data(db, create_data({ { "foo", "f" } })); fdb::Transaction tr(db); + int potentialCommitCount = 0; while (1) { tr.atomic_op(key("foo"), (const uint8_t*)"db", 2, FDB_MUTATION_TYPE_APPEND_IF_FITS); tr.atomic_op(key("bar"), (const uint8_t*)"foundation", 10, FDB_MUTATION_TYPE_APPEND_IF_FITS); + ++potentialCommitCount; fdb::EmptyFuture f1 = tr.commit(); fdb_error_t err = wait_future(f1); if (err) { + if (fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE_NOT_COMMITTED, err)) { + --potentialCommitCount; + } fdb::EmptyFuture f2 = tr.on_error(err); fdb_check(wait_future(f2)); continue; @@ -1218,13 +1245,18 @@ TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_APPEND_IF_FITS") { break; } - auto value = get_value(key("foo"), /* snapshot */ false, {}); - REQUIRE(value.has_value()); - CHECK(value->compare("fdb") == 0); + auto value_foo = get_value(key("foo"), /* snapshot */ false, {}); + REQUIRE(value_foo.has_value()); - value = get_value(key("bar"), /* snapshot */ false, {}); - REQUIRE(value.has_value()); - CHECK(value->compare("foundation") == 0); + auto value_bar = get_value(key("bar"), /* snapshot */ false, {}); + REQUIRE(value_bar.has_value()); + + if (potentialCommitCount != 1) { + MESSAGE("Transaction may not have committed exactly once. Suppressing assertions"); + } else { + CHECK(value_foo.value() == "fdb"); + CHECK(value_bar.value() == "foundation"); + } } TEST_CASE("fdb_transaction_atomic_op FDB_MUTATION_TYPE_MAX") { @@ -1576,7 +1608,7 @@ TEST_CASE("fdb_transaction_watch max watches") { fdb_check(f1.set_callback( +[](FDBFuture* f, void* param) { fdb_error_t err = fdb_future_get_error(f); - if (err != 1101) { // operation_cancelled + if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) { CHECK(err == 1032); // too_many_watches } auto* event = static_cast*>(param); @@ -1587,7 +1619,7 @@ TEST_CASE("fdb_transaction_watch max watches") { fdb_check(f2.set_callback( +[](FDBFuture* f, void* param) { fdb_error_t err = fdb_future_get_error(f); - if (err != 1101) { // operation_cancelled + if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) { CHECK(err == 1032); // too_many_watches } auto* event = static_cast*>(param); @@ -1598,7 +1630,7 @@ TEST_CASE("fdb_transaction_watch max watches") { fdb_check(f3.set_callback( +[](FDBFuture* f, void* param) { fdb_error_t err = fdb_future_get_error(f); - if (err != 1101) { // operation_cancelled + if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) { CHECK(err == 1032); // too_many_watches } auto* event = static_cast*>(param); @@ -1609,7 +1641,7 @@ TEST_CASE("fdb_transaction_watch max watches") { fdb_check(f4.set_callback( +[](FDBFuture* f, void* param) { fdb_error_t err = fdb_future_get_error(f); - if (err != 1101) { // operation_cancelled + if (err != /*operation_cancelled*/ 1101 && !fdb_error_predicate(FDB_ERROR_PREDICATE_RETRYABLE, err)) { CHECK(err == 1032); // too_many_watches } auto* event = static_cast*>(param); @@ -1671,7 +1703,7 @@ TEST_CASE("fdb_transaction_cancel") { // ... until the transaction has been reset. tr.reset(); fdb::ValueFuture f2 = tr.get("foo", /* snapshot */ false); - fdb_check(wait_future(f2)); + CHECK(wait_future(f2) != 1025); // transaction_cancelled } TEST_CASE("fdb_transaction_add_conflict_range") { @@ -2146,22 +2178,29 @@ TEST_CASE("monitor_network_busyness") { } int main(int argc, char** argv) { - if (argc != 3 && argc != 4) { + if (argc < 3) { std::cout << "Unit tests for the FoundationDB C API.\n" - << "Usage: fdb_c_unit_tests /path/to/cluster_file key_prefix [externalClient]" << std::endl; + << "Usage: fdb_c_unit_tests /path/to/cluster_file key_prefix [externalClient] [doctest args]" + << std::endl; return 1; } fdb_check(fdb_select_api_version(710)); - if (argc == 4) { + if (argc >= 4) { std::string externalClientLibrary = argv[3]; - fdb_check(fdb_network_set_option( - FDBNetworkOption::FDB_NET_OPTION_DISABLE_LOCAL_CLIENT, reinterpret_cast(""), 0)); - fdb_check(fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_LIBRARY, - reinterpret_cast(externalClientLibrary.c_str()), - externalClientLibrary.size())); + if (externalClientLibrary.substr(0, 2) != "--") { + fdb_check(fdb_network_set_option( + FDBNetworkOption::FDB_NET_OPTION_DISABLE_LOCAL_CLIENT, reinterpret_cast(""), 0)); + fdb_check(fdb_network_set_option(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_LIBRARY, + reinterpret_cast(externalClientLibrary.c_str()), + externalClientLibrary.size())); + } } + /* fdb_check(fdb_network_set_option( */ + /* FDBNetworkOption::FDB_NET_OPTION_CLIENT_BUGGIFY_ENABLE, reinterpret_cast(""), 0)); */ + doctest::Context context; + context.applyCommandLine(argc, argv); fdb_check(fdb_setup_network()); std::thread network_thread{ &fdb_run_network }; From 07edc1db9a84d3da138a773cd5cfd33d55412f1f Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Tue, 25 May 2021 15:32:49 +0000 Subject: [PATCH 432/461] Removing spaces in SevWarn trace event names --- fdbserver/FDBExecHelper.actor.cpp | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/fdbserver/FDBExecHelper.actor.cpp b/fdbserver/FDBExecHelper.actor.cpp index 618bba35be..e4185ad7c2 100644 --- a/fdbserver/FDBExecHelper.actor.cpp +++ b/fdbserver/FDBExecHelper.actor.cpp @@ -148,7 +148,10 @@ ACTOR Future spawnProcess(std::string path, state pid_t pid = pidAndReadFD.first; state Optional readFD = pidAndReadFD.second; if (pid == -1) { - TraceEvent(SevWarnAlways, "SpawnProcess: Command failed to spawn").detail("Cmd", path).detail("Args", allArgs); + TraceEvent(SevWarnAlways, "SpawnProcessFailure") + .detail("Reason", "Command failed to spawn") + .detail("Cmd", path) + .detail("Args", allArgs); return -1; } else if (pid > 0) { state int status = -1; @@ -160,7 +163,8 @@ ACTOR Future spawnProcess(std::string path, if (runTime > maxWaitTime) { // timing out - TraceEvent(SevWarnAlways, "SpawnProcess : Command failed, timeout") + TraceEvent(SevWarnAlways, "SpawnProcessFailure") + .detail("Reason", "Command failed, timeout") .detail("Cmd", path) .detail("Args", allArgs); return -1; @@ -175,9 +179,10 @@ ACTOR Future spawnProcess(std::string path, } if (err < 0) { - TraceEvent event(SevWarnAlways, "SpawnProcess : Command failed"); + TraceEvent event(SevWarnAlways, "SpawnProcessFailure"); setupTraceWithOutput(event, bytesRead, outputBuffer); - event.detail("Cmd", path) + event.detail("Reason", "Command failed") + .detail("Cmd", path) .detail("Args", allArgs) .detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : -1); return -1; @@ -194,14 +199,15 @@ ACTOR Future spawnProcess(std::string path, } else { // child process completed if (!(WIFEXITED(status) && WEXITSTATUS(status) == 0)) { - TraceEvent event(SevWarnAlways, "SpawnProcess : Command failed"); + TraceEvent event(SevWarnAlways, "SpawnProcessFailure"); setupTraceWithOutput(event, bytesRead, outputBuffer); - event.detail("Cmd", path) + event.detail("Reason", "Command failed") + .detail("Cmd", path) .detail("Args", allArgs) .detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : -1); return WIFEXITED(status) ? WEXITSTATUS(status) : -1; } - TraceEvent event("SpawnProcess : Command status"); + TraceEvent event("SpawnProcess_CommandStatus"); setupTraceWithOutput(event, bytesRead, outputBuffer); event.detail("Cmd", path) .detail("Args", allArgs) From a39dec1380f61bee25d0cc9fe20c3008bb37d37d Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Tue, 25 May 2021 15:51:26 +0000 Subject: [PATCH 433/461] Fixing multiple small redwood test bugs --- fdbserver/VersionedBTree.actor.cpp | 37 ++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 8a919fd190..49eac05655 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -40,6 +40,12 @@ #define REDWOOD_DEBUG 0 +// Only print redwood debug statements for a certain address. Useful in simulation with many redwood processes to reduce +// log size. +#define REDWOOD_DEBUG_ADDR 0 +// example addr: "[abcd::4:0:1:4]:1" +#define REDWOOD_DEBUG_ADDR_VAL ""; + #define debug_printf_stream stdout #define debug_printf_always(...) \ { \ @@ -49,11 +55,25 @@ fflush(debug_printf_stream); \ } +#define debug_printf_addr(...) \ + { \ + std::string addr = REDWOOD_DEBUG_ADDR_VAL; \ + if (!memcmp(addr.c_str(), g_network->getLocalAddress().toString().c_str(), addr.size())) { \ + std::string prefix = \ + format("%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__); \ + std::string msg = format(__VA_ARGS__); \ + writePrefixedLines(debug_printf_stream, prefix, msg); \ + fflush(debug_printf_stream); \ + } \ + } + #define debug_printf_noop(...) #if defined(NO_INTELLISENSE) #if REDWOOD_DEBUG #define debug_printf debug_printf_always +#elif REDWOOD_DEBUG_ADDR +#define debug_printf debug_printf_addr #else #define debug_printf debug_printf_noop #endif @@ -3868,9 +3888,10 @@ private: std::unordered_map parents; ParentInfoMapT childUpdateTracker; - // MetaKey changes size so allocate space for it to expand into + // MetaKey changes size so allocate space for it to expand into. FIXME: Steve is fixing this to be dynamically + // sized. union { - uint8_t headerSpace[sizeof(MetaKey) + sizeof(LogicalPageID) * 30]; + uint8_t headerSpace[sizeof(MetaKey) + sizeof(LogicalPageID) * 200]; MetaKey m_header; }; @@ -7548,7 +7569,11 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { std::set uniqueItems; while (uniqueItems.size() < N) { IntIntPair p = randomPair(); - if (uniqueItems.count(p) == 0) { + auto nextP = p; // also check if next highest/lowest key is not in set for testLTE/testGTE + nextP.v++; + auto prevP = p; + prevP.v--; + if (uniqueItems.count(p) == 0 && uniqueItems.count(nextP) == 0 && uniqueItems.count(prevP) == 0) { uniqueItems.insert(p); } } @@ -7566,7 +7591,11 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { std::vector toDelete; while (1) { IntIntPair p = randomPair(); - if (uniqueItems.count(p) == 0) { + auto nextP = p; // also check if next highest/lowest key is not in set for testLTE/testGTE + nextP.v++; + auto prevP = p; + prevP.v--; + if (uniqueItems.count(p) == 0 && uniqueItems.count(nextP) == 0 && uniqueItems.count(prevP) == 0) { if (!r.insert(p)) { break; }; From c31196ab01d2ebf65389a92bc941d2195dec2137 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Tue, 25 May 2021 10:24:00 -0700 Subject: [PATCH 434/461] Update fdbserver/FDBExecHelper.actor.cpp Co-authored-by: A.J. Beamon --- fdbserver/FDBExecHelper.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/FDBExecHelper.actor.cpp b/fdbserver/FDBExecHelper.actor.cpp index e4185ad7c2..1a32999e7d 100644 --- a/fdbserver/FDBExecHelper.actor.cpp +++ b/fdbserver/FDBExecHelper.actor.cpp @@ -207,7 +207,7 @@ ACTOR Future spawnProcess(std::string path, .detail("Errno", WIFEXITED(status) ? WEXITSTATUS(status) : -1); return WIFEXITED(status) ? WEXITSTATUS(status) : -1; } - TraceEvent event("SpawnProcess_CommandStatus"); + TraceEvent event("SpawnProcessCommandStatus"); setupTraceWithOutput(event, bytesRead, outputBuffer); event.detail("Cmd", path) .detail("Args", allArgs) From ce82c9653e7708e0ce6bdb38c538934f3825fa2d Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Fri, 5 Mar 2021 19:28:15 +0000 Subject: [PATCH 435/461] Testing Storage Server implementation --- .../source/mr-status-json-schemas.rst.inc | 10 + fdbcli/fdbcli.actor.cpp | 16 +- fdbclient/BackupAgentBase.actor.cpp | 18 +- fdbclient/CMakeLists.txt | 1 + fdbclient/CommitProxyInterface.h | 15 +- fdbclient/DatabaseConfiguration.cpp | 27 +- fdbclient/DatabaseConfiguration.h | 4 + fdbclient/DatabaseContext.h | 9 + fdbclient/ManagementAPI.actor.cpp | 39 +- fdbclient/NativeAPI.actor.cpp | 261 +++++++- fdbclient/Schemas.cpp | 17 +- fdbclient/StorageServerInterface.cpp | 465 ++++++++++++++ fdbclient/StorageServerInterface.h | 19 +- fdbclient/SystemData.cpp | 85 ++- fdbclient/SystemData.h | 5 + fdbrpc/CMakeLists.txt | 3 +- fdbrpc/LoadBalance.actor.h | 106 +++- fdbrpc/QueueModel.cpp | 35 ++ fdbrpc/QueueModel.h | 24 +- fdbrpc/TSSComparison.h | 78 +++ fdbrpc/fdbrpc.h | 2 + fdbrpc/simulator.h | 6 +- fdbserver/ApplyMetadataMutation.cpp | 7 + fdbserver/ClusterController.actor.cpp | 143 ++++- fdbserver/CommitProxyServer.actor.cpp | 3 + fdbserver/DataDistribution.actor.cpp | 569 +++++++++++++++--- fdbserver/DataDistributionTracker.actor.cpp | 14 +- fdbserver/Knobs.cpp | 1 + fdbserver/Knobs.h | 1 + fdbserver/MoveKeys.actor.cpp | 482 +++++++++++---- fdbserver/MoveKeys.actor.h | 3 +- fdbserver/MutationTracking.cpp | 3 + fdbserver/QuietDatabase.actor.cpp | 16 +- fdbserver/Ratekeeper.actor.cpp | 8 +- fdbserver/SimulatedCluster.actor.cpp | 44 ++ fdbserver/Status.actor.cpp | 16 +- fdbserver/TLogServer.actor.cpp | 5 + fdbserver/WorkerInterface.actor.h | 11 +- fdbserver/masterserver.actor.cpp | 6 + fdbserver/storageserver.actor.cpp | 562 +++++++++++++++-- fdbserver/tester.actor.cpp | 20 + fdbserver/worker.actor.cpp | 60 +- .../workloads/ConsistencyCheck.actor.cpp | 190 +++++- fdbserver/workloads/RandomMoveKeys.actor.cpp | 3 +- fdbserver/workloads/workloads.actor.h | 6 +- flow/ProtocolVersion.h | 4 +- flow/genericactors.actor.h | 2 + flow/serialize.h | 9 + tests/CMakeLists.txt | 2 + tests/StorageServerInterface.txt | 7 + tests/SystemData.txt | 7 + 51 files changed, 3128 insertions(+), 321 deletions(-) create mode 100644 fdbclient/StorageServerInterface.cpp create mode 100644 fdbrpc/TSSComparison.h create mode 100644 tests/StorageServerInterface.txt create mode 100644 tests/SystemData.txt diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index 7979331898..914a682c4c 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -682,6 +682,16 @@ "ssd-rocksdb-experimental", "memory" ]}, + "tss_count":1, + "tss_storage_engine":{ + "$enum":[ + "ssd", + "ssd-1", + "ssd-2", + "ssd-redwood-experimental", + "ssd-rocksdb-experimental", + "memory" + ]}, "coordinators_count":1, "excluded_servers":[ { diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 7f1bb3b735..0b53bcde6d 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -501,7 +501,10 @@ void initHelp() { "change the database configuration", "The `new' option, if present, initializes a new database with the given configuration rather than changing " "the configuration of an existing one. When used, both a redundancy mode and a storage engine must be " - "specified.\n\nRedundancy mode:\n single - one copy of the data. Not fault tolerant.\n double - two copies " + "specified.\n\ntss: when enabled, configures the testing storage server for the cluster instead." + "When used with new to set up tss for the first time, it requires both a count and a storage engine." + "To disable the testing storage server, run \"configure tss count=0\"\n\n" + "Redundancy mode:\n single - one copy of the data. Not fault tolerant.\n double - two copies " "of data (survive one failure).\n triple - three copies of data (survive two failures).\n three_data_hall - " "See the Admin Guide.\n three_datacenter - See the Admin Guide.\n\nStorage engine:\n ssd - B-Tree storage " "engine optimized for solid state disks.\n memory - Durable in-memory storage engine for small " @@ -1128,6 +1131,17 @@ void printStatus(StatusObjectReader statusObj, if (statusObjConfig.get("log_routers", intVal)) outputString += format("\n Desired Log Routers - %d", intVal); + if (statusObjConfig.get("tss_count", intVal) && intVal > 0) { + int activeTss = 0; + if (statusObjCluster.has("active_tss_count")) { + statusObjCluster.get("active_tss_count", activeTss); + } + outputString += format("\n TSS - %d/%d", activeTss, intVal); + + if (statusObjConfig.get("tss_storage_engine", strVal)) + outputString += format("\n TSS Storage Engine - %s", strVal.c_str()); + } + outputString += "\n Usable Regions - "; if (statusObjConfig.get("usable_regions", intVal)) { outputString += std::to_string(intVal); diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp index fba2e69954..cc861f310a 100644 --- a/fdbclient/BackupAgentBase.actor.cpp +++ b/fdbclient/BackupAgentBase.actor.cpp @@ -404,8 +404,14 @@ ACTOR Future readCommitted(Database cx, state RangeResult values = wait(tr.getRange(begin, end, limits)); // When this buggify line is enabled, if there are more than 1 result then use half of the results + // Copy the data instead of messing with the results directly to avoid TSS issues. if (values.size() > 1 && BUGGIFY) { - values.resize(values.arena(), values.size() / 2); + Standalone copy; + // only copy first half of values into copy + for (int i = 0; i < values.size() / 2; i++) { + copy.push_back_deep(copy.arena(), values[i]); + } + values = copy; values.more = true; // Half of the time wait for this tr to expire so that the next read is at a different version if (deterministicRandom()->random01() < 0.5) @@ -469,9 +475,15 @@ ACTOR Future readCommitted(Database cx, state RangeResult rangevalue = wait(tr.getRange(nextKey, end, limits)); - // When this buggify line is enabled, if there are more than 1 result then use half of the results + // When this buggify line is enabled, if there are more than 1 result then use half of the results. + // Copy the data instead of messing with the results directly to avoid TSS issues. if (rangevalue.size() > 1 && BUGGIFY) { - rangevalue.resize(rangevalue.arena(), rangevalue.size() / 2); + Standalone copy; + // only copy first half of rangevalue into copy + for (int i = 0; i < rangevalue.size() / 2; i++) { + copy.push_back_deep(copy.arena(), rangevalue[i]); + } + rangevalue = copy; rangevalue.more = true; // Half of the time wait for this tr to expire so that the next read is at a different version if (deterministicRandom()->random01() < 0.5) diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index bd14ef7b52..75b9fd5a0a 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -68,6 +68,7 @@ set(FDBCLIENT_SRCS Status.h StatusClient.actor.cpp StatusClient.h + StorageServerInterface.cpp StorageServerInterface.h Subspace.cpp Subspace.h diff --git a/fdbclient/CommitProxyInterface.h b/fdbclient/CommitProxyInterface.h index 794b88ceaa..16f6695a03 100644 --- a/fdbclient/CommitProxyInterface.h +++ b/fdbclient/CommitProxyInterface.h @@ -116,18 +116,31 @@ struct ClientDBInfo { firstCommitProxy; // not serialized, used for commitOnFirstProxy when the commit proxies vector has been shrunk Optional forward; vector history; + vector> + tssMapping; // logically map for all active TSS pairs ClientDBInfo() {} bool operator==(ClientDBInfo const& r) const { return id == r.id; } bool operator!=(ClientDBInfo const& r) const { return id != r.id; } + // convenience method to treat tss mapping like a map + // TODO can serializer handle maps? could just change it + Optional getTssPair(UID storageServerID) const { + for (auto& it : tssMapping) { + if (it.first == storageServerID) { + return Optional(it.second); + } + } + return Optional(); + } + template void serialize(Archive& ar) { if constexpr (!is_fb_function) { ASSERT(ar.protocolVersion().isValid()); } - serializer(ar, grvProxies, commitProxies, id, forward, history); + serializer(ar, grvProxies, commitProxies, id, forward, history, tssMapping); } }; diff --git a/fdbclient/DatabaseConfiguration.cpp b/fdbclient/DatabaseConfiguration.cpp index 838f6c3c10..a2cfc435b3 100644 --- a/fdbclient/DatabaseConfiguration.cpp +++ b/fdbclient/DatabaseConfiguration.cpp @@ -31,7 +31,8 @@ void DatabaseConfiguration::resetInternal() { commitProxyCount = grvProxyCount = resolverCount = desiredTLogCount = tLogWriteAntiQuorum = tLogReplicationFactor = storageTeamSize = desiredLogRouterCount = -1; tLogVersion = TLogVersion::DEFAULT; - tLogDataStoreType = storageServerStoreType = KeyValueStoreType::END; + tLogDataStoreType = storageServerStoreType = testingStorageServerStoreType = KeyValueStoreType::END; + desiredTSSCount = 0; tLogSpillType = TLogSpillType::DEFAULT; autoCommitProxyCount = CLIENT_KNOBS->DEFAULT_AUTO_COMMIT_PROXIES; autoGrvProxyCount = CLIENT_KNOBS->DEFAULT_AUTO_GRV_PROXIES; @@ -299,6 +300,25 @@ StatusObject DatabaseConfiguration::toJSON(bool noPolicies) const { result["storage_engine"] = "custom"; } + if (desiredTSSCount > 0) { + result["tss_count"] = desiredTSSCount; + if (testingStorageServerStoreType == KeyValueStoreType::SSD_BTREE_V1) { + result["tss_storage_engine"] = "ssd-1"; + } else if (testingStorageServerStoreType == KeyValueStoreType::SSD_BTREE_V2) { + result["tss_storage_engine"] = "ssd-2"; + } else if (testingStorageServerStoreType == KeyValueStoreType::SSD_REDWOOD_V1) { + result["tss_storage_engine"] = "ssd-redwood-experimental"; + } else if (testingStorageServerStoreType == KeyValueStoreType::SSD_ROCKSDB_V1) { + result["tss_storage_engine"] = "ssd-rocksdb-experimental"; + } else if (testingStorageServerStoreType == KeyValueStoreType::MEMORY_RADIXTREE) { + result["tss_storage_engine"] = "memory-radixtree-beta"; + } else if (testingStorageServerStoreType == KeyValueStoreType::MEMORY) { + result["tss_storage_engine"] = "memory-2"; + } else { + result["tss_storage_engine"] = "custom"; + } + } + result["log_spill"] = (int)tLogSpillType; if (remoteTLogReplicationFactor == 1) { @@ -449,6 +469,8 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) { } } else if (ck == LiteralStringRef("storage_replicas")) { parse(&storageTeamSize, value); + } else if (ck == LiteralStringRef("tss_count")) { + parse(&desiredTSSCount, value); } else if (ck == LiteralStringRef("log_version")) { parse((&type), value); type = std::max((int)TLogVersion::MIN_RECRUITABLE, type); @@ -471,6 +493,9 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) { } else if (ck == LiteralStringRef("storage_engine")) { parse((&type), value); storageServerStoreType = (KeyValueStoreType::StoreType)type; + } else if (ck == LiteralStringRef("tss_storage_engine")) { + parse((&type), value); + testingStorageServerStoreType = (KeyValueStoreType::StoreType)type; } else if (ck == LiteralStringRef("auto_commit_proxies")) { parse(&autoCommitProxyCount, value); } else if (ck == LiteralStringRef("auto_grv_proxies")) { diff --git a/fdbclient/DatabaseConfiguration.h b/fdbclient/DatabaseConfiguration.h index ef539f40b0..0df45ce228 100644 --- a/fdbclient/DatabaseConfiguration.h +++ b/fdbclient/DatabaseConfiguration.h @@ -225,6 +225,10 @@ struct DatabaseConfiguration { int32_t storageTeamSize; KeyValueStoreType storageServerStoreType; + // Testing StorageServers + int32_t desiredTSSCount; + KeyValueStoreType testingStorageServerStoreType; + // Remote TLogs int32_t desiredLogRouterCount; int32_t remoteDesiredTLogCount; diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index ae1a5a741b..b1dee87f18 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -273,6 +273,9 @@ public: Reference>> connectionFile; AsyncTrigger proxiesChangeTrigger; Future monitorProxiesInfoChange; + Future monitorTssInfoChange; + Future tssMismatchHandler; + PromiseStream tssMismatchStream; Reference commitProxies; Reference grvProxies; bool proxyProvisional; // Provisional commit proxy and grv proxy are used at the same time. @@ -320,6 +323,8 @@ public: std::map server_interf; + std::map> tssMetrics; + UID dbId; bool internal; // Only contexts created through the C client and fdbcli are non-internal @@ -419,6 +424,10 @@ public: static bool debugUseTags; static const std::vector debugTransactionTagChoices; std::unordered_map> watchMap; + + // TODO should this be private? + void maybeAddTssMapping(StorageServerInterface const& ssi); + void addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi); }; #endif diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index f53cf65828..8b4d03b4d8 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -60,6 +60,13 @@ std::map configForToken(std::string const& mode) { return out; } + if (mode == "tss") { + // Set temporary marker in config map to mark that this is a tss configuration and not a normal storage/log + // configuration. A bit of a hack but reuses the parsing code nicely. + out[p + "istss"] = "1"; + return out; + } + if (mode == "locked") { // Setting this key is interpreted as an instruction to use the normal version-stamp-based mechanism for locking // the database. @@ -119,7 +126,7 @@ std::map configForToken(std::string const& mode) { if ((key == "logs" || key == "commit_proxies" || key == "grv_proxies" || key == "resolvers" || key == "remote_logs" || key == "log_routers" || key == "usable_regions" || - key == "repopulate_anti_quorum") && + key == "repopulate_anti_quorum" || key == "count") && isInteger(value)) { out[p + key] = value; } @@ -334,6 +341,36 @@ ConfigurationResult buildConfiguration(std::vector const& modeTokens, serializeReplicationPolicy(policyWriter, logPolicy); outConf[p + "log_replication_policy"] = policyWriter.toValue().toString(); } + if (outConf.count(p + "istss")) { + // redo config parameters to be tss config instead of normal config + + // save param values from parsing as a normal config + bool isNew = outConf.count(p + "initialized"); + Optional count; + Optional storageEngine; + if (outConf.count(p + "count")) { + count = Optional(outConf[p + "count"]); + } + if (outConf.count(p + "storage_engine")) { + storageEngine = Optional(outConf[p + "storage_engine"]); + } + + // A new tss setup must have count + storage engine. An adjustment must have at least one. + if ((isNew && (!count.present() || !storageEngine.present())) || + (!isNew && !count.present() && !storageEngine.present())) { + // TODO is this the right error type? And should we log something? + return ConfigurationResult::INCOMPLETE_CONFIGURATION; + } + + // clear map and only reset tss parameters + outConf.clear(); + if (count.present()) { + outConf[p + "tss_count"] = count.get(); + } + if (storageEngine.present()) { + outConf[p + "tss_storage_engine"] = storageEngine.get(); + } + } return ConfigurationResult::SUCCESS; } diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 75c11db594..9cd9a32d8c 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -38,6 +38,7 @@ #include "fdbclient/DatabaseContext.h" #include "fdbclient/GlobalConfig.actor.h" #include "fdbclient/JsonBuilder.h" +#include "fdbclient/KeyBackedTypes.h" #include "fdbclient/KeyRangeMap.h" #include "fdbclient/Knobs.h" #include "fdbclient/ManagementAPI.actor.h" @@ -121,6 +122,52 @@ NetworkOptions::NetworkOptions() static const Key CLIENT_LATENCY_INFO_PREFIX = LiteralStringRef("client_latency/"); static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = LiteralStringRef("client_latency_counter/"); +// TODO make tss function here +void DatabaseContext::maybeAddTssMapping(StorageServerInterface const& ssi) { + // add tss mapping if server is new + + Optional tssPair = clientInfo->get().getTssPair(ssi.id()); + if (tssPair.present()) { + addTssMapping(ssi, tssPair.get()); + } +} + +// calling getInterface potentially recursively is weird, but since this function is only called when an entry is +// created/changed, the recursive call should never recurse itself. +void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi) { + // TODO get both with a getInterface call which will create the tss endpoint and/or update both endpoints if there + // was a change in endpoint tokens + + // the order of these is important because it hits the "different token same locality" issue, so we always want to + // request the tss first so the ss request overrides it. + // TODO this shouldn't be necessary after i stop doing the same server hack + Reference tssInfo = StorageServerInfo::getInterface(this, tssi, clientLocality); + Reference ssInfo = StorageServerInfo::getInterface(this, ssi, clientLocality); + + // add new tss metrics object to queue + Reference metrics = makeReference(); + tssMetrics[tssi.id()] = metrics; + + // TODO any other requests it makes sense to duplicate? + // add each read data request interface to map (getValue, getKey, getKeyValues, watchValue) + queueModel.updateTssEndpoint( + ssInfo->interf.getValue.getEndpoint().token.first(), + TSSEndpointData(tssi.id(), tssInfo->interf.getValue.getEndpoint(), metrics, clientInfo->get().id)); + queueModel.updateTssEndpoint( + ssInfo->interf.getKey.getEndpoint().token.first(), + TSSEndpointData(tssi.id(), tssInfo->interf.getKey.getEndpoint(), metrics, clientInfo->get().id)); + queueModel.updateTssEndpoint( + ssInfo->interf.getKeyValues.getEndpoint().token.first(), + TSSEndpointData(tssi.id(), tssInfo->interf.getKeyValues.getEndpoint(), metrics, clientInfo->get().id)); + queueModel.updateTssEndpoint( + ssInfo->interf.watchValue.getEndpoint().token.first(), + TSSEndpointData(tssi.id(), tssInfo->interf.watchValue.getEndpoint(), metrics, clientInfo->get().id)); + + // TODO REMOVE + printf( + "added tss endpoints to queue for mapping %s=%s\n", ssi.id().toString().c_str(), tssi.id().toString().c_str()); +} + Reference StorageServerInfo::getInterface(DatabaseContext* cx, StorageServerInterface const& ssi, LocalityData const& locality) { @@ -133,11 +180,19 @@ Reference StorageServerInfo::getInterface(DatabaseContext* cx // pointing to. This is technically correct, but is very unnatural. We may want to refactor load // balance to take an AsyncVar> so that it is notified when the interface // changes. + it->second->interf = ssi; + + // TODO remove print + printf("maybeAddTss same locality %s\n", ssi.id().toString().c_str()); + cx->maybeAddTssMapping(ssi); } else { it->second->notifyContextDestroyed(); Reference loc(new StorageServerInfo(cx, ssi, locality)); cx->server_interf[ssi.id()] = loc.getPtr(); + // TODO REMOVE print + printf("maybeAddTss different locality %s\n", ssi.id().toString().c_str()); + cx->maybeAddTssMapping(ssi); return loc; } } @@ -147,6 +202,9 @@ Reference StorageServerInfo::getInterface(DatabaseContext* cx Reference loc(new StorageServerInfo(cx, ssi, locality)); cx->server_interf[ssi.id()] = loc.getPtr(); + // TODO REMOVE print + // printf("maybeAddTss new ssi %s\n", ssi.id().toString().c_str()); + cx->maybeAddTssMapping(ssi); return loc; } @@ -327,6 +385,55 @@ ACTOR Future databaseLogger(DatabaseContext* cx) { cx->mutationsPerCommit.clear(); cx->bytesPerCommit.clear(); + for (const auto& it : cx->tssMetrics) { + // TODO could skip this tss if request counter is zero? would potentially complicate elapsed calculation + // though + if (it.second->mismatches.getIntervalDelta()) { + printf("Found tss %s with %d mismatches!!\n", + it.first.toString().c_str(), + it.second->mismatches.getIntervalDelta()); + cx->tssMismatchStream.send(it.first); + } + TraceEvent tssEv("TSSClientMetrics", cx->dbId); + tssEv.detail("TSSID", it.first) + .detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) + .detail("Internal", cx->internal); + + it.second->cc.logToTraceEvent(tssEv); + + tssEv.detail("MeanSSGetValueLatency", it.second->SSgetValueLatency.mean()) + .detail("MedianSSGetValueLatency", it.second->SSgetValueLatency.median()) + .detail("SSGetValueLatency90", it.second->SSgetValueLatency.percentile(0.90)) + .detail("SSGetValueLatency99", it.second->SSgetValueLatency.percentile(0.99)); + + tssEv.detail("MeanTSSGetValueLatency", it.second->TSSgetValueLatency.mean()) + .detail("MedianTSSGetValueLatency", it.second->TSSgetValueLatency.median()) + .detail("TSSGetValueLatency90", it.second->TSSgetValueLatency.percentile(0.90)) + .detail("TSSGetValueLatencyDiff99", it.second->TSSgetValueLatency.percentile(0.99)); + + tssEv.detail("MeanSSGetKeyLatency", it.second->SSgetKeyLatency.mean()) + .detail("MedianSSGetKeyLatency", it.second->SSgetKeyLatency.median()) + .detail("SSGetKeyLatency90", it.second->SSgetKeyLatency.percentile(0.90)) + .detail("SSGetKeyLatency99", it.second->SSgetKeyLatency.percentile(0.99)); + + tssEv.detail("MeanTSSGetKeyLatency", it.second->TSSgetKeyLatency.mean()) + .detail("MedianTSSGetKeyLatency", it.second->TSSgetKeyLatency.median()) + .detail("TSSGetKeyLatency90", it.second->TSSgetKeyLatency.percentile(0.90)) + .detail("TSSGetKeyLatencyDiff99", it.second->TSSgetKeyLatency.percentile(0.99)); + + tssEv.detail("MeanSSGetKeyValuesLatency", it.second->SSgetKeyLatency.mean()) + .detail("MedianSSGetKeyValuesLatency", it.second->SSgetKeyLatency.median()) + .detail("SSGetKeyValuesLatency90", it.second->SSgetKeyLatency.percentile(0.90)) + .detail("SSGetKeyValuesLatency99", it.second->SSgetKeyLatency.percentile(0.99)); + + tssEv.detail("MeanTSSGetKeyValuesLatency", it.second->TSSgetKeyValuesLatency.mean()) + .detail("MedianTSSGetKeyValuesLatency", it.second->TSSgetKeyValuesLatency.median()) + .detail("TSSGetKeyValuesLatency90", it.second->TSSgetKeyValuesLatency.percentile(0.90)) + .detail("TSSGetKeyValuesLatencyDiff99", it.second->TSSgetKeyValuesLatency.percentile(0.99)); + + it.second->clear(); + } + lastLogged = now(); } } @@ -711,6 +818,110 @@ ACTOR Future monitorCacheList(DatabaseContext* self) { } } +// updates tss mapping when set of tss servers changes +ACTOR static Future monitorTssChange(DatabaseContext* cx) { + state vector> curTssMapping; + curTssMapping = cx->clientInfo->get().tssMapping; + + loop { + wait(cx->clientInfo->onChange()); + if (cx->clientInfo->get().tssMapping != curTssMapping) { + // TODO maybe re-read this from system keys instead if it changes + ClientDBInfo clientInfo = cx->clientInfo->get(); + curTssMapping = clientInfo.tssMapping; + + // TODO REMOVE print + // printf("gonna do tss stuff with %d tss's\n", curTssMapping.size()); + + std::unordered_set seenTssIds; + + if (curTssMapping.size()) { + for (const auto& it : curTssMapping) { + seenTssIds.insert(it.second.id()); + + if (cx->server_interf.count(it.first)) { + // TODO REMOVE + printf("found new tss mapping %s -> %s\n", + it.first.toString().c_str(), + it.second.id().toString().c_str()); + cx->addTssMapping(cx->server_interf[it.first]->interf, it.second); + } else { + // TODO REMOVE case and print + // printf("server %s with tss pair %s not in server_interf, skipping for now\n", + // it.first.toString().c_str(), it.second.id().toString().c_str()); + } + } + } + + for (auto it = cx->tssMetrics.begin(); it != cx->tssMetrics.end();) { + if (seenTssIds.count(it->first)) { + it++; + } else { + // TODO REMOVE + printf("Erasing tss %s from tss_metrics\n", it->first.toString().c_str()); + it = cx->tssMetrics.erase(it); + } + } + + cx->queueModel.removeOldTssData(clientInfo.id); + } + } +} + +ACTOR static Future handleTssMismatches(DatabaseContext* cx) { + state Reference tr; + state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); + loop { + state UID tssID = waitNext(cx->tssMismatchStream.getFuture()); + // find ss pair id so we can remove it from the mapping + state UID tssPairID; + bool found = false; + for (const auto& it : cx->clientInfo->get().tssMapping) { + if (it.second.id() == tssID) { + tssPairID = it.first; + found = true; + break; + } + } + // TODO maybe instead of assert, do a trace event because it's possible that by the time we checked the mismatch + // the tss is gone? + if (found) { + // TODO add trace event + TEST(true); // killing TSS because it got mismatch + printf("KILLING TSS %s (partner=%s) BECAUSE OF TSS MISMATCH\n", + tssID.toString().c_str(), + tssPairID.toString().c_str()); + + // TODO we could write something to the system keyspace and then have DD listen to that keyspace and then DD + // do exactly this, so why not just cut out the middle man (or the middle system keys, as it were) + tr = makeReference(Database(Reference::addRef(cx))); + loop { + try { + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + + tr->clear(serverTagKeyFor(tssID)); + tssMapDB.erase(tr, tssPairID); + + tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString()); + wait(tr->commit()); + + break; + } catch (Error& e) { + printf("Kill Mismatch TSS Transaction got error %d\n", e.code()); + wait(tr->onError(e)); + } + } + tr = makeReference(); // clear out txn so that the extra ref gets decref'd and we + // can free cx + + } else { + TEST(true); // Not killing TSS with mismatch because it's already gone + printf("Not killing TSS %s because of tss mismatch, must be already removed\n", tssID.toString().c_str()); + } + } +} + ACTOR static Future getHealthMetricsActor(DatabaseContext* cx, bool detailed) { if (now() - cx->healthMetricsLastUpdated < CLIENT_KNOBS->AGGREGATE_HEALTH_METRICS_MAX_STALENESS) { if (detailed) { @@ -960,6 +1171,8 @@ DatabaseContext::DatabaseContext(Reference> clientInfo, DatabaseContext::~DatabaseContext() { cacheListMonitor.cancel(); monitorProxiesInfoChange.cancel(); + monitorTssInfoChange.cancel(); + tssMismatchHandler.cancel(); for (auto it = server_interf.begin(); it != server_interf.end(); it = server_interf.erase(it)) it->second->notifyContextDestroyed(); ASSERT_ABORT(server_interf.empty()); @@ -2345,6 +2560,11 @@ ACTOR Future getKey(Database cx, KeySelector k, Future version, Tr "NativeAPI.getKey.Before"); //.detail("StartKey", // k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual); ++cx->transactionPhysicalReads; + + GetKeyRequest req( + span.context, k, version.get(), cx->sampleReadTags() ? tags : Optional(), getKeyID); + req.arena.dependsOn(k.arena()); + state GetKeyReply reply; try { choose { @@ -2353,11 +2573,7 @@ ACTOR Future getKey(Database cx, KeySelector k, Future version, Tr wait(loadBalance(cx.getPtr(), ssi.second, &StorageServerInterface::getKey, - GetKeyRequest(span.context, - k, - version.get(), - cx->sampleReadTags() ? tags : Optional(), - getKeyID), + req, TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) { @@ -2718,6 +2934,9 @@ ACTOR Future getExactRange(Database cx, req.end = firstGreaterOrEqual(range.end); req.spanContext = span.context; + // keep shard's arena around in case of async tss comparison + req.arena.dependsOn(locations[shard].first.arena()); + transformRangeLimits(limits, reverse, req); ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse); @@ -3034,6 +3253,9 @@ ACTOR Future getRange(Database cx, req.isFetchKeys = (info.taskID == TaskPriority::FetchKeys); req.version = readVersion; + // In case of async tss comparison, also make req arena depend on begin, end, and/or shard's arena depending + // on which is used + bool dependOnShard = false; if (reverse && (begin - 1).isDefinitelyLess(shard.begin) && (!begin.isFirstGreaterOrEqual() || begin.getKey() != shard.begin)) { // In this case we would be setting modifiedSelectors to true, but @@ -3041,14 +3263,23 @@ ACTOR Future getRange(Database cx, req.begin = firstGreaterOrEqual(shard.begin); modifiedSelectors = true; - } else + req.arena.dependsOn(shard.arena()); + dependOnShard = true; + } else { req.begin = begin; + req.arena.dependsOn(begin.arena()); + } if (!reverse && end.isDefinitelyGreater(shard.end)) { req.end = firstGreaterOrEqual(shard.end); modifiedSelectors = true; - } else + if (!dependOnShard) { + req.arena.dependsOn(shard.arena()); + } + } else { req.end = end; + req.arena.dependsOn(end.arena()); + } transformRangeLimits(limits, reverse, req); ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse); @@ -3133,10 +3364,18 @@ ACTOR Future getRange(Database cx, output.readThroughEnd = readThroughEnd; if (BUGGIFY && limits.hasByteLimit() && output.size() > std::max(1, originalLimits.minRows)) { + printf("Buggify resizing in nativeapi\n"); + // Copy instead of resizing because TSS maybe be using output's arena for comparison. This only + // happens in simulation so it's fine + Standalone copy; + int newSize = + deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size()); + for (int i = 0; i < newSize; i++) { + copy.push_back_deep(copy.arena(), output[i]); + } + output = copy; output.more = true; - output.resize( - output.arena(), - deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size())); + getRangeFinished(cx, trLogInfo, startTime, @@ -4180,6 +4419,8 @@ ACTOR static Future tryCommit(Database cx, choose { when(wait(cx->onProxiesChanged())) { reply.cancel(); + // TODO REMOVE + printf("tryCommit proxies changed ERROR!\n"); throw request_maybe_delivered(); } when(CommitID ci = wait(reply)) { diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 124fb17873..514866fe83 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -431,6 +431,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "seconds" : 1.0, "versions" : 1000000 }, + "active_tss_count":0, "degraded_processes":0, "database_available":true, "database_lock_state": { @@ -729,6 +730,19 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "memory-2", "memory-radixtree-beta" ]}, + "tss_count":1, + "tss_storage_engine":{ + "$enum":[ + "ssd", + "ssd-1", + "ssd-2", + "ssd-redwood-experimental", + "ssd-rocksdb-experimental", + "memory", + "memory-1", + "memory-2", + "memory-radixtree-beta" + ]}, "coordinators_count":1, "excluded_servers":[ { @@ -802,7 +816,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( } } ], - "least_operating_space_bytes_storage_server":0 + "least_operating_space_bytes_storage_server":0, + "max_machine_failures_without_losing_data":0 }, "machines":{ "$map":{ diff --git a/fdbclient/StorageServerInterface.cpp b/fdbclient/StorageServerInterface.cpp new file mode 100644 index 0000000000..180d1b814c --- /dev/null +++ b/fdbclient/StorageServerInterface.cpp @@ -0,0 +1,465 @@ +/* + * StorageServerInterface.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/StorageServerInterface.h" +#include "flow/crc32c.h" // for crc32c_append, to checksum values in tss trace events + +// Includes template specializations for all tss operations on storage server types. +// New StorageServerInterface reply types must be added here or it won't compile. + +// if size + hex of checksum is shorter than value, record that instead of actual value. break-even point is 12 +// characters +std::string traceChecksumValue(ValueRef s) { + return s.size() > 12 ? format("(%d)%08x", s.size(), crc32c_append(0, s.begin(), s.size())) : s.toString(); +} + +template <> +bool TSS_doCompare(const GetValueRequest& req, + const GetValueReply& src, + const GetValueReply& tss, + Severity traceSeverity, + UID tssId) { + if (src.value.present() != tss.value.present() || (src.value.present() && src.value.get() != tss.value.get())) { + printf("GetValue %s @ %lld mismatch: src=%s, tss=%s\n", + req.key.printable().c_str(), + req.version, + src.value.present() ? traceChecksumValue(src.value.get()).c_str() : "missing", + tss.value.present() ? traceChecksumValue(tss.value.get()).c_str() : "missing"); + TraceEvent(traceSeverity, "TSSMismatchGetValue") + .suppressFor(1.0) + .detail("TSSID", tssId) + .detail("Key", req.key.printable()) + .detail("Version", req.version) + .detail("SSReply", src.value.present() ? traceChecksumValue(src.value.get()) : "missing") + .detail("TSSReply", tss.value.present() ? traceChecksumValue(tss.value.get()) : "missing"); + + return false; + } + // printf("tss GetValueReply matched! src=%s, tss=%s\n", src.value.present() ? src.value.get().toString().c_str() : + // "missing", tss.value.present() ? tss.value.get().toString().c_str() : "missing"); + return true; +} + +template <> +bool TSS_doCompare(const GetKeyRequest& req, + const GetKeyReply& src, + const GetKeyReply& tss, + Severity traceSeverity, + UID tssId) { + // This process is a bit complicated. Since the tss and ss can return different results if neighboring shards to + // req.sel.key are currently being moved, We validate that the results are the same IF the returned key selectors + // are final. Otherwise, we only mark the request as a mismatch if the difference between the two returned key + // selectors could ONLY be because of different results from the storage engines. We can afford to only partially + // check key selectors that start in a TSS shard and end in a non-TSS shard because the other read queries and the + // consistency check will eventually catch a misbehaving storage engine. + bool matches = true; + // printf("GetKey %s:<%s:%d @ %lld start:\n", + // req.sel.getKey().toString().c_str(), req.sel.orEqual ? "=" : "", req.sel.offset, req.version); + if (src.sel.orEqual == tss.sel.orEqual && src.sel.offset == tss.sel.offset) { + // full matching case + if (src.sel.offset == 0 && src.sel.orEqual) { + // found exact key, should be identical + matches = src.sel.getKey() == tss.sel.getKey(); + } + // if the query doesn't return the final key, there is an edge case where the ss and tss have different shard + // boundaries, so they pass different shard boundary keys back for the same offset + } else if (src.sel.getKey() == tss.sel.getKey()) { + // There is one case with a positive offset where the shard boundary the incomplete query stopped at is the next + // key in the shard that the complete query returned. This is not possible with a negative offset because the + // shard boundary is exclusive backwards + if (src.sel.offset == 0 && src.sel.orEqual && tss.sel.offset == 1 && !tss.sel.orEqual) { + // case where ss was complete and tss was incomplete + } else if (tss.sel.offset == 0 && tss.sel.orEqual && src.sel.offset == 1 && !src.sel.orEqual) { + // case where tss was complete and ss was incomplete + } else { + matches = false; + } + } else { + // ss/tss returned different keys, and different offsets and/or orEqual + // here we just validate that ordering of the keys matches the ordering of the offsets + bool tssKeyLarger = src.sel.getKey() < tss.sel.getKey(); + // the only case offsets are equal and orEqual aren't equal is the case with a negative offset, + // where one response has <=0 with the actual result and the other has <0 with the shard upper boundary. + // So whichever one has the actual result should have the lower key. + bool tssOffsetLarger = (src.sel.offset == tss.sel.offset) ? tss.sel.orEqual : src.sel.offset < tss.sel.offset; + // printf(" partial comparison: tssLarger=%s, tssOffsetLarger=%s, matches=%s\n", tssKeyLarger ? "T" : "F", + // tssOffsetLarger ? "T": "F", matches ? "T" : "F"); + matches = tssKeyLarger != tssOffsetLarger; + } + if (!matches) { + // TODO REMOVE print + printf("GetKey %s:<%s:%d @ %lld mismatch: src=%s:<%s:%d, tss=%s:<%s:%d\n", + req.sel.getKey().printable().c_str(), + req.sel.orEqual ? "=" : "", + req.sel.offset, + req.version, + src.sel.getKey().printable().c_str(), + src.sel.orEqual ? "=" : "", + src.sel.offset, + tss.sel.getKey().printable().c_str(), + tss.sel.orEqual ? "=" : "", + tss.sel.offset); + TraceEvent(traceSeverity, "TSSMismatchGetKey") + .suppressFor(1.0) + .detail("TSSID", tssId) + .detail("KeySelector", + format("%s%s:%d", req.sel.orEqual ? "=" : "", req.sel.getKey().printable().c_str(), req.sel.offset)) + .detail("Version", req.version) + .detail("SSReply", + format("%s%s:%d", src.sel.orEqual ? "=" : "", src.sel.getKey().printable().c_str(), src.sel.offset)) + .detail( + "TSSReply", + format("%s%s:%d", tss.sel.orEqual ? "=" : "", tss.sel.getKey().printable().c_str(), tss.sel.offset)); + } + return matches; +} + +template <> +bool TSS_doCompare(const GetKeyValuesRequest& req, + const GetKeyValuesReply& src, + const GetKeyValuesReply& tss, + Severity traceSeverity, + UID tssId) { + if (src.more != tss.more || src.data != tss.data) { + // TODO REMOVE debugging prints + printf("GetKeyValues [%s:<%s:%d - %s:<%s:%d) @ %lld (lim=%d limB=%d) mismatch:\n", + req.begin.getKey().printable().c_str(), + req.begin.orEqual ? "=" : "", + req.begin.offset, + req.end.getKey().printable().c_str(), + req.end.orEqual ? "=" : "", + req.end.offset, + req.version, + req.limit, + req.limitBytes); + + std::string ssResultsString = format("(%d)%s:\n", src.data.size(), src.more ? "+" : ""); + printf("src= (%d)%s:", src.data.size(), src.more ? "+" : ""); + for (auto& it : src.data) { + printf(" %s=%s\n", it.key.printable().c_str(), it.value.printable().c_str()); + ssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value); + } + + std::string tssResultsString = format("(%d)%s:\n", tss.data.size(), tss.more ? "+" : ""); + printf("tss= (%d)%s:", tss.data.size(), tss.more ? "+" : ""); + for (auto& it : tss.data) { + printf(" %s=%s\n", it.key.printable().c_str(), it.value.printable().c_str()); + tssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value); + } + printf("\n"); + + TraceEvent(traceSeverity, "TSSMismatchGetKeyValues") + .suppressFor(1.0) + .detail("TSSID", tssId) + .detail( + "Begin", + format( + "%s%s:%d", req.begin.orEqual ? "=" : "", req.begin.getKey().printable().c_str(), req.begin.offset)) + .detail("End", + format("%s%s:%d", req.end.orEqual ? "=" : "", req.end.getKey().printable().c_str(), req.end.offset)) + .detail("Version", req.version) + .detail("Limit", req.limit) + .detail("LimitBytes", req.limitBytes) + .detail("SSReply", ssResultsString) + .detail("TSSReply", tssResultsString); + + return false; + } + /*printf("tss GetKeyValues [%s:<%s:%d - %s:<%s:%d) matched! %d=%d\n", + req.begin.getKey().printable().c_str(), req.begin.orEqual ? "=" : "", req.begin.offset, + req.end.getKey().printable().c_str(), req.end.orEqual ? "=" : "", req.end.offset, + src.data.size(), tss.data.size());*/ + return true; +} + +template <> +bool TSS_doCompare(const WatchValueRequest& req, + const WatchValueReply& src, + const WatchValueReply& tss, + Severity traceSeverity, + UID tssId) { + // TODO should this check that both returned the same version? We mainly want to duplicate watches just for load + return true; +} + +// no-op template specializations for metrics replies +template <> +bool TSS_doCompare(const WaitMetricsRequest& req, + const StorageMetrics& src, + const StorageMetrics& tss, + Severity traceSeverity, + UID tssId) { + return true; +} + +template <> +bool TSS_doCompare(const SplitMetricsRequest& req, + const SplitMetricsReply& src, + const SplitMetricsReply& tss, + Severity traceSeverity, + UID tssId) { + return true; +} + +template <> +bool TSS_doCompare(const ReadHotSubRangeRequest& req, + const ReadHotSubRangeReply& src, + const ReadHotSubRangeReply& tss, + Severity traceSeverity, + UID tssId) { + return true; +} + +template <> +bool TSS_doCompare(const SplitRangeRequest& req, + const SplitRangeReply& src, + const SplitRangeReply& tss, + Severity traceSeverity, + UID tssId) { + // TODO in theory this should return the same response from both right? + return true; +} + +// don't duplicate \xff reads or fetchKeys (avoid adding load to servers) +template <> +bool TSS_shouldDuplicateRequest(const GetValueRequest& req) { + return req.key.size() == 0 || req.key[0] != 0xff; +} + +template <> +bool TSS_shouldDuplicateRequest(const GetKeyRequest& req) { + return req.sel.getKey().size() == 0 || req.sel.getKey()[0] != 0xff; +} + +template <> +bool TSS_shouldDuplicateRequest(const GetKeyValuesRequest& req) { + return (req.begin.getKey().size() == 0 || req.begin.getKey()[0] != 0xff || req.end.getKey().size() == 0 || + req.end.getKey()[0] != 0xff) && + !req.isFetchKeys; +} + +template <> +bool TSS_shouldDuplicateRequest(const WatchValueRequest& req) { + return req.key.size() == 0 || req.key[0] != 0xff; +} + +template <> +bool TSS_shouldDuplicateRequest(const WaitMetricsRequest& req) { + return false; +} + +template <> +bool TSS_shouldDuplicateRequest(const SplitMetricsRequest& req) { + return false; +} + +template <> +bool TSS_shouldDuplicateRequest(const ReadHotSubRangeRequest& req) { + return false; +} + +template <> +bool TSS_shouldDuplicateRequest(const SplitRangeRequest& req) { + return false; +} + +// only record metrics for data reads + +template <> +void TSSMetrics::recordLatency(const GetValueRequest& req, double ssLatency, double tssLatency) { + SSgetValueLatency.addSample(ssLatency); + TSSgetValueLatency.addSample(tssLatency); +} + +template <> +void TSSMetrics::recordLatency(const GetKeyRequest& req, double ssLatency, double tssLatency) { + SSgetKeyLatency.addSample(ssLatency); + TSSgetKeyLatency.addSample(tssLatency); +} + +template <> +void TSSMetrics::recordLatency(const GetKeyValuesRequest& req, double ssLatency, double tssLatency) { + SSgetKeyValuesLatency.addSample(ssLatency); + TSSgetKeyValuesLatency.addSample(tssLatency); +} + +template <> +void TSSMetrics::recordLatency(const WatchValueRequest& req, double ssLatency, double tssLatency) {} + +template <> +void TSSMetrics::recordLatency(const WaitMetricsRequest& req, double ssLatency, double tssLatency) {} + +template <> +void TSSMetrics::recordLatency(const SplitMetricsRequest& req, double ssLatency, double tssLatency) {} + +template <> +void TSSMetrics::recordLatency(const ReadHotSubRangeRequest& req, double ssLatency, double tssLatency) {} + +template <> +void TSSMetrics::recordLatency(const SplitRangeRequest& req, double ssLatency, double tssLatency) {} + +// ------------------- + +// TODO ADD UNIT TESTS for compare methods, especially GetKey!! +TEST_CASE("/StorageServerInterface/TSSCompare/TestComparison") { + printf("testing tss comparisons\n"); + + // test getValue + GetValueRequest gvReq; + gvReq.key = StringRef("a"); + gvReq.version = 5; + + UID tssId; + + GetValueReply gvReplyMissing; + GetValueReply gvReplyA(Optional(StringRef("a")), false); + GetValueReply gvReplyB(Optional(StringRef("b")), false); + ASSERT(TSS_doCompare(gvReq, gvReplyMissing, gvReplyMissing, SevInfo, tssId)); + ASSERT(TSS_doCompare(gvReq, gvReplyA, gvReplyA, SevInfo, tssId)); + ASSERT(TSS_doCompare(gvReq, gvReplyB, gvReplyB, SevInfo, tssId)); + + ASSERT(!TSS_doCompare(gvReq, gvReplyMissing, gvReplyA, SevInfo, tssId)); + ASSERT(!TSS_doCompare(gvReq, gvReplyA, gvReplyB, SevInfo, tssId)); + + // test GetKeyValues + Arena a; // for all of the refs. ASAN complains if this isn't done. Could also make them all standalone i guess + GetKeyValuesRequest gkvReq; + gkvReq.begin = firstGreaterOrEqual(StringRef(a, "A")); + gkvReq.end = firstGreaterOrEqual(StringRef(a, "C")); + gkvReq.version = 5; + + GetKeyValuesReply gkvReplyEmpty; + GetKeyValuesReply gkvReplyOne; + KeyValueRef v; + v.key = StringRef(a, "a"); + v.value = StringRef(a, "1"); + gkvReplyOne.data.push_back_deep(gkvReplyOne.arena, v); + GetKeyValuesReply gkvReplyOneMore; + gkvReplyOneMore.data.push_back_deep(gkvReplyOneMore.arena, v); + gkvReplyOneMore.more = true; + + ASSERT(TSS_doCompare(gkvReq, gkvReplyEmpty, gkvReplyEmpty, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkvReq, gkvReplyOne, gkvReplyOne, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkvReq, gkvReplyOneMore, gkvReplyOneMore, SevInfo, tssId)); + ASSERT(!TSS_doCompare(gkvReq, gkvReplyEmpty, gkvReplyOne, SevInfo, tssId)); + ASSERT(!TSS_doCompare(gkvReq, gkvReplyOne, gkvReplyOneMore, SevInfo, tssId)); + + // test GetKey + GetKeyRequest gkReq; + gkReq.sel = KeySelectorRef(StringRef(a, "Z"), false, 1); + gkReq.version = 5; + + GetKeyReply gkReplyA(KeySelectorRef(StringRef(a, "A"), false, 20), false); + GetKeyReply gkReplyB(KeySelectorRef(StringRef(a, "B"), false, 10), false); + GetKeyReply gkReplyC(KeySelectorRef(StringRef(a, "C"), true, 0), false); + GetKeyReply gkReplyD(KeySelectorRef(StringRef(a, "D"), false, -10), false); + GetKeyReply gkReplyE(KeySelectorRef(StringRef(a, "E"), false, -20), false); + + // identical cases + ASSERT(TSS_doCompare(gkReq, gkReplyA, gkReplyA, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkReq, gkReplyB, gkReplyB, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyC, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkReq, gkReplyD, gkReplyD, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkReq, gkReplyE, gkReplyE, SevInfo, tssId)); + + // relative offset cases + ASSERT(TSS_doCompare(gkReq, gkReplyA, gkReplyB, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkReq, gkReplyB, gkReplyA, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkReq, gkReplyA, gkReplyC, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyA, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkReq, gkReplyB, gkReplyC, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyB, SevInfo, tssId)); + + ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyD, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkReq, gkReplyD, gkReplyC, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkReq, gkReplyC, gkReplyE, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkReq, gkReplyE, gkReplyC, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkReq, gkReplyD, gkReplyE, SevInfo, tssId)); + ASSERT(TSS_doCompare(gkReq, gkReplyE, gkReplyD, SevInfo, tssId)); + + // test same offset/orEqual wrong key + ASSERT(!TSS_doCompare(gkReq, + GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false), + GetKeyReply(KeySelectorRef(StringRef("B"), true, 0), false), + SevInfo, + tssId)); + // this could be from different shard boundaries, so don't say it's a mismatch + ASSERT(TSS_doCompare(gkReq, + GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 10), false), + GetKeyReply(KeySelectorRef(StringRef(a, "B"), false, 10), false), + SevInfo, + tssId)); + + // test offsets and key difference don't match + ASSERT(!TSS_doCompare(gkReq, + GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 0), false), + GetKeyReply(KeySelectorRef(StringRef("B"), false, 10), false), + SevInfo, + tssId)); + ASSERT(!TSS_doCompare(gkReq, + GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, -10), false), + GetKeyReply(KeySelectorRef(StringRef("B"), false, 0), false), + SevInfo, + tssId)); + + // test key is next over in one shard, one found it and other didn't + // positive + // one that didn't find is +1 + ASSERT(TSS_doCompare(gkReq, + GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 1), false), + GetKeyReply(KeySelectorRef(StringRef("B"), true, 0), false), + SevInfo, + tssId)); + ASSERT(!TSS_doCompare(gkReq, + GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false), + GetKeyReply(KeySelectorRef(StringRef("B"), false, 1), false), + SevInfo, + tssId)); + + // negative will have zero offset but not equal set + ASSERT(TSS_doCompare(gkReq, + GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false), + GetKeyReply(KeySelectorRef(StringRef("B"), false, 0), false), + SevInfo, + tssId)); + ASSERT(!TSS_doCompare(gkReq, + GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 0), false), + GetKeyReply(KeySelectorRef(StringRef("B"), true, 0), false), + SevInfo, + tssId)); + + // test shard boundary key returned by incomplete query is the same as the key found by the other (only possible in + // positive direction) + ASSERT(TSS_doCompare(gkReq, + GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false), + GetKeyReply(KeySelectorRef(StringRef("A"), false, 1), false), + SevInfo, + tssId)); + + // explictly test checksum function + std::string s = "A"; + std::string s12 = "ABCDEFGHIJKL"; + std::string s13 = "ABCDEFGHIJKLO"; + std::string checksumStart13 = "(13)"; + ASSERT(s == traceChecksumValue(StringRef(s))); + ASSERT(s12 == traceChecksumValue(StringRef(s12))); + ASSERT(checksumStart13 == traceChecksumValue(StringRef(s13)).substr(0, 4)); + return Void(); +} \ No newline at end of file diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 84971f040b..9a514a447e 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -29,7 +29,9 @@ #include "fdbrpc/LoadBalance.actor.h" #include "fdbrpc/Stats.h" #include "fdbrpc/TimedRequest.h" +#include "fdbrpc/TSSComparison.h" #include "fdbclient/TagThrottle.h" +#include "flow/UnitTest.h" // Dead code, removed in the next protocol version struct VersionReply { @@ -54,6 +56,10 @@ struct StorageServerInterface { LocalityData locality; UID uniqueID; + // TODO get rid of explicit mapping? + // Effectively implements Optional but serializer didn't like Optional + bool isTss; + UID tssPairID; RequestStream getValue; RequestStream getKey; @@ -74,8 +80,8 @@ struct StorageServerInterface { RequestStream getReadHotRanges; RequestStream getRangeSplitPoints; - explicit StorageServerInterface(UID uid) : uniqueID(uid) {} - StorageServerInterface() : uniqueID(deterministicRandom()->randomUniqueID()) {} + explicit StorageServerInterface(UID uid) : uniqueID(uid), isTss(false) {} + StorageServerInterface() : uniqueID(deterministicRandom()->randomUniqueID()), isTss(false) {} NetworkAddress address() const { return getValue.getEndpoint().getPrimaryAddress(); } NetworkAddress stableAddress() const { return getValue.getEndpoint().getStableAddress(); } Optional secondaryAddress() const { return getValue.getEndpoint().addresses.secondaryAddress; } @@ -88,7 +94,11 @@ struct StorageServerInterface { // considered if (ar.protocolVersion().hasSmallEndpoints()) { - serializer(ar, uniqueID, locality, getValue); + if (ar.protocolVersion().hasTSS()) { + serializer(ar, uniqueID, locality, getValue, isTss, tssPairID); + } else { + serializer(ar, uniqueID, locality, getValue); + } if (Ar::isDeserializing) { getKey = RequestStream(getValue.getEndpoint().getAdjustedEndpoint(1)); getKeyValues = RequestStream(getValue.getEndpoint().getAdjustedEndpoint(2)); @@ -127,8 +137,9 @@ struct StorageServerInterface { waitFailure, getQueuingMetrics, getKeyValueStoreType); - if (ar.protocolVersion().hasWatches()) + if (ar.protocolVersion().hasWatches()) { serializer(ar, watchValue); + } } } bool operator==(StorageServerInterface const& s) const { return uniqueID == s.uniqueID; } diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 0f035b745c..1d7a750fe5 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -25,6 +25,7 @@ #include "flow/Arena.h" #include "flow/TDMetric.actor.h" #include "flow/serialize.h" +#include "flow/UnitTest.h" const KeyRef systemKeysPrefix = LiteralStringRef("\xff"); const KeyRangeRef normalKeys(KeyRef(), systemKeysPrefix); @@ -345,7 +346,11 @@ uint16_t cacheChangeKeyDecodeIndex(const KeyRef& key) { return idx; } +const KeyRef tssMappingChangeKey = LiteralStringRef("\xff\x02/tssMappingChangeKey"); +const KeyRangeRef tssMappingKeys(LiteralStringRef("\xff/tss/"), LiteralStringRef("\xff/tss0")); + const KeyRangeRef serverTagKeys(LiteralStringRef("\xff/serverTag/"), LiteralStringRef("\xff/serverTag0")); + const KeyRef serverTagPrefix = serverTagKeys.begin; const KeyRangeRef serverTagConflictKeys(LiteralStringRef("\xff/serverTagConflict/"), LiteralStringRef("\xff/serverTagConflict0")); @@ -532,6 +537,7 @@ const Key serverListKeyFor(UID serverID) { return wr.toValue(); } +// TODO use flatbuffers depending on version const Value serverListValue(StorageServerInterface const& server) { BinaryWriter wr(IncludeVersion(ProtocolVersion::withServerListValue())); wr << server; @@ -550,6 +556,18 @@ StorageServerInterface decodeServerListValue(ValueRef const& value) { return s; } +// TODO merge this with above stuff or something +const Value serverListValueFB(StorageServerInterface const& server) { + return ObjectWriter::toValue(server, IncludeVersion()); +} + +StorageServerInterface decodeServerListValueFB(ValueRef const& value) { + StorageServerInterface s; + ObjectReader reader(value.begin(), IncludeVersion()); + reader.deserialize(s); + return s; +} + // processClassKeys.contains(k) iff k.startsWith( processClassKeys.begin ) because '/'+1 == '0' const KeyRangeRef processClassKeys(LiteralStringRef("\xff/processClass/"), LiteralStringRef("\xff/processClass0")); const KeyRef processClassPrefix = processClassKeys.begin; @@ -636,15 +654,17 @@ std::string encodeFailedServersKey(AddressExclusion const& addr) { // const KeyRangeRef globalConfigKeys( LiteralStringRef("\xff/globalConfig/"), LiteralStringRef("\xff/globalConfig0") ); // const KeyRef globalConfigPrefix = globalConfigKeys.begin; -const KeyRangeRef globalConfigDataKeys( LiteralStringRef("\xff/globalConfig/k/"), LiteralStringRef("\xff/globalConfig/k0") ); +const KeyRangeRef globalConfigDataKeys(LiteralStringRef("\xff/globalConfig/k/"), + LiteralStringRef("\xff/globalConfig/k0")); const KeyRef globalConfigKeysPrefix = globalConfigDataKeys.begin; -const KeyRangeRef globalConfigHistoryKeys( LiteralStringRef("\xff/globalConfig/h/"), LiteralStringRef("\xff/globalConfig/h0") ); +const KeyRangeRef globalConfigHistoryKeys(LiteralStringRef("\xff/globalConfig/h/"), + LiteralStringRef("\xff/globalConfig/h0")); const KeyRef globalConfigHistoryPrefix = globalConfigHistoryKeys.begin; const KeyRef globalConfigVersionKey = LiteralStringRef("\xff/globalConfig/v"); -const KeyRangeRef workerListKeys( LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0") ); +const KeyRangeRef workerListKeys(LiteralStringRef("\xff/worker/"), LiteralStringRef("\xff/worker0")); const KeyRef workerListPrefix = workerListKeys.begin; const Key workerListKeyFor(StringRef processID) { @@ -1085,3 +1105,62 @@ const KeyRangeRef testOnlyTxnStateStorePrefixRange(LiteralStringRef("\xff/TESTON const KeyRef writeRecoveryKey = LiteralStringRef("\xff/writeRecovery"); const ValueRef writeRecoveryKeyTrue = LiteralStringRef("1"); const KeyRef snapshotEndVersionKey = LiteralStringRef("\xff/snapshotEndVersion"); + +// for tests +void testSSISerdes(StorageServerInterface const& ssi, bool useFB) { + printf("ssi=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\naddress=%s\ngetValue=%s\n\n\n", + ssi.id().toString().c_str(), + ssi.locality.toString().c_str(), + ssi.isTss ? "true" : "false", + ssi.isTss ? ssi.tssPairID.toString().c_str() : "", + ssi.address().toString().c_str(), + ssi.getValue.getEndpoint().token.toString().c_str()); + + StorageServerInterface ssi2 = + (useFB) ? decodeServerListValueFB(serverListValueFB(ssi)) : decodeServerListValue(serverListValue(ssi)); + + printf("ssi2=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\naddress=%s\ngetValue=%s\n\n\n", + ssi2.id().toString().c_str(), + ssi2.locality.toString().c_str(), + ssi2.isTss ? "true" : "false", + ssi2.isTss ? ssi2.tssPairID.toString().c_str() : "", + ssi2.address().toString().c_str(), + ssi2.getValue.getEndpoint().token.toString().c_str()); + + ASSERT(ssi.id() == ssi2.id()); + ASSERT(ssi.locality == ssi2.locality); + ASSERT(ssi.isTss == ssi2.isTss); + if (ssi.isTss) { + ASSERT(ssi2.tssPairID == ssi2.tssPairID); + } + ASSERT(ssi.address() == ssi2.address()); + ASSERT(ssi.getValue.getEndpoint().token == ssi2.getValue.getEndpoint().token); +} + +// unit test for serialization since tss stuff had bugs +TEST_CASE("/SystemData/SerDes/SSI") { + printf("testing ssi serdes\n"); + LocalityData localityData(Optional>(), + Standalone(deterministicRandom()->randomUniqueID().toString()), + Standalone(deterministicRandom()->randomUniqueID().toString()), + Optional>()); + + // non-tss + StorageServerInterface ssi; + ssi.uniqueID = UID(0x1234123412341234, 0x5678567856785678); + ssi.locality = localityData; + ssi.isTss = false; + ssi.initEndpoints(); + + testSSISerdes(ssi, false); + testSSISerdes(ssi, true); + + ssi.isTss = true; + ssi.tssPairID = UID(0x2345234523452345, 0x1238123812381238); + + testSSISerdes(ssi, false); + testSSISerdes(ssi, true); + printf("ssi serdes test complete\n"); + + return Void(); +} \ No newline at end of file diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index 79efb688c8..b9efe1e8a5 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -115,6 +115,11 @@ extern const KeyRef cacheChangePrefix; const Key cacheChangeKeyFor(uint16_t idx); uint16_t cacheChangeKeyDecodeIndex(const KeyRef& key); +// "\xff/tss/[[serverId]]" := "[[tssId]]" +extern const KeyRef tssMappingChangeKey; +extern const KeyRangeRef tssMappingKeys; +extern const KeyRef tssMappingPrefix; + // "\xff/serverTag/[[serverID]]" = "[[Tag]]" // Provides the Tag for the given serverID. Used to access a // storage server's corresponding TLog in order to apply mutations. diff --git a/fdbrpc/CMakeLists.txt b/fdbrpc/CMakeLists.txt index 055e497034..89ec859e8f 100644 --- a/fdbrpc/CMakeLists.txt +++ b/fdbrpc/CMakeLists.txt @@ -29,7 +29,8 @@ set(FDBRPC_SRCS sim2.actor.cpp sim_validation.cpp TimedRequest.h - TraceFileIO.cpp) + TraceFileIO.cpp + TSSComparison.h) set(COMPILE_EIO OFF) diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h index 393c3c0ee2..2f1ee375bf 100644 --- a/fdbrpc/LoadBalance.actor.h +++ b/fdbrpc/LoadBalance.actor.h @@ -31,11 +31,16 @@ #include "flow/flow.h" #include "flow/Knobs.h" +// TODO REMOVE? +#include + #include "fdbrpc/FailureMonitor.h" #include "fdbrpc/fdbrpc.h" #include "fdbrpc/Locality.h" #include "fdbrpc/QueueModel.h" #include "fdbrpc/MultiInterface.h" +#include "fdbrpc/simulator.h" // for checking tss simulation mode +#include "fdbrpc/TSSComparison.h" #include "flow/actorcompiler.h" // This must be the last #include. using std::vector; @@ -75,6 +80,82 @@ struct LoadBalancedReply { Optional getLoadBalancedReply(const LoadBalancedReply* reply); Optional getLoadBalancedReply(const void*); +ACTOR template +Future tssComparison(Req req, + Future> fSource, + Future> fTss, + TSSEndpointData tssData) { + // TODO add timeout and time requests + state double startTime = now(); + state Future>> fTssWithTimeout = timeout(fTss, 5.0 /*TODO knob?*/); + state int finished = 0; + state double srcEndTime; + state double tssEndTime; + + loop { + choose { + when(state ErrorOr src = wait(fSource)) { + srcEndTime = now(); + fSource = Never(); + finished++; + if (finished == 2) { + break; + } + } + when(state Optional> tss = wait(fTssWithTimeout)) { + tssEndTime = now(); + fTssWithTimeout = Never(); + finished++; + if (finished == 2) { + break; + } + } + } + } + + ++tssData.metrics->requests; + + if (src.isError()) { + ++tssData.metrics->ssErrors; + } + if (!tss.present()) { + ++tssData.metrics->tssTimeouts; + } else if (tss.get().isError()) { + ++tssData.metrics->tssErrors; + printf("Tss got error %d\n", tss.get().getError().code()); + } + if (!src.isError() && tss.present() && !tss.get().isError()) { + Optional srcLB = getLoadBalancedReply(&src.get()); + Optional tssLB = getLoadBalancedReply(&tss.get().get()); + ASSERT(srcLB.present() == + tssLB.present()); // getLoadBalancedReply returned different responses for same templated type + + // if Resp is a LoadBalancedReply, only compare if both replies are non-error + if (!srcLB.present() || (!srcLB.get().error.present() && !tssLB.get().error.present())) { + // only record latency difference if both requests actually succeeded, so that we're comparing apples to + // apples + tssData.metrics->recordLatency(req, srcEndTime - startTime, tssEndTime - startTime); + + // expect mismatches in drop mutations mode. + Severity traceSeverity = + (g_network->isSimulated() && g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations) + ? SevWarnAlways + : SevError; + + if (!TSS_doCompare(req, src.get(), tss.get().get(), traceSeverity, tssData.tssId)) { + ++tssData.metrics->mismatches; + } + } else if (tssLB.present() && tssLB.get().error.present()) { + ++tssData.metrics->tssErrors; + printf("Tss got LB error %d\n", tssLB.get().error.get().code()); + } else if (srcLB.present() && srcLB.get().error.present()) { + ++tssData.metrics->ssErrors; + } + } + + return Void(); +} + // Stores state for a request made by the load balancer template struct RequestData : NonCopyable { @@ -91,6 +172,26 @@ struct RequestData : NonCopyable { // This is true once setupRequest is called, even though at that point the response is Never(). bool isValid() { return response.isValid(); } + void maybeDuplicateTSSRequest(RequestStream const* stream, + Request const& request, + QueueModel* model, + Future ssResponse) { + if (model) { + // Send parallel request to TSS pair, if it exists + Optional tssData = model->getTssData(stream->getEndpoint().token.first()); + + if (tssData.present() && TSS_shouldDuplicateRequest(request)) { + resetReply(request); + + // TODO add timeout from knob to tss request? + // FIXME: optimize to avoid creating new netNotifiedQueue for each message + RequestStream tssRequestStream(tssData.get().endpoint); + Future> fTssResult = tssRequestStream.tryGetReply(request); + model->addActor.send(tssComparison(request, fResult, fTssResult, tssData.get())); + } + } + } + // Initializes the request state and starts it, possibly after a backoff delay void startRequest(double backoff, bool triedAllOptions, @@ -105,12 +206,15 @@ struct RequestData : NonCopyable { delay(backoff), [this, stream, &request, model](Void _) { requestStarted = true; modelHolder = Reference(new ModelHolder(model, stream->getEndpoint().token.first())); - return stream->tryGetReply(request); + Future resp = stream->tryGetReply(request); + maybeDuplicateTSSRequest(stream, request, model, resp); + return resp; }); } else { requestStarted = true; modelHolder = Reference(new ModelHolder(model, stream->getEndpoint().token.first())); response = stream->tryGetReply(request); + maybeDuplicateTSSRequest(stream, request, model, response); } requestProcessed = false; diff --git a/fdbrpc/QueueModel.cpp b/fdbrpc/QueueModel.cpp index fa458d6738..2cb5687b61 100644 --- a/fdbrpc/QueueModel.cpp +++ b/fdbrpc/QueueModel.cpp @@ -18,6 +18,8 @@ * limitations under the License. */ +#include + #include "fdbrpc/QueueModel.h" #include "fdbrpc/LoadBalance.h" @@ -60,6 +62,39 @@ double QueueModel::addRequest(uint64_t id) { return d.penalty; } +void QueueModel::updateTssEndpoint(uint64_t endpointId, TSSEndpointData tssData) { + auto& d = data[endpointId]; + if (!d.tssData.present()) { + tssCount++; + } + + d.tssData = Optional(tssData); + // TODO REMOVE print + printf("Setting tss endpoint for %" PRIx64 " = %s\n", endpointId, tssData.endpoint.token.toString().c_str()); +} + +void QueueModel::removeOldTssData(UID currentGeneration) { + if (tssCount > 0) { + // expire old tss mappings that aren't present in new mapping + for (auto& it : data) { + if (it.second.tssData.present() && it.second.tssData.get().generation != currentGeneration) { + // TODO REMOVE print + printf("Removing tss endpoint for %" PRIx64 + " because its generation %s doesn't match the current one %s\n", + it.first, + it.second.tssData.get().generation.toString().c_str(), + currentGeneration.toString().c_str()); + it.second.tssData = Optional(); + tssCount--; + } + } + } +} + +Optional QueueModel::getTssData(uint64_t id) { + return data[id].tssData; +} + Optional getLoadBalancedReply(const LoadBalancedReply* reply) { return *reply; } diff --git a/fdbrpc/QueueModel.h b/fdbrpc/QueueModel.h index 3ff07a80e9..f8592fa9a5 100644 --- a/fdbrpc/QueueModel.h +++ b/fdbrpc/QueueModel.h @@ -26,6 +26,19 @@ #include "fdbrpc/Smoother.h" #include "flow/Knobs.h" #include "flow/ActorCollection.h" +#include "fdbrpc/TSSComparison.h" // For TSS Metrics +#include "fdbrpc/FlowTransport.h" // For Endpoint + +struct TSSEndpointData { + UID tssId; + Endpoint endpoint; + Reference metrics; + UID generation; // TODO this isn't exactly like a generation since it's not ordered, i'll try to think of a better + // name + + TSSEndpointData(UID tssId, Endpoint endpoint, Reference metrics, UID generation) + : tssId(tssId), endpoint(endpoint), metrics(metrics), generation(generation) {} +}; // The data structure used for the client-side load balancing algorithm to // decide which storage server to read data from. Conceptually, it tracks the @@ -59,6 +72,10 @@ struct QueueData { // hasn't returned a valid result, increase above `futureVersionBackoff` // to increase the future backoff amount. double increaseBackoffTime; + + // a bit of a hack to store this here, but it's the only centralized place for per-endpoint tracking + Optional tssData; + QueueData() : latency(0.001), penalty(1.0), smoothOutstanding(FLOW_KNOBS->QUEUE_MODEL_SMOOTHING_AMOUNT), failedUntil(0), futureVersionBackoff(FLOW_KNOBS->FUTURE_VERSION_INITIAL_BACKOFF), increaseBackoffTime(0) {} @@ -91,7 +108,11 @@ public: Future laggingRequests; // requests for which a different recipient already answered int laggingRequestCount; - QueueModel() : secondMultiplier(1.0), secondBudget(0), laggingRequestCount(0) { + void updateTssEndpoint(uint64_t endpointId, TSSEndpointData endpointData); + void removeOldTssData(UID currentGeneration); + Optional getTssData(uint64_t endpointId); + + QueueModel() : secondMultiplier(1.0), secondBudget(0), laggingRequestCount(0), tssCount(0) { laggingRequests = actorCollection(addActor.getFuture(), &laggingRequestCount); } @@ -99,6 +120,7 @@ public: private: std::unordered_map data; + uint32_t tssCount; }; /* old queue model diff --git a/fdbrpc/TSSComparison.h b/fdbrpc/TSSComparison.h new file mode 100644 index 0000000000..6724e3dae7 --- /dev/null +++ b/fdbrpc/TSSComparison.h @@ -0,0 +1,78 @@ +/* + * TSSComparison.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This header is to declare the tss comparison function that LoadBalance.Actor.h needs to be aware of to call, + * But StorageServerInterface.h needs to implement on the types defined in SSI.h. + */ +#ifndef FDBRPC_TSS_COMPARISON_H +#define FDBRPC_TSS_COMPARISON_H + +#include "fdbrpc/ContinuousSample.h" +#include "fdbrpc/Stats.h" + +// refcounted + noncopyable because both DatabaseContext and individual endpoints share ownership +struct TSSMetrics : ReferenceCounted, NonCopyable { + CounterCollection cc; + Counter requests; + Counter ssErrors; + Counter tssErrors; + Counter tssTimeouts; + Counter mismatches; + + // TODO we could probably just ignore getKey as it's seldom used? + ContinuousSample SSgetValueLatency; + ContinuousSample SSgetKeyLatency; + ContinuousSample SSgetKeyValuesLatency; + + ContinuousSample TSSgetValueLatency; + ContinuousSample TSSgetKeyLatency; + ContinuousSample TSSgetKeyValuesLatency; + + template + void recordLatency(const Req& req, double ssLatency, double tssLatency); + + void clear() { + SSgetValueLatency.clear(); + SSgetKeyLatency.clear(); + SSgetKeyValuesLatency.clear(); + + TSSgetValueLatency.clear(); + TSSgetKeyLatency.clear(); + TSSgetKeyValuesLatency.clear(); + } + + TSSMetrics() + : cc("TSSClientMetrics"), requests("Requests", cc), ssErrors("SSErrors", cc), tssErrors("TSSErrors", cc), + tssTimeouts("TSSTimeouts", cc), mismatches("Mismatches", cc), SSgetValueLatency(1000), SSgetKeyLatency(1000), + SSgetKeyValuesLatency(1000), TSSgetValueLatency(1000), TSSgetKeyLatency(1000), TSSgetKeyValuesLatency(1000) {} +}; + +// global static functions + +template +bool TSS_shouldDuplicateRequest(const Req& req); + +// part of the contract of this function is that if there is a mismatch, the implementation needs to record a trace +// event with the specified severity and tssId in the event. +template +bool TSS_doCompare(const Req& req, const Rep& src, const Rep& tss, Severity traceSeverity, UID tssId); + +#endif diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h index a2a6af5af6..e15e0126a1 100644 --- a/fdbrpc/fdbrpc.h +++ b/fdbrpc/fdbrpc.h @@ -335,6 +335,7 @@ public: Future disc = makeDependent(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint(taskID)); if (disc.isReady()) { + printf("got disconnect or failure 1 :O\n"); return ErrorOr(request_maybe_delivered()); } Reference peer = @@ -353,6 +354,7 @@ public: Future disc = makeDependent(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint()); if (disc.isReady()) { + printf("got disconnect or failure 2 :O\n"); return ErrorOr(request_maybe_delivered()); } Reference peer = diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index 4b74ed91ba..fd49c64447 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -41,7 +41,7 @@ public: : desiredCoordinators(1), physicalDatacenters(1), processesPerMachine(0), listenersPerProcess(1), isStopped(false), lastConnectionFailure(0), connectionFailuresDisableDuration(0), speedUpSimulation(false), allSwapsDisabled(false), backupAgents(BackupAgentType::WaitForType), drAgents(BackupAgentType::WaitForType), - extraDB(nullptr), allowLogSetKills(true), usableRegions(1) {} + extraDB(nullptr), allowLogSetKills(true), usableRegions(1), tssMode(TSSMode::Disabled) {} // Order matters! enum KillType { @@ -55,6 +55,9 @@ public: None }; + // Order matters! all modes >= 2 are fault injection modes + enum TSSMode { Disabled, EnabledNormal, EnabledAddDelay, EnabledDropMutations }; + enum class BackupAgentType { NoBackupAgents, WaitForType, BackupToFile, BackupToDB }; // Subclasses may subclass ProcessInfo as well @@ -401,6 +404,7 @@ public: int32_t satelliteTLogWriteAntiQuorumFallback; std::vector>> primarySatelliteDcIds; std::vector>> remoteSatelliteDcIds; + TSSMode tssMode; // Used by workloads that perform reconfigurations int testerCount; diff --git a/fdbserver/ApplyMetadataMutation.cpp b/fdbserver/ApplyMetadataMutation.cpp index 125344d721..87044f49b7 100644 --- a/fdbserver/ApplyMetadataMutation.cpp +++ b/fdbserver/ApplyMetadataMutation.cpp @@ -95,12 +95,14 @@ void applyMetadataMutations(SpanID const& spanContext, for (const auto& id : src) { auto storageInfo = getStorageInfo(id, storageCache, txnStateStore); + ASSERT(!storageInfo->interf.isTss); ASSERT(storageInfo->tag != invalidTag); info.tags.push_back(storageInfo->tag); info.src_info.push_back(storageInfo); } for (const auto& id : dest) { auto storageInfo = getStorageInfo(id, storageCache, txnStateStore); + ASSERT(!storageInfo->interf.isTss); ASSERT(storageInfo->tag != invalidTag); info.tags.push_back(storageInfo->tag); info.dest_info.push_back(storageInfo); @@ -113,6 +115,11 @@ void applyMetadataMutations(SpanID const& spanContext, txnStateStore->set(KeyValueRef(m.param1, m.param2)); } else if (m.param1.startsWith(serverKeysPrefix)) { if (toCommit) { + Optional t = + txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get(); + // printf("got SetValue for serverKeysPrefix/%s, tag=%s\n", + // serverKeysDecodeServer(m.param1).toString().c_str(), t.present() ? + // decodeServerTagValue(t.get()).toString().c_str() : ""); MutationRef privatized = m; privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena); TraceEvent(SevDebug, "SendingPrivateMutation", dbgid) diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 53304bc6f6..abb87fdf2d 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -1,3 +1,4 @@ + /* * ClusterController.actor.cpp * @@ -3185,9 +3186,9 @@ ACTOR Future workerAvailabilityWatch(WorkerInterface worker, checkOutstandingRequests(cluster); } } + when(wait(failed)) { // remove workers that have failed WorkerInfo& failedWorkerInfo = cluster->id_worker[worker.locality.processId()]; - if (!failedWorkerInfo.reply.isSet()) { failedWorkerInfo.reply.send( RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo)); @@ -3378,14 +3379,22 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co isChanged = true; } + // TODO remove debugging + printf("CC:\ntss_count=%d\ntss_storage_engine=%d|%s\n", + db->config.desiredTSSCount, + db->config.testingStorageServerStoreType, + db->config.testingStorageServerStoreType.toString().c_str()); + // Construct the client information if (db->clientInfo->get().commitProxies != req.commitProxies || db->clientInfo->get().grvProxies != req.grvProxies) { isChanged = true; + // TODO why construct a new one and not just copy the old one and change proxies + id? ClientDBInfo clientInfo; clientInfo.id = deterministicRandom()->randomUniqueID(); clientInfo.commitProxies = req.commitProxies; clientInfo.grvProxies = req.grvProxies; + clientInfo.tssMapping = db->clientInfo->get().tssMapping; db->clientInfo->set(clientInfo); dbInfo.client = db->clientInfo->get(); } @@ -3861,6 +3870,136 @@ ACTOR Future monitorServerInfoConfig(ClusterControllerData::DBInfo* db) { } } +// Monitors the tss mapping change key for changes, +// and broadcasts the new tss mapping to the rest of the cluster in ClientDBInfo. +ACTOR Future monitorTSSMapping(ClusterControllerData* self) { + state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); + loop { + state Reference tr = + Reference(new ReadYourWritesTransaction(self->db.db)); + loop { + try { + tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE); + + std::vector> tssResults = + wait(tssMapDB.getRange(tr, UID(), Optional(), CLIENT_KNOBS->TOO_MANY)); + ASSERT(tssResults.size() < CLIENT_KNOBS->TOO_MANY); + + state std::unordered_map tssIdMap; + std::set seenTssIds; + + for (auto& it : tssResults) { + tssIdMap[it.first] = it.second; + // ensure two storage servers don't map to same TSS + ASSERT(seenTssIds.insert(it.second).second); + } + + // TODO REMOVE print + printf("tss mapping of size %d\n", tssIdMap.size()); + + // TODO is copying storage server interfaces bad? + state std::vector> newMapping; + state std::map oldMapping; + state bool mappingChanged = false; + + state ClientDBInfo clientInfo = self->db.clientInfo->get(); + + for (auto& it : clientInfo.tssMapping) { + oldMapping[it.first] = it.second; + if (!tssIdMap.count(it.first)) { + // TODO add trace event + printf("tss mapping removed: %s=%s\n", + it.first.toString().c_str(), + it.second.id().toString().c_str()); + TraceEvent("TSS_MappingRemoved", self->id) + .detail("SSID", it.first) + .detail("TSSID", it.second.id()); + mappingChanged = true; + } + } + + for (auto& it : tssIdMap) { + bool ssAlreadyPaired = oldMapping.count(it.first); + + state Optional oldTssId; + state Optional oldGetValueEndpoint; + + if (ssAlreadyPaired) { + auto interf = oldMapping[it.first]; + // check if this SS maps to a new TSS + oldTssId = Optional(interf.id()); + oldGetValueEndpoint = Optional(interf.getValue.getEndpoint().token); + if (interf.id() != it.second) { + TraceEvent("TSS_MappingChanged", self->id) + .detail("SSID", it.first) + .detail("TSSID", it.second) + .detail("OldTSSID", interf.id()); + printf("tss mapping updated: %s=%s\n", + it.first.toString().c_str(), + it.second.toString().c_str()); + mappingChanged = true; + } + } else { + // TODO add trace event + TraceEvent("TSS_MappingAdded", self->id).detail("SSID", it.first).detail("TSSID", it.second); + printf("tss mapping added: %s=%s\n", it.first.toString().c_str(), it.second.toString().c_str()); + mappingChanged = true; + } + + state UID ssid = it.first; + state UID tssid = it.second; + // request storage server interface for tssid, add it to results + // TODO could issue all of these futures and then process then after as an optimization + Optional tssiVal = wait(tr->get(serverListKeyFor(it.second))); + + // because we read the tss mapping in the same transaction, there can be no races with tss removal + // and the tss interface must exist + ASSERT(tssiVal.present()); + + StorageServerInterface tssi = decodeServerListValue(tssiVal.get()); + if (oldTssId.present() && tssi.id() == oldTssId.get() && oldGetValueEndpoint.present() && + oldGetValueEndpoint.get() != tssi.getValue.getEndpoint().token) { + // TODO REMOVE print + printf("tss %s restarted, getValue %s -> %s\n", + tssi.id().toString().c_str(), + oldGetValueEndpoint.get().toString().c_str(), + tssi.getValue.getEndpoint().token.toString().c_str()); + mappingChanged = true; + } + newMapping.push_back(std::pair(ssid, tssi)); + } + + // if nothing changed, skip updating + if (mappingChanged) { + // TODO REMOVE print + printf("CC updating tss client and server info\n"); + clientInfo.id = deterministicRandom()->randomUniqueID(); + clientInfo.tssMapping = newMapping; + self->db.clientInfo->set(clientInfo); + + ServerDBInfo serverInfo = self->db.serverInfo->get(); + // also change server db info so workers get new mapping + serverInfo.id = deterministicRandom()->randomUniqueID(); + serverInfo.infoGeneration = ++self->db.dbInfoCount; + serverInfo.client = clientInfo; + self->db.serverInfo->set(serverInfo); + } + + state Future tssChangeFuture = tr->watch(tssMappingChangeKey); + + wait(tr->commit()); + wait(tssChangeFuture); + + break; + } catch (Error& e) { + wait(tr->onError(e)); + } + } + } +} + // Monitors the global configuration version key for changes. When changes are // made, the global configuration history is read and any updates are sent to // all processes in the system by updating the ClientDBInfo object. The @@ -4411,6 +4550,7 @@ ACTOR Future clusterControllerCore(ClusterControllerFullInterface interf, self.addActor.send(handleForcedRecoveries(&self, interf)); self.addActor.send(monitorDataDistributor(&self)); self.addActor.send(monitorRatekeeper(&self)); + self.addActor.send(monitorTSSMapping(&self)); self.addActor.send(dbInfoUpdater(&self)); self.addActor.send(traceCounters("ClusterControllerMetrics", self.id, @@ -4452,6 +4592,7 @@ ACTOR Future clusterControllerCore(ClusterControllerFullInterface interf, when(GetWorkersRequest req = waitNext(interf.getWorkers.getFuture())) { ++self.getWorkersRequests; vector workers; + // printf("CC got GetWorkersRequest\n"); for (auto& it : self.id_worker) { if ((req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) && diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index d1469c0d3b..3fc1ed02c3 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -1507,6 +1507,7 @@ ACTOR static Future rejoinServer(CommitProxyInterface proxy, ProxyCommitDa loop { GetStorageServerRejoinInfoRequest req = waitNext(proxy.getStorageServerRejoinInfo.getFuture()); + printf("Proxy got Rejoin req for %s\n", req.id.toString().c_str()); if (commitData->txnStateStore->readValue(serverListKeyFor(req.id)).get().present()) { GetStorageServerRejoinInfoReply rep; rep.version = commitData->version; @@ -1567,8 +1568,10 @@ ACTOR static Future rejoinServer(CommitProxyInterface proxy, ProxyCommitDa } rep.newTag = Tag(maxTagLocality + 1, 0); } + printf("Proxy sent Rejoin response for %s\n", req.id.toString().c_str()); req.reply.send(rep); } else { + printf("Proxy notifying %s it can't rejoin because it was removed.\n", req.id.toString().c_str()); req.reply.sendError(worker_removed()); } } diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index d1762fc7cb..cbb0364178 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -66,6 +66,7 @@ struct TCServerInfo : public ReferenceCounted { Future> onInterfaceChanged; Promise removed; Future onRemoved; + Future onTSSPairRemoved; Promise wakeUpTracker; bool inDesiredDC; LocalityEntry localityEntry; @@ -83,8 +84,10 @@ struct TCServerInfo : public ReferenceCounted { Reference storageServerSet) : id(ssi.id()), collection(collection), lastKnownInterface(ssi), lastKnownClass(processClass), dataInFlightToServer(0), onInterfaceChanged(interfaceChanged.getFuture()), onRemoved(removed.getFuture()), - inDesiredDC(inDesiredDC), storeType(KeyValueStoreType::END) { - localityEntry = ((LocalityMap*)storageServerSet.getPtr())->add(ssi.locality, &id); + inDesiredDC(inDesiredDC), storeType(KeyValueStoreType::END), onTSSPairRemoved(Never()) { + if (!ssi.isTss) { + localityEntry = ((LocalityMap*)storageServerSet.getPtr())->add(ssi.locality, &id); + } } bool isCorrectStoreType(KeyValueStoreType configStoreType) { @@ -398,6 +401,7 @@ ACTOR Future> getInitialDataDistribution(Data state std::map> server_dc; state std::map, std::pair, vector>> team_cache; + state std::vector> tss_servers; // Get the server list in its own try/catch block since it modifies result. We don't want a subsequent failure // causing entries to be duplicated @@ -447,12 +451,19 @@ ACTOR Future> getInitialDataDistribution(Data for (int i = 0; i < serverList.get().size(); i++) { auto ssi = decodeServerListValue(serverList.get()[i].value); - result->allServers.push_back(std::make_pair(ssi, id_data[ssi.locality.processId()].processClass)); - server_dc[ssi.id()] = ssi.locality.dcId(); + if (!ssi.isTss) { + printf("DD adding SS %s on init\n", ssi.id().toString().c_str()); + result->allServers.push_back(std::make_pair(ssi, id_data[ssi.locality.processId()].processClass)); + server_dc[ssi.id()] = ssi.locality.dcId(); + } else { + printf("DD ignoring TSS %s on init until after team building\n", ssi.id().toString().c_str()); + tss_servers.push_back(std::make_pair(ssi, id_data[ssi.locality.processId()].processClass)); + } } break; } catch (Error& e) { + printf("get initial DD failed %d\n", e.code()); wait(tr.onError(e)); ASSERT(!succeeded); // We shouldn't be retrying if we have already started modifying result in this loop @@ -546,6 +557,7 @@ ACTOR Future> getInitialDataDistribution(Data beginKey = keyServers.end()[-1].key; break; } catch (Error& e) { + printf("GetInitialTeams got error %d\n", e.code()); TraceEvent("GetInitialTeamsKeyServersRetry", distributorId).error(e); wait(tr.onError(e)); @@ -559,6 +571,12 @@ ACTOR Future> getInitialDataDistribution(Data // a dummy shard at the end with no keys or servers makes life easier for trackInitialShards() result->shards.push_back(DDShardInfo(allKeys.end)); + // add tss to server list AFTER teams are built + for (auto& it : tss_servers) { + printf("DD adding TSS %s on init\n", it.first.id().toString().c_str()); + result->allServers.push_back(it); + } + return result; } @@ -567,7 +585,8 @@ ACTOR Future storageServerTracker(struct DDTeamCollection* self, TCServerInfo* server, Promise errorOut, Version addedVersion, - const DDEnabledState* ddEnabledState); + const DDEnabledState* ddEnabledState, + bool isTss); Future teamTracker(struct DDTeamCollection* const& self, Reference const& team, @@ -598,6 +617,8 @@ struct DDTeamCollection : ReferenceCounted { int64_t unhealthyServers; std::map priority_teams; std::map> server_info; + std::map> tss_info_by_pair; + std::map> server_and_tss_info; // TODO could replace this with an efficient way to do a read-only concatenation of 2 data structures? std::map lagging_zones; // zone to number of storage servers lagging AsyncVar disableFailingLaggingServers; @@ -610,6 +631,7 @@ struct DDTeamCollection : ReferenceCounted { vector> badTeams; Reference shardsAffectedByTeamFailure; PromiseStream removedServers; + PromiseStream removedTSS; std::set recruitingIds; // The IDs of the SS which are being recruited std::set recruitingLocalities; Future initialFailureReactionDelay; @@ -624,6 +646,8 @@ struct DDTeamCollection : ReferenceCounted { int optimalTeamCount; AsyncVar zeroOptimalTeams; + bool isTssRecruiting; // If tss recruiting is waiting on a pair, don't consider DD recruiting for the purposes of QuietDB + // EXCLUDED if an address is in the excluded list in the database. // FAILED if an address is permanently failed. // NONE by default. Updated asynchronously (eventually) @@ -709,7 +733,7 @@ struct DDTeamCollection : ReferenceCounted { initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay, this)), optimalTeamCount(0), recruitingStream(0), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY), unhealthyServers(0), includedDCs(includedDCs), otherTrackedDCs(otherTrackedDCs), - zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary), + zeroHealthyTeams(zeroHealthyTeams), zeroOptimalTeams(true), primary(primary), isTssRecruiting(false), medianAvailableSpace(SERVER_KNOBS->MIN_AVAILABLE_SPACE_RATIO), lastMedianAvailableSpaceUpdate(0), processingUnhealthy(processingUnhealthy), lowestUtilizationTeam(0), highestUtilizationTeam(0), getShardMetrics(getShardMetrics), removeFailedServer(removeFailedServer) { @@ -758,10 +782,11 @@ struct DDTeamCollection : ReferenceCounted { // The following makes sure that, even if a reference to a team is held in the DD Queue, the tracker will be // stopped // before the server_status map to which it has a pointer, is destroyed. - for (auto& [_, info] : server_info) { + for (auto& [_, info] : server_and_tss_info) { info->tracker.cancel(); info->collection = nullptr; } + // TraceEvent("DDTeamCollectionDestructed", distributorId) // .detail("Primary", primary) // .detail("ServerTrackerDestroyed", server_info.size()); @@ -1128,6 +1153,7 @@ struct DDTeamCollection : ReferenceCounted { self->healthyZone.set(initTeams->initHealthyZoneValue); // SOMEDAY: If some servers have teams and not others (or some servers have more data than others) and there is // an address/locality collision, should we preferentially mark the least used server as undesirable? + for (auto i = initTeams->allServers.begin(); i != initTeams->allServers.end(); ++i) { if (self->shouldHandleServer(i->first)) { if (!self->isValidLocality(self->configuration.storagePolicy, i->first.locality)) { @@ -1141,6 +1167,7 @@ struct DDTeamCollection : ReferenceCounted { self->addActor.send(self->checkInvalidLocalities); } } + printf("%p init adding %s\n", (void*)self, i->first.toString().c_str()); self->addServer(i->first, i->second, self->serverTrackerErrorOut, 0, ddEnabledState); } } @@ -2419,14 +2446,25 @@ struct DDTeamCollection : ReferenceCounted { if (!shouldHandleServer(newServer)) { return; } - allServers.push_back(newServer.id()); - TraceEvent("AddedStorageServer", distributorId) + // printf("addServer(%s)\n", newServer.id().toString().c_str()); + + if (!newServer.isTss) { + allServers.push_back(newServer.id()); + } + + TraceEvent(newServer.isTss ? "AddedTSS" : "AddedStorageServer", distributorId) .detail("ServerID", newServer.id()) .detail("ProcessClass", processClass.toString()) .detail("WaitFailureToken", newServer.waitFailure.getEndpoint().token) .detail("Address", newServer.waitFailure.getEndpoint().getPrimaryAddress()); - auto& r = server_info[newServer.id()] = makeReference( + + // TODO how to do this? + /*if (newServer.isTss) { + tr.detail("TSSPairID", newServer.tssPairID); + }*/ + + auto& r = server_and_tss_info[newServer.id()] = makeReference( newServer, this, processClass, @@ -2434,12 +2472,33 @@ struct DDTeamCollection : ReferenceCounted { std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end(), storageServerSet); - // Establish the relation between server and machine - checkAndCreateMachine(r); + if (newServer.isTss) { + tss_info_by_pair[newServer.tssPairID] = r; - r->tracker = storageServerTracker(this, cx, r.getPtr(), errorOut, addedVersion, ddEnabledState); - doBuildTeams = true; // Adding a new server triggers to build new teams - restartTeamBuilder.trigger(); + if (server_info.count(newServer.tssPairID)) { + r->onTSSPairRemoved = server_info[newServer.tssPairID]->onRemoved; + } + } else { + server_info[newServer.id()] = r; + // Establish the relation between server and machine + checkAndCreateMachine(r); + } + + r->tracker = + storageServerTracker(this, cx, r.getPtr(), errorOut, addedVersion, ddEnabledState, newServer.isTss); + + if (!newServer.isTss) { + // link and wake up tss' tracker so it knows when this server gets removed + if (tss_info_by_pair.count(newServer.id())) { + tss_info_by_pair[newServer.id()]->onTSSPairRemoved = r->onRemoved; + if (tss_info_by_pair[newServer.id()]->wakeUpTracker.canBeSet()) { + tss_info_by_pair[newServer.id()]->wakeUpTracker.send(Void()); + } + } + + doBuildTeams = true; // Adding a new server triggers to build new teams + restartTeamBuilder.trigger(); + } } bool removeTeam(Reference team) { @@ -2605,7 +2664,21 @@ struct DDTeamCollection : ReferenceCounted { return foundMachineTeam; } + void removeTSS(UID removedServer) { + // much simpler than remove server. tss isn't in any teams, so just remove it from data structures + TEST(true); // Remove a TSS frm the cluster + printf("Removing tss %s\n", removedServer.toString().c_str()); + TraceEvent("RemovedTSS", distributorId).detail("ServerID", removedServer); + Reference removedServerInfo = server_and_tss_info[removedServer]; + + tss_info_by_pair.erase(removedServerInfo->lastKnownInterface.tssPairID); + server_and_tss_info.erase(removedServer); + + server_status.clear(removedServer); + } + void removeServer(UID removedServer) { + printf("Removing ss %s\n", removedServer.toString().c_str()); TraceEvent("RemovedStorageServer", distributorId).detail("ServerID", removedServer); // ASSERT( !shardsAffectedByTeamFailure->getServersForTeam( t ) for all t in teams that contain removedServer ) @@ -2703,6 +2776,7 @@ struct DDTeamCollection : ReferenceCounted { } } server_info.erase(removedServer); + server_and_tss_info.erase(removedServer); if (server_status.get(removedServer).initialized && server_status.get(removedServer).isUnhealthy()) { unhealthyServers--; @@ -2726,7 +2800,7 @@ struct DDTeamCollection : ReferenceCounted { }; TCServerInfo::~TCServerInfo() { - if (collection && ssVersionTooFarBehind.get()) { + if (collection && ssVersionTooFarBehind.get() && !lastKnownInterface.isTss) { collection->removeLaggingStorageServer(lastKnownInterface.locality.zoneId().get()); } } @@ -3359,6 +3433,7 @@ ACTOR Future teamTracker(DDTeamCollection* self, Reference tea .detail("IsReady", self->initialFailureReactionDelay.isReady()); self->traceTeamCollectionInfo(); } + // Check if the number of degraded machines has changed state vector> change; bool anyUndesired = false; @@ -3400,18 +3475,20 @@ ACTOR Future teamTracker(DDTeamCollection* self, Reference tea bool containsFailed = teamContainsFailedServer(self, team); bool recheck = !healthy && (lastReady != self->initialFailureReactionDelay.isReady() || (lastZeroHealthy && !self->zeroHealthyTeams->get()) || containsFailed); - // TraceEvent("TeamHealthChangeDetected", self->distributorId) - // .detail("Team", team->getDesc()) - // .detail("ServersLeft", serversLeft) - // .detail("LastServersLeft", lastServersLeft) - // .detail("AnyUndesired", anyUndesired) - // .detail("LastAnyUndesired", lastAnyUndesired) - // .detail("AnyWrongConfiguration", anyWrongConfiguration) - // .detail("LastWrongConfiguration", lastWrongConfiguration) - // .detail("Recheck", recheck) - // .detail("BadTeam", badTeam) - // .detail("LastZeroHealthy", lastZeroHealthy) - // .detail("ZeroHealthyTeam", self->zeroHealthyTeams->get()); + + // TODO recomment + TraceEvent("TeamHealthChangeDetected", self->distributorId) + .detail("Team", team->getDesc()) + .detail("ServersLeft", serversLeft) + .detail("LastServersLeft", lastServersLeft) + .detail("AnyUndesired", anyUndesired) + .detail("LastAnyUndesired", lastAnyUndesired) + .detail("AnyWrongConfiguration", anyWrongConfiguration) + .detail("LastWrongConfiguration", lastWrongConfiguration) + .detail("Recheck", recheck) + .detail("BadTeam", badTeam) + .detail("LastZeroHealthy", lastZeroHealthy) + .detail("ZeroHealthyTeam", self->zeroHealthyTeams->get()); lastReady = self->initialFailureReactionDelay.isReady(); lastZeroHealthy = self->zeroHealthyTeams->get(); @@ -3764,8 +3841,8 @@ ACTOR Future waitServerListChange(DDTeamCollection* self, ProcessClass const& processClass = results[i].second; if (!self->shouldHandleServer(ssi)) { continue; - } else if (self->server_info.count(serverId)) { - auto& serverInfo = self->server_info[serverId]; + } else if (self->server_and_tss_info.count(serverId)) { + auto& serverInfo = self->server_and_tss_info[serverId]; if (ssi.getValue.getEndpoint() != serverInfo->lastKnownInterface.getValue.getEndpoint() || processClass != serverInfo->lastKnownClass.classType()) { Promise> currentInterfaceChanged = @@ -3783,7 +3860,9 @@ ACTOR Future waitServerListChange(DDTeamCollection* self, self->serverTrackerErrorOut, tr.getReadVersion().get(), ddEnabledState); - self->doBuildTeams = true; + if (!ssi.isTss) { + self->doBuildTeams = true; + } } } @@ -3798,6 +3877,7 @@ ACTOR Future waitServerListChange(DDTeamCollection* self, } } } catch (Error& e) { + printf("WaitServerListChange got error %d\n", e.code()); wait(tr.onError(e)); serverListAndProcessClasses = Never(); isFetchingResults = false; @@ -3886,16 +3966,18 @@ ACTOR Future keyValueStoreTypeTracker(DDTeamCollection* self, TCServerInfo } ACTOR Future waitForAllDataRemoved(Database cx, UID serverID, Version addedVersion, DDTeamCollection* teams) { - state Transaction tr(cx); + state Reference tr = makeReference(cx); + printf("Waiting for data to be removed from %s\n", serverID.toString().c_str()); loop { try { - tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); - Version ver = wait(tr.getReadVersion()); + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + Version ver = wait(tr->getReadVersion()); // we cannot remove a server immediately after adding it, because a perfectly timed master recovery could // cause us to not store the mutations sent to the short lived storage server. if (ver > addedVersion + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS) { - bool canRemove = wait(canRemoveStorageServer(&tr, serverID)); + bool canRemove = wait(canRemoveStorageServer(tr, serverID)); // TraceEvent("WaitForAllDataRemoved") // .detail("Server", serverID) // .detail("CanRemove", canRemove) @@ -3908,9 +3990,9 @@ ACTOR Future waitForAllDataRemoved(Database cx, UID serverID, Version adde // Wait for any change to the serverKeys for this server wait(delay(SERVER_KNOBS->ALL_DATA_REMOVED_DELAY, TaskPriority::DataDistribution)); - tr.reset(); + tr->reset(); } catch (Error& e) { - wait(tr.onError(e)); + wait(tr->onError(e)); } } } @@ -3923,6 +4005,10 @@ ACTOR Future storageServerFailureTracker(DDTeamCollection* self, state StorageServerInterface interf = server->lastKnownInterface; state int targetTeamNumPerServer = (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (self->configuration.storageTeamSize + 1)) / 2; + + printf("Starting failure tracker for %sSS %s\n", + server->lastKnownInterface.isTss ? "T" : "", + server->lastKnownInterface.id().toString().c_str()); loop { state bool inHealthyZone = false; // healthChanged actor will be Never() if this flag is true if (self->healthyZone.get().present()) { @@ -3941,16 +4027,18 @@ ACTOR Future storageServerFailureTracker(DDTeamCollection* self, } } - if (self->server_status.get(interf.id()).initialized) { - bool unhealthy = self->server_status.get(interf.id()).isUnhealthy(); - if (unhealthy && !status->isUnhealthy()) { - self->unhealthyServers--; - } - if (!unhealthy && status->isUnhealthy()) { + if (!interf.isTss) { + if (self->server_status.get(interf.id()).initialized) { + bool unhealthy = self->server_status.get(interf.id()).isUnhealthy(); + if (unhealthy && !status->isUnhealthy()) { + self->unhealthyServers--; + } + if (!unhealthy && status->isUnhealthy()) { + self->unhealthyServers++; + } + } else if (status->isUnhealthy()) { self->unhealthyServers++; } - } else if (status->isUnhealthy()) { - self->unhealthyServers++; } self->server_status.set(interf.id(), *status); @@ -3971,7 +4059,7 @@ ACTOR Future storageServerFailureTracker(DDTeamCollection* self, choose { when(wait(healthChanged)) { status->isFailed = !status->isFailed; - if (!status->isFailed && + if (!status->isFailed && !server->lastKnownInterface.isTss && (server->teams.size() < targetTeamNumPerServer || self->lastBuildTeamsFailed)) { self->doBuildTeams = true; } @@ -4014,7 +4102,9 @@ ACTOR Future storageServerTracker( TCServerInfo* server, // This actor is owned by this TCServerInfo, point to server_info[id] Promise errorOut, Version addedVersion, - const DDEnabledState* ddEnabledState) { + const DDEnabledState* ddEnabledState, + bool isTss) { + state Future failureTracker; state ServerStatus status(false, false, server->lastKnownInterface.locality); state bool lastIsUnhealthy = false; @@ -4022,13 +4112,16 @@ ACTOR Future storageServerTracker( state Future> interfaceChanged = server->onInterfaceChanged; - state Future storeTypeTracker = keyValueStoreTypeTracker(self, server); + state Future storeTypeTracker = (isTss) ? Never() : keyValueStoreTypeTracker(self, server); state bool hasWrongDC = !isCorrectDC(self, server); state bool hasInvalidLocality = !self->isValidLocality(self->configuration.storagePolicy, server->lastKnownInterface.locality); state int targetTeamNumPerServer = (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (self->configuration.storageTeamSize + 1)) / 2; + // TODO REMOVE + printf("Started %sSS tracker for %s\n", isTss ? "T" : "", server->id.toString().c_str()); + try { loop { status.isUndesired = !self->disableFailingLaggingServers.get() && server->ssVersionTooFarBehind.get(); @@ -4042,7 +4135,7 @@ ACTOR Future storageServerTracker( // dcLocation, interface) is changed. state std::vector> otherChanges; std::vector> wakeUpTrackers; - for (const auto& i : self->server_info) { + for (const auto& i : self->server_and_tss_info) { if (i.second.getPtr() != server && i.second->lastKnownInterface.address() == server->lastKnownInterface.address()) { auto& statusInfo = self->server_status.get(i.first); @@ -4144,11 +4237,11 @@ ACTOR Future storageServerTracker( .detail("Excluded", worstAddr.toString()); status.isUndesired = true; status.isWrongConfiguration = true; - if (worstStatus == DDTeamCollection::Status::FAILED) { + if (worstStatus == DDTeamCollection::Status::FAILED && !isTss) { TraceEvent(SevWarn, "FailedServerRemoveKeys", self->distributorId) .detail("Server", server->id) .detail("Excluded", worstAddr.toString()); - wait(delay(0.0)); //Do not throw an error while still inside trackExcludedServers + wait(delay(0.0)); // Do not throw an error while still inside trackExcludedServers while (!ddEnabledState->isDDEnabled()) { wait(delay(1.0)); } @@ -4165,7 +4258,7 @@ ACTOR Future storageServerTracker( self->restartRecruiting.trigger(); } - if (lastIsUnhealthy && !status.isUnhealthy() && + if (lastIsUnhealthy && !status.isUnhealthy() && !isTss && (server->teams.size() < targetTeamNumPerServer || self->lastBuildTeamsFailed)) { self->doBuildTeams = true; self->restartTeamBuilder.trigger(); // This does not trigger building teams if there exist healthy teams @@ -4174,7 +4267,9 @@ ACTOR Future storageServerTracker( state bool recordTeamCollectionInfo = false; choose { - when(wait(failureTracker)) { + when(wait(failureTracker || server->onTSSPairRemoved)) { + printf("Server %s getting removed\n", server->id.toString().c_str()); + // The server is failed AND all data has been removed from it, so permanently remove it. TraceEvent("StatusMapChange", self->distributorId) .detail("ServerID", server->id) @@ -4185,7 +4280,9 @@ ACTOR Future storageServerTracker( } // Remove server from FF/serverList - wait(removeStorageServer(cx, server->id, self->lock, ddEnabledState)); + Optional tssPairID = + server->lastKnownInterface.isTss ? server->lastKnownInterface.tssPairID : Optional(); + wait(removeStorageServer(cx, server->id, tssPairID, self->lock, ddEnabledState)); TraceEvent("StatusMapChange", self->distributorId) .detail("ServerID", server->id) @@ -4193,7 +4290,11 @@ ACTOR Future storageServerTracker( // Sets removeSignal (alerting dataDistributionTeamCollection to remove the storage server from its // own data structures) server->removed.send(Void()); - self->removedServers.send(server->id); + if (isTss) { + self->removedTSS.send(server->id); + } else { + self->removedServers.send(server->id); + } return Void(); } when(std::pair newInterface = wait(interfaceChanged)) { @@ -4211,7 +4312,7 @@ ACTOR Future storageServerTracker( server->lastKnownInterface = newInterface.first; server->lastKnownClass = newInterface.second; - if (localityChanged) { + if (localityChanged && !isTss) { TEST(true); // Server locality changed // The locality change of a server will affect machine teams related to the server if @@ -4303,7 +4404,7 @@ ACTOR Future storageServerTracker( recordTeamCollectionInfo = true; // Restart the storeTracker for the new interface. This will cancel the previous // keyValueStoreTypeTracker - storeTypeTracker = keyValueStoreTypeTracker(self, server); + storeTypeTracker = (isTss) ? Never() : keyValueStoreTypeTracker(self, server); hasWrongDC = !isCorrectDC(self, server); hasInvalidLocality = !self->isValidLocality(self->configuration.storagePolicy, server->lastKnownInterface.locality); @@ -4350,6 +4451,7 @@ ACTOR Future storageServerTracker( // Monitor whether or not storage servers are being recruited. If so, then a database cannot be considered quiet ACTOR Future monitorStorageServerRecruitment(DDTeamCollection* self) { state bool recruiting = false; + state bool lastIsTss = false; TraceEvent("StorageServerRecruitment", self->distributorId) .detail("State", "Idle") .trackLatest("StorageServerRecruitment_" + self->distributorId.toString()); @@ -4360,12 +4462,22 @@ ACTOR Future monitorStorageServerRecruitment(DDTeamCollection* self) { } TraceEvent("StorageServerRecruitment", self->distributorId) .detail("State", "Recruiting") + .detail("IsTSS", self->isTssRecruiting ? "True" : "False") .trackLatest("StorageServerRecruitment_" + self->distributorId.toString()); recruiting = true; + lastIsTss = self->isTssRecruiting; } else { loop { choose { - when(wait(self->recruitingStream.onChange())) {} + when(wait(self->recruitingStream.onChange())) { + if (lastIsTss != self->isTssRecruiting) { + TraceEvent("StorageServerRecruitment", self->distributorId) + .detail("State", "Recruiting") + .detail("IsTSS", self->isTssRecruiting ? "True" : "False") + .trackLatest("StorageServerRecruitment_" + self->distributorId.toString()); + lastIsTss = self->isTssRecruiting; + } + } when(wait(self->recruitingStream.get() == 0 ? delay(SERVER_KNOBS->RECRUITMENT_IDLE_DELAY, TaskPriority::DataDistribution) : Future(Never()))) { @@ -4444,8 +4556,9 @@ ACTOR Future checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) { } int numExistingSSOnAddr(DDTeamCollection* self, const AddressExclusion& addr) { + // TODO add tss? int numExistingSS = 0; - for (auto& server : self->server_info) { + for (auto& server : self->server_and_tss_info) { const NetworkAddress& netAddr = server.second->lastKnownInterface.stableAddress(); AddressExclusion usedAddr(netAddr.ip, netAddr.port); if (usedAddr == addr) { @@ -4456,9 +4569,75 @@ int numExistingSSOnAddr(DDTeamCollection* self, const AddressExclusion& addr) { return numExistingSS; } +// All state that represents an ongoing tss pair recruitment +struct TSSRecruitmentState : ReferenceCounted, NonCopyable { + Promise>> + ssPairInfo; // if set, for ss to pass its id to tss pair once it is successfully recruited + Promise tssPairDone; // if set, for tss to pass ss that it was successfully recruited + Optional dcId; // dc + bool active; + + TSSRecruitmentState() : active(false) {} + + TSSRecruitmentState(Optional dcId) : active(true), dcId(dcId) {} + + void cancel() { + // only cancel if both haven't been set, otherwise one half of pair could think it was successful but the other + // half would think it failed + if (active && ssPairInfo.canBeSet() && tssPairDone.canBeSet()) { + ssPairInfo.send(Optional>()); + // callback of ssPairInfo could have cancelled tssPairDone already, so double check before cancelling + if (tssPairDone.canBeSet()) { + tssPairDone.send(false); + } + } + } + + bool tssRecruitSuccess() { + if (active && tssPairDone.canBeSet()) { + tssPairDone.send(true); + return true; + } + return false; + } + + bool tssRecruitFailed() { + if (active && tssPairDone.canBeSet()) { + printf("tssPair: %p\n", &tssPairDone); + tssPairDone.send(false); + return true; + } + return false; + } + + bool ssRecruitSuccess(std::pair ssInfo) { + if (active && ssPairInfo.canBeSet()) { + ssPairInfo.send(Optional>(ssInfo)); + return true; + } + return false; + } + + bool ssRecruitFailed() { + if (active && ssPairInfo.canBeSet()) { + ssPairInfo.send(Optional>()); + return true; + } + return false; + } + + Future>> waitOnSS() { return ssPairInfo.getFuture(); } + + Future waitOnTSS() { return tssPairDone.getFuture(); } +}; + +// TODO switch recruitment order(ish) - grab tss but don't init it, wait for it to actually grab an ss, then the ss +// signals here to start, then when done this signals the ss to add server ACTOR Future initializeStorage(DDTeamCollection* self, RecruitStorageReply candidateWorker, - const DDEnabledState* ddEnabledState) { + const DDEnabledState* ddEnabledState, + bool recruitTss, + Reference tssState) { // SOMEDAY: Cluster controller waits for availability, retry quickly if a server's Locality changes self->recruitingStream.set(self->recruitingStream.get() + 1); @@ -4470,11 +4649,61 @@ ACTOR Future initializeStorage(DDTeamCollection* self, // too many storage server on the same address (i.e., process) can cause OOM. // Ask the candidateWorker to initialize a SS only if the worker does not have a pending request state UID interfaceId = deterministicRandom()->randomUniqueID(); - InitializeStorageRequest isr; - isr.storeType = self->configuration.storageServerStoreType; + + state InitializeStorageRequest isr; + isr.storeType = + recruitTss ? self->configuration.testingStorageServerStoreType : self->configuration.storageServerStoreType; isr.seedTag = invalidTag; isr.reqId = deterministicRandom()->randomUniqueID(); isr.interfaceId = interfaceId; + isr.isTss = recruitTss; + + printf("InitStorage %s on %sSS %s\n", + interfaceId.toString().c_str(), + recruitTss ? "T" : "", + candidateWorker.worker.address().toString().c_str()); + + self->recruitingIds.insert(interfaceId); + self->recruitingLocalities.insert(candidateWorker.worker.stableAddress()); + + // if tss, wait for pair ss to finish and add its id to isr. If pair fails, don't recruit tss + state bool doRecruit = true; + if (recruitTss) { + TraceEvent("TSS_Recruit", self->distributorId) + .detail("TSSID", interfaceId) + .detail("Stage", "TSSWaitingPair") + .detail("Addr", candidateWorker.worker.address()) + .detail("Locality", candidateWorker.worker.locality.toString()); + + printf("TSS %s waiting for partner uid\n", interfaceId.toString().c_str()); + Optional> ssPairInfoResult = wait(tssState->waitOnSS()); + if (ssPairInfoResult.present()) { + printf("TSS %s got pair of %s @ %lld\n", + interfaceId.toString().c_str(), + ssPairInfoResult.get().first.toString().c_str(), + ssPairInfoResult.get().second); + isr.tssPairID = ssPairInfoResult.get().first; + isr.tssPairVersion = ssPairInfoResult.get().second; + + TraceEvent("TSS_Recruit", self->distributorId) + .detail("SSID", isr.tssPairID) + .detail("TSSID", interfaceId) + .detail("Stage", "TSSWaitingPair") + .detail("Addr", candidateWorker.worker.address()) + .detail("Locality", candidateWorker.worker.locality.toString()); + } else { + printf("TSS %s didn't get partner, partner recruitment must have failed, abandoning\n", + interfaceId.toString().c_str()); + isr.isTss = false; + doRecruit = false; + + TraceEvent(SevWarn, "TSS_RecruitError", self->distributorId) + .detail("TSSID", interfaceId) + .detail("Reason", "SS recruitment failed for some reason") + .detail("Addr", candidateWorker.worker.address()) + .detail("Locality", candidateWorker.worker.locality.toString()); + } + } TraceEvent("DDRecruiting") .detail("Primary", self->primary) @@ -4483,19 +4712,64 @@ ACTOR Future initializeStorage(DDTeamCollection* self, .detail("WorkerLocality", candidateWorker.worker.locality.toString()) .detail("Interf", interfaceId) .detail("Addr", candidateWorker.worker.address()) + .detail("TSS", recruitTss ? "true" : "false") .detail("RecruitingStream", self->recruitingStream.get()); - self->recruitingIds.insert(interfaceId); - self->recruitingLocalities.insert(candidateWorker.worker.stableAddress()); - state ErrorOr newServer = - wait(candidateWorker.worker.storage.tryGetReply(isr, TaskPriority::DataDistribution)); - if (newServer.isError()) { + Future> fRecruit = + doRecruit ? candidateWorker.worker.storage.tryGetReply(isr, TaskPriority::DataDistribution) + : Future>(ErrorOr(recruitment_failed())); + + state ErrorOr newServer = wait(fRecruit); + + if (doRecruit && newServer.isError()) { TraceEvent(SevWarn, "DDRecruitmentError").error(newServer.getError()); if (!newServer.isError(error_code_recruitment_failed) && !newServer.isError(error_code_request_maybe_delivered)) throw newServer.getError(); wait(delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskPriority::DataDistribution)); } + + if (!recruitTss && newServer.present() && + tssState->ssRecruitSuccess(std::pair(interfaceId, newServer.get().addedVersion))) { + printf("ss %s signalling tss pair with version %lld\n", + interfaceId.toString().c_str(), + newServer.get().addedVersion); + // ss has a tss pair. send it this id, but wait for add server until tss is recruited + + TraceEvent("TSS_Recruit", self->distributorId) + .detail("SSID", interfaceId) + .detail("Stage", "SSSignaling") + .detail("Addr", candidateWorker.worker.address()) + .detail("Locality", candidateWorker.worker.locality.toString()); + + // wait for timeout, and give up if no TSS pair recruited + Optional tssSuccessful = wait(timeout(tssState->waitOnTSS(), SERVER_KNOBS->TSS_RECRUITMENT_TIMEOUT)); + + // TODO if unsuccessful, fail out tss so it doesn't cause a mismatch error? + if (tssSuccessful.present() && tssSuccessful.get()) { + TraceEvent("TSS_Recruit", self->distributorId) + .detail("SSID", interfaceId) + .detail("Stage", "SSGotPair") + .detail("Addr", candidateWorker.worker.address()) + .detail("Locality", candidateWorker.worker.locality.toString()); + } else { + TraceEvent(SevWarn, "TSS_RecruitError", self->distributorId) + .detail("SSID", interfaceId) + .detail("Reason", + tssSuccessful.present() ? "TSS recruitment failed for some reason" + : "TSS recruitment timed out") + .detail("Addr", candidateWorker.worker.address()) + .detail("Locality", candidateWorker.worker.locality.toString()); + + // TODO need to remove that tss here!! + } + + // TODO trace event, change sev and message if timeout or if unsuccessful + printf("ss %s %ssuccessfully got tss pair!\n", + interfaceId.toString().c_str(), + (tssSuccessful.present() && tssSuccessful.get()) ? "" : "un"); + } + self->recruitingIds.erase(interfaceId); self->recruitingLocalities.erase(candidateWorker.worker.stableAddress()); @@ -4509,26 +4783,46 @@ ACTOR Future initializeStorage(DDTeamCollection* self, .detail("RecruitingStream", self->recruitingStream.get()); if (newServer.present()) { - if (!self->server_info.count(newServer.get().interf.id())) - self->addServer(newServer.get().interf, - candidateWorker.processClass, - self->serverTrackerErrorOut, - newServer.get().addedVersion, - ddEnabledState); - else - TraceEvent(SevWarn, "DDRecruitmentError").detail("Reason", "Server ID already recruited"); - - self->doBuildTeams = true; + UID id = newServer.get().interf.id(); + if (!self->server_and_tss_info.count(id)) { + if (!recruitTss || tssState->tssRecruitSuccess()) { + self->addServer(newServer.get().interf, + candidateWorker.processClass, + self->serverTrackerErrorOut, + newServer.get().addedVersion, + ddEnabledState); + } else { + // TODO tss recruitment was cancelled since it failed to send a response to the ss, kill it + printf("TSS recruitment was cancelled, stop\n"); + } + } else { + TraceEvent(SevWarn, "DDRecruitmentError") + .detail("Reason", "Server ID already recruited") + .detail("ServerID", id); + } + if (!recruitTss) { + self->doBuildTeams = true; + } } } + if (recruitTss && tssState->tssRecruitFailed()) { + TEST(true); // TSS recruitment failed for some reason + // if tss wasn't already marked as done, it was unsuccessful in recruitment + printf("tss recruitment failed for some reason, signalling ss.\n"); + } + if (!recruitTss && tssState->ssRecruitFailed()) { + TEST(true); // SS with pair TSS recruitment failed for some reason + // if ss didn't already send its pair id to tss, it was unsuccessful in recruitment + printf("ss recruitment failed for some reason, signalling tss.\n"); + } + self->recruitingStream.set(self->recruitingStream.get() - 1); self->restartRecruiting.trigger(); return Void(); } -// Recruit a worker as a storage server ACTOR Future storageRecruiter(DDTeamCollection* self, Reference> db, const DDEnabledState* ddEnabledState) { @@ -4536,13 +4830,24 @@ ACTOR Future storageRecruiter(DDTeamCollection* self, state RecruitStorageRequest lastRequest; state bool hasHealthyTeam; state std::map numSSPerAddr; + + // tss-specific recruitment state + state uint32_t tssToRecruit = self->configuration.desiredTSSCount - db->get().client.tssMapping.size(); + state Reference tssState = makeReference(); + + printf("DD setting tssToRecruit=%d (%d - %d)\n", + tssToRecruit, + self->configuration.desiredTSSCount, + db->get().client.tssMapping.size()); + TraceEvent(SevDebug, "TSS_RecruitUpdated", self->distributorId).detail("Count", tssToRecruit); + loop { try { numSSPerAddr.clear(); hasHealthyTeam = (self->healthyTeamCount != 0); RecruitStorageRequest rsr; std::set exclusions; - for (auto s = self->server_info.begin(); s != self->server_info.end(); ++s) { + for (auto s = self->server_and_tss_info.begin(); s != self->server_and_tss_info.end(); ++s) { auto serverStatus = self->server_status.get(s->second->lastKnownInterface.id()); if (serverStatus.excludeOnRecruit()) { TraceEvent(SevDebug, "DDRecruitExcl1") @@ -4574,7 +4879,7 @@ ACTOR Future storageRecruiter(DDTeamCollection* self, exclusions.insert(addr); } - rsr.criticalRecruitment = self->healthyTeamCount == 0; + rsr.criticalRecruitment = !hasHealthyTeam; for (auto it : exclusions) { rsr.excludeAddresses.push_back(it); } @@ -4611,10 +4916,96 @@ ACTOR Future storageRecruiter(DDTeamCollection* self, .detail("Addr", candidateSSAddr.toString()) .detail("NumExistingSS", numExistingSS); } - self->addActor.send(initializeStorage(self, candidateWorker, ddEnabledState)); + + if (hasHealthyTeam && !tssState->active && tssToRecruit > 0) { + TraceEvent("TSS_Recruit", self->distributorId) + .detail("Stage", "HoldTSS") + .detail("Addr", candidateSSAddr.toString()) + .detail("Locality", candidateWorker.worker.locality.toString()); + + TEST(true); // Starting TSS recruitment + printf("starting recruitment of tss\n"); + self->isTssRecruiting = true; + tssState = makeReference(candidateWorker.worker.locality.dcId()); + + self->addActor.send(initializeStorage(self, candidateWorker, ddEnabledState, true, tssState)); + } else { + if (tssState->active && candidateWorker.worker.locality.dcId() == tssState->dcId) { + TEST(true); // TSS recruits pair in same dc + self->isTssRecruiting = false; + TraceEvent("TSS_Recruit", self->distributorId) + .detail("Stage", "PairSS") + .detail("Addr", candidateSSAddr.toString()) + .detail("Locality", candidateWorker.worker.locality.toString()); + printf("starting recruitment of ss with eventual tss pair in dc \'%s\'\n", + tssState->dcId.present() ? tssState->dcId.get().toString().c_str() : ""); + self->addActor.send( + initializeStorage(self, candidateWorker, ddEnabledState, false, tssState)); + // successfully started recruitment of pair, reset tss recruitment state + tssState = makeReference(); + tssToRecruit--; + if (tssToRecruit > 0) { + printf("%d tss pairs left to recruit\n", tssToRecruit); + } + } else { + if (tssState->active) { + TEST(true); // TSS recruitment skipped potential pair because it's in a different dc + printf("Recruiting normal ss (no tss) b/c new ss is in different dc \'%s\' than tss " + "\'%s\'\n", + candidateWorker.worker.locality.dcId().present() + ? candidateWorker.worker.locality.dcId().get().toString().c_str() + : "", + tssState->dcId.present() ? tssState->dcId.get().toString().c_str() : ""); + } else { + printf("recruiting normal ss (no tss)\n"); + } + self->addActor.send(initializeStorage( + self, candidateWorker, ddEnabledState, false, makeReference())); + } + } } - when(wait(db->onChange())) { // SOMEDAY: only if clusterInterface changes? + when(wait(db->onChange())) { // SOMEDAY: only if clusterInterface or tss changes? fCandidateWorker = Future(); + // TODO REMOVE print + int newTssToRecruit = self->configuration.desiredTSSCount - db->get().client.tssMapping.size(); + if (newTssToRecruit != tssToRecruit) { + TraceEvent("TSS_RecruitUpdated", self->distributorId).detail("Count", newTssToRecruit); + tssToRecruit = newTssToRecruit; + } + + // TODO HANDLE HERE if count is more than desired tss? + + printf("DD updated tssToRecruit=%d (%d - %d)\n", + tssToRecruit, + self->configuration.desiredTSSCount, + db->get().client.tssMapping.size()); + + if (self->isTssRecruiting && (tssToRecruit == 0 || self->zeroHealthyTeams->get())) { + TEST(tssToRecruit == 0); // tss recruitment cancelled due to too many TSS + TEST(self->zeroHealthyTeams->get()); // tss recruitment cancelled due zero healthy teams + TraceEvent(SevWarn, "TSS_RecruitCancelled", self->distributorId) + .detail("Reason", tssToRecruit == 0 ? "ConfigChange" : "ZeroHealthyTeams"); + printf("Cancelling tss recruitment! tssToRecruit: %d, zeroHealthyTeams: %s\n", + tssToRecruit, + self->zeroHealthyTeams->get() ? "T" : "F"); + tssState->cancel(); + tssState = makeReference(); + self->isTssRecruiting = false; + } + } + when(wait(self->zeroHealthyTeams->onChange())) { + // TODO refactor? + if (self->isTssRecruiting && self->zeroHealthyTeams->get()) { + TEST(self->zeroHealthyTeams->get()); // tss recruitment cancelled due zero healthy teams 2 + TraceEvent(SevWarn, "TSS_RecruitCancelled", self->distributorId) + .detail("Reason", "ZeroHealthyTeams"); + printf("Cancelling tss recruitment!! tssToRecruit: %d, zeroHealthyTeams: %s\n", + tssToRecruit, + self->zeroHealthyTeams->get() ? "T" : "F"); + tssState->cancel(); + tssState = makeReference(); + self->isTssRecruiting = false; + } } when(wait(self->restartRecruiting.onTrigger())) {} } @@ -4760,6 +5151,13 @@ ACTOR Future dataDistributionTeamCollection(Reference te self->restartRecruiting.trigger(); } + when(UID removedTSS = waitNext(self->removedTSS.getFuture())) { + TEST(true); // TSS removed from database + self->removeTSS(removedTSS); + serverRemoved.send(Void()); + + self->restartRecruiting.trigger(); + } when(wait(self->zeroHealthyTeams->onChange())) { if (self->zeroHealthyTeams->get()) { self->restartRecruiting.trigger(); @@ -5254,6 +5652,8 @@ ACTOR Future dataDistribution(Reference self, wait(waitForAll(actors)); return Void(); } catch (Error& e) { + // TODO REMOVE + printf("DD got error! %d\n", e.code()); trackerCancelled = true; state Error err = e; TraceEvent("DataDistributorDestroyTeamCollections").error(e); @@ -5265,7 +5665,8 @@ ACTOR Future dataDistribution(Reference self, if (removeFailedServer.getFuture().isReady() && !removeFailedServer.getFuture().isError()) { TraceEvent("RemoveFailedServer", removeFailedServer.getFuture().get()).error(err); wait(removeKeysFromFailedServer(cx, removeFailedServer.getFuture().get(), lock, ddEnabledState)); - wait(removeStorageServer(cx, removeFailedServer.getFuture().get(), lock, ddEnabledState)); + Optional tssPairID; + wait(removeStorageServer(cx, removeFailedServer.getFuture().get(), tssPairID, lock, ddEnabledState)); } else { if (err.code() != error_code_movekeys_conflict) { throw err; @@ -5921,3 +6322,5 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") { return Void(); } + +// TODO add unit test for TSS recruitment? diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 94f38622f0..51501c9b62 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -497,14 +497,22 @@ ACTOR Future shardSplitter(DataDistributionTracker* self, .detail("MaxBytes", shardBounds.max.bytes) .detail("MetricsBytes", metrics.bytes) .detail("Bandwidth", - bandwidthStatus == BandwidthStatusHigh ? "High" - : bandwidthStatus == BandwidthStatusNormal ? "Normal" - : "Low") + bandwidthStatus == BandwidthStatusHigh + ? "High" + : bandwidthStatus == BandwidthStatusNormal ? "Normal" : "Low") .detail("BytesPerKSec", metrics.bytesPerKSecond) .detail("NumShards", numShards); } if (numShards > 1) { + // TODO REMOVE + printf("Splitting [%s - %s) into %d shards:\n", + splitKeys[0].toString().c_str(), + splitKeys[numShards].toString().c_str(), + numShards); + for (int i = 0; i < numShards; i++) { + printf(" [%s - %s)\n", splitKeys[i].toString().c_str(), splitKeys[i + 1].toString().c_str()); + } int skipRange = deterministicRandom()->randomInt(0, numShards); // The queue can't deal with RelocateShard requests which split an existing shard into three pieces, so // we have to send the unskipped ranges in this order (nibbling in from the edges of the old range) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index fc1234d243..8e507f1727 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -217,6 +217,7 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( SERVER_LIST_DELAY, 1.0 ); init( RECRUITMENT_IDLE_DELAY, 1.0 ); init( STORAGE_RECRUITMENT_DELAY, 10.0 ); + init( TSS_RECRUITMENT_TIMEOUT, 3*STORAGE_RECRUITMENT_DELAY ); if (randomize && BUGGIFY ) TSS_RECRUITMENT_TIMEOUT = 1.0; //Super low timeout should cause tss recruitments to fail init( DATA_DISTRIBUTION_LOGGING_INTERVAL, 5.0 ); init( DD_ENABLED_CHECK_DELAY, 1.0 ); init( DD_STALL_CHECK_DELAY, 0.4 ); //Must be larger than 2*MAX_BUGGIFIED_DELAY diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index be2caba6a1..9a4cc4a047 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -167,6 +167,7 @@ public: double SERVER_LIST_DELAY; double RECRUITMENT_IDLE_DELAY; double STORAGE_RECRUITMENT_DELAY; + double TSS_RECRUITMENT_TIMEOUT; double DATA_DISTRIBUTION_LOGGING_INTERVAL; double DD_ENABLED_CHECK_DELAY; double DD_STALL_CHECK_DELAY; diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index 1f2e3a9780..927a7af00b 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -20,6 +20,8 @@ #include "flow/Util.h" #include "fdbrpc/FailureMonitor.h" +#include "fdbclient/DatabaseContext.h" // for tss mapping +#include "fdbclient/KeyBackedTypes.h" #include "fdbclient/SystemData.h" #include "fdbserver/MoveKeys.actor.h" #include "fdbserver/Knobs.h" @@ -99,6 +101,7 @@ ACTOR static Future checkMoveKeysLock(Transaction* tr, bool isWrite = true) { if (!ddEnabledState->isDDEnabled()) { TraceEvent(SevDebug, "DDDisabledByInMemoryCheck"); + printf("MK: DD disabled\n"); throw movekeys_conflict(); } Optional readVal = wait(tr->get(moveKeysLockOwnerKey)); @@ -110,6 +113,7 @@ ACTOR static Future checkMoveKeysLock(Transaction* tr, UID lastWrite = readVal.present() ? BinaryReader::fromStringRef(readVal.get(), Unversioned()) : UID(); if (lastWrite != lock.prevWrite) { TEST(true); // checkMoveKeysLock: Conflict with previous owner + printf("MK: conflict with previous owner\n"); throw movekeys_conflict(); } @@ -143,6 +147,7 @@ ACTOR static Future checkMoveKeysLock(Transaction* tr, return Void(); } else { TEST(true); // checkMoveKeysLock: Conflict with new owner + printf("MK: conflict %s with new owner %s\n", currentOwner.toString().c_str(), lock.myOwner.toString().c_str()); throw movekeys_conflict(); } } @@ -158,7 +163,7 @@ ACTOR Future> checkReadWrite(Future> f return Optional(uid); } -Future removeOldDestinations(Transaction* tr, +Future removeOldDestinations(Reference tr, UID oldDest, VectorRef shards, KeyRangeRef currentKeys) { @@ -235,7 +240,7 @@ ACTOR Future> addReadWriteDestinations(KeyRangeRef shard, } ACTOR Future>> additionalSources(RangeResult shards, - Transaction* tr, + Reference tr, int desiredHealthy, int maxServers) { state RangeResult UIDtoTagMap = wait(tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY)); @@ -325,6 +330,12 @@ ACTOR static Future startMoveKeys(Database occ, state Future warningLogger = logWarningAfter("StartMoveKeysTooLong", 600, servers); // state TraceInterval waitInterval(""); + // TODO REMOVE + printf("starting move keys for [%s, %s): to %s\n", + keys.begin.toString().c_str(), + keys.end.toString().c_str(), + servers[0].toString().c_str()); + wait(startMoveKeysLock->take(TaskPriority::DataDistributionLaunch)); state FlowLock::Releaser releaser(*startMoveKeysLock); @@ -343,7 +354,8 @@ ACTOR static Future startMoveKeys(Database occ, TEST(begin > keys.begin); // Multi-transactional startMoveKeys batches++; - state Transaction tr(occ); + // RYW to optimize re-reading the same key ranges + state Reference tr = makeReference(occ); state int retries = 0; loop { @@ -356,15 +368,16 @@ ACTOR static Future startMoveKeys(Database occ, // Keep track of shards for all src servers so that we can preserve their values in serverKeys state Map> shardMap; - tr.info.taskID = TaskPriority::MoveKeys; - tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr->getTransaction().info.taskID = TaskPriority::MoveKeys; + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - wait(checkMoveKeysLock(&tr, lock, ddEnabledState)); + wait(checkMoveKeysLock(&(tr->getTransaction()), lock, ddEnabledState)); vector>> serverListEntries; serverListEntries.reserve(servers.size()); for (int s = 0; s < servers.size(); s++) - serverListEntries.push_back(tr.get(serverListKeyFor(servers[s]))); + serverListEntries.push_back(tr->get(serverListKeyFor(servers[s]))); state vector> serverListValues = wait(getAll(serverListEntries)); for (int s = 0; s < serverListValues.size(); s++) { @@ -380,11 +393,12 @@ ACTOR static Future startMoveKeys(Database occ, // Get all existing shards overlapping keys (exclude any that have been processed in a previous // iteration of the outer loop) state KeyRange currentKeys = KeyRangeRef(begin, keys.end); - state RangeResult old = wait(krmGetRanges(&tr, - keyServersPrefix, - currentKeys, - SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT, - SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES)); + + state RangeResult old = wait(krmGetRanges(tr, + keyServersPrefix, + currentKeys, + SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT, + SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES)); // Determine the last processed key (which will be the beginning for the next iteration) state Key endKey = old.end()[-1].key; @@ -399,10 +413,10 @@ ACTOR static Future startMoveKeys(Database occ, // printf("'%s': '%s'\n", old[i].key.toString().c_str(), old[i].value.toString().c_str()); // Check that enough servers for each shard are in the correct state - state RangeResult UIDtoTagMap = wait(tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY)); + state RangeResult UIDtoTagMap = wait(tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY)); ASSERT(!UIDtoTagMap.more && UIDtoTagMap.size() < CLIENT_KNOBS->TOO_MANY); vector> addAsSource = wait(additionalSources( - old, &tr, servers.size(), SERVER_KNOBS->MAX_ADDED_SOURCES_MULTIPLIER * servers.size())); + old, tr, servers.size(), SERVER_KNOBS->MAX_ADDED_SOURCES_MULTIPLIER * servers.size())); // For each intersecting range, update keyServers[range] dest to be servers and clear existing dest // servers from serverKeys @@ -417,7 +431,7 @@ ACTOR static Future startMoveKeys(Database occ, // .detail("KeyEnd", rangeIntersectKeys.end.toString()) // .detail("OldSrc", describe(src)) // .detail("OldDest", describe(dest)) - // .detail("ReadVersion", tr.getReadVersion().get()); + // .detail("ReadVersion", tr->getReadVersion().get()); for (auto& uid : addAsSource[i]) { src.push_back(uid); @@ -425,7 +439,7 @@ ACTOR static Future startMoveKeys(Database occ, uniquify(src); // Update dest servers for this range to be equal to servers - krmSetPreviouslyEmptyRange(&tr, + krmSetPreviouslyEmptyRange(&(tr->getTransaction()), keyServersPrefix, rangeIntersectKeys, keyServersValue(UIDtoTagMap, src, servers), @@ -455,7 +469,7 @@ ACTOR static Future startMoveKeys(Database occ, vector> actors; for (oldDest = oldDests.begin(); oldDest != oldDests.end(); ++oldDest) if (std::find(servers.begin(), servers.end(), *oldDest) == servers.end()) - actors.push_back(removeOldDestinations(&tr, *oldDest, shardMap[*oldDest], currentKeys)); + actors.push_back(removeOldDestinations(tr, *oldDest, shardMap[*oldDest], currentKeys)); // Update serverKeys to include keys (or the currently processed subset of keys) for each SS in // servers @@ -464,12 +478,12 @@ ACTOR static Future startMoveKeys(Database occ, // to have the same shard boundaries If that invariant was important, we would have to move this // inside the loop above and also set it for the src servers actors.push_back(krmSetRangeCoalescing( - &tr, serverKeysPrefixFor(servers[i]), currentKeys, allKeys, serverKeysTrue)); + tr, serverKeysPrefixFor(servers[i]), currentKeys, allKeys, serverKeysTrue)); } wait(waitForAll(actors)); - wait(tr.commit()); + wait(tr->commit()); /*TraceEvent("StartMoveKeysCommitDone", relocationIntervalId) .detail("CommitVersion", tr.getCommittedVersion()) @@ -481,7 +495,7 @@ ACTOR static Future startMoveKeys(Database occ, state Error err = e; if (err.code() == error_code_move_to_removed_server) throw; - wait(tr.onError(e)); + wait(tr->onError(e)); if (retries % 10 == 0) { TraceEvent( @@ -500,7 +514,7 @@ ACTOR static Future startMoveKeys(Database occ, } // printf("Committed moving '%s'-'%s' (version %lld)\n", keys.begin.toString().c_str(), - // keys.end.toString().c_str(), tr.getCommittedVersion()); + // keys.end.toString().c_str(), tr->getCommittedVersion()); TraceEvent(SevDebug, interval.end(), relocationIntervalId) .detail("Batches", batches) .detail("Shards", shards) @@ -517,15 +531,37 @@ ACTOR Future waitForShardReady(StorageServerInterface server, KeyRange keys, Version minVersion, GetShardStateRequest::waitMode mode) { + // TODO REMOVE + printf("waiting for shard [%s, %s) in state %d from %sss %s @ %lld\n", + keys.begin.toString().c_str(), + keys.end.toString().c_str(), + mode, + server.isTss ? "t" : "", + server.id().toString().c_str(), + minVersion); loop { try { GetShardStateReply rep = wait(server.getShardState.getReply(GetShardStateRequest(keys, mode), TaskPriority::MoveKeys)); if (rep.first >= minVersion) { + // TODO REMOVE + printf("shard [%s, %s) is in state %d from %sss %s @ %lld >= %lld\n", + keys.begin.toString().c_str(), + keys.end.toString().c_str(), + mode, + server.isTss ? "t" : "", + server.id().toString().c_str(), + rep.first, + minVersion); return Void(); } wait(delayJittered(SERVER_KNOBS->SHARD_READY_DELAY, TaskPriority::MoveKeys)); } catch (Error& e) { + printf("Waiting for shard from %sss %s getValue=%s got error! %d\n", + server.isTss ? "t" : "", + server.id().toString().c_str(), + server.getValue.getEndpoint().token.toString().c_str(), + e.code()); if (e.code() != error_code_timed_out) { if (e.code() != error_code_broken_promise) throw e; @@ -536,6 +572,8 @@ ACTOR Future waitForShardReady(StorageServerInterface server, } } +// best effort to also wait for TSS on data move + ACTOR Future checkFetchingState(Database cx, vector dest, KeyRange keys, @@ -557,6 +595,8 @@ ACTOR Future checkFetchingState(Database cx, serverListEntries.push_back(tr.get(serverListKeyFor(dest[s]))); state vector> serverListValues = wait(getAll(serverListEntries)); vector> requests; + state vector> tssRequests; + ClientDBInfo clientInfo = cx->clientInfo->get(); for (int s = 0; s < serverListValues.size(); s++) { if (!serverListValues[s].present()) { // FIXME: Is this the right behavior? dataMovementComplete will never be sent! @@ -567,10 +607,25 @@ ACTOR Future checkFetchingState(Database cx, ASSERT(si.id() == dest[s]); requests.push_back( waitForShardReady(si, keys, tr.getReadVersion().get(), GetShardStateRequest::FETCHING)); + + Optional tssPair = clientInfo.getTssPair(si.id()); + if (tssPair.present()) { + tssRequests.push_back(waitForShardReady( + tssPair.get(), keys, tr.getReadVersion().get(), GetShardStateRequest::FETCHING)); + } } wait(timeoutError(waitForAll(requests), SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, TaskPriority::MoveKeys)); + // If normal servers return normally, give TSS data movement a bit of a chance, but don't block on it, and + // ignore errors in tss requests + if (tssRequests.size()) { + wait(timeout(waitForAllReady(tssRequests), + SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT / 2, + Void(), + TaskPriority::MoveKeys)); + } + dataMovementComplete.send(Void()); return Void(); } catch (Error& e) { @@ -601,9 +656,18 @@ ACTOR static Future finishMoveKeys(Database occ, state Key endKey; state int retries = 0; state FlowLock::Releaser releaser; + state int waitForTSSCounter = + 2; // try waiting for tss for a 2 loops, give up if they're stuck to not affect the rest of the cluster + + // for killing tss if any get stuck during movekeys + state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); + state std::vector tssToKill; + state std::set tssToIgnore; ASSERT(!destinationTeam.empty()); + printf("finishing move keys for [%s, %s)\n", keys.begin.toString().c_str(), keys.end.toString().c_str()); + try { TraceEvent(SevDebug, interval.begin(), relocationIntervalId) .detail("KeyBegin", keys.begin) @@ -616,9 +680,53 @@ ACTOR static Future finishMoveKeys(Database occ, state Transaction tr(occ); - // printf("finishMoveKeys( '%s'-'%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str()); + // TODO re-comment and change back + printf("finishMoveKeys( '%s'-'%s' )\n", begin.toString().c_str(), keys.end.toString().c_str()); loop { try { + if (tssToKill.size()) { + // TODO could move this to helper method? + // TODO add trace event + TEST(true); // killing TSS because they were unavailable for movekeys + printf("KILLING %d TSS BECAUSE THEY TIMED OUT IN MOVEKEYS\n", tssToKill.size()); + + // kill tss BEFORE committing main txn so that client requests don't make it to the tss when it + // has a different shard set than its pair use a different RYW transaction since i'm too lazy + // (and don't want to add bugs) by changing whole method to RYW. also using a different + // transaction makes it commit earlier which we may need to guarantee causality of tss getting + // removed before client sends a request to this key range on the new ss + state Reference tssTr = + makeReference(occ); + loop { + try { + tssTr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tssTr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + for (auto& tss : tssToKill) { + // DO NOT remove server list key - that'll break a bunch of stuff. DD will + // eventually call removeStorageServer tssTr->clear(serverListKeyFor(tss.id())); + tssTr->clear(serverTagKeyFor(tss.id())); + // tssTr->clear(serverTagHistoryRangeFor(tss.id())); + tssMapDB.erase(tssTr, tss.tssPairID); + } + tssTr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString()); + wait(tssTr->commit()); + + for (auto& tss : tssToKill) { + // TODO ADD trace event (sev30?) + printf("Successfully removed TSS %s in finishMoveKeys\n", + tss.id().toString().c_str()); + tssToIgnore.insert(tss.id()); + } + tssToKill.clear(); + + break; + } catch (Error& e) { + printf("MoveKeys TSS Removal Transaction got error %d\n", e.code()); + wait(tssTr->onError(e)); + } + } + } + tr.info.taskID = TaskPriority::MoveKeys; tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); @@ -763,6 +871,8 @@ ACTOR static Future finishMoveKeys(Database occ, // between // now and when this transaction commits. state vector> serverReady; // only for count below + state vector> tssReady; // for waiting in parallel with tss + state vector tssReadyInterfs; state vector newDestinations; std::set completeSrcSet(completeSrc.begin(), completeSrc.end()); for (auto& it : dest) { @@ -789,22 +899,104 @@ ACTOR static Future finishMoveKeys(Database occ, storageServerInterfaces.push_back(si); } + // update client info in case tss mapping changed or server got updated + + // Use most up to date version of tss mapping + ClientDBInfo clientInfo = occ->clientInfo->get(); + // Wait for new destination servers to fetch the keys + serverReady.reserve(storageServerInterfaces.size()); - for (int s = 0; s < storageServerInterfaces.size(); s++) + tssReady.reserve(storageServerInterfaces.size()); + tssReadyInterfs.reserve(storageServerInterfaces.size()); + for (int s = 0; s < storageServerInterfaces.size(); s++) { serverReady.push_back(waitForShardReady(storageServerInterfaces[s], keys, tr.getReadVersion().get(), GetShardStateRequest::READABLE)); - wait(timeout(waitForAll(serverReady), + + Optional tssPair = + clientInfo.getTssPair(storageServerInterfaces[s].id()); + + if (tssPair.present() && waitForTSSCounter > 0 && !tssToIgnore.count(tssPair.get().id())) { + tssReadyInterfs.push_back(tssPair.get()); + tssReady.push_back(waitForShardReady( + tssPair.get(), keys, tr.getReadVersion().get(), GetShardStateRequest::READABLE)); + } + } + + // Wait for all storage server moves, and explicitly swallow errors for tss ones with + // waitForAllReady If this takes too long the transaction will time out and retry, which is ok + wait(timeout(waitForAll(serverReady) && waitForAllReady(tssReady), SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, Void(), TaskPriority::MoveKeys)); + + // Check to see if we're waiting only on tss. If so, decrement the waiting counter. + // If the waiting counter is zero, kill the slow/non-responsive tss processes before finalizing the + // data move. + if (tssReady.size()) { + bool allSSDone = true; + for (auto& f : serverReady) { + allSSDone &= f.isReady() && !f.isError(); + if (!allSSDone) { + break; + } + } + + if (allSSDone) { + bool anyTssNotDone = false; + + for (auto& f : tssReady) { + if (!f.isReady() || f.isError()) { + anyTssNotDone = true; + printf("MK: [%s - %s) waiting on tss!\n", + begin.toString().c_str(), + keys.end.toString().c_str()); + waitForTSSCounter--; + break; + } + } + + if (anyTssNotDone && waitForTSSCounter == 0) { + for (int i = 0; i < tssReady.size(); i++) { + if (!tssReady[i].isReady() || tssReady[i].isError()) { + // TODO trace event!! + printf("TSS NOT DONE %s with move keys, killing!!\n", + tssReadyInterfs[i].id().toString().c_str()); + tssToKill.push_back(tssReadyInterfs[i]); + } + } + // repeat loop and go back to start to kill tss' before continuing on + continue; + } + } + } + int count = dest.size() - newDestinations.size(); for (int s = 0; s < serverReady.size(); s++) count += serverReady[s].isReady() && !serverReady[s].isError(); - // printf(" fMK: moved data to %d/%d servers\n", count, serverReady.size()); + int tssCount = 0; + for (int s = 0; s < tssReady.size(); s++) + tssCount += tssReady[s].isReady() && !tssReady[s].isError(); + + // TODO re-comment + if (tssReady.size()) { + printf(" fMK: [%s - %s) moved data to %d/%d servers and %d/%d tss\n", + begin.toString().c_str(), + keys.end.toString().c_str(), + count, + serverReady.size(), + tssCount, + tssReady.size()); + } else { + printf(" fMK: [%s - %s) moved data to %d/%d servers\n", + begin.toString().c_str(), + keys.end.toString().c_str(), + count, + serverReady.size()); + } TraceEvent(SevDebug, waitInterval.end(), relocationIntervalId).detail("ReadyServers", count); if (count == dest.size()) { @@ -834,6 +1026,7 @@ ACTOR static Future finishMoveKeys(Database occ, } tr.reset(); } catch (Error& error) { + printf(" fMK: error %d\n", error.code()); if (error.code() == error_code_actor_cancelled) throw; state Error err = error; @@ -862,43 +1055,50 @@ ACTOR static Future finishMoveKeys(Database occ, } ACTOR Future> addStorageServer(Database cx, StorageServerInterface server) { - state Transaction tr(cx); + state Reference tr = makeReference(cx); + state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); state int maxSkipTags = 1; + + printf("%sSS %s adding itself\n", server.isTss ? "T" : "", server.id().toString().c_str()); loop { try { - state Future fTagLocalities = tr.getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY); - state Future> fv = tr.get(serverListKeyFor(server.id())); + // TODO should also set priority system immediate? also why is this needed? + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - state Future> fExclProc = tr.get( + // TODO don't fetch tag localities, all tags, and history tags if tss. Just fetch pair's tag + state Future fTagLocalities = tr->getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY); + state Future> fv = tr->get(serverListKeyFor(server.id())); + + state Future> fExclProc = tr->get( StringRef(encodeExcludedServersKey(AddressExclusion(server.address().ip, server.address().port)))); state Future> fExclIP = - tr.get(StringRef(encodeExcludedServersKey(AddressExclusion(server.address().ip)))); - state Future> fFailProc = - tr.get(StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip, server.address().port)))); + tr->get(StringRef(encodeExcludedServersKey(AddressExclusion(server.address().ip)))); + state Future> fFailProc = tr->get( + StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip, server.address().port)))); state Future> fFailIP = - tr.get(StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip)))); + tr->get(StringRef(encodeFailedServersKey(AddressExclusion(server.address().ip)))); state Future> fExclProc2 = server.secondaryAddress().present() - ? tr.get(StringRef(encodeExcludedServersKey( + ? tr->get(StringRef(encodeExcludedServersKey( AddressExclusion(server.secondaryAddress().get().ip, server.secondaryAddress().get().port)))) : Future>(Optional()); state Future> fExclIP2 = server.secondaryAddress().present() - ? tr.get(StringRef(encodeExcludedServersKey(AddressExclusion(server.secondaryAddress().get().ip)))) + ? tr->get(StringRef(encodeExcludedServersKey(AddressExclusion(server.secondaryAddress().get().ip)))) : Future>(Optional()); state Future> fFailProc2 = server.secondaryAddress().present() - ? tr.get(StringRef(encodeFailedServersKey( + ? tr->get(StringRef(encodeFailedServersKey( AddressExclusion(server.secondaryAddress().get().ip, server.secondaryAddress().get().port)))) : Future>(Optional()); state Future> fFailIP2 = server.secondaryAddress().present() - ? tr.get(StringRef(encodeFailedServersKey(AddressExclusion(server.secondaryAddress().get().ip)))) + ? tr->get(StringRef(encodeFailedServersKey(AddressExclusion(server.secondaryAddress().get().ip)))) : Future>(Optional()); - state Future fTags = tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY, true); - state Future fHistoryTags = tr.getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY, true); + state Future fTags = tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY, true); + state Future fHistoryTags = tr->getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY, true); wait(success(fTagLocalities) && success(fv) && success(fTags) && success(fHistoryTags) && success(fExclProc) && success(fExclIP) && success(fFailProc) && success(fFailIP) && @@ -908,70 +1108,109 @@ ACTOR Future> addStorageServer(Database cx, StorageServe if (fExclProc.get().present() || fExclIP.get().present() || fFailProc.get().present() || fFailIP.get().present() || fExclProc2.get().present() || fExclIP2.get().present() || fFailProc2.get().present() || fFailIP2.get().present()) { + printf("%sSS %s failing to recruit because of exclusion\n", + server.isTss ? "T" : "", + server.id().toString().c_str()); throw recruitment_failed(); } if (fTagLocalities.get().more || fTags.get().more || fHistoryTags.get().more) ASSERT(false); - int8_t maxTagLocality = 0; - state int8_t locality = -1; - for (auto& kv : fTagLocalities.get()) { - int8_t loc = decodeTagLocalityListValue(kv.value); - if (decodeTagLocalityListKey(kv.key) == server.locality.dcId()) { - locality = loc; - break; - } - maxTagLocality = std::max(maxTagLocality, loc); - } - - if (locality == -1) { - locality = maxTagLocality + 1; - if (locality < 0) - throw recruitment_failed(); - tr.set(tagLocalityListKeyFor(server.locality.dcId()), tagLocalityListValue(locality)); - } - - int skipTags = deterministicRandom()->randomInt(0, maxSkipTags); - - state uint16_t tagId = 0; - std::vector usedTags; - for (auto& it : fTags.get()) { - Tag t = decodeServerTagValue(it.value); - if (t.locality == locality) { - usedTags.push_back(t.id); - } - } - for (auto& it : fHistoryTags.get()) { - Tag t = decodeServerTagValue(it.value); - if (t.locality == locality) { - usedTags.push_back(t.id); - } - } - std::sort(usedTags.begin(), usedTags.end()); - - int usedIdx = 0; - for (; usedTags.size() > 0 && tagId <= usedTags.end()[-1]; tagId++) { - if (tagId < usedTags[usedIdx]) { - if (skipTags == 0) + state Tag tag; + if (server.isTss) { + bool foundTag = false; + for (auto& it : fTags.get()) { + UID key = decodeServerTagKey(it.key); + if (key == server.tssPairID) { + tag = decodeServerTagValue(it.value); + foundTag = true; break; - skipTags--; - } else { - usedIdx++; + } } + if (!foundTag) { + throw recruitment_failed(); + } + // ASSERT(foundTag); // TSS's pair was removed before TSS could register. Should never happen, since the + // SS shouldn't be tracked by DD until this completes. + printf("TSS %s found tag %s for pair %s\n", + server.id().toString().c_str(), + tag.toString().c_str(), + server.tssPairID.toString().c_str()); + tssMapDB.set(tr, server.tssPairID, server.id()); + tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString()); + + } else { + int8_t maxTagLocality = 0; + state int8_t locality = -1; + // TODO i think tss can ignore this part? + for (auto& kv : fTagLocalities.get()) { + int8_t loc = decodeTagLocalityListValue(kv.value); + if (decodeTagLocalityListKey(kv.key) == server.locality.dcId()) { + locality = loc; + break; + } + maxTagLocality = std::max(maxTagLocality, loc); + } + + if (locality == -1) { + locality = maxTagLocality + 1; + if (locality < 0) { + throw recruitment_failed(); + } + tr->set(tagLocalityListKeyFor(server.locality.dcId()), tagLocalityListValue(locality)); + } + + int skipTags = deterministicRandom()->randomInt(0, maxSkipTags); + + state uint16_t tagId = 0; + std::vector usedTags; + for (auto& it : fTags.get()) { + Tag t = decodeServerTagValue(it.value); + if (t.locality == locality) { + usedTags.push_back(t.id); + } + } + for (auto& it : fHistoryTags.get()) { + Tag t = decodeServerTagValue(it.value); + if (t.locality == locality) { + usedTags.push_back(t.id); + } + } + std::sort(usedTags.begin(), usedTags.end()); + + int usedIdx = 0; + for (; usedTags.size() > 0 && tagId <= usedTags.end()[-1]; tagId++) { + if (tagId < usedTags[usedIdx]) { + if (skipTags == 0) + break; + skipTags--; + } else { + usedIdx++; + } + } + tagId += skipTags; + + tag = Tag(locality, tagId); + + tr->set(serverTagKeyFor(server.id()), serverTagValue(tag)); + KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(tag)); + tr->addReadConflictRange(conflictRange); + tr->addWriteConflictRange(conflictRange); } - tagId += skipTags; - state Tag tag(locality, tagId); - tr.set(serverTagKeyFor(server.id()), serverTagValue(tag)); - tr.set(serverListKeyFor(server.id()), serverListValue(server)); - KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(tag)); - tr.addReadConflictRange(conflictRange); - tr.addWriteConflictRange(conflictRange); - - wait(tr.commit()); - return std::make_pair(tr.getCommittedVersion(), tag); + tr->set(serverListKeyFor(server.id()), serverListValue(server)); + wait(tr->commit()); + printf("%sSS %s successfully added itself @ %lld\n", + server.isTss ? "T" : "", + server.id().toString().c_str(), + tr->getCommittedVersion()); + return std::make_pair(tr->getCommittedVersion(), tag); } catch (Error& e) { + printf("%sSS %s got error adding itself: %d!!\n", + server.isTss ? "T" : "", + server.id().toString().c_str(), + e.code()); if (e.code() == error_code_commit_unknown_result) throw recruitment_failed(); // There is a remote possibility that we successfully added ourselves and // then someone removed us, so we have to fail @@ -980,12 +1219,12 @@ ACTOR Future> addStorageServer(Database cx, StorageServe maxSkipTags = SERVER_KNOBS->MAX_SKIP_TAGS; } - wait(tr.onError(e)); + wait(tr->onError(e)); } } } // A SS can be removed only if all data (shards) on the SS have been moved away from the SS. -ACTOR Future canRemoveStorageServer(Transaction* tr, UID serverID) { +ACTOR Future canRemoveStorageServer(Reference tr, UID serverID) { RangeResult keys = wait(krmGetRanges(tr, serverKeysPrefixFor(serverID), allKeys, 2)); ASSERT(keys.size() >= 2); @@ -1005,34 +1244,39 @@ ACTOR Future canRemoveStorageServer(Transaction* tr, UID serverID) { ACTOR Future removeStorageServer(Database cx, UID serverID, + Optional tssPairID, MoveKeysLock lock, const DDEnabledState* ddEnabledState) { - state Transaction tr(cx); + state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); + state Reference tr = makeReference(cx); state bool retry = false; state int noCanRemoveCount = 0; + + printf("Removing storage server %s\n", serverID.toString().c_str()); + loop { try { - tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); - wait(checkMoveKeysLock(&tr, lock, ddEnabledState)); + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + wait(checkMoveKeysLock(&(tr->getTransaction()), lock, ddEnabledState)); TraceEvent("RemoveStorageServerLocked") .detail("ServerID", serverID) - .detail("Version", tr.getReadVersion().get()); + .detail("Version", tr->getReadVersion().get()); - state bool canRemove = wait(canRemoveStorageServer(&tr, serverID)); + state bool canRemove = wait(canRemoveStorageServer(tr, serverID)); if (!canRemove) { TEST(true); // The caller had a transaction in flight that assigned keys to the server. Wait for it to // reverse its mistake. TraceEvent(SevWarn, "NoCanRemove").detail("Count", noCanRemoveCount++).detail("ServerID", serverID); wait(delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::DataDistributionLaunch)); - tr.reset(); + tr->reset(); TraceEvent("RemoveStorageServerRetrying").detail("CanRemove", canRemove); } else { - - state Future> fListKey = tr.get(serverListKeyFor(serverID)); - state Future fTags = tr.getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY); - state Future fHistoryTags = tr.getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY); - state Future fTagLocalities = tr.getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY); - state Future fTLogDatacenters = tr.getRange(tLogDatacentersKeys, CLIENT_KNOBS->TOO_MANY); + state Future> fListKey = tr->get(serverListKeyFor(serverID)); + state Future fTags = tr->getRange(serverTagKeys, CLIENT_KNOBS->TOO_MANY); + state Future fHistoryTags = tr->getRange(serverTagHistoryKeys, CLIENT_KNOBS->TOO_MANY); + state Future fTagLocalities = tr->getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY); + state Future fTLogDatacenters = tr->getRange(tLogDatacentersKeys, CLIENT_KNOBS->TOO_MANY); wait(success(fListKey) && success(fTags) && success(fHistoryTags) && success(fTagLocalities) && success(fTLogDatacenters)); @@ -1072,22 +1316,33 @@ ACTOR Future removeStorageServer(Database cx, if (locality >= 0 && !allLocalities.count(locality)) { for (auto& it : fTagLocalities.get()) { if (locality == decodeTagLocalityListValue(it.value)) { - tr.clear(it.key); + tr->clear(it.key); break; } } } - tr.clear(serverListKeyFor(serverID)); - tr.clear(serverTagKeyFor(serverID)); - tr.clear(serverTagHistoryRangeFor(serverID)); + tr->clear(serverListKeyFor(serverID)); + tr->clear(serverTagKeyFor(serverID)); // the tss uses this to communicate shutdown but it never has a + // server tag key set in the first place + tr->clear(serverTagHistoryRangeFor(serverID)); + + // TODO a small optimization would be to only erase and trigger tss mapping if this is a tss or an ss + // with a tss pair, instead of always + if (tssPairID.present()) { + tssMapDB.erase(tr, tssPairID.get()); + } else { + tssMapDB.erase(tr, serverID); + } + tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString()); + retry = true; - wait(tr.commit()); + wait(tr->commit()); return Void(); } } catch (Error& e) { state Error err = e; - wait(tr.onError(e)); + wait(tr->onError(e)); TraceEvent("RemoveStorageServerRetrying").error(err); } } @@ -1099,6 +1354,7 @@ ACTOR Future removeKeysFromFailedServer(Database cx, MoveKeysLock lock, const DDEnabledState* ddEnabledState) { state Key begin = allKeys.begin; + printf("Removing keys from failed server %s\n", serverID.toString().c_str()); // Multi-transactional removal in case of large number of shards, concern in violating 5s transaction limit while (begin < allKeys.end) { state Transaction tr(cx); @@ -1200,6 +1456,8 @@ ACTOR Future moveKeys(Database cx, if (!dataMovementComplete.isSet()) dataMovementComplete.send(Void()); + printf("move keys done for [%s, %s)\n", keys.begin.toString().c_str(), keys.end.toString().c_str()); + return Void(); } diff --git a/fdbserver/MoveKeys.actor.h b/fdbserver/MoveKeys.actor.h index e8ae691878..c8092bbcdd 100644 --- a/fdbserver/MoveKeys.actor.h +++ b/fdbserver/MoveKeys.actor.h @@ -89,13 +89,14 @@ ACTOR Future> addStorageServer(Database cx, StorageServe ACTOR Future removeStorageServer(Database cx, UID serverID, + Optional tssPairID, // if serverID is a tss, set to its ss pair id MoveKeysLock lock, const DDEnabledState* ddEnabledState); // Removes the given storage server permanently from the database. It must already // have no shards assigned to it. The storage server MUST NOT be added again after this // (though a new storage server with a new unique ID may be recruited from the same fdbserver). -ACTOR Future canRemoveStorageServer(Transaction* tr, UID serverID); +ACTOR Future canRemoveStorageServer(Reference tr, UID serverID); // Returns true if the given storage server has no keys assigned to it and may be safely removed // Obviously that could change later! ACTOR Future removeKeysFromFailedServer(Database cx, diff --git a/fdbserver/MutationTracking.cpp b/fdbserver/MutationTracking.cpp index 16a17a0f10..b0e7215fb8 100644 --- a/fdbserver/MutationTracking.cpp +++ b/fdbserver/MutationTracking.cpp @@ -30,6 +30,9 @@ // Track up to 2 keys in simulation via enabling MUTATION_TRACKING_ENABLED and setting the keys here. StringRef debugKey = LiteralStringRef(""); StringRef debugKey2 = LiteralStringRef("\xff\xff\xff\xff"); +// StringRef debugKey = LiteralStringRef("\x00\x00\x02\xff\x00\x00\x04\xc1\x00\x00\x00\x01\x00\x00\x00\x02"); // missing +// from ss StringRef debugKey2 = LiteralStringRef("\x00\x00\x02\xff\x00\x00\x01\x89\x00\x00\x00\x04\x00\x00\x00\x02"); +// // missing from tss TraceEvent debugMutationEnabled(const char* context, Version version, MutationRef const& mutation) { if ((mutation.type == mutation.ClearRange || mutation.type == mutation.DebugKeyRange) && diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 98f14d545e..40f731aed6 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -294,6 +294,11 @@ ACTOR Future getMaxStorageServerQueueSize(Database cx, Reference servers = wait(serversFuture); state std::vector workers = wait(workersFuture); + /*printf("Found %d storage servers:\n", servers.size()); + for (auto& it : servers) { + printf(" %s\n", it.id().toString().c_str()); + }*/ + std::map workersMap; for (auto worker : workers) { workersMap[worker.interf.address()] = worker.interf; @@ -323,6 +328,7 @@ ACTOR Future getMaxStorageServerQueueSize(Database cx, Reference getStorageServersRecruiting(Database cx, WorkerInterface dist 1.0)); TraceEvent("StorageServersRecruiting").detail("Message", recruitingMessage.toString()); - return recruitingMessage.getValue("State") == "Recruiting"; + + if (recruitingMessage.getValue("State") == "Recruiting") { + std::string tssValue; + // if we're tss recruiting, that's fine because that can block indefinitely if only 1 free storage process + if (!recruitingMessage.tryGetValue("IsTSS", tssValue) || tssValue == "False") { + return true; + } + } + return false; } catch (Error& e) { TraceEvent("QuietDatabaseFailure", distributorWorker.id()) .detail("Reason", "Failed to extract StorageServersRecruiting") diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 71d3056489..83894c1201 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -719,9 +719,11 @@ ACTOR Future trackEachStorageServer( when(state std::pair> change = waitNext(serverChanges)) { wait(delay(0)); // prevent storageServerTracker from getting cancelled while on the call stack if (change.second.present()) { - auto& a = actors[change.first]; - a = Future(); - a = splitError(trackStorageServerQueueInfo(self, change.second.get()), err); + if (!change.second.get().isTss) { // TODO is this all we need to do to get ratekeeper to ignore tss? + auto& a = actors[change.first]; + a = Future(); + a = splitError(trackStorageServerQueueInfo(self, change.second.get()), err); + } } else actors.erase(change.first); } diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 128eace3a8..24d7dfb01d 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -1138,6 +1138,7 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { storage_engine_type = deterministicRandom()->randomInt(0, 4); } } + switch (storage_engine_type) { case 0: { TEST(true); // Simulated cluster using ssd storage engine @@ -1162,6 +1163,17 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { default: ASSERT(false); // Programmer forgot to adjust cases. } + + int tssCount = 0; + // if (!testConfig.simpleConfig && deterministicRandom()->random01() < 0.25) { + if (true) { + // if (false) { + // tss + // 1 or 2 tss + tssCount = deterministicRandom()->randomInt(1, 3); + printf("Initial tss count to %d\n", tssCount); + } + // if (deterministicRandom()->random01() < 0.5) { // set_config("ssd"); // } else { @@ -1494,6 +1506,29 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { } else { processes_per_machine = deterministicRandom()->randomInt(1, (extraDB ? 14 : 28) / machine_count + 2); } + + // reduce tss to half of extra non-seed servers that can be recruited in usable regions. + tssCount = + std::max(0, std::min(tssCount, (db.usableRegions * (machine_count / datacenters) - replication_type) / 2)); + printf("Adjusted tss count to %d\n", tssCount); + + if (tssCount > 0) { + std::string confStr = format("tss_count:=%d tss_storage_engine:=%d", tssCount, db.storageServerStoreType); + set_config(confStr); + double tssRandom = deterministicRandom()->random01(); + if (tssRandom > 0.5) { + // normal tss mode + g_simulator.tssMode = ISimulator::TSSMode::EnabledNormal; + printf("normal tss mode\n"); + } else if (tssRandom < 0.25) { + // delay injection + g_simulator.tssMode = ISimulator::TSSMode::EnabledAddDelay; + } else { + // fault injection + g_simulator.tssMode = ISimulator::TSSMode::EnabledDropMutations; + } + printf("enabling tss for simulation in mode %d: %s\n", g_simulator.tssMode, confStr.c_str()); + } } // Configures the system according to the given specifications in order to run @@ -1517,6 +1552,9 @@ void setupSimulatedSystem(vector>* systemActors, startingConfigString += " locked"; } for (auto kv : startingConfigJSON) { + if ("tss_storage_engine" == kv.first) { + continue; + } startingConfigString += " "; if (kv.second.type() == json_spirit::int_type) { startingConfigString += kv.first + ":=" + format("%d", kv.second.get_int()); @@ -1531,6 +1569,12 @@ void setupSimulatedSystem(vector>* systemActors, } } + // handle tss_storage_engine separately because the passthrough needs the enum ordinal, but it's serialized to json + // as the string name + if (simconfig.db.desiredTSSCount > 0) { + startingConfigString += format(" tss_storage_engine:=%d", simconfig.db.testingStorageServerStoreType); + } + if (g_simulator.originalRegions != "") { simconfig.set_config(g_simulator.originalRegions); g_simulator.startingDisabledConfiguration = startingConfigString + " " + g_simulator.disableRemote; diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 5f546638ff..723c6c6111 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1880,10 +1880,10 @@ ACTOR static Future>> getCommit ACTOR static Future>> getGrvProxiesAndMetrics( Reference> db, std::unordered_map address_workers) { - vector> results = - wait(getServerMetrics(db->get().client.grvProxies, - address_workers, - std::vector{ "GRVLatencyMetrics", "GRVLatencyBands", "GRVBatchLatencyMetrics" })); + vector> results = wait( + getServerMetrics(db->get().client.grvProxies, + address_workers, + std::vector{ "GRVLatencyMetrics", "GRVLatencyBands", "GRVBatchLatencyMetrics" })); return results; } @@ -3005,6 +3005,14 @@ ACTOR Future clusterGetStatus( statusObj["incompatible_connections"] = incompatibleConnectionsArray; statusObj["datacenter_lag"] = getLagObject(datacenterVersionDifference); + int activeTSSCount = 0; + for (auto& it : storageServers) { + if (it.first.isTss) { + activeTSSCount++; + } + } + statusObj["active_tss_count"] = activeTSSCount; + int totalDegraded = 0; for (auto& it : workers) { if (it.degraded) { diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 4ea9e83bee..f884a2e310 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1671,6 +1671,11 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen Version poppedVer = poppedVersion(logData, req.tag); if (poppedVer > req.begin) { + printf("tag %s - %s tried to peek popped data!!: %lld > %lld\n", + req.tag.toString().c_str(), + peekId.toString().c_str(), + poppedVer, + req.begin); TLogPeekReply rep; rep.maxKnownVersion = logData->version.get(); rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index 3446b3a7b8..48a4d9ce07 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -614,11 +614,18 @@ struct InitializeStorageRequest { UID reqId; UID interfaceId; KeyValueStoreType storeType; + bool isTss; + UID tssPairID; + Version tssPairVersion; ReplyPromise reply; template void serialize(Ar& ar) { - serializer(ar, seedTag, reqId, interfaceId, storeType, reply); + if (ar.protocolVersion().hasTSS()) { + serializer(ar, seedTag, reqId, interfaceId, storeType, reply, isTss, tssPairID, tssPairVersion); + } else { + serializer(ar, seedTag, reqId, interfaceId, storeType, reply); + } } }; @@ -770,6 +777,7 @@ struct DiskStoreRequest { struct Role { static const Role WORKER; static const Role STORAGE_SERVER; + static const Role TESTING_STORAGE_SERVER; static const Role TRANSACTION_LOG; static const Role SHARED_TRANSACTION_LOG; static const Role COMMIT_PROXY; @@ -840,6 +848,7 @@ class IDiskQueue; ACTOR Future storageServer(IKeyValueStore* persistentData, StorageServerInterface ssi, Tag seedTag, + Version tssSeedVersion, ReplyPromise recruitReply, Reference> db, std::string folder); diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 97953ce1a3..c0dee60682 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -417,11 +417,14 @@ ACTOR Future newTLogServers(Reference self, ACTOR Future newSeedServers(Reference self, RecruitFromConfigurationReply recruits, vector* servers) { + printf("Seeding initial %d storage servers\n", recruits.storageServers.size()); // This is only necessary if the database is at version 0 servers->clear(); if (self->lastEpochEnd) return Void(); + // TODO might need to make this handle TSS recruitment (or make RecruitFromConfiguration handle it?) for simulation + state int idx = 0; state std::map, Tag> dcId_tags; state int8_t nextLocality = 0; @@ -434,6 +437,7 @@ ACTOR Future newSeedServers(Reference self, ? dcId_tags[recruits.storageServers[idx].locality.dcId()] : Tag(nextLocality, 0); isr.storeType = self->configuration.storageServerStoreType; + isr.isTss = false; isr.reqId = deterministicRandom()->randomUniqueID(); isr.interfaceId = deterministicRandom()->randomUniqueID(); @@ -469,6 +473,8 @@ ACTOR Future newSeedServers(Reference self, .detail("TargetCount", self->configuration.storageTeamSize) .detail("Servers", describe(*servers)); + printf("Seed servers sees %d desired tss\n", self->configuration.desiredTSSCount); + return Void(); } diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 6789549944..7fe0b1c2a3 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -38,6 +38,7 @@ #include "fdbclient/DatabaseContext.h" #include "fdbclient/KeyRangeMap.h" #include "fdbclient/CommitProxyInterface.h" +#include "fdbclient/KeyBackedTypes.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/Notified.h" #include "fdbclient/StatusClient.h" @@ -463,7 +464,7 @@ public: void byteSampleApplyClear(KeyRangeRef range, Version ver); void popVersion(Version v, bool popAllTags = false) { - if (logSystem) { + if (logSystem && !isTss()) { if (v > poppedAllAfter) { popAllTags = true; poppedAllAfter = std::numeric_limits::max(); @@ -510,6 +511,25 @@ public: return mLV.push_back_deep(mLV.arena(), m); } + void setTssPair(UID pairId) { + tssPairID = Optional(pairId); + + // Set up tss fault injection here, only if we are in simulated mode and with fault injection. + // With fault injection enabled, the tss will start acting normal for a bit, then after the specified delay + // start behaving incorrectly. + if (g_network->isSimulated() && !g_simulator.speedUpSimulation && + g_simulator.tssMode >= ISimulator::TSSMode::EnabledAddDelay) { + tssFaultInjectTime = now() + deterministicRandom()->randomInt(60, 300); + TraceEvent(SevWarnAlways, "TSSInjectFaultEnabled", thisServerID) + .detail("Mode", g_simulator.tssMode) + .detail("At", tssFaultInjectTime.get()); + printf("ENABLING FAULT INJECTION FOR TSS %s at time %.4f in mode %d\n", + thisServerID.toString().c_str(), + tssFaultInjectTime.get(), + g_simulator.tssMode); + } + } + StorageServerDisk storage; KeyRangeMap> shards; @@ -552,6 +572,9 @@ public: Reference logCursor; UID thisServerID; + Optional tssPairID; // if this server is a tss, this is the id of its (ss) pair + Optional ssPairID; // if this server is an ss, this is the id of its (tss) pair + Optional tssFaultInjectTime; Key sk; Reference> db; Database cx; @@ -785,6 +808,14 @@ public: mutableData().forgetVersionsBefore(ver); } + bool isTss() const { return tssPairID.present(); } + + bool isSSWithTSSPair() const { return ssPairID.present(); } + + void setSSWithTssPair(UID idOfTSS) { ssPairID = Optional(idOfTSS); } + + void clearSSWithTssPair() { ssPairID = Optional(); } + // This is the maximum version that might be read from storage (the minimum version is durableVersion) Version storageVersion() const { return oldestVersion.get(); } @@ -1046,12 +1077,24 @@ void updateProcessStats(StorageServer* self) { ACTOR Future waitForVersionActor(StorageServer* data, Version version, SpanID spanContext) { state Span span("SS.WaitForVersion"_loc, { spanContext }); + /*if (172218491 == version) { + printf("%sSS %s starting waitForVersionActor @ %lld\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), version); + }*/ choose { when(wait(data->version.whenAtLeast(version))) { // FIXME: A bunch of these can block with or without the following delay 0. // wait( delay(0) ); // don't do a whole bunch of these at once + /*if (172218491 == version) { + printf("%sSS %s waitForVersionActor @ %lld - at least version\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), version); + }*/ if (version < data->oldestVersion.get()) throw transaction_too_old(); // just in case + /*if (172218491 == version) { + printf("%sSS %s waitForVersionActor @ %lld - not too old\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), version); + }*/ return version; } when(wait(delay(SERVER_KNOBS->FUTURE_VERSION_DELAY))) { @@ -1060,23 +1103,39 @@ ACTOR Future waitForVersionActor(StorageServer* data, Version version, .detail("Version", version) .detail("MyVersion", data->version.get()) .detail("ServerID", data->thisServerID); + /*if (172218491 == version) { + printf("%sSS %s waitForVersionActor @ %lld - future version\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), version); + }*/ throw future_version(); } } } Future waitForVersion(StorageServer* data, Version version, SpanID spanContext) { + /*if (172218491 == version) { + printf("%sSS %s started waitForVersion @ %lld\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), version); + }*/ if (version == latestVersion) { version = std::max(Version(1), data->version.get()); } if (version < data->oldestVersion.get() || version <= 0) { + /*if (172218491 == version) { + printf("%sSS %s waitForVersion @ %lld - transaction too old\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), version); + }*/ return transaction_too_old(); } else if (version <= data->version.get()) { return version; } if ((data->behind || data->versionBehind) && version > data->version.get()) { + /*if (172218491 == version) { + printf("%sSS %s waitForVersion @ %lld - process_behind\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), version); + }*/ return process_behind(); } @@ -1110,6 +1169,11 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { Span span("SS:getValue"_loc, { req.spanContext }); span.addTag("key"_sr, req.key); + /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { + printf("%sSS %s started getValueQ for %s @ %lld\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version); + }*/ + try { ++data->counters.getValueQueries; ++data->counters.allQueries; @@ -1121,6 +1185,11 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { // so we need to downgrade here wait(data->getQueryDelay()); + /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { + printf("%sSS %s getValueQ for %s @ %lld - got query delay\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version); + }*/ + if (req.debugID.present()) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), @@ -1135,8 +1204,17 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { state uint64_t changeCounter = data->shardChangeCounter; + /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { + printf("%sSS %s getValueQ for %s @ %lld - waited for version\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version); + }*/ + if (!data->shards[req.key]->isReadable()) { //TraceEvent("WrongShardServer", data->thisServerID).detail("Key", req.key).detail("Version", version).detail("In", "getValueQ"); + /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { + printf("%sSS %s started getValueQ for %s @ %lld got wrong shard server\n", data->tssPairID.present() ? + "T" : "", data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version); + }*/ throw wrong_shard_server(); } @@ -1145,6 +1223,10 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { if (i && i->isValue() && i.key() == req.key) { v = (Value)i->getValue(); path = 1; + /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { + printf("%sSS %s getValueQ for %s @ %lld - got from memory\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version); + }*/ } else if (!i || !i->isClearTo() || i->getEndKey() <= req.key) { path = 2; Optional vv = wait(data->storage.readValue(req.key, req.debugID)); @@ -1155,18 +1237,21 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { } data->checkChangeCounter(changeCounter, req.key); v = vv; + /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { + printf("%sSS %s getValueQ for %s @ %lld - got from storage\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version); + }*/ } DEBUG_MUTATION("ShardGetValue", version, MutationRef(MutationRef::DebugKey, req.key, v.present() ? v.get() : LiteralStringRef(""))); - DEBUG_MUTATION("ShardGetPath", - version, - MutationRef(MutationRef::DebugKey, - req.key, - path == 0 ? LiteralStringRef("0") - : path == 1 ? LiteralStringRef("1") - : LiteralStringRef("2"))); + DEBUG_MUTATION( + "ShardGetPath", + version, + MutationRef(MutationRef::DebugKey, + req.key, + path == 0 ? LiteralStringRef("0") : path == 1 ? LiteralStringRef("1") : LiteralStringRef("2"))); /* StorageMetrics m; @@ -1183,6 +1268,12 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { ++data->counters.emptyQueries; } + /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { + printf("%sSS %s getValueQ for %s @ %lld = %s\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version, v.present() ? + v.get().toString().c_str() : ""); + }*/ + if (SERVER_KNOBS->READ_SAMPLING_ENABLED) { // If the read yields no value, randomly sample the empty read. int64_t bytesReadPerKSecond = @@ -1205,8 +1296,16 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { reply.penalty = data->getPenalty(); req.reply.send(reply); } catch (Error& e) { + /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { + printf("%sSS %s getValueQ for %s @ %lld = ERROR: %d\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version, e.code()); + }*/ if (!canReplyWith(e)) throw; + /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { + printf("%sSS %s getValueQ for %s @ %lld = replying with error: %d\n", data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version, e.code()); + }*/ data->sendErrorWithPenalty(req.reply, e, data->getPenalty()); } @@ -1717,13 +1816,21 @@ ACTOR Future findKey(StorageServer* data, state int distance = forward ? sel.offset : 1 - sel.offset; state Span span("SS.findKey"_loc, { parentSpan }); + /*if (version == 166817893 && sel.offset == 80) { + printf("%sSS %s FindKey request %s:<%s:%d @ %lld: with key range [%s - %s):\n", data->isTss() ? "t" : "", + data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset, + version, range.begin.toString().c_str(), range.end.toString().c_str()); + }*/ + // Don't limit the number of bytes if this is a trivial key selector (there will be at most two items returned from // the read range in this case) state int maxBytes; if (sel.offset <= 1 && sel.offset >= 0) maxBytes = std::numeric_limits::max(); else - maxBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_LIMIT_BYTES : SERVER_KNOBS->STORAGE_LIMIT_BYTES; + maxBytes = (g_network->isSimulated() && g_simulator.tssMode == ISimulator::TSSMode::Disabled && BUGGIFY) + ? SERVER_KNOBS->BUGGIFY_LIMIT_BYTES + : SERVER_KNOBS->STORAGE_LIMIT_BYTES; state GetKeyValuesReply rep = wait( readRange(data, @@ -1734,6 +1841,13 @@ ACTOR Future findKey(StorageServer* data, span.context)); state bool more = rep.more && rep.data.size() != distance + skipEqualKey; + /*if (version == 166817893 && sel.offset == 80) { + printf("%sSS %s FindKey request %s:<%s:%d @ %lld: readRange with limBytes=%d got %d:\n", data->isTss() ? "t" : + "", data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset, + version, maxBytes, rep.data.size()); for (auto& it : rep.data) { printf(" %s\n", it.key.toString().c_str()); + } + }*/ + // If we get only one result in the reverse direction as a result of the data being too large, we could get stuck in // a loop if (more && !forward && rep.data.size() == 1) { @@ -1781,9 +1895,20 @@ ACTOR Future findKey(StorageServer* data, // query SOMEDAY: graceful handling of exceptionally sized values ASSERT(returnKey != sel.getKey()); + /*if (version == 166817893 && sel.offset == 80) { + printf("%sSS %s FindKey request %s:<%s:%d @ %lld: moving same shard\n", data->isTss() ? "t" : "", + data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset, + version); + }*/ return returnKey; - } else + } else { + /*if (version == 166817893 && sel.offset == 80) { + printf("%sSS %s FindKey request %s:<%s:%d @ %lld: moving shard boundary\n", data->isTss() ? "t" : "", + data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset, + version); + }*/ return forward ? range.end : range.begin; + } } } @@ -1806,6 +1931,15 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) state Span span("SS:getKeyValues"_loc, { req.spanContext }); state int64_t resultSize = 0; + if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) { + printf("%sSS %s starting query [%s - %s) @ %lld\n", + data->isTss() ? "T" : "", + data->thisServerID.toString().c_str(), + req.begin.getKey().printable().c_str(), + req.end.getKey().printable().c_str(), + req.version); + } + ++data->counters.getRangeQueries; ++data->counters.allQueries; ++data->readQueueSizeMetric; @@ -1820,6 +1954,15 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) wait(data->getQueryDelay()); } + if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) { + printf("%sSS %s downgraded [%s - %s) @ %lld\n", + data->isTss() ? "T" : "", + data->thisServerID.toString().c_str(), + req.begin.getKey().printable().c_str(), + req.end.getKey().printable().c_str(), + req.version); + } + try { if (req.debugID.present()) g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Before"); @@ -1844,6 +1987,15 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) throw wrong_shard_server(); } + if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) { + printf("%sSS %s validated shard [%s - %s) @ %lld\n", + data->isTss() ? "T" : "", + data->thisServerID.toString().c_str(), + req.begin.getKey().printable().c_str(), + req.end.getKey().printable().c_str(), + req.version); + } + state int offset1; state int offset2; state Future fBegin = req.begin.isFirstGreaterOrEqual() @@ -1854,6 +2006,7 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) : findKey(data, req.end, version, shard, &offset2, span.context); state Key begin = wait(fBegin); state Key end = wait(fEnd); + if (req.debugID.present()) g_traceBatch.addEvent( "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterKeys"); @@ -1873,6 +2026,25 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) throw wrong_shard_server(); } + if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) { + printf("%sSS %s resolved begin and end [%s - %s) @ %lld\n", + data->isTss() ? "T" : "", + data->thisServerID.toString().c_str(), + req.begin.getKey().printable().c_str(), + req.end.getKey().printable().c_str(), + req.version); + printf(" %s:<%s:%d @ -> %s\n", + req.begin.getKey().printable().c_str(), + req.begin.orEqual ? "=" : "", + req.begin.offset, + req.begin.getKey().printable().c_str()); + printf(" %s:<%s:%d @ -> %s\n", + req.end.getKey().printable().c_str(), + req.end.orEqual ? "=" : "", + req.end.offset, + req.end.getKey().printable().c_str()); + } + if (begin >= end) { if (req.debugID.present()) g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Send"); @@ -1890,10 +2062,28 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) } else { state int remainingLimitBytes = req.limitBytes; + /*if (req.begin.getKey().toString() == "m3fc7" && req.end.getKey().toString() == "s" && req.version == + 133421369) { printf("%sSS %s beginning readRange [%s - %s) @ %lld\n", data->isTss() ? "T" : "", + data->thisServerID.toString().c_str(), req.begin.getKey().printable().c_str(), + req.end.getKey().printable().c_str(), req.version); + }*/ + GetKeyValuesReply _r = wait(readRange(data, version, KeyRangeRef(begin, end), req.limit, &remainingLimitBytes, span.context)); GetKeyValuesReply r = _r; + if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && + req.version == 107157353) { + printf("%sSS %s completed readRange (%d)%s: \n", + data->isTss() ? "T" : "", + data->thisServerID.toString().c_str(), + r.data.size(), + r.more ? "+" : ""); + /*for (auto& it : r.data) { + printf(" %s=%s\n", it.key.printable().c_str(), it.value.printable().c_str()); + }*/ + } + if (req.debugID.present()) g_traceBatch.addEvent( "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterReadRange"); @@ -1926,6 +2116,14 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) data->metrics.notifyBytesReadPerKSecond(r.data[r.data.size() - 1].key, bytesReadPerKSecond); } + if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && + req.version == 107157353) { + printf("%sSS %s replying to %s\n", + data->isTss() ? "T" : "", + data->thisServerID.toString().c_str(), + req.reply.getEndpoint().token.toString().c_str()); + } + r.penalty = data->getPenalty(); req.reply.send(r); @@ -1976,14 +2174,33 @@ ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { // so we need to downgrade here wait(data->getQueryDelay()); + /*if (req.version == 166817893 && req.sel.offset == 80) { + printf("%sSS %s GetKey request %s:<%s:%d @ %lld\n", data->isTss() ? "t" : "", + data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "", + req.sel.offset, req.version); + }*/ + try { state Version version = wait(waitForVersion(data, req.version, req.spanContext)); + + /*if (req.version == 166817893 && req.sel.offset == 80) { + printf("%sSS %s GetKey request %s:<%s:%d @ %lld: waited for version\n", data->isTss() ? "t" : "", + data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "", + req.sel.offset, req.version); + }*/ + state uint64_t changeCounter = data->shardChangeCounter; state KeyRange shard = getShardKeyRange(data, req.sel); state int offset; Key k = wait(findKey(data, req.sel, version, shard, &offset, req.spanContext)); + /*if (req.version == 166817893 && req.sel.offset == 80) { + printf("%sSS %s GetKey request %s:<%s:%d @ %lld: found key: %s\n", data->isTss() ? "t" : "", + data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "", + req.sel.offset, req.version, k.toString().c_str()); + }*/ + data->checkChangeCounter( changeCounter, KeyRangeRef(std::min(req.sel.getKey(), k), std::max(req.sel.getKey(), k))); @@ -1998,6 +2215,12 @@ ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { else updated = KeySelectorRef(k, true, 0); // found + /*if (req.version == 166817893 && req.sel.offset == 80) { + printf("%sSS %s GetKey request %s:<%s:%d @ %lld: updated: %s:<%s:%d\n", data->isTss() ? "t" : "", + data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "", + req.sel.offset, req.version, updated.getKey().printable().c_str(), updated.orEqual ? "=" : "", updated.offset); + }*/ + resultSize = k.size(); data->counters.bytesQueried += resultSize; ++data->counters.rowsQueried; @@ -2322,6 +2545,14 @@ void removeDataRange(StorageServer* ss, // disk when this latest version becomes durable mLV is also modified if necessary to ensure that split clears can // be forgotten + // TODO REMOVE print + printf("%sss %s removing data range [%s - %s) @ %lld\n", + ss->isTss() ? "t" : "", + ss->thisServerID.toString().c_str(), + range.begin.toString().c_str(), + range.end.toString().c_str(), + mLV.version); + MutationRef clearRange(MutationRef::ClearRange, range.begin, range.end); clearRange = ss->addMutationToMutationLog(mLV, clearRange); @@ -2352,6 +2583,13 @@ void removeDataRange(StorageServer* ss, } data.erase(range.begin, range.end); + + printf("%sss %s removed data range [%s - %s) @ %lld\n", + ss->isTss() ? "t" : "", + ss->thisServerID.toString().c_str(), + range.begin.toString().c_str(), + range.end.toString().c_str(), + mLV.version); } void setAvailableStatus(StorageServer* self, KeyRangeRef keys, bool available); @@ -2932,32 +3170,30 @@ void changeServerKeys(StorageServer* data, ChangeServerKeysContext context) { ASSERT(!keys.empty()); - //TraceEvent("ChangeServerKeys", data->thisServerID) - // .detail("KeyBegin", keys.begin) - // .detail("KeyEnd", keys.end) - // .detail("NowAssigned", nowAssigned) - // .detail("Version", version) - // .detail("Context", changeServerKeysContextName[(int)context]); + TraceEvent("ChangeServerKeys", data->thisServerID) + .detail("KeyBegin", keys.begin) + .detail("KeyEnd", keys.end) + .detail("NowAssigned", nowAssigned) + .detail("Version", version) + .detail("Context", changeServerKeysContextName[(int)context]); validate(data); // TODO(alexmiller): Figure out how to selectively enable spammy data distribution events. - // DEBUG_KEY_RANGE( nowAssigned ? "KeysAssigned" : "KeysUnassigned", version, keys ); + DEBUG_KEY_RANGE(nowAssigned ? "KeysAssigned" : "KeysUnassigned", version, keys); bool isDifferent = false; auto existingShards = data->shards.intersectingRanges(keys); for (auto it = existingShards.begin(); it != existingShards.end(); ++it) { if (nowAssigned != it->value()->assigned()) { isDifferent = true; - /*TraceEvent("CSKRangeDifferent", data->thisServerID) - .detail("KeyBegin", it->range().begin) - .detail("KeyEnd", it->range().end);*/ + TraceEvent("CSKRangeDifferent", data->thisServerID) + .detail("KeyBegin", it->range().begin) + .detail("KeyEnd", it->range().end); break; } } if (!isDifferent) { - //TraceEvent("CSKShortCircuit", data->thisServerID) - // .detail("KeyBegin", keys.begin) - // .detail("KeyEnd", keys.end); + TraceEvent("CSKShortCircuit", data->thisServerID).detail("KeyBegin", keys.begin).detail("KeyEnd", keys.end); return; } @@ -2995,13 +3231,13 @@ void changeServerKeys(StorageServer* data, for (auto r = vr.begin(); r != vr.end(); ++r) { KeyRangeRef range = keys & r->range(); bool dataAvailable = r->value() == latestVersion || r->value() >= version; - /*TraceEvent("CSKRange", data->thisServerID) + TraceEvent("CSKRange", data->thisServerID) .detail("KeyBegin", range.begin) .detail("KeyEnd", range.end) .detail("Available", dataAvailable) .detail("NowAssigned", nowAssigned) .detail("NewestAvailable", r->value()) - .detail("ShardState0", data->shards[range.begin]->debugDescribeState());*/ + .detail("ShardState0", data->shards[range.begin]->debugDescribeState()); if (!nowAssigned) { if (dataAvailable) { ASSERT(r->value() == @@ -3043,8 +3279,14 @@ void changeServerKeys(StorageServer* data, oldShards.clear(); ranges.clear(); for (auto r = removeRanges.begin(); r != removeRanges.end(); ++r) { + // TODO should we do this at the passed in version? (or the passed in version + 1?) removeDataRange(data, data->addVersionToMutationLog(data->data().getLatestVersion()), data->shards, *r); setAvailableStatus(data, *r, false); + printf("%sss %s set data range unavailable [%s - %s)\n", + data->isTss() ? "t" : "", + data->thisServerID.toString().c_str(), + keys.begin.toString().c_str(), + keys.end.toString().c_str()); } validate(data); } @@ -3103,6 +3345,7 @@ static const KeyValueRef persistFormat(LiteralStringRef(PERSIST_PREFIX "Format") static const KeyRangeRef persistFormatReadableRange(LiteralStringRef("FoundationDB/StorageServer/1/2"), LiteralStringRef("FoundationDB/StorageServer/1/5")); static const KeyRef persistID = LiteralStringRef(PERSIST_PREFIX "ID"); +static const KeyRef persistTssPairID = LiteralStringRef(PERSIST_PREFIX "tssPairID"); // (Potentially) change with the durable version or when fetchKeys completes static const KeyRef persistVersion = LiteralStringRef(PERSIST_PREFIX "Version"); @@ -3215,15 +3458,26 @@ private: data->recoveryVersionSkips.emplace_back(rollbackVersion, currentVersion - rollbackVersion); } else if (m.type == MutationRef::SetValue && m.param1 == killStoragePrivateKey) { + printf("worked removed kill storage: %s\n", data->thisServerID.toString().c_str()); throw worker_removed(); } else if ((m.type == MutationRef::SetValue || m.type == MutationRef::ClearRange) && m.param1.substr(1).startsWith(serverTagPrefix)) { - bool matchesThisServer = decodeServerTagKey(m.param1.substr(1)) == data->thisServerID; - if ((m.type == MutationRef::SetValue && !matchesThisServer) || - (m.type == MutationRef::ClearRange && matchesThisServer)) + UID serverTagKey = decodeServerTagKey(m.param1.substr(1)); + // bool matchesThisServer = (!data->isTss() && serverTagKey == data->thisServerID) || (data->isTss() && + // serverTagKey == data->tssPairID.get()); + bool matchesThisServer = serverTagKey == data->thisServerID; + bool matchesTssPair = data->isTss() ? serverTagKey == data->tssPairID.get() : false; + if ((m.type == MutationRef::SetValue && !data->isTss() && !matchesThisServer) || + (m.type == MutationRef::ClearRange && (matchesThisServer || (data->isTss() && matchesTssPair)))) { + printf("%sSS %s removed b/c tag mutation: %s\n", + data->isTss() ? "T" : "", + data->thisServerID.toString().c_str(), + m.toString().c_str()); throw worker_removed(); + } } else if (m.type == MutationRef::SetValue && m.param1 == rebootWhenDurablePrivateKey) { data->rebootAfterDurableVersion = currentVersion; + printf("%s got reboot after durable @ %lld\n", data->thisServerID.toString().c_str(), currentVersion); TraceEvent("RebootWhenDurableSet", data->thisServerID) .detail("DurableVersion", data->durableVersion.get()) .detail("RebootAfterDurableVersion", data->rebootAfterDurableVersion); @@ -3288,6 +3542,24 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { wait(delayJittered(.005, TaskPriority::TLogPeekReply)); } + // TODO REMOVE!! just for testing what happens when TSS gets behind + if (g_network->isSimulated() && data->isTss() && g_simulator.tssMode == ISimulator::TSSMode::EnabledAddDelay && + data->tssFaultInjectTime.present() && data->tssFaultInjectTime.get() < now()) { + if (deterministicRandom()->random01() < 0.01) { + TraceEvent(SevWarnAlways, "TSSInjectDelayForever", data->thisServerID); + printf("TSS %s INJECTING DELAY FOREVER!!\n", data->thisServerID.toString().c_str()); + // small random chance to just completely get stuck here, each tss should eventually hit this in this + // mode + wait(Never()); + } else { + // otherwise pause for part of a second + double delayTime = deterministicRandom()->random01(); + TraceEvent(SevWarnAlways, "TSSInjectDelay", data->thisServerID).detail("Delay", delayTime); + printf("TSS %s INJECTING DELAY for %.4f!!\n", data->thisServerID.toString().c_str(), delayTime); + wait(delay(delayTime)); + } + } + while (data->byteSampleClearsTooLarge.get()) { wait(data->byteSampleClearsTooLarge.onChange()); } @@ -3300,8 +3572,11 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { break; } } - if (cursor->popped() > 0) + if (cursor->popped() > 0) { + printf( + "Worker removed because of popped=%d: %s\n", cursor->popped(), data->thisServerID.toString().c_str()); throw worker_removed(); + } ++data->counters.updateBatches; data->lastTLogVersion = cursor->getMaxKnownVersion(); @@ -3352,7 +3627,7 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { } else { MutationRef msg; cloneReader >> msg; - //TraceEvent(SevDebug, "SSReadingLog", data->thisServerID).detail("Mutation", msg.toString()); + // TraceEvent(SevDebug, "SSReadingLog", data->thisServerID).detail("Mutation", msg.toString()); if (firstMutation && msg.param1.startsWith(systemKeys.end)) hasPrivateData = true; @@ -3460,7 +3735,15 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { Span span("SS:update"_loc, { spanContext }); span.addTag("key"_sr, msg.param1); - if (ver != invalidVersion) { // This change belongs to a version < minVersion + if (g_network->isSimulated() && data->isTss() && + g_simulator.tssMode == ISimulator::TSSMode::EnabledDropMutations && + data->tssFaultInjectTime.present() && data->tssFaultInjectTime.get() < now() && + (msg.type == MutationRef::SetValue || msg.type == MutationRef::ClearRange) && msg.param1.size() && + msg.param1[0] != 0xff && deterministicRandom()->random01() < 0.05) { + TraceEvent(SevWarnAlways, "TSSInjectDropMutation", data->thisServerID) + .detail("Mutation", msg.toString()) + .detail("Version", cloneCursor2->version().toString()); + } else if (ver != invalidVersion) { // This change belongs to a version < minVersion DEBUG_MUTATION("SSPeek", ver, msg).detail("ServerID", data->thisServerID); if (ver == 1) { TraceEvent("SSPeekMutation", data->thisServerID); @@ -3699,8 +3982,14 @@ ACTOR Future updateStorage(StorageServer* data) { #endif void StorageServerDisk::makeNewStorageServerDurable() { + // TODO REMOVE print + printf( + "%sSS %s saving durable state\n", data->tssPairID.present() ? "T" : "", data->thisServerID.toString().c_str()); storage->set(persistFormat); storage->set(KeyValueRef(persistID, BinaryWriter::toValue(data->thisServerID, Unversioned()))); + if (data->tssPairID.present()) { + storage->set(KeyValueRef(persistTssPairID, BinaryWriter::toValue(data->tssPairID.get(), Unversioned()))); + } storage->set(KeyValueRef(persistVersion, BinaryWriter::toValue(data->version.get(), Unversioned()))); storage->set(KeyValueRef(persistShardAssignedKeys.begin.toString(), LiteralStringRef("0"))); storage->set(KeyValueRef(persistShardAvailableKeys.begin.toString(), LiteralStringRef("0"))); @@ -3929,6 +4218,7 @@ ACTOR Future restoreByteSample(StorageServer* data, ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* storage) { state Future> fFormat = storage->readValue(persistFormat.key); state Future> fID = storage->readValue(persistID); + state Future> ftssPairID = storage->readValue(persistTssPairID); state Future> fVersion = storage->readValue(persistVersion); state Future> fLogProtocol = storage->readValue(persistLogProtocol); state Future> fPrimaryLocality = storage->readValue(persistPrimaryLocality); @@ -3941,7 +4231,7 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor restoreByteSample(data, storage, byteSampleSampleRecovered, startByteSampleRestore.getFuture()); TraceEvent("ReadingDurableState", data->thisServerID); - wait(waitForAll(std::vector{ fFormat, fID, fVersion, fLogProtocol, fPrimaryLocality })); + wait(waitForAll(std::vector{ fFormat, fID, ftssPairID, fVersion, fLogProtocol, fPrimaryLocality })); wait(waitForAll(std::vector{ fShardAssigned, fShardAvailable })); wait(byteSampleSampleRecovered.getFuture()); TraceEvent("RestoringDurableState", data->thisServerID); @@ -3961,7 +4251,12 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor throw worker_recovery_failed(); } data->thisServerID = BinaryReader::fromStringRef(fID.get().get(), Unversioned()); - data->sk = serverKeysPrefixFor(data->thisServerID).withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/ + if (ftssPairID.get().present()) { + data->setTssPair(BinaryReader::fromStringRef(ftssPairID.get().get(), Unversioned())); + } + + data->sk = serverKeysPrefixFor((data->tssPairID.present()) ? data->tssPairID.get() : data->thisServerID) + .withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/ if (fLogProtocol.get().present()) data->logProtocol = BinaryReader::fromStringRef(fLogProtocol.get().get(), Unversioned()); @@ -3973,6 +4268,17 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor debug_checkRestoredVersion(data->thisServerID, version, "StorageServer"); data->setInitialVersion(version); + // TODO REMOVE print + printf("%sSS %s restored durable state @ %lld\n", + data->tssPairID.present() ? "T" : "", + data->thisServerID.toString().c_str(), + version); + if (data->tssPairID.present()) { + printf("TSS %s recovered pairing to SS %s\n", + data->thisServerID.toString().c_str(), + data->tssPairID.get().toString().c_str()); + } + state RangeResult available = fShardAvailable.get(); state int availableLoc; for (availableLoc = 0; availableLoc < available.size(); availableLoc++) { @@ -4006,6 +4312,7 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor wait(yield()); } + // TODO why is this seemingly random delay here? wait(delay(0.0001)); { @@ -4253,20 +4560,30 @@ ACTOR Future metricsCore(StorageServer* self, StorageServerInterface ssi) wait(self->byteSampleRecovery); - Tag tag = self->tag; self->actors.add(traceCounters("StorageMetrics", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self->counters.cc, self->thisServerID.toString() + "/StorageMetrics", - [tag, self=self](TraceEvent& te) { - te.detail("Tag", tag.toString()); - StorageBytes sb = self->storage.getStorageBytes(); + [self=self](TraceEvent& te) { + te.detail("Tag", self->tag.toString()); + StorageBytes sb = self->storage.getStorageBytes(); te.detail("KvstoreBytesUsed", sb.used); te.detail("KvstoreBytesFree", sb.free); te.detail("KvstoreBytesAvailable", sb.available); te.detail("KvstoreBytesTotal", sb.total); te.detail("KvstoreBytesTemp", sb.temp); + if (self->isTss()) { + te.detail("TSSPairID", self->tssPairID); + te.detail("TSSJointID", + UID(self->thisServerID.first() ^ self->tssPairID.get().first(), + self->thisServerID.second() ^ self->tssPairID.get().second())); + } else if (self->isSSWithTSSPair()) { + te.detail("SSPairID", self->ssPairID); + te.detail("TSSJointID", + UID(self->thisServerID.first() ^ self->ssPairID.get().first(), + self->thisServerID.second() ^ self->ssPairID.get().second())); + } })); loop { @@ -4370,6 +4687,20 @@ ACTOR Future serveGetValueRequests(StorageServer* self, FutureStream serveGetKeyValuesRequests(StorageServer* self, FutureStream getKeyValues) { loop { GetKeyValuesRequest req = waitNext(getKeyValues); + + if (req.begin.getKey().toString() == "m3fc7" && req.end.getKey().toString() == "s" && + req.version == 133421369) { + printf("%sSS %s got range read [%s - %s) @ %lld\n", + self->isTss() ? "T" : "", + self->thisServerID.toString().c_str(), + req.begin.getKey().printable().c_str(), + req.end.getKey().printable().c_str(), + req.version); + } + + // A TSS should never be the source for fetch keys + ASSERT(!self->tssPairID.present() || !req.isFetchKeys); + // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade // before doing real work self->actors.add(self->readGuard(req, getKeyValuesQ)); @@ -4601,6 +4932,28 @@ ACTOR Future storageServerCore(StorageServer* self, StorageServerInterface } } } + // SS monitors tss mapping here to see if it has a tss pair. + // This information is only used for ss/tss pair metrics reporting so it's ok to be eventually + // consistent. + if (!self->isTss()) { + ClientDBInfo clientInfo = self->db->get().client; + Optional myTssPair = clientInfo.getTssPair(self->thisServerID); + if (myTssPair.present()) { + // TODO REMOVE print, just for debugging + if (!self->ssPairID.present()) { + printf("SS %s found tss pair %s\n", + self->thisServerID.toString().c_str(), + myTssPair.get().id().toString().c_str()); + } + self->setSSWithTssPair(myTssPair.get().id()); + } else { + // TODO REMOVE print, just for debugging + if (self->ssPairID.present()) { + printf("SS %s lost tss pair\n", self->thisServerID.toString().c_str()); + } + self->clearSSWithTssPair(); + } + } } when(GetShardStateRequest req = waitNext(ssi.getShardState.getFuture())) { if (req.mode == GetShardStateRequest::NO_WAIT) { @@ -4667,18 +5020,19 @@ ACTOR Future memoryStoreRecover(IKeyValueStore* store, Reference tr = makeReference(cx); state int noCanRemoveCount = 0; loop { try { - tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - state bool canRemove = wait(canRemoveStorageServer(&tr, id)); + state bool canRemove = wait(canRemoveStorageServer(tr, id)); if (!canRemove) { TEST(true); // it's possible that the caller had a transaction in flight that assigned keys to the // server. Wait for it to reverse its mistake. wait(delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::UpdateStorage)); - tr.reset(); + tr->reset(); TraceEvent("RemoveStorageServerRetrying") .detail("Count", noCanRemoveCount++) .detail("ServerID", id) @@ -4688,21 +5042,34 @@ ACTOR Future memoryStoreRecover(IKeyValueStore* store, ReferenceonError(e)); TraceEvent("RemoveStorageServerRetrying").error(err); } } } +// for creating a new storage server ACTOR Future storageServer(IKeyValueStore* persistentData, StorageServerInterface ssi, Tag seedTag, + Version tssSeedVersion, ReplyPromise recruitReply, Reference> db, std::string folder) { state StorageServer self(persistentData, db, ssi); + if (ssi.isTss) { + self.setTssPair(ssi.tssPairID); + ASSERT(self.isTss()); + } - self.sk = serverKeysPrefixFor(self.thisServerID).withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/ + // TODO REMOVE + printf("initializing %sstorage %s with tag %s and tss pair=%s\n", + ssi.isTss ? "testing " : "", + ssi.id().toString().c_str(), + seedTag.toString().c_str(), + self.tssPairID.present() ? self.tssPairID.get().toString().c_str() : ""); + self.sk = serverKeysPrefixFor(self.tssPairID.present() ? self.tssPairID.get() : self.thisServerID) + .withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/ self.folder = folder; try { @@ -4713,7 +5080,16 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, std::pair verAndTag = wait(addStorageServer( self.cx, ssi)); // Might throw recruitment_failed in case of simultaneous master failure self.tag = verAndTag.second; - self.setInitialVersion(verAndTag.first - 1); + // self.setInitialVersion(ssi.isTss ? 0 : verAndTag.first - 1); + if (ssi.isTss) { + printf("TSS %s overriding initial version from %lld to %lld\n", + ssi.id().toString().c_str(), + verAndTag.first - 1, + tssSeedVersion); + self.setInitialVersion(tssSeedVersion); + } else { + self.setInitialVersion(verAndTag.first - 1); + } } else { self.tag = seedTag; } @@ -4723,7 +5099,8 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, TraceEvent("StorageServerInit", ssi.id()) .detail("Version", self.version.get()) - .detail("SeedTag", seedTag.toString()); + .detail("SeedTag", seedTag.toString()) + .detail("TssPair", ssi.isTss ? ssi.tssPairID.toString() : ""); InitializeStorageReply rep; rep.interf = ssi; rep.addedVersion = self.version.get(); @@ -4744,6 +5121,10 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, } ACTOR Future replaceInterface(StorageServer* self, StorageServerInterface ssi) { + printf("SS %s replacing interface\ngetValue=%s\n", + ssi.id().toString().c_str(), + ssi.getValue.getEndpoint().token.toString().c_str()); + ASSERT(!ssi.isTss); state Transaction tr(self->cx); loop { @@ -4758,8 +5139,17 @@ ACTOR Future replaceInterface(StorageServer* self, StorageServerInterface GetStorageServerRejoinInfoRequest(ssi.id(), ssi.locality.dcId())) : Never())) { state GetStorageServerRejoinInfoReply rep = _rep; + + printf("SS %s got rejoin reply:\nversion: %" PRIu64 "\ntag: %s\nnewTag: %s\nnewLocality: %s\n", + ssi.id().toString().c_str(), + rep.version, + rep.tag.toString().c_str(), + rep.newTag.present() ? rep.newTag.get().toString().c_str() : "", + rep.newLocality ? "true" : "false"); + try { tr.reset(); + // TODO why doesn't this need ACCESS_SYSTEM_KEYS? tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr.setVersion(rep.version); @@ -4776,6 +5166,7 @@ ACTOR Future replaceInterface(StorageServer* self, StorageServerInterface tagLocalityListValue(rep.newTag.get().locality)); } + // this only should happen if SS moved datacenters if (rep.newTag.present()) { KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(rep.newTag.get())); tr.addReadConflictRange(conflictRange); @@ -4793,6 +5184,7 @@ ACTOR Future replaceInterface(StorageServer* self, StorageServerInterface choose { when(wait(tr.commit())) { + printf("SS committed rejoin txn\n"); self->history = rep.history; if (rep.newTag.present()) { @@ -4821,6 +5213,7 @@ ACTOR Future replaceInterface(StorageServer* self, StorageServerInterface when(wait(infoChanged)) {} } } catch (Error& e) { + printf("rejoin txn got error: %d!!\n", e.code()); wait(tr.onError(e)); } } @@ -4831,6 +5224,64 @@ ACTOR Future replaceInterface(StorageServer* self, StorageServerInterface return Void(); } +ACTOR Future replaceTSSInterface(StorageServer* self, StorageServerInterface ssi) { + // RYW for KeyBackedMap + state Reference tr = makeReference(self->cx); + state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); + + ASSERT(ssi.isTss); + + printf("TSS %s replacing interface:\ngetValue=%s\n", + ssi.id().toString().c_str(), + ssi.getValue.getEndpoint().token.toString().c_str()); + + // TODO should this loop until successful? it should never have conflicts, in theory + + loop { + try { + state Tag myTag; + + tr->reset(); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); // TODO is this needed? + tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + + Optional pairTagValue = wait(tr->get(serverTagKeyFor(self->tssPairID.get()))); + + if (!pairTagValue.present()) { + TEST(true); // Race where tss was down, pair was removed, tss starts back up + throw worker_removed(); + } + + myTag = decodeServerTagValue(pairTagValue.get()); + + tr->addReadConflictRange(singleKeyRange(serverListKeyFor(ssi.id()))); + tr->set(serverListKeyFor(ssi.id()), serverListValue(ssi)); + + // add itself back to tss mapping + // tr->set(tssMappingKeyFor(self->tssPairID.get()), tssMappingValueFor(ssi.id())); + tssMapDB.set(tr, self->tssPairID.get(), ssi.id()); + tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString()); + + wait(tr->commit()); + + // TODO trace event instead + printf("tss %s added itself back, got tag %s for partner %s\n", + self->thisServerID.toString().c_str(), + self->tag.toString().c_str(), + self->tssPairID.get().toString().c_str()); + self->tag = myTag; + + break; + } catch (Error& e) { + printf("tss replace interface got error %d!!\n", e.code()); + wait(tr->onError(e)); + } + } + + return Void(); +} + +// for recovering an existing storage server ACTOR Future storageServer(IKeyValueStore* persistentData, StorageServerInterface ssi, Reference> db, @@ -4839,7 +5290,7 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, Reference connFile) { state StorageServer self(persistentData, db, ssi); self.folder = folder; - self.sk = serverKeysPrefixFor(self.thisServerID).withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/ + try { state double start = now(); TraceEvent("StorageServerRebootStart", self.thisServerID); @@ -4864,13 +5315,30 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, } TraceEvent("SSTimeRestoreDurableState", self.thisServerID).detail("TimeTaken", now() - start); + // if this is a tss storage file, use that as source of truth for this server being a tss instead of the + // presence of the tss pair key in the storage engine + if (ssi.isTss) { + ASSERT(self.isTss()); + ssi.tssPairID = self.tssPairID.get(); + } else { + ASSERT(!self.isTss()); + } + ASSERT(self.thisServerID == ssi.id()); + + self.sk = serverKeysPrefixFor(self.tssPairID.present() ? self.tssPairID.get() : self.thisServerID) + .withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/ + TraceEvent("StorageServerReboot", self.thisServerID).detail("Version", self.version.get()); if (recovered.canBeSet()) recovered.send(Void()); - wait(replaceInterface(&self, ssi)); + if (self.isTss()) { + wait(replaceTSSInterface(&self, ssi)); + } else { + wait(replaceInterface(&self, ssi)); + } TraceEvent("StorageServerStartingCore", self.thisServerID).detail("TimeTaken", now() - start); diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 4b98b38486..1b23040d0d 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -869,6 +869,7 @@ ACTOR Future checkConsistency(Database cx, std::vector testers, bool doQuiescentCheck, bool doCacheCheck, + bool doTSSCheck, double quiescentWaitTimeout, double softTimeLimit, double databasePingDelay, @@ -885,12 +886,16 @@ ACTOR Future checkConsistency(Database cx, Standalone> options; StringRef performQuiescent = LiteralStringRef("false"); StringRef performCacheCheck = LiteralStringRef("false"); + StringRef performTSSCheck = LiteralStringRef("false"); if (doQuiescentCheck) { performQuiescent = LiteralStringRef("true"); } if (doCacheCheck) { performCacheCheck = LiteralStringRef("true"); } + if (doTSSCheck) { + performTSSCheck = LiteralStringRef("true"); + } spec.title = LiteralStringRef("ConsistencyCheck"); spec.databasePingDelay = databasePingDelay; spec.timeout = 32000; @@ -898,6 +903,7 @@ ACTOR Future checkConsistency(Database cx, KeyValueRef(LiteralStringRef("testName"), LiteralStringRef("ConsistencyCheck"))); options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("performQuiescentChecks"), performQuiescent)); options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("performCacheCheck"), performCacheCheck)); + options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("performTSSCheck"), performTSSCheck)); options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("quiescentWaitTimeout"), ValueRef(options.arena(), format("%f", quiescentWaitTimeout)))); @@ -973,6 +979,8 @@ ACTOR Future runTest(Database cx, testers, quiescent, spec.runConsistencyCheckOnCache, + // spec.runConsistencyCheckOnTSS, // TODO override with true to test + true, 10000.0, 18000, spec.databasePingDelay, @@ -1108,6 +1116,11 @@ std::maprunConsistencyCheckOnCache = (value == "true"); TraceEvent("TestParserTest").detail("ParsedRunConsistencyCheckOnCache", spec->runConsistencyCheckOnCache); } }, + { "runConsistencyCheckOnTSS", + [](const std::string& value, TestSpec* spec) { + spec->runConsistencyCheckOnTSS = (value == "true"); + TraceEvent("TestParserTest").detail("ParsedRunConsistencyCheckOnTSS", spec->runConsistencyCheckOnTSS); + } }, { "waitForQuiescence", [](const std::string& value, TestSpec* spec) { bool toWait = value == "true"; @@ -1416,14 +1429,19 @@ ACTOR Future runTests(ReferenceisSimulated() && enableDD) { + printf("waiting for DD\n"); wait(success(setDDMode(cx, 1))); + printf("done waiting for DD\n"); } } catch (Error& e) { TraceEvent(SevError, "TestFailure").error(e).detail("Reason", "Unable to set starting configuration"); } } + printf("starting configuration set, moving on\n"); + if (useDB && waitForQuiescenceBegin) { TraceEvent("TesterStartingPreTestChecks") .detail("DatabasePingDelay", databasePingDelay) @@ -1439,6 +1457,8 @@ ACTOR Future runTests(Reference loadedPonger(FutureStream pings) { } StringRef fileStoragePrefix = LiteralStringRef("storage-"); +StringRef testingStoragePrefix = LiteralStringRef("testingstorage-"); StringRef fileLogDataPrefix = LiteralStringRef("log-"); StringRef fileVersionedLogDataPrefix = LiteralStringRef("log2-"); StringRef fileLogQueuePrefix = LiteralStringRef("logqueue-"); @@ -315,6 +316,7 @@ std::string filenameFromSample(KeyValueStoreType storeType, std::string folder, } std::string filenameFromId(KeyValueStoreType storeType, std::string folder, std::string prefix, UID id) { + if (storeType == KeyValueStoreType::SSD_BTREE_V1) return joinPath(folder, prefix + id.toString() + ".fdb"); else if (storeType == KeyValueStoreType::SSD_BTREE_V2) @@ -326,6 +328,7 @@ std::string filenameFromId(KeyValueStoreType storeType, std::string folder, std: else if (storeType == KeyValueStoreType::SSD_ROCKSDB_V1) return joinPath(folder, prefix + id.toString() + ".rocksdb"); + printf("UNKNOWN storeType %s\n", storeType.toString().c_str()); UNREACHABLE(); } @@ -444,6 +447,9 @@ std::vector getDiskStores(std::string folder, if (filename.startsWith(fileStoragePrefix)) { store.storedComponent = DiskStore::Storage; prefix = fileStoragePrefix; + } else if (filename.startsWith(testingStoragePrefix)) { + store.storedComponent = DiskStore::Storage; + prefix = testingStoragePrefix; } else if (filename.startsWith(fileVersionedLogDataPrefix)) { store.storedComponent = DiskStore::TLogData; // Use the option string that's in the file rather than tLogOptions.toPrefix(), @@ -739,6 +745,7 @@ ACTOR Future storageServerRollbackRebooter(Future prevStorageServer, std::string filename, UID id, LocalityData locality, + bool isTss, Reference> db, std::string folder, ActorCollection* filesClosed, @@ -756,6 +763,7 @@ ACTOR Future storageServerRollbackRebooter(Future prevStorageServer, StorageServerInterface recruited; recruited.uniqueID = id; recruited.locality = locality; + recruited.isTss = isTss; recruited.initEndpoints(); DUMPTOKEN(recruited.getValue); @@ -1097,14 +1105,26 @@ ACTOR Future workerServer(Reference connFile, Future kvClosed = kv->onClosed(); filesClosed.add(kvClosed); + // std::string doesn't have startsWith + std::string tssPrefix = testingStoragePrefix.toString(); + // TODO might be more efficient to mark a boolean on DiskStore in getDiskStores, but that kind of breaks + // the abstraction since DiskStore also applies to storage cache + tlog + bool isTss = s.filename.find(tssPrefix) != std::string::npos; + // TODO REMOVE after test + printf("%s is%s tss filename\n", s.filename.c_str(), isTss ? "" : " not"); + Role ssRole = isTss ? Role::TESTING_STORAGE_SERVER : Role::STORAGE_SERVER; + StorageServerInterface recruited; recruited.uniqueID = s.storeID; recruited.locality = locality; + recruited.isTss = isTss; recruited.initEndpoints(); std::map details; details["StorageEngine"] = s.storeType.toString(); - startRole(Role::STORAGE_SERVER, recruited.id(), interf.id(), details, "Restored"); + details["IsTSS"] = isTss ? "Yes" : "No"; + + startRole(ssRole, recruited.id(), interf.id(), details, "Restored"); DUMPTOKEN(recruited.getValue); DUMPTOKEN(recruited.getKey); @@ -1129,12 +1149,13 @@ ACTOR Future workerServer(Reference connFile, s.filename, recruited.id(), recruited.locality, + isTss, dbInfo, folder, &filesClosed, memoryLimit, kv); - errorForwarders.add(forwardError(errors, Role::STORAGE_SERVER, recruited.id(), f)); + errorForwarders.add(forwardError(errors, ssRole, recruited.id(), f)); } else if (s.storedComponent == DiskStore::TLogData) { std::string logQueueBasename; const std::string filename = basename(s.filename); @@ -1487,13 +1508,29 @@ ACTOR Future workerServer(Reference connFile, } when(InitializeStorageRequest req = waitNext(interf.storage.getFuture())) { if (!storageCache.exists(req.reqId)) { + + printf("Got " + "InitializeStorageRequest:seedTag=%s\nreqId=%s\ninterfaceId=%s\nstoreType=%s\nisTss=%" + "s\ntssPairID=%s\ntssPairVersion=%lld\n\n", + req.seedTag.toString().c_str(), + req.reqId.toString().c_str(), + req.interfaceId.toString().c_str(), + req.storeType.toString().c_str(), + req.isTss ? "true" : "false", + req.isTss ? req.tssPairID.toString().c_str() : "", + req.isTss ? req.tssPairVersion : 0); + StorageServerInterface recruited(req.interfaceId); recruited.locality = locality; + recruited.isTss = req.isTss; + recruited.tssPairID = req.tssPairID; recruited.initEndpoints(); std::map details; details["StorageEngine"] = req.storeType.toString(); - startRole(Role::STORAGE_SERVER, recruited.id(), interf.id(), details); + details["IsTSS"] = std::to_string(recruited.isTss); + Role ssRole = recruited.isTss ? Role::TESTING_STORAGE_SERVER : Role::STORAGE_SERVER; + startRole(ssRole, recruited.id(), interf.id(), details); DUMPTOKEN(recruited.getValue); DUMPTOKEN(recruited.getKey); @@ -1508,16 +1545,21 @@ ACTOR Future workerServer(Reference connFile, DUMPTOKEN(recruited.getQueuingMetrics); DUMPTOKEN(recruited.getKeyValueStoreType); DUMPTOKEN(recruited.watchValue); - // printf("Recruited as storageServer\n"); + // TODO re-comment! + printf("Recruited as storageServer\n"); std::string filename = - filenameFromId(req.storeType, folder, fileStoragePrefix.toString(), recruited.id()); + filenameFromId(req.storeType, + folder, + recruited.isTss ? testingStoragePrefix.toString() : fileStoragePrefix.toString(), + recruited.id()); IKeyValueStore* data = openKVStore(req.storeType, filename, recruited.id(), memoryLimit); Future kvClosed = data->onClosed(); filesClosed.add(kvClosed); ReplyPromise storageReady = req.reply; storageCache.set(req.reqId, storageReady.getFuture()); - Future s = storageServer(data, recruited, req.seedTag, storageReady, dbInfo, folder); + Future s = + storageServer(data, recruited, req.seedTag, req.tssPairVersion, storageReady, dbInfo, folder); s = handleIOErrors(s, data, recruited.id(), kvClosed); s = storageCache.removeOnReady(req.reqId, s); s = storageServerRollbackRebooter(s, @@ -1525,12 +1567,13 @@ ACTOR Future workerServer(Reference connFile, filename, recruited.id(), recruited.locality, + req.isTss, dbInfo, folder, &filesClosed, memoryLimit, data); - errorForwarders.add(forwardError(errors, Role::STORAGE_SERVER, recruited.id(), s)); + errorForwarders.add(forwardError(errors, ssRole, recruited.id(), s)); } else forwardPromise(req.reply, storageCache.get(req.reqId)); } @@ -2111,6 +2154,7 @@ ACTOR Future fdbd(Reference connFile, const Role Role::WORKER("Worker", "WK", false); const Role Role::STORAGE_SERVER("StorageServer", "SS"); +const Role Role::TESTING_STORAGE_SERVER("TestingStorageServer", "TS"); const Role Role::TRANSACTION_LOG("TLog", "TL"); const Role Role::SHARED_TRANSACTION_LOG("SharedTLog", "SL", false); const Role Role::COMMIT_PROXY("CommitProxyServer", "CP"); @@ -2118,7 +2162,7 @@ const Role Role::GRV_PROXY("GrvProxyServer", "GP"); const Role Role::MASTER("MasterServer", "MS"); const Role Role::RESOLVER("Resolver", "RV"); const Role Role::CLUSTER_CONTROLLER("ClusterController", "CC"); -const Role Role::TESTER("Tester", "TS"); +const Role Role::TESTER("TestClient", "TC"); const Role Role::LOG_ROUTER("LogRouter", "LR"); const Role Role::DATA_DISTRIBUTOR("DataDistributor", "DD"); const Role Role::RATEKEEPER("Ratekeeper", "RK"); diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 722e1f5e6e..0aae7ca9d4 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -48,6 +48,9 @@ struct ConsistencyCheckWorkload : TestWorkload { // Whether or not perform consistency check between storage cache servers and storage servers bool performCacheCheck; + // Whether or not to perform consistency check between storage servers and pair TSS + bool performTSSCheck; + // How long to wait for the database to go quiet before failing (if doing quiescent checks) double quiescentWaitTimeout; @@ -94,6 +97,7 @@ struct ConsistencyCheckWorkload : TestWorkload { ConsistencyCheckWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { performQuiescentChecks = getOption(options, LiteralStringRef("performQuiescentChecks"), false); performCacheCheck = getOption(options, LiteralStringRef("performCacheCheck"), false); + performTSSCheck = getOption(options, LiteralStringRef("performTSSCheck"), false); quiescentWaitTimeout = getOption(options, LiteralStringRef("quiescentWaitTimeout"), 600.0); distributed = getOption(options, LiteralStringRef("distributed"), true); shardSampleFactor = std::max(getOption(options, LiteralStringRef("shardSampleFactor"), 1), 1); @@ -1057,7 +1061,9 @@ struct ConsistencyCheckWorkload : TestWorkload { TraceEvent("ConsistencyCheck_FailedToFetchMetrics") .detail("Begin", printable(shard.begin)) .detail("End", printable(shard.end)) - .detail("StorageServer", storageServers[i].id()); + .detail("StorageServer", storageServers[i].id()) + .detail("IsTSS", storageServers[i].isTss ? "True" : "False") + .error(reply.getError()); estimatedBytes.push_back(-1); } @@ -1074,7 +1080,10 @@ struct ConsistencyCheckWorkload : TestWorkload { .detail("Begin", printable(shard.begin)) .detail("End", printable(shard.end)) .detail("StorageServer1", storageServers[firstValidStorageServer].id()) - .detail("StorageServer2", storageServers[i].id()); + .detail("StorageServer2", storageServers[i].id()) + .detail("IsTSS", + storageServers[i].isTss || storageServers[firstValidStorageServer].isTss ? "True" + : "False"); } } } @@ -1236,6 +1245,28 @@ struct ConsistencyCheckWorkload : TestWorkload { } } + // add TSS to end of list, if configured and if not relocating + if (!isRelocating && self->performTSSCheck) { + printf("CCheck: Checking for tss to add: isRelocating: %s, performTSSCheck: %s\n", + isRelocating ? "T" : "F", + self->performTSSCheck ? "T" : "F"); + int initialSize = storageServers.size(); + for (int i = 0; i < initialSize; i++) { + Optional tssPair = cx->clientInfo->get().getTssPair(storageServers[i]); + if (tssPair.present()) { + printf("CCheck: Adding TSS %s to consistency check!\n", tssPair.get().id().toString().c_str()); + storageServers.push_back(tssPair.get().id()); + storageServerInterfaces.push_back(tssPair.get()); + } else { + printf("CCheck: SS %s doesn't have tss pair\n", storageServers[i].toString().c_str()); + } + } + } else { + printf("CCheck: Not checking for tss to add: isRelocating: %s, performTSSCheck: %s\n", + isRelocating ? "T" : "F", + self->performTSSCheck ? "T" : "F"); + } + state vector estimatedBytes = wait(self->getStorageSizeEstimate(storageServerInterfaces, range)); // Gets permitted size range of shard @@ -1323,7 +1354,8 @@ struct ConsistencyCheckWorkload : TestWorkload { // Be especially verbose if in simulation if (g_network->isSimulated()) { int invalidIndex = -1; - printf("\nSERVER %d (%s); shard = %s - %s:\n", + printf("\n%sSERVER %d (%s); shard = %s - %s:\n", + storageServerInterfaces[j].isTss ? "TSS " : "", j, storageServerInterfaces[j].address().toString().c_str(), printable(req.begin.getKey()).c_str(), @@ -1341,7 +1373,8 @@ struct ConsistencyCheckWorkload : TestWorkload { } printf( - "\nSERVER %d (%s); shard = %s - %s:\n", + "\n%sSERVER %d (%s); shard = %s - %s:\n", + storageServerInterfaces[firstValidServer].isTss ? "TSS " : "", firstValidServer, storageServerInterfaces[firstValidServer].address().toString().c_str(), printable(req.begin.getKey()).c_str(), @@ -1430,16 +1463,31 @@ struct ConsistencyCheckWorkload : TestWorkload { printable(referenceUniqueKey)) .detail("ValueMismatches", valueMismatches) .detail("ValueMismatchKey", printable(valueMismatchKey)) - .detail("MatchingKVPairs", matchingKVPairs); + .detail("MatchingKVPairs", matchingKVPairs) + .detail("IsTSS", + storageServerInterfaces[j].isTss || + storageServerInterfaces[firstValidServer].isTss + ? "True" + : "False"); - self->testFailure("Data inconsistent", true); - return false; + // TODO should the test still fail if TSS is wrong? Or is just logging the trace + // logs ok + if ((g_network->isSimulated() && + g_simulator.tssMode != ISimulator::TSSMode::EnabledDropMutations) || + (!storageServerInterfaces[j].isTss && + !storageServerInterfaces[firstValidServer].isTss)) { + self->testFailure("Data inconsistent", true); + return false; + } } } } // If the data is not available and we aren't relocating this shard else if (!isRelocating) { + Error e = + rangeResult.isError() ? rangeResult.getError() : rangeResult.get().error.get(); + TraceEvent("ConsistencyCheck_StorageServerUnavailable") .suppressFor(1.0) .detail("StorageServer", storageServers[j]) @@ -1448,10 +1496,20 @@ struct ConsistencyCheckWorkload : TestWorkload { .detail("Address", storageServerInterfaces[j].address()) .detail("UID", storageServerInterfaces[j].id()) .detail("GetKeyValuesToken", - storageServerInterfaces[j].getKeyValues.getEndpoint().token); + storageServerInterfaces[j].getKeyValues.getEndpoint().token) + .detail("IsTSS", storageServerInterfaces[j].isTss ? "True" : "False") + .error(e); + + printf("CC %sSS %s failed with error % d\n", + storageServerInterfaces[j].isTss ? "T" : "", + storageServers[j].toString().c_str(), + e.code()); // All shards should be available in quiscence - if (self->performQuiescentChecks) { + // TODO should the test still fail if TSS is unavailable? Or is just logging the trace + // logs ok + if (self->performQuiescentChecks && + (g_network->isSimulated() || !storageServerInterfaces[j].isTss)) { self->testFailure("Storage server unavailable"); return false; } @@ -1546,19 +1604,25 @@ struct ConsistencyCheckWorkload : TestWorkload { bool hasValidEstimate = estimatedBytes.size() > 0; // If the storage servers' sampled estimate of shard size is different from ours + // TODO should the test still fail if TSS has wrong estimate? Or is just logging the trace logs ok if (self->performQuiescentChecks) { for (int j = 0; j < estimatedBytes.size(); j++) { if (estimatedBytes[j] >= 0 && estimatedBytes[j] != sampledBytes) { TraceEvent("ConsistencyCheck_IncorrectEstimate") .detail("EstimatedBytes", estimatedBytes[j]) .detail("CorrectSampledBytes", sampledBytes) - .detail("StorageServer", storageServers[j]); - self->testFailure("Storage servers had incorrect sampled estimate"); + .detail("StorageServer", storageServers[j]) + .detail("IsTSS", storageServerInterfaces[j].isTss ? "True" : "False"); + + if (!storageServerInterfaces[j].isTss) { + self->testFailure("Storage servers had incorrect sampled estimate"); + } hasValidEstimate = false; break; - } else if (estimatedBytes[j] < 0) { + } else if (estimatedBytes[j] < 0 && + (g_network->isSimulated() || !storageServerInterfaces[j].isTss)) { self->testFailure("Could not get storage metrics from server"); hasValidEstimate = false; break; @@ -1670,7 +1734,9 @@ struct ConsistencyCheckWorkload : TestWorkload { if (!keyValueStoreType.present()) { TraceEvent("ConsistencyCheck_ServerUnavailable").detail("ServerID", storageServers[i].id()); self->testFailure("Storage server unavailable"); - } else if (keyValueStoreType.get() != configuration.storageServerStoreType) { + } else if ((!storageServers[i].isTss && keyValueStoreType.get() != configuration.storageServerStoreType) || + (storageServers[i].isTss && + keyValueStoreType.get() != configuration.testingStorageServerStoreType)) { TraceEvent("ConsistencyCheck_WrongKeyValueStoreType") .detail("ServerID", storageServers[i].id()) .detail("StoreType", keyValueStoreType.get().toString()) @@ -1681,6 +1747,10 @@ struct ConsistencyCheckWorkload : TestWorkload { // Check each pair of storage servers for an address match for (j = i + 1; j < storageServers.size(); j++) { + // TODO change this hack back once i fix recruitment + /*if (storageServers[i].isTss || storageServers[j].isTss) { + continue; + }*/ if (storageServers[i].address() == storageServers[j].address()) { TraceEvent("ConsistencyCheck_UndesirableServer") .detail("StorageServer1", storageServers[i].id()) @@ -1701,8 +1771,18 @@ struct ConsistencyCheckWorkload : TestWorkload { ConsistencyCheckWorkload* self) { state vector workers = wait(getWorkers(self->dbInfo)); state vector storageServers = wait(getStorageServers(cx)); - std::set> missingStorage; + std::vector> missingStorage; // vector instead of a set to get the count + printf("CC starting check for storage: %d workers, %d SS\n", workers.size(), storageServers.size()); + printf("CC checking %d regions: ", configuration.regions.size()); + if (configuration.regions.size() == 1) { + printf("%s", configuration.regions[0].dcId.toString().c_str()); + } else if (configuration.regions.size() == 2) { + printf("%s %s", + configuration.regions[0].dcId.toString().c_str(), + configuration.regions[1].dcId.toString().c_str()); + } + printf("\n"); for (int i = 0; i < workers.size(); i++) { NetworkAddress addr = workers[i].interf.stableAddress(); if (!configuration.isExcludedServer(workers[i].interf.addresses()) && @@ -1712,29 +1792,83 @@ struct ConsistencyCheckWorkload : TestWorkload { for (int j = 0; j < storageServers.size(); j++) { if (storageServers[j].stableAddress() == addr) { found = true; + printf("CC found SS %s on %s in dc %s\n", + storageServers[j].id().toString().c_str(), + addr.toString().c_str(), + workers[i].interf.locality.dcId().present() + ? workers[i].interf.locality.dcId().get().toString().c_str() + : ""); break; } } if (!found) { + if (configuration.regions.size() == 0 || + (configuration.regions.size() == 1 && + workers[i].interf.locality.dcId() == configuration.regions[0].dcId) || + (configuration.regions.size() == 2 && + (workers[i].interf.locality.dcId() == configuration.regions[0].dcId || + workers[i].interf.locality.dcId() == configuration.regions[1].dcId))) { + printf("CC found no SS on %s in dc %s\n", + addr.toString().c_str(), + workers[i].interf.locality.dcId().present() + ? workers[i].interf.locality.dcId().get().toString().c_str() + : ""); + } + TraceEvent("ConsistencyCheck_NoStorage") .detail("Address", addr) .detail("ProcessClassEqualToStorageClass", (int)(workers[i].processClass == ProcessClass::StorageClass)); - missingStorage.insert(workers[i].interf.locality.dcId()); + missingStorage.push_back(workers[i].interf.locality.dcId()); } } } + int missingDc0 = configuration.regions.size() == 0 + ? 0 + : std::count(missingStorage.begin(), missingStorage.end(), configuration.regions[0].dcId); + int missingDc1 = configuration.regions.size() < 2 + ? 0 + : std::count(missingStorage.begin(), missingStorage.end(), configuration.regions[1].dcId); + if ((configuration.regions.size() == 0 && missingStorage.size()) || - (configuration.regions.size() == 1 && missingStorage.count(configuration.regions[0].dcId)) || - (configuration.regions.size() == 2 && configuration.usableRegions == 1 && - missingStorage.count(configuration.regions[0].dcId) && - missingStorage.count(configuration.regions[1].dcId)) || - (configuration.regions.size() == 2 && configuration.usableRegions > 1 && - (missingStorage.count(configuration.regions[0].dcId) || - missingStorage.count(configuration.regions[1].dcId)))) { - self->testFailure("No storage server on worker"); - return false; + (configuration.regions.size() == 1 && missingDc0) || + (configuration.regions.size() == 2 && configuration.usableRegions == 1 && missingDc0 && missingDc1) || + (configuration.regions.size() == 2 && configuration.usableRegions > 1 && (missingDc0 || missingDc1))) { + + // TODO could improve this check by also ensuring DD is currently recruiting a TSS by using quietdb? + bool couldExpectMissingTss = + (configuration.desiredTSSCount - self->dbInfo->get().client.tssMapping.size()) > 0; + printf("CC couldExpectMissingTss = %s\n", couldExpectMissingTss ? "True" : "False"); + + int countMissing = missingStorage.size(); + int acceptableTssMissing = 1; + if (configuration.regions.size() == 1) { + countMissing = missingDc0; + } else if (configuration.regions.size() == 2) { + if (configuration.usableRegions == 1) { + // all processes should be missing from 1, so take the number missing from the other + countMissing = std::min(missingDc0, missingDc1); + } else if (configuration.usableRegions == 2) { + countMissing = missingDc0 + missingDc1; + acceptableTssMissing = 2; + } else { + ASSERT(false); // in case fdb ever adds 3+ region support? + } + } + + if (!couldExpectMissingTss || countMissing > acceptableTssMissing) { + printf("No storage server on %d workers. CouldBeTSS=%s, acceptableTssMissing=%d\n", + countMissing, + couldExpectMissingTss ? "T" : "F", + acceptableTssMissing); + self->testFailure("No storage server on worker"); + return false; + } else { + // TODO sev=30 warn instead of print + printf("CC found %d missing storage server on worker, but it is likely a tss(es) waiting for a pair\n", + configuration.usableRegions); + } } return true; @@ -1751,8 +1885,10 @@ struct ConsistencyCheckWorkload : TestWorkload { state bool foundExtraDataStore = false; state std::vector protectedProcessesToKill; + printf("CC checking for extra data stores\n"); state std::map> statefulProcesses; for (const auto& ss : storageServers) { + printf("CC Marking %ss as ok\n", ss.id().toString().c_str()); statefulProcesses[ss.address()].insert(ss.id()); // A process may have two addresses (same ip, different ports) if (ss.secondaryAddress().present()) { @@ -1809,6 +1945,9 @@ struct ConsistencyCheckWorkload : TestWorkload { if (statefulProcesses[itr->interf.address()].count(id)) { continue; } + printf("CC found extra data store %s on %s\n", + id.toString().c_str(), + itr->interf.address().toString().c_str()); // For extra data store TraceEvent("ConsistencyCheck_ExtraDataStore") .detail("Address", itr->interf.address()) @@ -1841,7 +1980,10 @@ struct ConsistencyCheckWorkload : TestWorkload { } } + printf("CC check for extra data stores complete\n"); + if (foundExtraDataStore) { + printf("CC Extra Data Stores\n"); self->testFailure("Extra data stores present on workers"); return false; } diff --git a/fdbserver/workloads/RandomMoveKeys.actor.cpp b/fdbserver/workloads/RandomMoveKeys.actor.cpp index 8bdafb35e0..4fe5654a18 100644 --- a/fdbserver/workloads/RandomMoveKeys.actor.cpp +++ b/fdbserver/workloads/RandomMoveKeys.actor.cpp @@ -162,12 +162,13 @@ struct MoveKeysWorkload : TestWorkload { // The real data distribution algorithm doesn't want to deal with multiple servers // with the same address having keys. So if there are two servers with the same address, // don't use either one (so we don't have to find out which of them, if any, already has keys). + // Also get rid of tss since we don't want to move a shard to a tss. std::map count; for (int s = 0; s < servers.size(); s++) count[servers[s].address()]++; int o = 0; for (int s = 0; s < servers.size(); s++) - if (count[servers[s].address()] == 1) + if (count[servers[s].address()] == 1 && !servers[s].isTss) servers[o++] = servers[s]; servers.resize(o); } diff --git a/fdbserver/workloads/workloads.actor.h b/fdbserver/workloads/workloads.actor.h index 36fcf28312..ffd669e88b 100644 --- a/fdbserver/workloads/workloads.actor.h +++ b/fdbserver/workloads/workloads.actor.h @@ -152,6 +152,7 @@ public: databasePingDelay = g_network->isSimulated() ? 0.0 : 15.0; runConsistencyCheck = g_network->isSimulated(); runConsistencyCheckOnCache = false; + runConsistencyCheckOnTSS = false; waitForQuiescenceBegin = true; waitForQuiescenceEnd = true; simCheckRelocationDuration = false; @@ -167,8 +168,8 @@ public: double databasePingDelay = -1.0) : title(title), dumpAfterTest(dump), clearAfterTest(clear), startDelay(startDelay), useDB(useDB), timeout(600), databasePingDelay(databasePingDelay), runConsistencyCheck(g_network->isSimulated()), - runConsistencyCheckOnCache(false), waitForQuiescenceBegin(true), waitForQuiescenceEnd(true), - simCheckRelocationDuration(false), simConnectionFailuresDisableDuration(0), + runConsistencyCheckOnCache(false), runConsistencyCheckOnTSS(false), waitForQuiescenceBegin(true), + waitForQuiescenceEnd(true), simCheckRelocationDuration(false), simConnectionFailuresDisableDuration(0), simBackupAgents(ISimulator::BackupAgentType::NoBackupAgents), simDrAgents(ISimulator::BackupAgentType::NoBackupAgents) { phases = TestWorkload::SETUP | TestWorkload::EXECUTION | TestWorkload::CHECK | TestWorkload::METRICS; @@ -187,6 +188,7 @@ public: double databasePingDelay; bool runConsistencyCheck; bool runConsistencyCheckOnCache; + bool runConsistencyCheckOnTSS; bool waitForQuiescenceBegin; bool waitForQuiescenceEnd; diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h index d3c601a9b5..7feb6b3839 100644 --- a/flow/ProtocolVersion.h +++ b/flow/ProtocolVersion.h @@ -121,7 +121,7 @@ public: // introduced features PROTOCOL_VERSION_FEATURE(0x0FDB00B062010001LL, CloseUnusedConnection); PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, DBCoreState); PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, TagThrottleValue); - PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, ServerListValue); + PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, ServerListValue); PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, StorageCacheValue); PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, RestoreStatusValue); PROTOCOL_VERSION_FEATURE(0x0FDB00B063010000LL, RestoreRequestValue); @@ -138,6 +138,8 @@ public: // introduced features PROTOCOL_VERSION_FEATURE(0x0FDB00B070010000LL, StableInterfaces); PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, TagThrottleValueReason); PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, SpanContext); + // TODO is this right? + PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, TSS); }; template <> diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 7bf2a05e63..e0e84c6e25 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -1230,6 +1230,8 @@ Future brokenPromiseToMaybeDelivered(Future in) { return t; } catch (Error& e) { if (e.code() == error_code_broken_promise) { + // TODO REMOVE! + printf("broken promise!!"); throw request_maybe_delivered(); } throw; diff --git a/flow/serialize.h b/flow/serialize.h index 81bb18ad4d..7653648a80 100644 --- a/flow/serialize.h +++ b/flow/serialize.h @@ -22,6 +22,9 @@ #define FLOW_SERIALIZE_H #pragma once +// TODO REMOVE +#include + #include #include #include @@ -109,6 +112,12 @@ class Serializer { public: static void serialize(Archive& ar, T& t) { t.serialize(ar); + // TODO REMOVE + if (!ar.protocolVersion().isValid()) { + printf("invalid protocol version %" PRIx64 " < %" PRIx64 "!!!\n", + ar.protocolVersion().version(), + ProtocolVersion::minValidProtocolVersion); + } ASSERT(ar.protocolVersion().isValid()); } }; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e12b1e3ce9..63f5c3bab8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -87,7 +87,9 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES SlowTask.txt IGNORE) add_fdb_test(TEST_FILES SpecificUnitTest.txt IGNORE) add_fdb_test(TEST_FILES StorageMetricsSampleTests.txt IGNORE) + add_fdb_test(TEST_FILES StorageServerInterface.txt) add_fdb_test(TEST_FILES StreamingWrite.txt IGNORE) + add_fdb_test(TEST_FILES SystemData.txt) add_fdb_test(TEST_FILES ThreadSafety.txt IGNORE) add_fdb_test(TEST_FILES TraceEventMetrics.txt IGNORE) add_fdb_test(TEST_FILES PopulateTPCC.txt IGNORE) diff --git a/tests/StorageServerInterface.txt b/tests/StorageServerInterface.txt new file mode 100644 index 0000000000..b2bf01bb14 --- /dev/null +++ b/tests/StorageServerInterface.txt @@ -0,0 +1,7 @@ +testTitle=UnitTests +startDelay=0 +useDB=false + + testName=UnitTests + maxTestCases=0 + testsMatching=/StorageServerInterface/ \ No newline at end of file diff --git a/tests/SystemData.txt b/tests/SystemData.txt new file mode 100644 index 0000000000..e8bbc2c57d --- /dev/null +++ b/tests/SystemData.txt @@ -0,0 +1,7 @@ +testTitle=UnitTests +startDelay=0 +useDB=false + + testName=UnitTests + maxTestCases=0 + testsMatching=/SystemData/ \ No newline at end of file From 4257ac2b4dac2f41df581e8f7a3105a45c2a6354 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Wed, 12 May 2021 18:53:20 +0000 Subject: [PATCH 436/461] More TSS Changes/Fixes --- fdbclient/BackupAgentBase.actor.cpp | 4 +- fdbclient/CommitProxyInterface.h | 1 - fdbclient/DatabaseContext.h | 1 - fdbclient/ManagementAPI.actor.cpp | 1 - fdbclient/NativeAPI.actor.cpp | 103 +++--- fdbclient/StorageServerInterface.cpp | 162 ++------ fdbclient/StorageServerInterface.h | 12 +- fdbclient/SystemData.cpp | 17 +- fdbrpc/LoadBalance.actor.h | 53 +-- fdbrpc/QueueModel.cpp | 15 +- fdbrpc/QueueModel.h | 12 +- fdbrpc/TSSComparison.h | 23 +- fdbrpc/fdbrpc.h | 2 - fdbserver/ApplyMetadataMutation.cpp | 72 +++- fdbserver/ClusterController.actor.cpp | 34 +- fdbserver/CommitProxyServer.actor.cpp | 3 - fdbserver/DataDistribution.actor.cpp | 265 ++++++------- fdbserver/DataDistributionTracker.actor.cpp | 14 +- fdbserver/Knobs.cpp | 4 +- fdbserver/Knobs.h | 2 + fdbserver/MoveKeys.actor.cpp | 171 +++------ fdbserver/MutationTracking.cpp | 3 - fdbserver/QuietDatabase.actor.cpp | 6 - fdbserver/Ratekeeper.actor.cpp | 2 +- fdbserver/SimulatedCluster.actor.cpp | 23 +- fdbserver/Status.actor.cpp | 12 +- fdbserver/TLogServer.actor.cpp | 5 - fdbserver/WorkerInterface.actor.h | 11 +- fdbserver/masterserver.actor.cpp | 6 - fdbserver/storageserver.actor.cpp | 349 ++---------------- fdbserver/tester.actor.cpp | 10 +- fdbserver/worker.actor.cpp | 47 ++- .../workloads/ConsistencyCheck.actor.cpp | 104 +----- fdbserver/workloads/RandomMoveKeys.actor.cpp | 2 +- fdbserver/workloads/workloads.actor.h | 2 +- flow/Knobs.cpp | 1 + flow/Knobs.h | 1 + flow/genericactors.actor.h | 2 - flow/serialize.h | 9 - 39 files changed, 482 insertions(+), 1084 deletions(-) diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp index cc861f310a..4b00857503 100644 --- a/fdbclient/BackupAgentBase.actor.cpp +++ b/fdbclient/BackupAgentBase.actor.cpp @@ -406,7 +406,7 @@ ACTOR Future readCommitted(Database cx, // When this buggify line is enabled, if there are more than 1 result then use half of the results // Copy the data instead of messing with the results directly to avoid TSS issues. if (values.size() > 1 && BUGGIFY) { - Standalone copy; + RangeResult copy; // only copy first half of values into copy for (int i = 0; i < values.size() / 2; i++) { copy.push_back_deep(copy.arena(), values[i]); @@ -478,7 +478,7 @@ ACTOR Future readCommitted(Database cx, // When this buggify line is enabled, if there are more than 1 result then use half of the results. // Copy the data instead of messing with the results directly to avoid TSS issues. if (rangevalue.size() > 1 && BUGGIFY) { - Standalone copy; + RangeResult copy; // only copy first half of rangevalue into copy for (int i = 0; i < rangevalue.size() / 2; i++) { copy.push_back_deep(copy.arena(), rangevalue[i]); diff --git a/fdbclient/CommitProxyInterface.h b/fdbclient/CommitProxyInterface.h index 16f6695a03..2ac4481a15 100644 --- a/fdbclient/CommitProxyInterface.h +++ b/fdbclient/CommitProxyInterface.h @@ -125,7 +125,6 @@ struct ClientDBInfo { bool operator!=(ClientDBInfo const& r) const { return id != r.id; } // convenience method to treat tss mapping like a map - // TODO can serializer handle maps? could just change it Optional getTssPair(UID storageServerID) const { for (auto& it : tssMapping) { if (it.first == storageServerID) { diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index b1dee87f18..2a3d2ec35a 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -425,7 +425,6 @@ public: static const std::vector debugTransactionTagChoices; std::unordered_map> watchMap; - // TODO should this be private? void maybeAddTssMapping(StorageServerInterface const& ssi); void addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi); }; diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 8b4d03b4d8..490d7404d9 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -358,7 +358,6 @@ ConfigurationResult buildConfiguration(std::vector const& modeTokens, // A new tss setup must have count + storage engine. An adjustment must have at least one. if ((isNew && (!count.present() || !storageEngine.present())) || (!isNew && !count.present() && !storageEngine.present())) { - // TODO is this the right error type? And should we log something? return ConfigurationResult::INCOMPLETE_CONFIGURATION; } diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 9cd9a32d8c..b5ec6c179f 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -122,7 +122,6 @@ NetworkOptions::NetworkOptions() static const Key CLIENT_LATENCY_INFO_PREFIX = LiteralStringRef("client_latency/"); static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = LiteralStringRef("client_latency_counter/"); -// TODO make tss function here void DatabaseContext::maybeAddTssMapping(StorageServerInterface const& ssi) { // add tss mapping if server is new @@ -135,21 +134,14 @@ void DatabaseContext::maybeAddTssMapping(StorageServerInterface const& ssi) { // calling getInterface potentially recursively is weird, but since this function is only called when an entry is // created/changed, the recursive call should never recurse itself. void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi) { - // TODO get both with a getInterface call which will create the tss endpoint and/or update both endpoints if there - // was a change in endpoint tokens - - // the order of these is important because it hits the "different token same locality" issue, so we always want to - // request the tss first so the ss request overrides it. - // TODO this shouldn't be necessary after i stop doing the same server hack Reference tssInfo = StorageServerInfo::getInterface(this, tssi, clientLocality); Reference ssInfo = StorageServerInfo::getInterface(this, ssi, clientLocality); - // add new tss metrics object to queue Reference metrics = makeReference(); tssMetrics[tssi.id()] = metrics; - // TODO any other requests it makes sense to duplicate? - // add each read data request interface to map (getValue, getKey, getKeyValues, watchValue) + // Add each read data request we want to duplicate to TSS to endpoint mapping (getValue, getKey, getKeyValues, + // watchValue) queueModel.updateTssEndpoint( ssInfo->interf.getValue.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssInfo->interf.getValue.getEndpoint(), metrics, clientInfo->get().id)); @@ -162,10 +154,6 @@ void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageSe queueModel.updateTssEndpoint( ssInfo->interf.watchValue.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssInfo->interf.watchValue.getEndpoint(), metrics, clientInfo->get().id)); - - // TODO REMOVE - printf( - "added tss endpoints to queue for mapping %s=%s\n", ssi.id().toString().c_str(), tssi.id().toString().c_str()); } Reference StorageServerInfo::getInterface(DatabaseContext* cx, @@ -182,16 +170,11 @@ Reference StorageServerInfo::getInterface(DatabaseContext* cx // changes. it->second->interf = ssi; - - // TODO remove print - printf("maybeAddTss same locality %s\n", ssi.id().toString().c_str()); cx->maybeAddTssMapping(ssi); } else { it->second->notifyContextDestroyed(); Reference loc(new StorageServerInfo(cx, ssi, locality)); cx->server_interf[ssi.id()] = loc.getPtr(); - // TODO REMOVE print - printf("maybeAddTss different locality %s\n", ssi.id().toString().c_str()); cx->maybeAddTssMapping(ssi); return loc; } @@ -202,8 +185,6 @@ Reference StorageServerInfo::getInterface(DatabaseContext* cx Reference loc(new StorageServerInfo(cx, ssi, locality)); cx->server_interf[ssi.id()] = loc.getPtr(); - // TODO REMOVE print - // printf("maybeAddTss new ssi %s\n", ssi.id().toString().c_str()); cx->maybeAddTssMapping(ssi); return loc; } @@ -343,6 +324,13 @@ void delref(DatabaseContext* ptr) { ptr->delref(); } +void traceTSSErrors(const char* name, UID tssId, const std::unordered_map& errorsByCode) { + TraceEvent ev(name, tssId); + for (auto& it : errorsByCode) { + ev.detail("E" + std::to_string(it.first), it.second); + } +} + ACTOR Future databaseLogger(DatabaseContext* cx) { state double lastLogged = 0; loop { @@ -389,11 +377,18 @@ ACTOR Future databaseLogger(DatabaseContext* cx) { // TODO could skip this tss if request counter is zero? would potentially complicate elapsed calculation // though if (it.second->mismatches.getIntervalDelta()) { - printf("Found tss %s with %d mismatches!!\n", - it.first.toString().c_str(), - it.second->mismatches.getIntervalDelta()); cx->tssMismatchStream.send(it.first); } + + // do error histograms as separate event + if (it.second->ssErrorsByCode.size()) { + traceTSSErrors("TSS_SSErrors", it.first, it.second->ssErrorsByCode); + } + + if (it.second->tssErrorsByCode.size()) { + traceTSSErrors("TSS_TSSErrors", it.first, it.second->tssErrorsByCode); + } + TraceEvent tssEv("TSSClientMetrics", cx->dbId); tssEv.detail("TSSID", it.first) .detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) @@ -409,7 +404,7 @@ ACTOR Future databaseLogger(DatabaseContext* cx) { tssEv.detail("MeanTSSGetValueLatency", it.second->TSSgetValueLatency.mean()) .detail("MedianTSSGetValueLatency", it.second->TSSgetValueLatency.median()) .detail("TSSGetValueLatency90", it.second->TSSgetValueLatency.percentile(0.90)) - .detail("TSSGetValueLatencyDiff99", it.second->TSSgetValueLatency.percentile(0.99)); + .detail("TSSGetValueLatency99", it.second->TSSgetValueLatency.percentile(0.99)); tssEv.detail("MeanSSGetKeyLatency", it.second->SSgetKeyLatency.mean()) .detail("MedianSSGetKeyLatency", it.second->SSgetKeyLatency.median()) @@ -419,7 +414,7 @@ ACTOR Future databaseLogger(DatabaseContext* cx) { tssEv.detail("MeanTSSGetKeyLatency", it.second->TSSgetKeyLatency.mean()) .detail("MedianTSSGetKeyLatency", it.second->TSSgetKeyLatency.median()) .detail("TSSGetKeyLatency90", it.second->TSSgetKeyLatency.percentile(0.90)) - .detail("TSSGetKeyLatencyDiff99", it.second->TSSgetKeyLatency.percentile(0.99)); + .detail("TSSGetKeyLatency99", it.second->TSSgetKeyLatency.percentile(0.99)); tssEv.detail("MeanSSGetKeyValuesLatency", it.second->SSgetKeyLatency.mean()) .detail("MedianSSGetKeyValuesLatency", it.second->SSgetKeyLatency.median()) @@ -429,7 +424,7 @@ ACTOR Future databaseLogger(DatabaseContext* cx) { tssEv.detail("MeanTSSGetKeyValuesLatency", it.second->TSSgetKeyValuesLatency.mean()) .detail("MedianTSSGetKeyValuesLatency", it.second->TSSgetKeyValuesLatency.median()) .detail("TSSGetKeyValuesLatency90", it.second->TSSgetKeyValuesLatency.percentile(0.90)) - .detail("TSSGetKeyValuesLatencyDiff99", it.second->TSSgetKeyValuesLatency.percentile(0.99)); + .detail("TSSGetKeyValuesLatency99", it.second->TSSgetKeyValuesLatency.percentile(0.99)); it.second->clear(); } @@ -826,13 +821,12 @@ ACTOR static Future monitorTssChange(DatabaseContext* cx) { loop { wait(cx->clientInfo->onChange()); if (cx->clientInfo->get().tssMapping != curTssMapping) { - // TODO maybe re-read this from system keys instead if it changes + // To optimize size of the ClientDBInfo payload, we could eventually change CC to just send a tss change + // id/generation, and have client reread the mapping here if it changed. It's a very minor optimization + // though, and would cause extra read load. ClientDBInfo clientInfo = cx->clientInfo->get(); curTssMapping = clientInfo.tssMapping; - // TODO REMOVE print - // printf("gonna do tss stuff with %d tss's\n", curTssMapping.size()); - std::unordered_set seenTssIds; if (curTssMapping.size()) { @@ -840,15 +834,7 @@ ACTOR static Future monitorTssChange(DatabaseContext* cx) { seenTssIds.insert(it.second.id()); if (cx->server_interf.count(it.first)) { - // TODO REMOVE - printf("found new tss mapping %s -> %s\n", - it.first.toString().c_str(), - it.second.id().toString().c_str()); cx->addTssMapping(cx->server_interf[it.first]->interf, it.second); - } else { - // TODO REMOVE case and print - // printf("server %s with tss pair %s not in server_interf, skipping for now\n", - // it.first.toString().c_str(), it.second.id().toString().c_str()); } } } @@ -857,8 +843,6 @@ ACTOR static Future monitorTssChange(DatabaseContext* cx) { if (seenTssIds.count(it->first)) { it++; } else { - // TODO REMOVE - printf("Erasing tss %s from tss_metrics\n", it->first.toString().c_str()); it = cx->tssMetrics.erase(it); } } @@ -883,18 +867,14 @@ ACTOR static Future handleTssMismatches(DatabaseContext* cx) { break; } } - // TODO maybe instead of assert, do a trace event because it's possible that by the time we checked the mismatch - // the tss is gone? if (found) { - // TODO add trace event + TraceEvent(SevWarnAlways, "TSS_KillMismatch").detail("TSSID", tssID.toString()); TEST(true); // killing TSS because it got mismatch - printf("KILLING TSS %s (partner=%s) BECAUSE OF TSS MISMATCH\n", - tssID.toString().c_str(), - tssPairID.toString().c_str()); - + // TODO we could write something to the system keyspace and then have DD listen to that keyspace and then DD // do exactly this, so why not just cut out the middle man (or the middle system keys, as it were) tr = makeReference(Database(Reference::addRef(cx))); + state int tries = 0; loop { try { tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); @@ -908,16 +888,20 @@ ACTOR static Future handleTssMismatches(DatabaseContext* cx) { break; } catch (Error& e) { - printf("Kill Mismatch TSS Transaction got error %d\n", e.code()); wait(tr->onError(e)); } + tries++; + if (tries > 10) { + // Give up on trying to kill the tss, it'll get another mismatch or a human will investigate + // eventually + TraceEvent("TSS_KillMismatchGaveUp").detail("TSSID", tssID.toString()); + break; + } } - tr = makeReference(); // clear out txn so that the extra ref gets decref'd and we - // can free cx - + // clear out txn so that the extra DatabaseContext ref gets decref'd and we can free cx + tr = makeReference(); } else { TEST(true); // Not killing TSS with mismatch because it's already gone - printf("Not killing TSS %s because of tss mismatch, must be already removed\n", tssID.toString().c_str()); } } } @@ -1264,14 +1248,16 @@ DatabaseContext::DatabaseContext(Reference( KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0")) - .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); + .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeySpaceModule( - SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeySpaceModule( - SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, + SpecialKeySpace::MODULE::MANAGEMENT, + SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0")) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); @@ -3364,10 +3350,9 @@ ACTOR Future getRange(Database cx, output.readThroughEnd = readThroughEnd; if (BUGGIFY && limits.hasByteLimit() && output.size() > std::max(1, originalLimits.minRows)) { - printf("Buggify resizing in nativeapi\n"); // Copy instead of resizing because TSS maybe be using output's arena for comparison. This only // happens in simulation so it's fine - Standalone copy; + RangeResult copy; int newSize = deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size()); for (int i = 0; i < newSize; i++) { @@ -4419,8 +4404,6 @@ ACTOR static Future tryCommit(Database cx, choose { when(wait(cx->onProxiesChanged())) { reply.cancel(); - // TODO REMOVE - printf("tryCommit proxies changed ERROR!\n"); throw request_maybe_delivered(); } when(CommitID ci = wait(reply)) { diff --git a/fdbclient/StorageServerInterface.cpp b/fdbclient/StorageServerInterface.cpp index 180d1b814c..fe5ef4aaeb 100644 --- a/fdbclient/StorageServerInterface.cpp +++ b/fdbclient/StorageServerInterface.cpp @@ -37,11 +37,6 @@ bool TSS_doCompare(const GetValueRequest& req, Severity traceSeverity, UID tssId) { if (src.value.present() != tss.value.present() || (src.value.present() && src.value.get() != tss.value.get())) { - printf("GetValue %s @ %lld mismatch: src=%s, tss=%s\n", - req.key.printable().c_str(), - req.version, - src.value.present() ? traceChecksumValue(src.value.get()).c_str() : "missing", - tss.value.present() ? traceChecksumValue(tss.value.get()).c_str() : "missing"); TraceEvent(traceSeverity, "TSSMismatchGetValue") .suppressFor(1.0) .detail("TSSID", tssId) @@ -52,8 +47,6 @@ bool TSS_doCompare(const GetValueRequest& req, return false; } - // printf("tss GetValueReply matched! src=%s, tss=%s\n", src.value.present() ? src.value.get().toString().c_str() : - // "missing", tss.value.present() ? tss.value.get().toString().c_str() : "missing"); return true; } @@ -70,8 +63,6 @@ bool TSS_doCompare(const GetKeyRequest& req, // check key selectors that start in a TSS shard and end in a non-TSS shard because the other read queries and the // consistency check will eventually catch a misbehaving storage engine. bool matches = true; - // printf("GetKey %s:<%s:%d @ %lld start:\n", - // req.sel.getKey().toString().c_str(), req.sel.orEqual ? "=" : "", req.sel.offset, req.version); if (src.sel.orEqual == tss.sel.orEqual && src.sel.offset == tss.sel.offset) { // full matching case if (src.sel.offset == 0 && src.sel.orEqual) { @@ -99,23 +90,9 @@ bool TSS_doCompare(const GetKeyRequest& req, // where one response has <=0 with the actual result and the other has <0 with the shard upper boundary. // So whichever one has the actual result should have the lower key. bool tssOffsetLarger = (src.sel.offset == tss.sel.offset) ? tss.sel.orEqual : src.sel.offset < tss.sel.offset; - // printf(" partial comparison: tssLarger=%s, tssOffsetLarger=%s, matches=%s\n", tssKeyLarger ? "T" : "F", - // tssOffsetLarger ? "T": "F", matches ? "T" : "F"); matches = tssKeyLarger != tssOffsetLarger; } if (!matches) { - // TODO REMOVE print - printf("GetKey %s:<%s:%d @ %lld mismatch: src=%s:<%s:%d, tss=%s:<%s:%d\n", - req.sel.getKey().printable().c_str(), - req.sel.orEqual ? "=" : "", - req.sel.offset, - req.version, - src.sel.getKey().printable().c_str(), - src.sel.orEqual ? "=" : "", - src.sel.offset, - tss.sel.getKey().printable().c_str(), - tss.sel.orEqual ? "=" : "", - tss.sel.offset); TraceEvent(traceSeverity, "TSSMismatchGetKey") .suppressFor(1.0) .detail("TSSID", tssId) @@ -138,32 +115,16 @@ bool TSS_doCompare(const GetKeyValuesRequest& req, Severity traceSeverity, UID tssId) { if (src.more != tss.more || src.data != tss.data) { - // TODO REMOVE debugging prints - printf("GetKeyValues [%s:<%s:%d - %s:<%s:%d) @ %lld (lim=%d limB=%d) mismatch:\n", - req.begin.getKey().printable().c_str(), - req.begin.orEqual ? "=" : "", - req.begin.offset, - req.end.getKey().printable().c_str(), - req.end.orEqual ? "=" : "", - req.end.offset, - req.version, - req.limit, - req.limitBytes); std::string ssResultsString = format("(%d)%s:\n", src.data.size(), src.more ? "+" : ""); - printf("src= (%d)%s:", src.data.size(), src.more ? "+" : ""); for (auto& it : src.data) { - printf(" %s=%s\n", it.key.printable().c_str(), it.value.printable().c_str()); ssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value); } std::string tssResultsString = format("(%d)%s:\n", tss.data.size(), tss.more ? "+" : ""); - printf("tss= (%d)%s:", tss.data.size(), tss.more ? "+" : ""); for (auto& it : tss.data) { - printf(" %s=%s\n", it.key.printable().c_str(), it.value.printable().c_str()); tssResultsString += "\n" + it.key.printable() + "=" + traceChecksumValue(it.value); } - printf("\n"); TraceEvent(traceSeverity, "TSSMismatchGetKeyValues") .suppressFor(1.0) @@ -182,10 +143,6 @@ bool TSS_doCompare(const GetKeyValuesRequest& req, return false; } - /*printf("tss GetKeyValues [%s:<%s:%d - %s:<%s:%d) matched! %d=%d\n", - req.begin.getKey().printable().c_str(), req.begin.orEqual ? "=" : "", req.begin.offset, - req.end.getKey().printable().c_str(), req.end.orEqual ? "=" : "", req.end.offset, - src.data.size(), tss.data.size());*/ return true; } @@ -195,7 +152,7 @@ bool TSS_doCompare(const WatchValueRequest& req, const WatchValueReply& tss, Severity traceSeverity, UID tssId) { - // TODO should this check that both returned the same version? We mainly want to duplicate watches just for load + // We duplicate watches just for load, no need to validte replies. return true; } @@ -233,53 +190,9 @@ bool TSS_doCompare(const SplitRangeRequest& req, const SplitRangeReply& tss, Severity traceSeverity, UID tssId) { - // TODO in theory this should return the same response from both right? return true; } -// don't duplicate \xff reads or fetchKeys (avoid adding load to servers) -template <> -bool TSS_shouldDuplicateRequest(const GetValueRequest& req) { - return req.key.size() == 0 || req.key[0] != 0xff; -} - -template <> -bool TSS_shouldDuplicateRequest(const GetKeyRequest& req) { - return req.sel.getKey().size() == 0 || req.sel.getKey()[0] != 0xff; -} - -template <> -bool TSS_shouldDuplicateRequest(const GetKeyValuesRequest& req) { - return (req.begin.getKey().size() == 0 || req.begin.getKey()[0] != 0xff || req.end.getKey().size() == 0 || - req.end.getKey()[0] != 0xff) && - !req.isFetchKeys; -} - -template <> -bool TSS_shouldDuplicateRequest(const WatchValueRequest& req) { - return req.key.size() == 0 || req.key[0] != 0xff; -} - -template <> -bool TSS_shouldDuplicateRequest(const WaitMetricsRequest& req) { - return false; -} - -template <> -bool TSS_shouldDuplicateRequest(const SplitMetricsRequest& req) { - return false; -} - -template <> -bool TSS_shouldDuplicateRequest(const ReadHotSubRangeRequest& req) { - return false; -} - -template <> -bool TSS_shouldDuplicateRequest(const SplitRangeRequest& req) { - return false; -} - // only record metrics for data reads template <> @@ -317,20 +230,26 @@ void TSSMetrics::recordLatency(const SplitRangeRequest& req, double ssLatency, d // ------------------- -// TODO ADD UNIT TESTS for compare methods, especially GetKey!! TEST_CASE("/StorageServerInterface/TSSCompare/TestComparison") { printf("testing tss comparisons\n"); + // to avoid compiler issues that StringRef(char* is deprecated) + std::string s_a = "a"; + std::string s_b = "b"; + std::string s_c = "c"; + std::string s_d = "d"; + std::string s_e = "e"; + // test getValue GetValueRequest gvReq; - gvReq.key = StringRef("a"); + gvReq.key = StringRef(s_a); gvReq.version = 5; UID tssId; GetValueReply gvReplyMissing; - GetValueReply gvReplyA(Optional(StringRef("a")), false); - GetValueReply gvReplyB(Optional(StringRef("b")), false); + GetValueReply gvReplyA(Optional(StringRef(s_a)), false); + GetValueReply gvReplyB(Optional(StringRef(s_b)), false); ASSERT(TSS_doCompare(gvReq, gvReplyMissing, gvReplyMissing, SevInfo, tssId)); ASSERT(TSS_doCompare(gvReq, gvReplyA, gvReplyA, SevInfo, tssId)); ASSERT(TSS_doCompare(gvReq, gvReplyB, gvReplyB, SevInfo, tssId)); @@ -341,15 +260,15 @@ TEST_CASE("/StorageServerInterface/TSSCompare/TestComparison") { // test GetKeyValues Arena a; // for all of the refs. ASAN complains if this isn't done. Could also make them all standalone i guess GetKeyValuesRequest gkvReq; - gkvReq.begin = firstGreaterOrEqual(StringRef(a, "A")); - gkvReq.end = firstGreaterOrEqual(StringRef(a, "C")); + gkvReq.begin = firstGreaterOrEqual(StringRef(a, s_a)); + gkvReq.end = firstGreaterOrEqual(StringRef(a, s_b)); gkvReq.version = 5; GetKeyValuesReply gkvReplyEmpty; GetKeyValuesReply gkvReplyOne; KeyValueRef v; - v.key = StringRef(a, "a"); - v.value = StringRef(a, "1"); + v.key = StringRef(a, s_a); + v.value = StringRef(a, s_b); gkvReplyOne.data.push_back_deep(gkvReplyOne.arena, v); GetKeyValuesReply gkvReplyOneMore; gkvReplyOneMore.data.push_back_deep(gkvReplyOneMore.arena, v); @@ -363,14 +282,14 @@ TEST_CASE("/StorageServerInterface/TSSCompare/TestComparison") { // test GetKey GetKeyRequest gkReq; - gkReq.sel = KeySelectorRef(StringRef(a, "Z"), false, 1); + gkReq.sel = KeySelectorRef(StringRef(a, s_a), false, 1); gkReq.version = 5; - GetKeyReply gkReplyA(KeySelectorRef(StringRef(a, "A"), false, 20), false); - GetKeyReply gkReplyB(KeySelectorRef(StringRef(a, "B"), false, 10), false); - GetKeyReply gkReplyC(KeySelectorRef(StringRef(a, "C"), true, 0), false); - GetKeyReply gkReplyD(KeySelectorRef(StringRef(a, "D"), false, -10), false); - GetKeyReply gkReplyE(KeySelectorRef(StringRef(a, "E"), false, -20), false); + GetKeyReply gkReplyA(KeySelectorRef(StringRef(a, s_a), false, 20), false); + GetKeyReply gkReplyB(KeySelectorRef(StringRef(a, s_b), false, 10), false); + GetKeyReply gkReplyC(KeySelectorRef(StringRef(a, s_c), true, 0), false); + GetKeyReply gkReplyD(KeySelectorRef(StringRef(a, s_d), false, -10), false); + GetKeyReply gkReplyE(KeySelectorRef(StringRef(a, s_e), false, -20), false); // identical cases ASSERT(TSS_doCompare(gkReq, gkReplyA, gkReplyA, SevInfo, tssId)); @@ -396,26 +315,26 @@ TEST_CASE("/StorageServerInterface/TSSCompare/TestComparison") { // test same offset/orEqual wrong key ASSERT(!TSS_doCompare(gkReq, - GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false), - GetKeyReply(KeySelectorRef(StringRef("B"), true, 0), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_b), true, 0), false), SevInfo, tssId)); // this could be from different shard boundaries, so don't say it's a mismatch ASSERT(TSS_doCompare(gkReq, - GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 10), false), - GetKeyReply(KeySelectorRef(StringRef(a, "B"), false, 10), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 10), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 10), false), SevInfo, tssId)); // test offsets and key difference don't match ASSERT(!TSS_doCompare(gkReq, - GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 0), false), - GetKeyReply(KeySelectorRef(StringRef("B"), false, 10), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 0), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 10), false), SevInfo, tssId)); ASSERT(!TSS_doCompare(gkReq, - GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, -10), false), - GetKeyReply(KeySelectorRef(StringRef("B"), false, 0), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, -10), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 0), false), SevInfo, tssId)); @@ -423,42 +342,41 @@ TEST_CASE("/StorageServerInterface/TSSCompare/TestComparison") { // positive // one that didn't find is +1 ASSERT(TSS_doCompare(gkReq, - GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 1), false), - GetKeyReply(KeySelectorRef(StringRef("B"), true, 0), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 1), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_b), true, 0), false), SevInfo, tssId)); ASSERT(!TSS_doCompare(gkReq, - GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false), - GetKeyReply(KeySelectorRef(StringRef("B"), false, 1), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 1), false), SevInfo, tssId)); // negative will have zero offset but not equal set ASSERT(TSS_doCompare(gkReq, - GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false), - GetKeyReply(KeySelectorRef(StringRef("B"), false, 0), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_b), false, 0), false), SevInfo, tssId)); ASSERT(!TSS_doCompare(gkReq, - GetKeyReply(KeySelectorRef(StringRef(a, "A"), false, 0), false), - GetKeyReply(KeySelectorRef(StringRef("B"), true, 0), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 0), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_b), true, 0), false), SevInfo, tssId)); // test shard boundary key returned by incomplete query is the same as the key found by the other (only possible in // positive direction) ASSERT(TSS_doCompare(gkReq, - GetKeyReply(KeySelectorRef(StringRef(a, "A"), true, 0), false), - GetKeyReply(KeySelectorRef(StringRef("A"), false, 1), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_a), true, 0), false), + GetKeyReply(KeySelectorRef(StringRef(a, s_a), false, 1), false), SevInfo, tssId)); // explictly test checksum function - std::string s = "A"; std::string s12 = "ABCDEFGHIJKL"; std::string s13 = "ABCDEFGHIJKLO"; std::string checksumStart13 = "(13)"; - ASSERT(s == traceChecksumValue(StringRef(s))); + ASSERT(s_a == traceChecksumValue(StringRef(s_a))); ASSERT(s12 == traceChecksumValue(StringRef(s12))); ASSERT(checksumStart13 == traceChecksumValue(StringRef(s13)).substr(0, 4)); return Void(); diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 9a514a447e..be1a223453 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -56,10 +56,7 @@ struct StorageServerInterface { LocalityData locality; UID uniqueID; - // TODO get rid of explicit mapping? - // Effectively implements Optional but serializer didn't like Optional - bool isTss; - UID tssPairID; + Optional tssPairID; RequestStream getValue; RequestStream getKey; @@ -80,12 +77,13 @@ struct StorageServerInterface { RequestStream getReadHotRanges; RequestStream getRangeSplitPoints; - explicit StorageServerInterface(UID uid) : uniqueID(uid), isTss(false) {} - StorageServerInterface() : uniqueID(deterministicRandom()->randomUniqueID()), isTss(false) {} + explicit StorageServerInterface(UID uid) : uniqueID(uid) {} + StorageServerInterface() : uniqueID(deterministicRandom()->randomUniqueID()) {} NetworkAddress address() const { return getValue.getEndpoint().getPrimaryAddress(); } NetworkAddress stableAddress() const { return getValue.getEndpoint().getStableAddress(); } Optional secondaryAddress() const { return getValue.getEndpoint().addresses.secondaryAddress; } UID id() const { return uniqueID; } + bool isTss() const { return tssPairID.present(); } std::string toString() const { return id().shortString(); } template void serialize(Ar& ar) { @@ -95,7 +93,7 @@ struct StorageServerInterface { if (ar.protocolVersion().hasSmallEndpoints()) { if (ar.protocolVersion().hasTSS()) { - serializer(ar, uniqueID, locality, getValue, isTss, tssPairID); + serializer(ar, uniqueID, locality, getValue, tssPairID); } else { serializer(ar, uniqueID, locality, getValue); } diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 1d7a750fe5..9ffd58464f 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -556,7 +556,6 @@ StorageServerInterface decodeServerListValue(ValueRef const& value) { return s; } -// TODO merge this with above stuff or something const Value serverListValueFB(StorageServerInterface const& server) { return ObjectWriter::toValue(server, IncludeVersion()); } @@ -1111,8 +1110,8 @@ void testSSISerdes(StorageServerInterface const& ssi, bool useFB) { printf("ssi=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\naddress=%s\ngetValue=%s\n\n\n", ssi.id().toString().c_str(), ssi.locality.toString().c_str(), - ssi.isTss ? "true" : "false", - ssi.isTss ? ssi.tssPairID.toString().c_str() : "", + ssi.isTss() ? "true" : "false", + ssi.isTss() ? ssi.tssPairID.get().toString().c_str() : "", ssi.address().toString().c_str(), ssi.getValue.getEndpoint().token.toString().c_str()); @@ -1122,16 +1121,16 @@ void testSSISerdes(StorageServerInterface const& ssi, bool useFB) { printf("ssi2=\nid=%s\nlocality=%s\nisTss=%s\ntssId=%s\naddress=%s\ngetValue=%s\n\n\n", ssi2.id().toString().c_str(), ssi2.locality.toString().c_str(), - ssi2.isTss ? "true" : "false", - ssi2.isTss ? ssi2.tssPairID.toString().c_str() : "", + ssi2.isTss() ? "true" : "false", + ssi2.isTss() ? ssi2.tssPairID.get().toString().c_str() : "", ssi2.address().toString().c_str(), ssi2.getValue.getEndpoint().token.toString().c_str()); ASSERT(ssi.id() == ssi2.id()); ASSERT(ssi.locality == ssi2.locality); - ASSERT(ssi.isTss == ssi2.isTss); - if (ssi.isTss) { - ASSERT(ssi2.tssPairID == ssi2.tssPairID); + ASSERT(ssi.isTss() == ssi2.isTss()); + if (ssi.isTss()) { + ASSERT(ssi2.tssPairID.get() == ssi2.tssPairID.get()); } ASSERT(ssi.address() == ssi2.address()); ASSERT(ssi.getValue.getEndpoint().token == ssi2.getValue.getEndpoint().token); @@ -1149,13 +1148,11 @@ TEST_CASE("/SystemData/SerDes/SSI") { StorageServerInterface ssi; ssi.uniqueID = UID(0x1234123412341234, 0x5678567856785678); ssi.locality = localityData; - ssi.isTss = false; ssi.initEndpoints(); testSSISerdes(ssi, false); testSSISerdes(ssi, true); - ssi.isTss = true; ssi.tssPairID = UID(0x2345234523452345, 0x1238123812381238); testSSISerdes(ssi, false); diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h index 2f1ee375bf..33a689c31d 100644 --- a/fdbrpc/LoadBalance.actor.h +++ b/fdbrpc/LoadBalance.actor.h @@ -31,9 +31,6 @@ #include "flow/flow.h" #include "flow/Knobs.h" -// TODO REMOVE? -#include - #include "fdbrpc/FailureMonitor.h" #include "fdbrpc/fdbrpc.h" #include "fdbrpc/Locality.h" @@ -85,9 +82,8 @@ Future tssComparison(Req req, Future> fSource, Future> fTss, TSSEndpointData tssData) { - // TODO add timeout and time requests state double startTime = now(); - state Future>> fTssWithTimeout = timeout(fTss, 5.0 /*TODO knob?*/); + state Future>> fTssWithTimeout = timeout(fTss, FLOW_KNOBS->LOAD_BALANCE_TSS_TIMEOUT); state int finished = 0; state double srcEndTime; state double tssEndTime; @@ -113,16 +109,21 @@ Future tssComparison(Req req, } } + // we want to record ss/tss errors to metrics + int srcErrorCode = error_code_success; + int tssErrorCode = error_code_success; + ++tssData.metrics->requests; if (src.isError()) { - ++tssData.metrics->ssErrors; + srcErrorCode = src.getError().code(); + tssData.metrics->ssError(srcErrorCode); } if (!tss.present()) { ++tssData.metrics->tssTimeouts; } else if (tss.get().isError()) { - ++tssData.metrics->tssErrors; - printf("Tss got error %d\n", tss.get().getError().code()); + tssErrorCode = tss.get().getError().code(); + tssData.metrics->tssError(tssErrorCode); } if (!src.isError() && tss.present() && !tss.get().isError()) { Optional srcLB = getLoadBalancedReply(&src.get()); @@ -146,13 +147,23 @@ Future tssComparison(Req req, ++tssData.metrics->mismatches; } } else if (tssLB.present() && tssLB.get().error.present()) { - ++tssData.metrics->tssErrors; - printf("Tss got LB error %d\n", tssLB.get().error.get().code()); + tssErrorCode = tssLB.get().error.get().code(); + tssData.metrics->tssError(tssErrorCode); } else if (srcLB.present() && srcLB.get().error.present()) { - ++tssData.metrics->ssErrors; + srcErrorCode = srcLB.get().error.get().code(); + tssData.metrics->ssError(srcErrorCode); } } + if (srcErrorCode != error_code_success && tssErrorCode != error_code_success && srcErrorCode != tssErrorCode) { + // if ss and tss both got different errors, record them + TraceEvent("TSSErrorMismatch") + .suppressFor(1.0) + .detail("TSSID", tssData.tssId) + .detail("SSError", srcErrorCode) + .detail("TSSError", tssErrorCode); + } + return Void(); } @@ -172,22 +183,20 @@ struct RequestData : NonCopyable { // This is true once setupRequest is called, even though at that point the response is Never(). bool isValid() { return response.isValid(); } - void maybeDuplicateTSSRequest(RequestStream const* stream, - Request const& request, - QueueModel* model, - Future ssResponse) { + static void maybeDuplicateTSSRequest(RequestStream const* stream, + Request& request, + QueueModel* model, + Future ssResponse) { if (model) { // Send parallel request to TSS pair, if it exists Optional tssData = model->getTssData(stream->getEndpoint().token.first()); - if (tssData.present() && TSS_shouldDuplicateRequest(request)) { + if (tssData.present()) { resetReply(request); - - // TODO add timeout from knob to tss request? // FIXME: optimize to avoid creating new netNotifiedQueue for each message RequestStream tssRequestStream(tssData.get().endpoint); Future> fTssResult = tssRequestStream.tryGetReply(request); - model->addActor.send(tssComparison(request, fResult, fTssResult, tssData.get())); + model->addActor.send(tssComparison(request, ssResponse, fTssResult, tssData.get())); } } } @@ -196,7 +205,7 @@ struct RequestData : NonCopyable { void startRequest(double backoff, bool triedAllOptions, RequestStream const* stream, - Request const& request, + Request& request, QueueModel* model) { modelHolder = Reference(); requestStarted = false; @@ -207,8 +216,8 @@ struct RequestData : NonCopyable { requestStarted = true; modelHolder = Reference(new ModelHolder(model, stream->getEndpoint().token.first())); Future resp = stream->tryGetReply(request); - maybeDuplicateTSSRequest(stream, request, model, resp); - return resp; + maybeDuplicateTSSRequest(stream, request, model, resp); + return resp; }); } else { requestStarted = true; diff --git a/fdbrpc/QueueModel.cpp b/fdbrpc/QueueModel.cpp index 2cb5687b61..6aaaf3df34 100644 --- a/fdbrpc/QueueModel.cpp +++ b/fdbrpc/QueueModel.cpp @@ -18,8 +18,6 @@ * limitations under the License. */ -#include - #include "fdbrpc/QueueModel.h" #include "fdbrpc/LoadBalance.h" @@ -66,11 +64,10 @@ void QueueModel::updateTssEndpoint(uint64_t endpointId, TSSEndpointData tssData) auto& d = data[endpointId]; if (!d.tssData.present()) { tssCount++; + d.tssData = Optional(tssData); + } else { + d.tssData.get().generation = tssData.generation; } - - d.tssData = Optional(tssData); - // TODO REMOVE print - printf("Setting tss endpoint for %" PRIx64 " = %s\n", endpointId, tssData.endpoint.token.toString().c_str()); } void QueueModel::removeOldTssData(UID currentGeneration) { @@ -78,12 +75,6 @@ void QueueModel::removeOldTssData(UID currentGeneration) { // expire old tss mappings that aren't present in new mapping for (auto& it : data) { if (it.second.tssData.present() && it.second.tssData.get().generation != currentGeneration) { - // TODO REMOVE print - printf("Removing tss endpoint for %" PRIx64 - " because its generation %s doesn't match the current one %s\n", - it.first, - it.second.tssData.get().generation.toString().c_str(), - currentGeneration.toString().c_str()); it.second.tssData = Optional(); tssCount--; } diff --git a/fdbrpc/QueueModel.h b/fdbrpc/QueueModel.h index f8592fa9a5..1e8cd009a0 100644 --- a/fdbrpc/QueueModel.h +++ b/fdbrpc/QueueModel.h @@ -33,8 +33,7 @@ struct TSSEndpointData { UID tssId; Endpoint endpoint; Reference metrics; - UID generation; // TODO this isn't exactly like a generation since it's not ordered, i'll try to think of a better - // name + UID generation; TSSEndpointData(UID tssId, Endpoint endpoint, Reference metrics, UID generation) : tssId(tssId), endpoint(endpoint), metrics(metrics), generation(generation) {} @@ -106,7 +105,10 @@ public: double secondBudget; PromiseStream> addActor; Future laggingRequests; // requests for which a different recipient already answered + PromiseStream> addTSSActor; + Future tssComparisons; // requests for which a different recipient already answered int laggingRequestCount; + int laggingTSSCompareCount; void updateTssEndpoint(uint64_t endpointId, TSSEndpointData endpointData); void removeOldTssData(UID currentGeneration); @@ -114,9 +116,13 @@ public: QueueModel() : secondMultiplier(1.0), secondBudget(0), laggingRequestCount(0), tssCount(0) { laggingRequests = actorCollection(addActor.getFuture(), &laggingRequestCount); + tssComparisons = actorCollection(addTSSActor.getFuture(), &laggingTSSCompareCount); } - ~QueueModel() { laggingRequests.cancel(); } + ~QueueModel() { + laggingRequests.cancel(); + tssComparisons.cancel(); + } private: std::unordered_map data; diff --git a/fdbrpc/TSSComparison.h b/fdbrpc/TSSComparison.h index 6724e3dae7..335e8ae68e 100644 --- a/fdbrpc/TSSComparison.h +++ b/fdbrpc/TSSComparison.h @@ -37,7 +37,7 @@ struct TSSMetrics : ReferenceCounted, NonCopyable { Counter tssTimeouts; Counter mismatches; - // TODO we could probably just ignore getKey as it's seldom used? + // We could probably just ignore getKey as it's seldom used? ContinuousSample SSgetValueLatency; ContinuousSample SSgetKeyLatency; ContinuousSample SSgetKeyValuesLatency; @@ -46,6 +46,19 @@ struct TSSMetrics : ReferenceCounted, NonCopyable { ContinuousSample TSSgetKeyLatency; ContinuousSample TSSgetKeyValuesLatency; + std::unordered_map ssErrorsByCode; + std::unordered_map tssErrorsByCode; + + void ssError(int code) { + ++ssErrors; + ssErrorsByCode[code]++; + } + + void tssError(int code) { + ++tssErrors; + tssErrorsByCode[code]++; + } + template void recordLatency(const Req& req, double ssLatency, double tssLatency); @@ -57,6 +70,9 @@ struct TSSMetrics : ReferenceCounted, NonCopyable { TSSgetValueLatency.clear(); TSSgetKeyLatency.clear(); TSSgetKeyValuesLatency.clear(); + + tssErrorsByCode.clear(); + ssErrorsByCode.clear(); } TSSMetrics() @@ -65,11 +81,6 @@ struct TSSMetrics : ReferenceCounted, NonCopyable { SSgetKeyValuesLatency(1000), TSSgetValueLatency(1000), TSSgetKeyLatency(1000), TSSgetKeyValuesLatency(1000) {} }; -// global static functions - -template -bool TSS_shouldDuplicateRequest(const Req& req); - // part of the contract of this function is that if there is a mismatch, the implementation needs to record a trace // event with the specified severity and tssId in the event. template diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h index e15e0126a1..a2a6af5af6 100644 --- a/fdbrpc/fdbrpc.h +++ b/fdbrpc/fdbrpc.h @@ -335,7 +335,6 @@ public: Future disc = makeDependent(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint(taskID)); if (disc.isReady()) { - printf("got disconnect or failure 1 :O\n"); return ErrorOr(request_maybe_delivered()); } Reference peer = @@ -354,7 +353,6 @@ public: Future disc = makeDependent(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint()); if (disc.isReady()) { - printf("got disconnect or failure 2 :O\n"); return ErrorOr(request_maybe_delivered()); } Reference peer = diff --git a/fdbserver/ApplyMetadataMutation.cpp b/fdbserver/ApplyMetadataMutation.cpp index 87044f49b7..7349918f7a 100644 --- a/fdbserver/ApplyMetadataMutation.cpp +++ b/fdbserver/ApplyMetadataMutation.cpp @@ -68,6 +68,12 @@ void applyMetadataMutations(SpanID const& spanContext, // std::map> cacheRangeInfo; std::map cachedRangeInfo; + // Testing Storage Server removal (clearing serverTagKey) needs to read tss server list value to determine it is a + // tss + find partner's tag to send the private mutation. Since the removeStorageServer transaction clears both the + // storage list and server tag, we have to enforce ordering, proccessing the server tag first, and postpone the + // server list clear until the end; + std::vector tssServerListToRemove; + for (auto const& m : mutations) { //TraceEvent("MetadataMutation", dbgid).detail("M", m.toString()); if (toCommit) { @@ -95,14 +101,14 @@ void applyMetadataMutations(SpanID const& spanContext, for (const auto& id : src) { auto storageInfo = getStorageInfo(id, storageCache, txnStateStore); - ASSERT(!storageInfo->interf.isTss); + ASSERT(!storageInfo->interf.isTss()); ASSERT(storageInfo->tag != invalidTag); info.tags.push_back(storageInfo->tag); info.src_info.push_back(storageInfo); } for (const auto& id : dest) { auto storageInfo = getStorageInfo(id, storageCache, txnStateStore); - ASSERT(!storageInfo->interf.isTss); + ASSERT(!storageInfo->interf.isTss()); ASSERT(storageInfo->tag != invalidTag); info.tags.push_back(storageInfo->tag); info.dest_info.push_back(storageInfo); @@ -115,11 +121,8 @@ void applyMetadataMutations(SpanID const& spanContext, txnStateStore->set(KeyValueRef(m.param1, m.param2)); } else if (m.param1.startsWith(serverKeysPrefix)) { if (toCommit) { - Optional t = - txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get(); - // printf("got SetValue for serverKeysPrefix/%s, tag=%s\n", - // serverKeysDecodeServer(m.param1).toString().c_str(), t.present() ? - // decodeServerTagValue(t.get()).toString().c_str() : ""); + Tag tag = decodeServerTagValue( + txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get().get()); MutationRef privatized = m; privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena); TraceEvent(SevDebug, "SendingPrivateMutation", dbgid) @@ -127,14 +130,9 @@ void applyMetadataMutations(SpanID const& spanContext, .detail("Privatized", privatized.toString()) .detail("Server", serverKeysDecodeServer(m.param1)) .detail("TagKey", serverTagKeyFor(serverKeysDecodeServer(m.param1))) - .detail( - "Tag", - decodeServerTagValue( - txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get().get()) - .toString()); + .detail("Tag", tag.toString()); - toCommit->addTag(decodeServerTagValue( - txnStateStore->readValue(serverTagKeyFor(serverKeysDecodeServer(m.param1))).get().get())); + toCommit->addTag(tag); toCommit->writeTypedMessage(privatized); } } else if (m.param1.startsWith(serverTagPrefix)) { @@ -386,8 +384,20 @@ void applyMetadataMutations(SpanID const& spanContext, } } if (serverListKeys.intersects(range)) { - if (!initialCommit) - txnStateStore->clear(range & serverListKeys); + if (!initialCommit) { + KeyRangeRef rangeToClear = range & serverListKeys; + if (rangeToClear.singleKeyRange()) { + UID id = decodeServerListKey(rangeToClear.begin); + Optional ssiV = txnStateStore->readValue(serverListKeyFor(id)).get(); + if (ssiV.present() && decodeServerListValue(ssiV.get()).isTss()) { + tssServerListToRemove.push_back(rangeToClear); + } else { + txnStateStore->clear(rangeToClear); + } + } else { + txnStateStore->clear(rangeToClear); + } + } } if (tagLocalityListKeys.intersects(range)) { if (!initialCommit) @@ -418,6 +428,32 @@ void applyMetadataMutations(SpanID const& spanContext, toCommit->writeTypedMessage(privatized); } } + // Might be a tss removal, which doesn't store a tag there. + // Chained if is a little verbose, but avoids unecessary work + if (!initialCommit && !serverKeysCleared.size()) { + KeyRangeRef maybeTssRange = range & serverTagKeys; + if (maybeTssRange.singleKeyRange()) { + UID id = decodeServerTagKey(maybeTssRange.begin); + Optional ssiV = txnStateStore->readValue(serverListKeyFor(id)).get(); + + if (ssiV.present()) { + StorageServerInterface ssi = decodeServerListValue(ssiV.get()); + if (ssi.isTss()) { + Optional tagV = + txnStateStore->readValue(serverTagKeyFor(ssi.tssPairID.get())).get(); + if (tagV.present()) { + MutationRef privatized = m; + privatized.param1 = maybeTssRange.begin.withPrefix(systemKeys.begin, arena); + privatized.param2 = + keyAfter(maybeTssRange.begin, arena).withPrefix(systemKeys.begin, arena); + + toCommit->addTag(decodeServerTagValue(tagV.get())); + toCommit->writeTypedMessage(privatized); + } + } + } + } + } } if (!initialCommit) { KeyRangeRef clearRange = range & serverTagKeys; @@ -575,6 +611,10 @@ void applyMetadataMutations(SpanID const& spanContext, } } + for (KeyRangeRef& range : tssServerListToRemove) { + txnStateStore->clear(range); + } + // If we accumulated private mutations for cached key-ranges, we also need to // tag them with the relevant storage servers. This is done to make the storage // servers aware of the cached key-ranges diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index abb87fdf2d..d6a2482950 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -1,4 +1,3 @@ - /* * ClusterController.actor.cpp * @@ -3186,9 +3185,9 @@ ACTOR Future workerAvailabilityWatch(WorkerInterface worker, checkOutstandingRequests(cluster); } } - when(wait(failed)) { // remove workers that have failed WorkerInfo& failedWorkerInfo = cluster->id_worker[worker.locality.processId()]; + if (!failedWorkerInfo.reply.isSet()) { failedWorkerInfo.reply.send( RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo)); @@ -3379,12 +3378,6 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co isChanged = true; } - // TODO remove debugging - printf("CC:\ntss_count=%d\ntss_storage_engine=%d|%s\n", - db->config.desiredTSSCount, - db->config.testingStorageServerStoreType, - db->config.testingStorageServerStoreType.toString().c_str()); - // Construct the client information if (db->clientInfo->get().commitProxies != req.commitProxies || db->clientInfo->get().grvProxies != req.grvProxies) { @@ -3894,12 +3887,11 @@ ACTOR Future monitorTSSMapping(ClusterControllerData* self) { tssIdMap[it.first] = it.second; // ensure two storage servers don't map to same TSS ASSERT(seenTssIds.insert(it.second).second); + // ensure a storage server doesn't accidentally map to itself (unless we're in HACK_IDENTITY_MAPPING + // mode) + ASSERT(SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING || it.first != it.second); } - // TODO REMOVE print - printf("tss mapping of size %d\n", tssIdMap.size()); - - // TODO is copying storage server interfaces bad? state std::vector> newMapping; state std::map oldMapping; state bool mappingChanged = false; @@ -3909,10 +3901,6 @@ ACTOR Future monitorTSSMapping(ClusterControllerData* self) { for (auto& it : clientInfo.tssMapping) { oldMapping[it.first] = it.second; if (!tssIdMap.count(it.first)) { - // TODO add trace event - printf("tss mapping removed: %s=%s\n", - it.first.toString().c_str(), - it.second.id().toString().c_str()); TraceEvent("TSS_MappingRemoved", self->id) .detail("SSID", it.first) .detail("TSSID", it.second.id()); @@ -3936,22 +3924,16 @@ ACTOR Future monitorTSSMapping(ClusterControllerData* self) { .detail("SSID", it.first) .detail("TSSID", it.second) .detail("OldTSSID", interf.id()); - printf("tss mapping updated: %s=%s\n", - it.first.toString().c_str(), - it.second.toString().c_str()); mappingChanged = true; } } else { - // TODO add trace event TraceEvent("TSS_MappingAdded", self->id).detail("SSID", it.first).detail("TSSID", it.second); - printf("tss mapping added: %s=%s\n", it.first.toString().c_str(), it.second.toString().c_str()); mappingChanged = true; } state UID ssid = it.first; state UID tssid = it.second; // request storage server interface for tssid, add it to results - // TODO could issue all of these futures and then process then after as an optimization Optional tssiVal = wait(tr->get(serverListKeyFor(it.second))); // because we read the tss mapping in the same transaction, there can be no races with tss removal @@ -3961,11 +3943,6 @@ ACTOR Future monitorTSSMapping(ClusterControllerData* self) { StorageServerInterface tssi = decodeServerListValue(tssiVal.get()); if (oldTssId.present() && tssi.id() == oldTssId.get() && oldGetValueEndpoint.present() && oldGetValueEndpoint.get() != tssi.getValue.getEndpoint().token) { - // TODO REMOVE print - printf("tss %s restarted, getValue %s -> %s\n", - tssi.id().toString().c_str(), - oldGetValueEndpoint.get().toString().c_str(), - tssi.getValue.getEndpoint().token.toString().c_str()); mappingChanged = true; } newMapping.push_back(std::pair(ssid, tssi)); @@ -3973,8 +3950,6 @@ ACTOR Future monitorTSSMapping(ClusterControllerData* self) { // if nothing changed, skip updating if (mappingChanged) { - // TODO REMOVE print - printf("CC updating tss client and server info\n"); clientInfo.id = deterministicRandom()->randomUniqueID(); clientInfo.tssMapping = newMapping; self->db.clientInfo->set(clientInfo); @@ -4592,7 +4567,6 @@ ACTOR Future clusterControllerCore(ClusterControllerFullInterface interf, when(GetWorkersRequest req = waitNext(interf.getWorkers.getFuture())) { ++self.getWorkersRequests; vector workers; - // printf("CC got GetWorkersRequest\n"); for (auto& it : self.id_worker) { if ((req.flags & GetWorkersRequest::NON_EXCLUDED_PROCESSES_ONLY) && diff --git a/fdbserver/CommitProxyServer.actor.cpp b/fdbserver/CommitProxyServer.actor.cpp index 3fc1ed02c3..d1469c0d3b 100644 --- a/fdbserver/CommitProxyServer.actor.cpp +++ b/fdbserver/CommitProxyServer.actor.cpp @@ -1507,7 +1507,6 @@ ACTOR static Future rejoinServer(CommitProxyInterface proxy, ProxyCommitDa loop { GetStorageServerRejoinInfoRequest req = waitNext(proxy.getStorageServerRejoinInfo.getFuture()); - printf("Proxy got Rejoin req for %s\n", req.id.toString().c_str()); if (commitData->txnStateStore->readValue(serverListKeyFor(req.id)).get().present()) { GetStorageServerRejoinInfoReply rep; rep.version = commitData->version; @@ -1568,10 +1567,8 @@ ACTOR static Future rejoinServer(CommitProxyInterface proxy, ProxyCommitDa } rep.newTag = Tag(maxTagLocality + 1, 0); } - printf("Proxy sent Rejoin response for %s\n", req.id.toString().c_str()); req.reply.send(rep); } else { - printf("Proxy notifying %s it can't rejoin because it was removed.\n", req.id.toString().c_str()); req.reply.sendError(worker_removed()); } } diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index cbb0364178..659ce2cbd0 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -67,6 +67,7 @@ struct TCServerInfo : public ReferenceCounted { Promise removed; Future onRemoved; Future onTSSPairRemoved; + Promise killTss; Promise wakeUpTracker; bool inDesiredDC; LocalityEntry localityEntry; @@ -85,7 +86,7 @@ struct TCServerInfo : public ReferenceCounted { : id(ssi.id()), collection(collection), lastKnownInterface(ssi), lastKnownClass(processClass), dataInFlightToServer(0), onInterfaceChanged(interfaceChanged.getFuture()), onRemoved(removed.getFuture()), inDesiredDC(inDesiredDC), storeType(KeyValueStoreType::END), onTSSPairRemoved(Never()) { - if (!ssi.isTss) { + if (!ssi.isTss()) { localityEntry = ((LocalityMap*)storageServerSet.getPtr())->add(ssi.locality, &id); } } @@ -451,19 +452,16 @@ ACTOR Future> getInitialDataDistribution(Data for (int i = 0; i < serverList.get().size(); i++) { auto ssi = decodeServerListValue(serverList.get()[i].value); - if (!ssi.isTss) { - printf("DD adding SS %s on init\n", ssi.id().toString().c_str()); + if (!ssi.isTss()) { result->allServers.push_back(std::make_pair(ssi, id_data[ssi.locality.processId()].processClass)); server_dc[ssi.id()] = ssi.locality.dcId(); } else { - printf("DD ignoring TSS %s on init until after team building\n", ssi.id().toString().c_str()); tss_servers.push_back(std::make_pair(ssi, id_data[ssi.locality.processId()].processClass)); } } break; } catch (Error& e) { - printf("get initial DD failed %d\n", e.code()); wait(tr.onError(e)); ASSERT(!succeeded); // We shouldn't be retrying if we have already started modifying result in this loop @@ -557,7 +555,6 @@ ACTOR Future> getInitialDataDistribution(Data beginKey = keyServers.end()[-1].key; break; } catch (Error& e) { - printf("GetInitialTeams got error %d\n", e.code()); TraceEvent("GetInitialTeamsKeyServersRetry", distributorId).error(e); wait(tr.onError(e)); @@ -573,7 +570,6 @@ ACTOR Future> getInitialDataDistribution(Data // add tss to server list AFTER teams are built for (auto& it : tss_servers) { - printf("DD adding TSS %s on init\n", it.first.id().toString().c_str()); result->allServers.push_back(it); } @@ -1167,7 +1163,6 @@ struct DDTeamCollection : ReferenceCounted { self->addActor.send(self->checkInvalidLocalities); } } - printf("%p init adding %s\n", (void*)self, i->first.toString().c_str()); self->addServer(i->first, i->second, self->serverTrackerErrorOut, 0, ddEnabledState); } } @@ -2447,23 +2442,16 @@ struct DDTeamCollection : ReferenceCounted { return; } - // printf("addServer(%s)\n", newServer.id().toString().c_str()); - - if (!newServer.isTss) { + if (!newServer.isTss()) { allServers.push_back(newServer.id()); } - TraceEvent(newServer.isTss ? "AddedTSS" : "AddedStorageServer", distributorId) + TraceEvent(newServer.isTss() ? "AddedTSS" : "AddedStorageServer", distributorId) .detail("ServerID", newServer.id()) .detail("ProcessClass", processClass.toString()) .detail("WaitFailureToken", newServer.waitFailure.getEndpoint().token) .detail("Address", newServer.waitFailure.getEndpoint().getPrimaryAddress()); - // TODO how to do this? - /*if (newServer.isTss) { - tr.detail("TSSPairID", newServer.tssPairID); - }*/ - auto& r = server_and_tss_info[newServer.id()] = makeReference( newServer, this, @@ -2472,11 +2460,11 @@ struct DDTeamCollection : ReferenceCounted { std::find(includedDCs.begin(), includedDCs.end(), newServer.locality.dcId()) != includedDCs.end(), storageServerSet); - if (newServer.isTss) { - tss_info_by_pair[newServer.tssPairID] = r; + if (newServer.isTss()) { + tss_info_by_pair[newServer.tssPairID.get()] = r; - if (server_info.count(newServer.tssPairID)) { - r->onTSSPairRemoved = server_info[newServer.tssPairID]->onRemoved; + if (server_info.count(newServer.tssPairID.get())) { + r->onTSSPairRemoved = server_info[newServer.tssPairID.get()]->onRemoved; } } else { server_info[newServer.id()] = r; @@ -2485,9 +2473,9 @@ struct DDTeamCollection : ReferenceCounted { } r->tracker = - storageServerTracker(this, cx, r.getPtr(), errorOut, addedVersion, ddEnabledState, newServer.isTss); + storageServerTracker(this, cx, r.getPtr(), errorOut, addedVersion, ddEnabledState, newServer.isTss()); - if (!newServer.isTss) { + if (!newServer.isTss()) { // link and wake up tss' tracker so it knows when this server gets removed if (tss_info_by_pair.count(newServer.id())) { tss_info_by_pair[newServer.id()]->onTSSPairRemoved = r->onRemoved; @@ -2666,19 +2654,16 @@ struct DDTeamCollection : ReferenceCounted { void removeTSS(UID removedServer) { // much simpler than remove server. tss isn't in any teams, so just remove it from data structures - TEST(true); // Remove a TSS frm the cluster - printf("Removing tss %s\n", removedServer.toString().c_str()); TraceEvent("RemovedTSS", distributorId).detail("ServerID", removedServer); Reference removedServerInfo = server_and_tss_info[removedServer]; - tss_info_by_pair.erase(removedServerInfo->lastKnownInterface.tssPairID); + tss_info_by_pair.erase(removedServerInfo->lastKnownInterface.tssPairID.get()); server_and_tss_info.erase(removedServer); server_status.clear(removedServer); } void removeServer(UID removedServer) { - printf("Removing ss %s\n", removedServer.toString().c_str()); TraceEvent("RemovedStorageServer", distributorId).detail("ServerID", removedServer); // ASSERT( !shardsAffectedByTeamFailure->getServersForTeam( t ) for all t in teams that contain removedServer ) @@ -2800,7 +2785,7 @@ struct DDTeamCollection : ReferenceCounted { }; TCServerInfo::~TCServerInfo() { - if (collection && ssVersionTooFarBehind.get() && !lastKnownInterface.isTss) { + if (collection && ssVersionTooFarBehind.get() && !lastKnownInterface.isTss()) { collection->removeLaggingStorageServer(lastKnownInterface.locality.zoneId().get()); } } @@ -3476,19 +3461,18 @@ ACTOR Future teamTracker(DDTeamCollection* self, Reference tea bool recheck = !healthy && (lastReady != self->initialFailureReactionDelay.isReady() || (lastZeroHealthy && !self->zeroHealthyTeams->get()) || containsFailed); - // TODO recomment - TraceEvent("TeamHealthChangeDetected", self->distributorId) - .detail("Team", team->getDesc()) - .detail("ServersLeft", serversLeft) - .detail("LastServersLeft", lastServersLeft) - .detail("AnyUndesired", anyUndesired) - .detail("LastAnyUndesired", lastAnyUndesired) - .detail("AnyWrongConfiguration", anyWrongConfiguration) - .detail("LastWrongConfiguration", lastWrongConfiguration) - .detail("Recheck", recheck) - .detail("BadTeam", badTeam) - .detail("LastZeroHealthy", lastZeroHealthy) - .detail("ZeroHealthyTeam", self->zeroHealthyTeams->get()); + // TraceEvent("TeamHealthChangeDetected", self->distributorId) + // .detail("Team", team->getDesc()) + // .detail("ServersLeft", serversLeft) + // .detail("LastServersLeft", lastServersLeft) + // .detail("AnyUndesired", anyUndesired) + // .detail("LastAnyUndesired", lastAnyUndesired) + // .detail("AnyWrongConfiguration", anyWrongConfiguration) + // .detail("LastWrongConfiguration", lastWrongConfiguration) + // .detail("Recheck", recheck) + // .detail("BadTeam", badTeam) + // .detail("LastZeroHealthy", lastZeroHealthy) + // .detail("ZeroHealthyTeam", self->zeroHealthyTeams->get()); lastReady = self->initialFailureReactionDelay.isReady(); lastZeroHealthy = self->zeroHealthyTeams->get(); @@ -3860,7 +3844,7 @@ ACTOR Future waitServerListChange(DDTeamCollection* self, self->serverTrackerErrorOut, tr.getReadVersion().get(), ddEnabledState); - if (!ssi.isTss) { + if (!ssi.isTss()) { self->doBuildTeams = true; } } @@ -3877,7 +3861,6 @@ ACTOR Future waitServerListChange(DDTeamCollection* self, } } } catch (Error& e) { - printf("WaitServerListChange got error %d\n", e.code()); wait(tr.onError(e)); serverListAndProcessClasses = Never(); isFetchingResults = false; @@ -3967,7 +3950,6 @@ ACTOR Future keyValueStoreTypeTracker(DDTeamCollection* self, TCServerInfo ACTOR Future waitForAllDataRemoved(Database cx, UID serverID, Version addedVersion, DDTeamCollection* teams) { state Reference tr = makeReference(cx); - printf("Waiting for data to be removed from %s\n", serverID.toString().c_str()); loop { try { tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); @@ -4005,10 +3987,6 @@ ACTOR Future storageServerFailureTracker(DDTeamCollection* self, state StorageServerInterface interf = server->lastKnownInterface; state int targetTeamNumPerServer = (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (self->configuration.storageTeamSize + 1)) / 2; - - printf("Starting failure tracker for %sSS %s\n", - server->lastKnownInterface.isTss ? "T" : "", - server->lastKnownInterface.id().toString().c_str()); loop { state bool inHealthyZone = false; // healthChanged actor will be Never() if this flag is true if (self->healthyZone.get().present()) { @@ -4027,7 +4005,7 @@ ACTOR Future storageServerFailureTracker(DDTeamCollection* self, } } - if (!interf.isTss) { + if (!interf.isTss()) { if (self->server_status.get(interf.id()).initialized) { bool unhealthy = self->server_status.get(interf.id()).isUnhealthy(); if (unhealthy && !status->isUnhealthy()) { @@ -4059,7 +4037,7 @@ ACTOR Future storageServerFailureTracker(DDTeamCollection* self, choose { when(wait(healthChanged)) { status->isFailed = !status->isFailed; - if (!status->isFailed && !server->lastKnownInterface.isTss && + if (!status->isFailed && !server->lastKnownInterface.isTss() && (server->teams.size() < targetTeamNumPerServer || self->lastBuildTeamsFailed)) { self->doBuildTeams = true; } @@ -4119,9 +4097,6 @@ ACTOR Future storageServerTracker( state int targetTeamNumPerServer = (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (self->configuration.storageTeamSize + 1)) / 2; - // TODO REMOVE - printf("Started %sSS tracker for %s\n", isTss ? "T" : "", server->id.toString().c_str()); - try { loop { status.isUndesired = !self->disableFailingLaggingServers.get() && server->ssVersionTooFarBehind.get(); @@ -4267,9 +4242,7 @@ ACTOR Future storageServerTracker( state bool recordTeamCollectionInfo = false; choose { - when(wait(failureTracker || server->onTSSPairRemoved)) { - printf("Server %s getting removed\n", server->id.toString().c_str()); - + when(wait(failureTracker || server->onTSSPairRemoved || server->killTss.getFuture())) { // The server is failed AND all data has been removed from it, so permanently remove it. TraceEvent("StatusMapChange", self->distributorId) .detail("ServerID", server->id) @@ -4280,9 +4253,8 @@ ACTOR Future storageServerTracker( } // Remove server from FF/serverList - Optional tssPairID = - server->lastKnownInterface.isTss ? server->lastKnownInterface.tssPairID : Optional(); - wait(removeStorageServer(cx, server->id, tssPairID, self->lock, ddEnabledState)); + wait(removeStorageServer( + cx, server->id, server->lastKnownInterface.tssPairID, self->lock, ddEnabledState)); TraceEvent("StatusMapChange", self->distributorId) .detail("ServerID", server->id) @@ -4556,7 +4528,6 @@ ACTOR Future checkAndRemoveInvalidLocalityAddr(DDTeamCollection* self) { } int numExistingSSOnAddr(DDTeamCollection* self, const AddressExclusion& addr) { - // TODO add tss? int numExistingSS = 0; for (auto& server : self->server_and_tss_info) { const NetworkAddress& netAddr = server.second->lastKnownInterface.stableAddress(); @@ -4570,16 +4541,24 @@ int numExistingSSOnAddr(DDTeamCollection* self, const AddressExclusion& addr) { } // All state that represents an ongoing tss pair recruitment -struct TSSRecruitmentState : ReferenceCounted, NonCopyable { +struct TSSPairState : ReferenceCounted, NonCopyable { Promise>> ssPairInfo; // if set, for ss to pass its id to tss pair once it is successfully recruited Promise tssPairDone; // if set, for tss to pass ss that it was successfully recruited + Optional dcId; // dc + Optional dataHallId; // data hall + bool active; - TSSRecruitmentState() : active(false) {} + TSSPairState() : active(false) {} - TSSRecruitmentState(Optional dcId) : active(true), dcId(dcId) {} + TSSPairState(const LocalityData& locality) + : active(true), dcId(locality.dcId()), dataHallId(locality.dataHallId()) {} + + bool inDataZone(const LocalityData& locality) { + return locality.dcId() == dcId && locality.dataHallId() == dataHallId; + } void cancel() { // only cancel if both haven't been set, otherwise one half of pair could think it was successful but the other @@ -4603,7 +4582,6 @@ struct TSSRecruitmentState : ReferenceCounted, NonCopyable bool tssRecruitFailed() { if (active && tssPairDone.canBeSet()) { - printf("tssPair: %p\n", &tssPairDone); tssPairDone.send(false); return true; } @@ -4631,13 +4609,11 @@ struct TSSRecruitmentState : ReferenceCounted, NonCopyable Future waitOnTSS() { return tssPairDone.getFuture(); } }; -// TODO switch recruitment order(ish) - grab tss but don't init it, wait for it to actually grab an ss, then the ss -// signals here to start, then when done this signals the ss to add server ACTOR Future initializeStorage(DDTeamCollection* self, RecruitStorageReply candidateWorker, const DDEnabledState* ddEnabledState, bool recruitTss, - Reference tssState) { + Reference tssState) { // SOMEDAY: Cluster controller waits for availability, retry quickly if a server's Locality changes self->recruitingStream.set(self->recruitingStream.get() + 1); @@ -4656,12 +4632,6 @@ ACTOR Future initializeStorage(DDTeamCollection* self, isr.seedTag = invalidTag; isr.reqId = deterministicRandom()->randomUniqueID(); isr.interfaceId = interfaceId; - isr.isTss = recruitTss; - - printf("InitStorage %s on %sSS %s\n", - interfaceId.toString().c_str(), - recruitTss ? "T" : "", - candidateWorker.worker.address().toString().c_str()); self->recruitingIds.insert(interfaceId); self->recruitingLocalities.insert(candidateWorker.worker.stableAddress()); @@ -4675,29 +4645,21 @@ ACTOR Future initializeStorage(DDTeamCollection* self, .detail("Addr", candidateWorker.worker.address()) .detail("Locality", candidateWorker.worker.locality.toString()); - printf("TSS %s waiting for partner uid\n", interfaceId.toString().c_str()); Optional> ssPairInfoResult = wait(tssState->waitOnSS()); if (ssPairInfoResult.present()) { - printf("TSS %s got pair of %s @ %lld\n", - interfaceId.toString().c_str(), - ssPairInfoResult.get().first.toString().c_str(), - ssPairInfoResult.get().second); - isr.tssPairID = ssPairInfoResult.get().first; - isr.tssPairVersion = ssPairInfoResult.get().second; + isr.tssPairIDAndVersion = ssPairInfoResult.get(); TraceEvent("TSS_Recruit", self->distributorId) - .detail("SSID", isr.tssPairID) + .detail("SSID", ssPairInfoResult.get().first) .detail("TSSID", interfaceId) .detail("Stage", "TSSWaitingPair") .detail("Addr", candidateWorker.worker.address()) + .detail("Version", ssPairInfoResult.get().second) .detail("Locality", candidateWorker.worker.locality.toString()); } else { - printf("TSS %s didn't get partner, partner recruitment must have failed, abandoning\n", - interfaceId.toString().c_str()); - isr.isTss = false; doRecruit = false; - TraceEvent(SevWarn, "TSS_RecruitError", self->distributorId) + TraceEvent(SevWarnAlways, "TSS_RecruitError", self->distributorId) .detail("TSSID", interfaceId) .detail("Reason", "SS recruitment failed for some reason") .detail("Addr", candidateWorker.worker.address()) @@ -4731,10 +4693,7 @@ ACTOR Future initializeStorage(DDTeamCollection* self, if (!recruitTss && newServer.present() && tssState->ssRecruitSuccess(std::pair(interfaceId, newServer.get().addedVersion))) { - printf("ss %s signalling tss pair with version %lld\n", - interfaceId.toString().c_str(), - newServer.get().addedVersion); - // ss has a tss pair. send it this id, but wait for add server until tss is recruited + // SS has a tss pair. send it this id, but try to wait for add server until tss is recruited TraceEvent("TSS_Recruit", self->distributorId) .detail("SSID", interfaceId) @@ -4742,10 +4701,9 @@ ACTOR Future initializeStorage(DDTeamCollection* self, .detail("Addr", candidateWorker.worker.address()) .detail("Locality", candidateWorker.worker.locality.toString()); - // wait for timeout, and give up if no TSS pair recruited + // wait for timeout, but eventually move on if no TSS pair recruited Optional tssSuccessful = wait(timeout(tssState->waitOnTSS(), SERVER_KNOBS->TSS_RECRUITMENT_TIMEOUT)); - // TODO if unsuccessful, fail out tss so it doesn't cause a mismatch error? if (tssSuccessful.present() && tssSuccessful.get()) { TraceEvent("TSS_Recruit", self->distributorId) .detail("SSID", interfaceId) @@ -4760,14 +4718,7 @@ ACTOR Future initializeStorage(DDTeamCollection* self, : "TSS recruitment timed out") .detail("Addr", candidateWorker.worker.address()) .detail("Locality", candidateWorker.worker.locality.toString()); - - // TODO need to remove that tss here!! } - - // TODO trace event, change sev and message if timeout or if unsuccessful - printf("ss %s %ssuccessfully got tss pair!\n", - interfaceId.toString().c_str(), - (tssSuccessful.present() && tssSuccessful.get()) ? "" : "un"); } self->recruitingIds.erase(interfaceId); @@ -4791,9 +4742,6 @@ ACTOR Future initializeStorage(DDTeamCollection* self, self->serverTrackerErrorOut, newServer.get().addedVersion, ddEnabledState); - } else { - // TODO tss recruitment was cancelled since it failed to send a response to the ss, kill it - printf("TSS recruitment was cancelled, stop\n"); } } else { TraceEvent(SevWarn, "DDRecruitmentError") @@ -4806,15 +4754,12 @@ ACTOR Future initializeStorage(DDTeamCollection* self, } } + // SS and/or TSS recruitment failed at this point, update tssState if (recruitTss && tssState->tssRecruitFailed()) { TEST(true); // TSS recruitment failed for some reason - // if tss wasn't already marked as done, it was unsuccessful in recruitment - printf("tss recruitment failed for some reason, signalling ss.\n"); } if (!recruitTss && tssState->ssRecruitFailed()) { TEST(true); // SS with pair TSS recruitment failed for some reason - // if ss didn't already send its pair id to tss, it was unsuccessful in recruitment - printf("ss recruitment failed for some reason, signalling tss.\n"); } self->recruitingStream.set(self->recruitingStream.get() - 1); @@ -4832,13 +4777,11 @@ ACTOR Future storageRecruiter(DDTeamCollection* self, state std::map numSSPerAddr; // tss-specific recruitment state - state uint32_t tssToRecruit = self->configuration.desiredTSSCount - db->get().client.tssMapping.size(); - state Reference tssState = makeReference(); + state int32_t tssToRecruit = self->configuration.desiredTSSCount - db->get().client.tssMapping.size(); + state Reference tssState = makeReference(); + state Future checkKillTss = self->initialFailureReactionDelay; + state bool sleepingAfterKillTss = false; - printf("DD setting tssToRecruit=%d (%d - %d)\n", - tssToRecruit, - self->configuration.desiredTSSCount, - db->get().client.tssMapping.size()); TraceEvent(SevDebug, "TSS_RecruitUpdated", self->distributorId).detail("Count", tssToRecruit); loop { @@ -4924,87 +4867,97 @@ ACTOR Future storageRecruiter(DDTeamCollection* self, .detail("Locality", candidateWorker.worker.locality.toString()); TEST(true); // Starting TSS recruitment - printf("starting recruitment of tss\n"); self->isTssRecruiting = true; - tssState = makeReference(candidateWorker.worker.locality.dcId()); + tssState = makeReference(candidateWorker.worker.locality); self->addActor.send(initializeStorage(self, candidateWorker, ddEnabledState, true, tssState)); } else { - if (tssState->active && candidateWorker.worker.locality.dcId() == tssState->dcId) { - TEST(true); // TSS recruits pair in same dc + if (tssState->active && tssState->inDataZone(candidateWorker.worker.locality)) { + TEST(true); // TSS recruits pair in same dc/datahall self->isTssRecruiting = false; TraceEvent("TSS_Recruit", self->distributorId) .detail("Stage", "PairSS") .detail("Addr", candidateSSAddr.toString()) .detail("Locality", candidateWorker.worker.locality.toString()); - printf("starting recruitment of ss with eventual tss pair in dc \'%s\'\n", - tssState->dcId.present() ? tssState->dcId.get().toString().c_str() : ""); self->addActor.send( initializeStorage(self, candidateWorker, ddEnabledState, false, tssState)); // successfully started recruitment of pair, reset tss recruitment state - tssState = makeReference(); + tssState = makeReference(); tssToRecruit--; - if (tssToRecruit > 0) { - printf("%d tss pairs left to recruit\n", tssToRecruit); - } } else { - if (tssState->active) { - TEST(true); // TSS recruitment skipped potential pair because it's in a different dc - printf("Recruiting normal ss (no tss) b/c new ss is in different dc \'%s\' than tss " - "\'%s\'\n", - candidateWorker.worker.locality.dcId().present() - ? candidateWorker.worker.locality.dcId().get().toString().c_str() - : "", - tssState->dcId.present() ? tssState->dcId.get().toString().c_str() : ""); - } else { - printf("recruiting normal ss (no tss)\n"); - } + TEST(tssState->active); // TSS recruitment skipped potential pair because it's in a + // different dc/datahall self->addActor.send(initializeStorage( - self, candidateWorker, ddEnabledState, false, makeReference())); + self, candidateWorker, ddEnabledState, false, makeReference())); } } } when(wait(db->onChange())) { // SOMEDAY: only if clusterInterface or tss changes? fCandidateWorker = Future(); - // TODO REMOVE print int newTssToRecruit = self->configuration.desiredTSSCount - db->get().client.tssMapping.size(); + if (newTssToRecruit != tssToRecruit) { TraceEvent("TSS_RecruitUpdated", self->distributorId).detail("Count", newTssToRecruit); tssToRecruit = newTssToRecruit; } - // TODO HANDLE HERE if count is more than desired tss? - - printf("DD updated tssToRecruit=%d (%d - %d)\n", - tssToRecruit, - self->configuration.desiredTSSCount, - db->get().client.tssMapping.size()); - - if (self->isTssRecruiting && (tssToRecruit == 0 || self->zeroHealthyTeams->get())) { - TEST(tssToRecruit == 0); // tss recruitment cancelled due to too many TSS + if (self->isTssRecruiting && (tssToRecruit <= 0 || self->zeroHealthyTeams->get())) { + TEST(tssToRecruit <= 0); // tss recruitment cancelled due to too many TSS TEST(self->zeroHealthyTeams->get()); // tss recruitment cancelled due zero healthy teams TraceEvent(SevWarn, "TSS_RecruitCancelled", self->distributorId) - .detail("Reason", tssToRecruit == 0 ? "ConfigChange" : "ZeroHealthyTeams"); - printf("Cancelling tss recruitment! tssToRecruit: %d, zeroHealthyTeams: %s\n", - tssToRecruit, - self->zeroHealthyTeams->get() ? "T" : "F"); + .detail("Reason", tssToRecruit <= 0 ? "ConfigChange" : "ZeroHealthyTeams"); tssState->cancel(); - tssState = makeReference(); + tssState = makeReference(); self->isTssRecruiting = false; + } else if (!self->isTssRecruiting && + (tssToRecruit < 0 || + (self->zeroHealthyTeams->get() && db->get().client.tssMapping.size() > 0))) { + if (!sleepingAfterKillTss) { + checkKillTss = self->initialFailureReactionDelay; + } } } when(wait(self->zeroHealthyTeams->onChange())) { - // TODO refactor? if (self->isTssRecruiting && self->zeroHealthyTeams->get()) { TEST(self->zeroHealthyTeams->get()); // tss recruitment cancelled due zero healthy teams 2 TraceEvent(SevWarn, "TSS_RecruitCancelled", self->distributorId) .detail("Reason", "ZeroHealthyTeams"); - printf("Cancelling tss recruitment!! tssToRecruit: %d, zeroHealthyTeams: %s\n", - tssToRecruit, - self->zeroHealthyTeams->get() ? "T" : "F"); tssState->cancel(); - tssState = makeReference(); + tssState = makeReference(); self->isTssRecruiting = false; + } else if (!self->isTssRecruiting && self->zeroHealthyTeams->get() && + db->get().client.tssMapping.size() > 0) { + if (!sleepingAfterKillTss) { + checkKillTss = self->initialFailureReactionDelay; + } + } + } + when(wait(checkKillTss)) { + int tssToKill = std::min((int)db->get().client.tssMapping.size(), + std::max(-tssToRecruit, self->zeroHealthyTeams->get() ? 1 : 0)); + if (tssToKill > 0) { + for (int i = 0; i < tssToKill; i++) { + StorageServerInterface tssi = db->get().client.tssMapping[i].second; + + if (self->shouldHandleServer(tssi) && self->server_and_tss_info.count(tssi.id())) { + TraceEvent(SevWarn, "TSS_DDKill", self->distributorId) + .detail("TSSID", tssi.id()) + .detail("Reason", + self->zeroHealthyTeams->get() ? "ZeroHealthyTeams" : "ConfigChange"); + + Promise killPromise = self->server_and_tss_info[tssi.id()]->killTss; + if (killPromise.canBeSet()) { + killPromise.send(Void()); + } + } + } + // If we're killing a TSS because of zero healthy teams, wait a bit to give the replacing SS a + // change to join teams and stuff before killing another TSS + sleepingAfterKillTss = true; + checkKillTss = delay(SERVER_KNOBS->TSS_DD_KILL_INTERVAL); + } else { + sleepingAfterKillTss = false; + checkKillTss = Never(); } } when(wait(self->restartRecruiting.onTrigger())) {} @@ -5652,8 +5605,6 @@ ACTOR Future dataDistribution(Reference self, wait(waitForAll(actors)); return Void(); } catch (Error& e) { - // TODO REMOVE - printf("DD got error! %d\n", e.code()); trackerCancelled = true; state Error err = e; TraceEvent("DataDistributorDestroyTeamCollections").error(e); @@ -6321,6 +6272,4 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") { ASSERT(result == 8); return Void(); -} - -// TODO add unit test for TSS recruitment? +} \ No newline at end of file diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 51501c9b62..94f38622f0 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -497,22 +497,14 @@ ACTOR Future shardSplitter(DataDistributionTracker* self, .detail("MaxBytes", shardBounds.max.bytes) .detail("MetricsBytes", metrics.bytes) .detail("Bandwidth", - bandwidthStatus == BandwidthStatusHigh - ? "High" - : bandwidthStatus == BandwidthStatusNormal ? "Normal" : "Low") + bandwidthStatus == BandwidthStatusHigh ? "High" + : bandwidthStatus == BandwidthStatusNormal ? "Normal" + : "Low") .detail("BytesPerKSec", metrics.bytesPerKSecond) .detail("NumShards", numShards); } if (numShards > 1) { - // TODO REMOVE - printf("Splitting [%s - %s) into %d shards:\n", - splitKeys[0].toString().c_str(), - splitKeys[numShards].toString().c_str(), - numShards); - for (int i = 0; i < numShards; i++) { - printf(" [%s - %s)\n", splitKeys[i].toString().c_str(), splitKeys[i + 1].toString().c_str()); - } int skipRange = deterministicRandom()->randomInt(0, numShards); // The queue can't deal with RelocateShard requests which split an existing shard into three pieces, so // we have to send the unskipped ranges in this order (nibbling in from the edges of the old range) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 8e507f1727..61c3c62f82 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -217,7 +217,9 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi init( SERVER_LIST_DELAY, 1.0 ); init( RECRUITMENT_IDLE_DELAY, 1.0 ); init( STORAGE_RECRUITMENT_DELAY, 10.0 ); - init( TSS_RECRUITMENT_TIMEOUT, 3*STORAGE_RECRUITMENT_DELAY ); if (randomize && BUGGIFY ) TSS_RECRUITMENT_TIMEOUT = 1.0; //Super low timeout should cause tss recruitments to fail + init( TSS_HACK_IDENTITY_MAPPING, false ); // THIS SHOULD NEVER BE SET IN PROD. Only for performance testing + init( TSS_RECRUITMENT_TIMEOUT, 3*STORAGE_RECRUITMENT_DELAY ); if (randomize && BUGGIFY ) TSS_RECRUITMENT_TIMEOUT = 1.0; // Super low timeout should cause tss recruitments to fail + init( TSS_DD_KILL_INTERVAL, 60.0 ); if (randomize && BUGGIFY ) TSS_DD_KILL_INTERVAL = 1.0; // May kill all TSS quickly init( DATA_DISTRIBUTION_LOGGING_INTERVAL, 5.0 ); init( DD_ENABLED_CHECK_DELAY, 1.0 ); init( DD_STALL_CHECK_DELAY, 0.4 ); //Must be larger than 2*MAX_BUGGIFIED_DELAY diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 9a4cc4a047..6b47e6ef30 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -167,7 +167,9 @@ public: double SERVER_LIST_DELAY; double RECRUITMENT_IDLE_DELAY; double STORAGE_RECRUITMENT_DELAY; + bool TSS_HACK_IDENTITY_MAPPING; double TSS_RECRUITMENT_TIMEOUT; + double TSS_DD_KILL_INTERVAL; double DATA_DISTRIBUTION_LOGGING_INTERVAL; double DD_ENABLED_CHECK_DELAY; double DD_STALL_CHECK_DELAY; diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index 927a7af00b..afd12a81c1 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -101,7 +101,6 @@ ACTOR static Future checkMoveKeysLock(Transaction* tr, bool isWrite = true) { if (!ddEnabledState->isDDEnabled()) { TraceEvent(SevDebug, "DDDisabledByInMemoryCheck"); - printf("MK: DD disabled\n"); throw movekeys_conflict(); } Optional readVal = wait(tr->get(moveKeysLockOwnerKey)); @@ -113,7 +112,6 @@ ACTOR static Future checkMoveKeysLock(Transaction* tr, UID lastWrite = readVal.present() ? BinaryReader::fromStringRef(readVal.get(), Unversioned()) : UID(); if (lastWrite != lock.prevWrite) { TEST(true); // checkMoveKeysLock: Conflict with previous owner - printf("MK: conflict with previous owner\n"); throw movekeys_conflict(); } @@ -147,7 +145,6 @@ ACTOR static Future checkMoveKeysLock(Transaction* tr, return Void(); } else { TEST(true); // checkMoveKeysLock: Conflict with new owner - printf("MK: conflict %s with new owner %s\n", currentOwner.toString().c_str(), lock.myOwner.toString().c_str()); throw movekeys_conflict(); } } @@ -330,12 +327,6 @@ ACTOR static Future startMoveKeys(Database occ, state Future warningLogger = logWarningAfter("StartMoveKeysTooLong", 600, servers); // state TraceInterval waitInterval(""); - // TODO REMOVE - printf("starting move keys for [%s, %s): to %s\n", - keys.begin.toString().c_str(), - keys.end.toString().c_str(), - servers[0].toString().c_str()); - wait(startMoveKeysLock->take(TaskPriority::DataDistributionLaunch)); state FlowLock::Releaser releaser(*startMoveKeysLock); @@ -395,10 +386,10 @@ ACTOR static Future startMoveKeys(Database occ, state KeyRange currentKeys = KeyRangeRef(begin, keys.end); state RangeResult old = wait(krmGetRanges(tr, - keyServersPrefix, - currentKeys, - SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT, - SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES)); + keyServersPrefix, + currentKeys, + SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT, + SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES)); // Determine the last processed key (which will be the beginning for the next iteration) state Key endKey = old.end()[-1].key; @@ -531,37 +522,15 @@ ACTOR Future waitForShardReady(StorageServerInterface server, KeyRange keys, Version minVersion, GetShardStateRequest::waitMode mode) { - // TODO REMOVE - printf("waiting for shard [%s, %s) in state %d from %sss %s @ %lld\n", - keys.begin.toString().c_str(), - keys.end.toString().c_str(), - mode, - server.isTss ? "t" : "", - server.id().toString().c_str(), - minVersion); loop { try { GetShardStateReply rep = wait(server.getShardState.getReply(GetShardStateRequest(keys, mode), TaskPriority::MoveKeys)); if (rep.first >= minVersion) { - // TODO REMOVE - printf("shard [%s, %s) is in state %d from %sss %s @ %lld >= %lld\n", - keys.begin.toString().c_str(), - keys.end.toString().c_str(), - mode, - server.isTss ? "t" : "", - server.id().toString().c_str(), - rep.first, - minVersion); return Void(); } wait(delayJittered(SERVER_KNOBS->SHARD_READY_DELAY, TaskPriority::MoveKeys)); } catch (Error& e) { - printf("Waiting for shard from %sss %s getValue=%s got error! %d\n", - server.isTss ? "t" : "", - server.id().toString().c_str(), - server.getValue.getEndpoint().token.toString().c_str(), - e.code()); if (e.code() != error_code_timed_out) { if (e.code() != error_code_broken_promise) throw e; @@ -656,18 +625,16 @@ ACTOR static Future finishMoveKeys(Database occ, state Key endKey; state int retries = 0; state FlowLock::Releaser releaser; - state int waitForTSSCounter = - 2; // try waiting for tss for a 2 loops, give up if they're stuck to not affect the rest of the cluster // for killing tss if any get stuck during movekeys state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); state std::vector tssToKill; - state std::set tssToIgnore; + state std::unordered_set tssToIgnore; + // try waiting for tss for a 2 loops, give up if they're stuck to not affect the rest of the cluster + state int waitForTSSCounter = 2; ASSERT(!destinationTeam.empty()); - printf("finishing move keys for [%s, %s)\n", keys.begin.toString().c_str(), keys.end.toString().c_str()); - try { TraceEvent(SevDebug, interval.begin(), relocationIntervalId) .detail("KeyBegin", keys.begin) @@ -680,21 +647,17 @@ ACTOR static Future finishMoveKeys(Database occ, state Transaction tr(occ); - // TODO re-comment and change back - printf("finishMoveKeys( '%s'-'%s' )\n", begin.toString().c_str(), keys.end.toString().c_str()); + // printf("finishMoveKeys( '%s'-'%s' )\n", begin.toString().c_str(), keys.end.toString().c_str()); loop { try { if (tssToKill.size()) { - // TODO could move this to helper method? - // TODO add trace event TEST(true); // killing TSS because they were unavailable for movekeys - printf("KILLING %d TSS BECAUSE THEY TIMED OUT IN MOVEKEYS\n", tssToKill.size()); - // kill tss BEFORE committing main txn so that client requests don't make it to the tss when it + // Kill tss BEFORE committing main txn so that client requests don't make it to the tss when it // has a different shard set than its pair use a different RYW transaction since i'm too lazy - // (and don't want to add bugs) by changing whole method to RYW. also using a different + // (and don't want to add bugs) by changing whole method to RYW. Also, using a different // transaction makes it commit earlier which we may need to guarantee causality of tss getting - // removed before client sends a request to this key range on the new ss + // removed before client sends a request to this key range on the new SS state Reference tssTr = makeReference(occ); loop { @@ -703,25 +666,22 @@ ACTOR static Future finishMoveKeys(Database occ, tssTr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); for (auto& tss : tssToKill) { // DO NOT remove server list key - that'll break a bunch of stuff. DD will - // eventually call removeStorageServer tssTr->clear(serverListKeyFor(tss.id())); + // eventually call removeStorageServer + tssTr->clear(serverTagKeyFor(tss.id())); - // tssTr->clear(serverTagHistoryRangeFor(tss.id())); - tssMapDB.erase(tssTr, tss.tssPairID); + tssMapDB.erase(tssTr, tss.tssPairID.get()); } tssTr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString()); wait(tssTr->commit()); for (auto& tss : tssToKill) { - // TODO ADD trace event (sev30?) - printf("Successfully removed TSS %s in finishMoveKeys\n", - tss.id().toString().c_str()); + TraceEvent(SevWarnAlways, "TSS_KillMoveKeys").detail("TSSID", tss.id().toString()); tssToIgnore.insert(tss.id()); } tssToKill.clear(); break; } catch (Error& e) { - printf("MoveKeys TSS Removal Transaction got error %d\n", e.code()); wait(tssTr->onError(e)); } } @@ -950,9 +910,6 @@ ACTOR static Future finishMoveKeys(Database occ, for (auto& f : tssReady) { if (!f.isReady() || f.isError()) { anyTssNotDone = true; - printf("MK: [%s - %s) waiting on tss!\n", - begin.toString().c_str(), - keys.end.toString().c_str()); waitForTSSCounter--; break; } @@ -961,9 +918,6 @@ ACTOR static Future finishMoveKeys(Database occ, if (anyTssNotDone && waitForTSSCounter == 0) { for (int i = 0; i < tssReady.size(); i++) { if (!tssReady[i].isReady() || tssReady[i].isError()) { - // TODO trace event!! - printf("TSS NOT DONE %s with move keys, killing!!\n", - tssReadyInterfs[i].id().toString().c_str()); tssToKill.push_back(tssReadyInterfs[i]); } } @@ -981,22 +935,21 @@ ACTOR static Future finishMoveKeys(Database occ, for (int s = 0; s < tssReady.size(); s++) tssCount += tssReady[s].isReady() && !tssReady[s].isError(); - // TODO re-comment - if (tssReady.size()) { - printf(" fMK: [%s - %s) moved data to %d/%d servers and %d/%d tss\n", - begin.toString().c_str(), - keys.end.toString().c_str(), - count, - serverReady.size(), - tssCount, - tssReady.size()); + /*if (tssReady.size()) { + printf(" fMK: [%s - %s) moved data to %d/%d servers and %d/%d tss\n", + begin.toString().c_str(), + keys.end.toString().c_str(), + count, + serverReady.size(), + tssCount, + tssReady.size()); } else { - printf(" fMK: [%s - %s) moved data to %d/%d servers\n", - begin.toString().c_str(), - keys.end.toString().c_str(), - count, - serverReady.size()); - } + printf(" fMK: [%s - %s) moved data to %d/%d servers\n", + begin.toString().c_str(), + keys.end.toString().c_str(), + count, + serverReady.size()); + }*/ TraceEvent(SevDebug, waitInterval.end(), relocationIntervalId).detail("ReadyServers", count); if (count == dest.size()) { @@ -1026,7 +979,6 @@ ACTOR static Future finishMoveKeys(Database occ, } tr.reset(); } catch (Error& error) { - printf(" fMK: error %d\n", error.code()); if (error.code() == error_code_actor_cancelled) throw; state Error err = error; @@ -1059,13 +1011,11 @@ ACTOR Future> addStorageServer(Database cx, StorageServe state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); state int maxSkipTags = 1; - printf("%sSS %s adding itself\n", server.isTss ? "T" : "", server.id().toString().c_str()); loop { try { - // TODO should also set priority system immediate? also why is this needed? tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); - // TODO don't fetch tag localities, all tags, and history tags if tss. Just fetch pair's tag + // FIXME: don't fetch tag localities, all tags, and history tags if tss. Just fetch pair's tag state Future fTagLocalities = tr->getRange(tagLocalityListKeys, CLIENT_KNOBS->TOO_MANY); state Future> fv = tr->get(serverListKeyFor(server.id())); @@ -1108,9 +1058,6 @@ ACTOR Future> addStorageServer(Database cx, StorageServe if (fExclProc.get().present() || fExclIP.get().present() || fFailProc.get().present() || fFailIP.get().present() || fExclProc2.get().present() || fExclIP2.get().present() || fFailProc2.get().present() || fFailIP2.get().present()) { - printf("%sSS %s failing to recruit because of exclusion\n", - server.isTss ? "T" : "", - server.id().toString().c_str()); throw recruitment_failed(); } @@ -1118,11 +1065,11 @@ ACTOR Future> addStorageServer(Database cx, StorageServe ASSERT(false); state Tag tag; - if (server.isTss) { + if (server.isTss()) { bool foundTag = false; for (auto& it : fTags.get()) { UID key = decodeServerTagKey(it.key); - if (key == server.tssPairID) { + if (key == server.tssPairID.get()) { tag = decodeServerTagValue(it.value); foundTag = true; break; @@ -1131,19 +1078,13 @@ ACTOR Future> addStorageServer(Database cx, StorageServe if (!foundTag) { throw recruitment_failed(); } - // ASSERT(foundTag); // TSS's pair was removed before TSS could register. Should never happen, since the - // SS shouldn't be tracked by DD until this completes. - printf("TSS %s found tag %s for pair %s\n", - server.id().toString().c_str(), - tag.toString().c_str(), - server.tssPairID.toString().c_str()); - tssMapDB.set(tr, server.tssPairID, server.id()); + + tssMapDB.set(tr, server.tssPairID.get(), server.id()); tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString()); } else { int8_t maxTagLocality = 0; state int8_t locality = -1; - // TODO i think tss can ignore this part? for (auto& kv : fTagLocalities.get()) { int8_t loc = decodeTagLocalityListValue(kv.value); if (decodeTagLocalityListKey(kv.key) == server.locality.dcId()) { @@ -1197,20 +1138,19 @@ ACTOR Future> addStorageServer(Database cx, StorageServe KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(tag)); tr->addReadConflictRange(conflictRange); tr->addWriteConflictRange(conflictRange); + + if (SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING) { + // THIS SHOULD NEVER BE ENABLED IN ANY NON-TESTING ENVIRONMENT + TraceEvent(SevError, "TSSIdentityMappingEnabled"); + tssMapDB.set(tr, server.id(), server.id()); + tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString()); + } } tr->set(serverListKeyFor(server.id()), serverListValue(server)); wait(tr->commit()); - printf("%sSS %s successfully added itself @ %lld\n", - server.isTss ? "T" : "", - server.id().toString().c_str(), - tr->getCommittedVersion()); return std::make_pair(tr->getCommittedVersion(), tag); } catch (Error& e) { - printf("%sSS %s got error adding itself: %d!!\n", - server.isTss ? "T" : "", - server.id().toString().c_str(), - e.code()); if (e.code() == error_code_commit_unknown_result) throw recruitment_failed(); // There is a remote possibility that we successfully added ourselves and // then someone removed us, so we have to fail @@ -1252,8 +1192,6 @@ ACTOR Future removeStorageServer(Database cx, state bool retry = false; state int noCanRemoveCount = 0; - printf("Removing storage server %s\n", serverID.toString().c_str()); - loop { try { tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); @@ -1323,18 +1261,19 @@ ACTOR Future removeStorageServer(Database cx, } tr->clear(serverListKeyFor(serverID)); - tr->clear(serverTagKeyFor(serverID)); // the tss uses this to communicate shutdown but it never has a + tr->clear(serverTagKeyFor(serverID)); // A tss uses this to communicate shutdown but it never has a // server tag key set in the first place tr->clear(serverTagHistoryRangeFor(serverID)); - // TODO a small optimization would be to only erase and trigger tss mapping if this is a tss or an ss - // with a tss pair, instead of always - if (tssPairID.present()) { - tssMapDB.erase(tr, tssPairID.get()); - } else { + if (SERVER_KNOBS->TSS_HACK_IDENTITY_MAPPING) { + // THIS SHOULD NEVER BE ENABLED IN ANY NON-TESTING ENVIRONMENT + TraceEvent(SevError, "TSSIdentityMappingEnabled"); tssMapDB.erase(tr, serverID); + tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString()); + } else if (tssPairID.present()) { + tssMapDB.erase(tr, tssPairID.get()); + tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString()); } - tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString()); retry = true; wait(tr->commit()); @@ -1354,7 +1293,6 @@ ACTOR Future removeKeysFromFailedServer(Database cx, MoveKeysLock lock, const DDEnabledState* ddEnabledState) { state Key begin = allKeys.begin; - printf("Removing keys from failed server %s\n", serverID.toString().c_str()); // Multi-transactional removal in case of large number of shards, concern in violating 5s transaction limit while (begin < allKeys.end) { state Transaction tr(cx); @@ -1456,8 +1394,6 @@ ACTOR Future moveKeys(Database cx, if (!dataMovementComplete.isSet()) dataMovementComplete.send(Void()); - printf("move keys done for [%s, %s)\n", keys.begin.toString().c_str(), keys.end.toString().c_str()); - return Void(); } @@ -1486,6 +1422,15 @@ void seedShardServers(Arena& arena, CommitTransactionRef& tr, vectorTSS_HACK_IDENTITY_MAPPING) { + // THIS SHOULD NEVER BE ENABLED IN ANY NON-TESTING ENVIRONMENT + TraceEvent(SevError, "TSSIdentityMappingEnabled"); + // hack key-backed map here since we can't really change CommitTransactionRef to a RYW transaction + Key uidRef = Codec::pack(s.id()).pack(); + tr.set(arena, uidRef.withPrefix(tssMappingKeys.begin), uidRef); + // tssMapDB.set(tr, server.id(), server.id()); + tr.set(arena, tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString()); + } } std::vector serverTags; diff --git a/fdbserver/MutationTracking.cpp b/fdbserver/MutationTracking.cpp index b0e7215fb8..16a17a0f10 100644 --- a/fdbserver/MutationTracking.cpp +++ b/fdbserver/MutationTracking.cpp @@ -30,9 +30,6 @@ // Track up to 2 keys in simulation via enabling MUTATION_TRACKING_ENABLED and setting the keys here. StringRef debugKey = LiteralStringRef(""); StringRef debugKey2 = LiteralStringRef("\xff\xff\xff\xff"); -// StringRef debugKey = LiteralStringRef("\x00\x00\x02\xff\x00\x00\x04\xc1\x00\x00\x00\x01\x00\x00\x00\x02"); // missing -// from ss StringRef debugKey2 = LiteralStringRef("\x00\x00\x02\xff\x00\x00\x01\x89\x00\x00\x00\x04\x00\x00\x00\x02"); -// // missing from tss TraceEvent debugMutationEnabled(const char* context, Version version, MutationRef const& mutation) { if ((mutation.type == mutation.ClearRange || mutation.type == mutation.DebugKeyRange) && diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 40f731aed6..223393bcdd 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -294,11 +294,6 @@ ACTOR Future getMaxStorageServerQueueSize(Database cx, Reference servers = wait(serversFuture); state std::vector workers = wait(workersFuture); - /*printf("Found %d storage servers:\n", servers.size()); - for (auto& it : servers) { - printf(" %s\n", it.id().toString().c_str()); - }*/ - std::map workersMap; for (auto worker : workers) { workersMap[worker.interf.address()] = worker.interf; @@ -328,7 +323,6 @@ ACTOR Future getMaxStorageServerQueueSize(Database cx, Reference trackEachStorageServer( when(state std::pair> change = waitNext(serverChanges)) { wait(delay(0)); // prevent storageServerTracker from getting cancelled while on the call stack if (change.second.present()) { - if (!change.second.get().isTss) { // TODO is this all we need to do to get ratekeeper to ignore tss? + if (!change.second.get().isTss()) { auto& a = actors[change.first]; a = Future(); a = splitError(trackStorageServerQueueInfo(self, change.second.get()), err); diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 24d7dfb01d..70fed849d9 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -170,6 +170,9 @@ class TestConfig { if (attrib == "maxTLogVersion") { sscanf(value.c_str(), "%d", &maxTLogVersion); } + if (attrib == "restartInfoLocation") { + isFirstTestInRestart = true; + } } ifs.close(); @@ -183,6 +186,7 @@ public: bool configureLocked = false; bool startIncompatibleProcess = false; int logAntiQuorum = -1; + bool firstTestInRestart = false; // Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig // 0 = "ssd" // 1 = "memory" @@ -235,6 +239,8 @@ public: for (const auto& [key, value] : conf) { if (key == "ClientInfoLogging") { setNetworkOption(FDBNetworkOptions::DISABLE_CLIENT_STATISTICS_LOGGING); + } else if (key == "restartInfoLocation") { + isFirstTestInRestart = true; } else { builder.set(key, value); } @@ -1165,13 +1171,9 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { } int tssCount = 0; - // if (!testConfig.simpleConfig && deterministicRandom()->random01() < 0.25) { - if (true) { - // if (false) { - // tss + if (!testconfig.simpleConfig && deterministicRandom()->random01() < 0.25) { // 1 or 2 tss tssCount = deterministicRandom()->randomInt(1, 3); - printf("Initial tss count to %d\n", tssCount); } // if (deterministicRandom()->random01() < 0.5) { @@ -1510,7 +1512,6 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { // reduce tss to half of extra non-seed servers that can be recruited in usable regions. tssCount = std::max(0, std::min(tssCount, (db.usableRegions * (machine_count / datacenters) - replication_type) / 2)); - printf("Adjusted tss count to %d\n", tssCount); if (tssCount > 0) { std::string confStr = format("tss_count:=%d tss_storage_engine:=%d", tssCount, db.storageServerStoreType); @@ -1519,13 +1520,13 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { if (tssRandom > 0.5) { // normal tss mode g_simulator.tssMode = ISimulator::TSSMode::EnabledNormal; - printf("normal tss mode\n"); - } else if (tssRandom < 0.25) { + } else if (tssRandom < 0.25 && !testConfig.isFirstTestInRestart) { + // fault injection - don't enable in first test in restart because second test won't know it intentionally + // lost data + g_simulator.tssMode = ISimulator::TSSMode::EnabledDropMutations; + } else { // delay injection g_simulator.tssMode = ISimulator::TSSMode::EnabledAddDelay; - } else { - // fault injection - g_simulator.tssMode = ISimulator::TSSMode::EnabledDropMutations; } printf("enabling tss for simulation in mode %d: %s\n", g_simulator.tssMode, confStr.c_str()); } diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 723c6c6111..5305abacbc 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -630,7 +630,7 @@ struct RolesInfo { TraceEventFields const& commitLatencyBands = metrics.at("CommitLatencyBands"); if (commitLatencyBands.size()) { obj["commit_latency_bands"] = addLatencyBandInfo(commitLatencyBands); - } + } TraceEventFields const& commitBatchingWindowSize = metrics.at("CommitBatchingWindowSize"); if (commitBatchingWindowSize.size()) { @@ -1869,10 +1869,10 @@ ACTOR static Future>> getTLogsAndMetri ACTOR static Future>> getCommitProxiesAndMetrics( Reference> db, std::unordered_map address_workers) { - vector> results = - wait(getServerMetrics(db->get().client.commitProxies, - address_workers, - std::vector{ "CommitLatencyMetrics", "CommitLatencyBands", "CommitBatchingWindowSize"})); + vector> results = wait(getServerMetrics( + db->get().client.commitProxies, + address_workers, + std::vector{ "CommitLatencyMetrics", "CommitLatencyBands", "CommitBatchingWindowSize" })); return results; } @@ -3007,7 +3007,7 @@ ACTOR Future clusterGetStatus( int activeTSSCount = 0; for (auto& it : storageServers) { - if (it.first.isTss) { + if (it.first.isTss()) { activeTSSCount++; } } diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index f884a2e310..4ea9e83bee 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1671,11 +1671,6 @@ ACTOR Future tLogPeekMessages(TLogData* self, TLogPeekRequest req, Referen Version poppedVer = poppedVersion(logData, req.tag); if (poppedVer > req.begin) { - printf("tag %s - %s tried to peek popped data!!: %lld > %lld\n", - req.tag.toString().c_str(), - peekId.toString().c_str(), - poppedVer, - req.begin); TLogPeekReply rep; rep.maxKnownVersion = logData->version.get(); rep.minKnownCommittedVersion = logData->minKnownCommittedVersion; diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index 48a4d9ce07..689bf0a68c 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -614,18 +614,13 @@ struct InitializeStorageRequest { UID reqId; UID interfaceId; KeyValueStoreType storeType; - bool isTss; - UID tssPairID; - Version tssPairVersion; + Optional> + tssPairIDAndVersion; // Only set if recruiting a tss. Will be the UID and Version of its SS pair. ReplyPromise reply; template void serialize(Ar& ar) { - if (ar.protocolVersion().hasTSS()) { - serializer(ar, seedTag, reqId, interfaceId, storeType, reply, isTss, tssPairID, tssPairVersion); - } else { - serializer(ar, seedTag, reqId, interfaceId, storeType, reply); - } + serializer(ar, seedTag, reqId, interfaceId, storeType, reply, tssPairIDAndVersion); } }; diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index c0dee60682..97953ce1a3 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -417,14 +417,11 @@ ACTOR Future newTLogServers(Reference self, ACTOR Future newSeedServers(Reference self, RecruitFromConfigurationReply recruits, vector* servers) { - printf("Seeding initial %d storage servers\n", recruits.storageServers.size()); // This is only necessary if the database is at version 0 servers->clear(); if (self->lastEpochEnd) return Void(); - // TODO might need to make this handle TSS recruitment (or make RecruitFromConfiguration handle it?) for simulation - state int idx = 0; state std::map, Tag> dcId_tags; state int8_t nextLocality = 0; @@ -437,7 +434,6 @@ ACTOR Future newSeedServers(Reference self, ? dcId_tags[recruits.storageServers[idx].locality.dcId()] : Tag(nextLocality, 0); isr.storeType = self->configuration.storageServerStoreType; - isr.isTss = false; isr.reqId = deterministicRandom()->randomUniqueID(); isr.interfaceId = deterministicRandom()->randomUniqueID(); @@ -473,8 +469,6 @@ ACTOR Future newSeedServers(Reference self, .detail("TargetCount", self->configuration.storageTeamSize) .detail("Servers", describe(*servers)); - printf("Seed servers sees %d desired tss\n", self->configuration.desiredTSSCount); - return Void(); } diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 7fe0b1c2a3..507de28f32 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -523,10 +523,6 @@ public: TraceEvent(SevWarnAlways, "TSSInjectFaultEnabled", thisServerID) .detail("Mode", g_simulator.tssMode) .detail("At", tssFaultInjectTime.get()); - printf("ENABLING FAULT INJECTION FOR TSS %s at time %.4f in mode %d\n", - thisServerID.toString().c_str(), - tssFaultInjectTime.get(), - g_simulator.tssMode); } } @@ -1077,24 +1073,12 @@ void updateProcessStats(StorageServer* self) { ACTOR Future waitForVersionActor(StorageServer* data, Version version, SpanID spanContext) { state Span span("SS.WaitForVersion"_loc, { spanContext }); - /*if (172218491 == version) { - printf("%sSS %s starting waitForVersionActor @ %lld\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), version); - }*/ choose { when(wait(data->version.whenAtLeast(version))) { // FIXME: A bunch of these can block with or without the following delay 0. // wait( delay(0) ); // don't do a whole bunch of these at once - /*if (172218491 == version) { - printf("%sSS %s waitForVersionActor @ %lld - at least version\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), version); - }*/ if (version < data->oldestVersion.get()) throw transaction_too_old(); // just in case - /*if (172218491 == version) { - printf("%sSS %s waitForVersionActor @ %lld - not too old\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), version); - }*/ return version; } when(wait(delay(SERVER_KNOBS->FUTURE_VERSION_DELAY))) { @@ -1103,39 +1087,23 @@ ACTOR Future waitForVersionActor(StorageServer* data, Version version, .detail("Version", version) .detail("MyVersion", data->version.get()) .detail("ServerID", data->thisServerID); - /*if (172218491 == version) { - printf("%sSS %s waitForVersionActor @ %lld - future version\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), version); - }*/ throw future_version(); } } } Future waitForVersion(StorageServer* data, Version version, SpanID spanContext) { - /*if (172218491 == version) { - printf("%sSS %s started waitForVersion @ %lld\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), version); - }*/ if (version == latestVersion) { version = std::max(Version(1), data->version.get()); } if (version < data->oldestVersion.get() || version <= 0) { - /*if (172218491 == version) { - printf("%sSS %s waitForVersion @ %lld - transaction too old\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), version); - }*/ return transaction_too_old(); } else if (version <= data->version.get()) { return version; } if ((data->behind || data->versionBehind) && version > data->version.get()) { - /*if (172218491 == version) { - printf("%sSS %s waitForVersion @ %lld - process_behind\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), version); - }*/ return process_behind(); } @@ -1169,11 +1137,6 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { Span span("SS:getValue"_loc, { req.spanContext }); span.addTag("key"_sr, req.key); - /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { - printf("%sSS %s started getValueQ for %s @ %lld\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version); - }*/ - try { ++data->counters.getValueQueries; ++data->counters.allQueries; @@ -1185,11 +1148,6 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { // so we need to downgrade here wait(data->getQueryDelay()); - /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { - printf("%sSS %s getValueQ for %s @ %lld - got query delay\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version); - }*/ - if (req.debugID.present()) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), @@ -1204,17 +1162,8 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { state uint64_t changeCounter = data->shardChangeCounter; - /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { - printf("%sSS %s getValueQ for %s @ %lld - waited for version\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version); - }*/ - if (!data->shards[req.key]->isReadable()) { //TraceEvent("WrongShardServer", data->thisServerID).detail("Key", req.key).detail("Version", version).detail("In", "getValueQ"); - /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { - printf("%sSS %s started getValueQ for %s @ %lld got wrong shard server\n", data->tssPairID.present() ? - "T" : "", data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version); - }*/ throw wrong_shard_server(); } @@ -1223,10 +1172,6 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { if (i && i->isValue() && i.key() == req.key) { v = (Value)i->getValue(); path = 1; - /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { - printf("%sSS %s getValueQ for %s @ %lld - got from memory\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version); - }*/ } else if (!i || !i->isClearTo() || i->getEndKey() <= req.key) { path = 2; Optional vv = wait(data->storage.readValue(req.key, req.debugID)); @@ -1237,10 +1182,6 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { } data->checkChangeCounter(changeCounter, req.key); v = vv; - /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { - printf("%sSS %s getValueQ for %s @ %lld - got from storage\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version); - }*/ } DEBUG_MUTATION("ShardGetValue", @@ -1268,12 +1209,6 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { ++data->counters.emptyQueries; } - /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { - printf("%sSS %s getValueQ for %s @ %lld = %s\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version, v.present() ? - v.get().toString().c_str() : ""); - }*/ - if (SERVER_KNOBS->READ_SAMPLING_ENABLED) { // If the read yields no value, randomly sample the empty read. int64_t bytesReadPerKSecond = @@ -1296,16 +1231,8 @@ ACTOR Future getValueQ(StorageServer* data, GetValueRequest req) { reply.penalty = data->getPenalty(); req.reply.send(reply); } catch (Error& e) { - /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { - printf("%sSS %s getValueQ for %s @ %lld = ERROR: %d\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version, e.code()); - }*/ if (!canReplyWith(e)) throw; - /*if ("000002ff000004c10000000100000002" == req.key.toString() && 172218491 == req.version) { - printf("%sSS %s getValueQ for %s @ %lld = replying with error: %d\n", data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), req.key.toString().c_str(), req.version, e.code()); - }*/ data->sendErrorWithPenalty(req.reply, e, data->getPenalty()); } @@ -1816,12 +1743,6 @@ ACTOR Future findKey(StorageServer* data, state int distance = forward ? sel.offset : 1 - sel.offset; state Span span("SS.findKey"_loc, { parentSpan }); - /*if (version == 166817893 && sel.offset == 80) { - printf("%sSS %s FindKey request %s:<%s:%d @ %lld: with key range [%s - %s):\n", data->isTss() ? "t" : "", - data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset, - version, range.begin.toString().c_str(), range.end.toString().c_str()); - }*/ - // Don't limit the number of bytes if this is a trivial key selector (there will be at most two items returned from // the read range in this case) state int maxBytes; @@ -1841,13 +1762,6 @@ ACTOR Future findKey(StorageServer* data, span.context)); state bool more = rep.more && rep.data.size() != distance + skipEqualKey; - /*if (version == 166817893 && sel.offset == 80) { - printf("%sSS %s FindKey request %s:<%s:%d @ %lld: readRange with limBytes=%d got %d:\n", data->isTss() ? "t" : - "", data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset, - version, maxBytes, rep.data.size()); for (auto& it : rep.data) { printf(" %s\n", it.key.toString().c_str()); - } - }*/ - // If we get only one result in the reverse direction as a result of the data being too large, we could get stuck in // a loop if (more && !forward && rep.data.size() == 1) { @@ -1894,19 +1808,8 @@ ACTOR Future findKey(StorageServer* data, // This is possible if key/value pairs are very large and only one result is returned on a last less than // query SOMEDAY: graceful handling of exceptionally sized values ASSERT(returnKey != sel.getKey()); - - /*if (version == 166817893 && sel.offset == 80) { - printf("%sSS %s FindKey request %s:<%s:%d @ %lld: moving same shard\n", data->isTss() ? "t" : "", - data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset, - version); - }*/ return returnKey; } else { - /*if (version == 166817893 && sel.offset == 80) { - printf("%sSS %s FindKey request %s:<%s:%d @ %lld: moving shard boundary\n", data->isTss() ? "t" : "", - data->thisServerID.toString().c_str(), sel.getKey().printable().c_str(), sel.orEqual ? "=" : "", sel.offset, - version); - }*/ return forward ? range.end : range.begin; } } @@ -1931,15 +1834,6 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) state Span span("SS:getKeyValues"_loc, { req.spanContext }); state int64_t resultSize = 0; - if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) { - printf("%sSS %s starting query [%s - %s) @ %lld\n", - data->isTss() ? "T" : "", - data->thisServerID.toString().c_str(), - req.begin.getKey().printable().c_str(), - req.end.getKey().printable().c_str(), - req.version); - } - ++data->counters.getRangeQueries; ++data->counters.allQueries; ++data->readQueueSizeMetric; @@ -1954,15 +1848,6 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) wait(data->getQueryDelay()); } - if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) { - printf("%sSS %s downgraded [%s - %s) @ %lld\n", - data->isTss() ? "T" : "", - data->thisServerID.toString().c_str(), - req.begin.getKey().printable().c_str(), - req.end.getKey().printable().c_str(), - req.version); - } - try { if (req.debugID.present()) g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Before"); @@ -1987,15 +1872,6 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) throw wrong_shard_server(); } - if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) { - printf("%sSS %s validated shard [%s - %s) @ %lld\n", - data->isTss() ? "T" : "", - data->thisServerID.toString().c_str(), - req.begin.getKey().printable().c_str(), - req.end.getKey().printable().c_str(), - req.version); - } - state int offset1; state int offset2; state Future fBegin = req.begin.isFirstGreaterOrEqual() @@ -2026,25 +1902,6 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) throw wrong_shard_server(); } - if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && req.version == 107157353) { - printf("%sSS %s resolved begin and end [%s - %s) @ %lld\n", - data->isTss() ? "T" : "", - data->thisServerID.toString().c_str(), - req.begin.getKey().printable().c_str(), - req.end.getKey().printable().c_str(), - req.version); - printf(" %s:<%s:%d @ -> %s\n", - req.begin.getKey().printable().c_str(), - req.begin.orEqual ? "=" : "", - req.begin.offset, - req.begin.getKey().printable().c_str()); - printf(" %s:<%s:%d @ -> %s\n", - req.end.getKey().printable().c_str(), - req.end.orEqual ? "=" : "", - req.end.offset, - req.end.getKey().printable().c_str()); - } - if (begin >= end) { if (req.debugID.present()) g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Send"); @@ -2062,28 +1919,10 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) } else { state int remainingLimitBytes = req.limitBytes; - /*if (req.begin.getKey().toString() == "m3fc7" && req.end.getKey().toString() == "s" && req.version == - 133421369) { printf("%sSS %s beginning readRange [%s - %s) @ %lld\n", data->isTss() ? "T" : "", - data->thisServerID.toString().c_str(), req.begin.getKey().printable().c_str(), - req.end.getKey().printable().c_str(), req.version); - }*/ - GetKeyValuesReply _r = wait(readRange(data, version, KeyRangeRef(begin, end), req.limit, &remainingLimitBytes, span.context)); GetKeyValuesReply r = _r; - if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && - req.version == 107157353) { - printf("%sSS %s completed readRange (%d)%s: \n", - data->isTss() ? "T" : "", - data->thisServerID.toString().c_str(), - r.data.size(), - r.more ? "+" : ""); - /*for (auto& it : r.data) { - printf(" %s=%s\n", it.key.printable().c_str(), it.value.printable().c_str()); - }*/ - } - if (req.debugID.present()) g_traceBatch.addEvent( "TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterReadRange"); @@ -2116,14 +1955,6 @@ ACTOR Future getKeyValuesQ(StorageServer* data, GetKeyValuesRequest req) data->metrics.notifyBytesReadPerKSecond(r.data[r.data.size() - 1].key, bytesReadPerKSecond); } - if (req.begin.getKey().toString() == "B" && req.end.getKey().toString() == "v" && - req.version == 107157353) { - printf("%sSS %s replying to %s\n", - data->isTss() ? "T" : "", - data->thisServerID.toString().c_str(), - req.reply.getEndpoint().token.toString().c_str()); - } - r.penalty = data->getPenalty(); req.reply.send(r); @@ -2174,33 +2005,15 @@ ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { // so we need to downgrade here wait(data->getQueryDelay()); - /*if (req.version == 166817893 && req.sel.offset == 80) { - printf("%sSS %s GetKey request %s:<%s:%d @ %lld\n", data->isTss() ? "t" : "", - data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "", - req.sel.offset, req.version); - }*/ - try { state Version version = wait(waitForVersion(data, req.version, req.spanContext)); - /*if (req.version == 166817893 && req.sel.offset == 80) { - printf("%sSS %s GetKey request %s:<%s:%d @ %lld: waited for version\n", data->isTss() ? "t" : "", - data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "", - req.sel.offset, req.version); - }*/ - state uint64_t changeCounter = data->shardChangeCounter; state KeyRange shard = getShardKeyRange(data, req.sel); state int offset; Key k = wait(findKey(data, req.sel, version, shard, &offset, req.spanContext)); - /*if (req.version == 166817893 && req.sel.offset == 80) { - printf("%sSS %s GetKey request %s:<%s:%d @ %lld: found key: %s\n", data->isTss() ? "t" : "", - data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "", - req.sel.offset, req.version, k.toString().c_str()); - }*/ - data->checkChangeCounter( changeCounter, KeyRangeRef(std::min(req.sel.getKey(), k), std::max(req.sel.getKey(), k))); @@ -2215,12 +2028,6 @@ ACTOR Future getKeyQ(StorageServer* data, GetKeyRequest req) { else updated = KeySelectorRef(k, true, 0); // found - /*if (req.version == 166817893 && req.sel.offset == 80) { - printf("%sSS %s GetKey request %s:<%s:%d @ %lld: updated: %s:<%s:%d\n", data->isTss() ? "t" : "", - data->thisServerID.toString().c_str(), req.sel.getKey().printable().c_str(), req.sel.orEqual ? "=" : "", - req.sel.offset, req.version, updated.getKey().printable().c_str(), updated.orEqual ? "=" : "", updated.offset); - }*/ - resultSize = k.size(); data->counters.bytesQueried += resultSize; ++data->counters.rowsQueried; @@ -2545,14 +2352,6 @@ void removeDataRange(StorageServer* ss, // disk when this latest version becomes durable mLV is also modified if necessary to ensure that split clears can // be forgotten - // TODO REMOVE print - printf("%sss %s removing data range [%s - %s) @ %lld\n", - ss->isTss() ? "t" : "", - ss->thisServerID.toString().c_str(), - range.begin.toString().c_str(), - range.end.toString().c_str(), - mLV.version); - MutationRef clearRange(MutationRef::ClearRange, range.begin, range.end); clearRange = ss->addMutationToMutationLog(mLV, clearRange); @@ -2583,13 +2382,6 @@ void removeDataRange(StorageServer* ss, } data.erase(range.begin, range.end); - - printf("%sss %s removed data range [%s - %s) @ %lld\n", - ss->isTss() ? "t" : "", - ss->thisServerID.toString().c_str(), - range.begin.toString().c_str(), - range.end.toString().c_str(), - mLV.version); } void setAvailableStatus(StorageServer* self, KeyRangeRef keys, bool available); @@ -3170,12 +2962,12 @@ void changeServerKeys(StorageServer* data, ChangeServerKeysContext context) { ASSERT(!keys.empty()); - TraceEvent("ChangeServerKeys", data->thisServerID) - .detail("KeyBegin", keys.begin) - .detail("KeyEnd", keys.end) - .detail("NowAssigned", nowAssigned) - .detail("Version", version) - .detail("Context", changeServerKeysContextName[(int)context]); + // TraceEvent("ChangeServerKeys", data->thisServerID) + // .detail("KeyBegin", keys.begin) + // .detail("KeyEnd", keys.end) + // .detail("NowAssigned", nowAssigned) + // .detail("Version", version) + // .detail("Context", changeServerKeysContextName[(int)context]); validate(data); // TODO(alexmiller): Figure out how to selectively enable spammy data distribution events. @@ -3193,7 +2985,7 @@ void changeServerKeys(StorageServer* data, } } if (!isDifferent) { - TraceEvent("CSKShortCircuit", data->thisServerID).detail("KeyBegin", keys.begin).detail("KeyEnd", keys.end); + // TraceEvent("CSKShortCircuit", data->thisServerID).detail("KeyBegin", keys.begin).detail("KeyEnd", keys.end); return; } @@ -3231,13 +3023,13 @@ void changeServerKeys(StorageServer* data, for (auto r = vr.begin(); r != vr.end(); ++r) { KeyRangeRef range = keys & r->range(); bool dataAvailable = r->value() == latestVersion || r->value() >= version; - TraceEvent("CSKRange", data->thisServerID) - .detail("KeyBegin", range.begin) - .detail("KeyEnd", range.end) - .detail("Available", dataAvailable) - .detail("NowAssigned", nowAssigned) - .detail("NewestAvailable", r->value()) - .detail("ShardState0", data->shards[range.begin]->debugDescribeState()); + // TraceEvent("CSKRange", data->thisServerID) + // .detail("KeyBegin", range.begin) + // .detail("KeyEnd", range.end) + // .detail("Available", dataAvailable) + // .detail("NowAssigned", nowAssigned) + // .detail("NewestAvailable", r->value()) + // .detail("ShardState0", data->shards[range.begin]->debugDescribeState()); if (!nowAssigned) { if (dataAvailable) { ASSERT(r->value() == @@ -3279,14 +3071,8 @@ void changeServerKeys(StorageServer* data, oldShards.clear(); ranges.clear(); for (auto r = removeRanges.begin(); r != removeRanges.end(); ++r) { - // TODO should we do this at the passed in version? (or the passed in version + 1?) removeDataRange(data, data->addVersionToMutationLog(data->data().getLatestVersion()), data->shards, *r); setAvailableStatus(data, *r, false); - printf("%sss %s set data range unavailable [%s - %s)\n", - data->isTss() ? "t" : "", - data->thisServerID.toString().c_str(), - keys.begin.toString().c_str(), - keys.end.toString().c_str()); } validate(data); } @@ -3458,26 +3244,18 @@ private: data->recoveryVersionSkips.emplace_back(rollbackVersion, currentVersion - rollbackVersion); } else if (m.type == MutationRef::SetValue && m.param1 == killStoragePrivateKey) { - printf("worked removed kill storage: %s\n", data->thisServerID.toString().c_str()); throw worker_removed(); } else if ((m.type == MutationRef::SetValue || m.type == MutationRef::ClearRange) && m.param1.substr(1).startsWith(serverTagPrefix)) { UID serverTagKey = decodeServerTagKey(m.param1.substr(1)); - // bool matchesThisServer = (!data->isTss() && serverTagKey == data->thisServerID) || (data->isTss() && - // serverTagKey == data->tssPairID.get()); bool matchesThisServer = serverTagKey == data->thisServerID; bool matchesTssPair = data->isTss() ? serverTagKey == data->tssPairID.get() : false; if ((m.type == MutationRef::SetValue && !data->isTss() && !matchesThisServer) || (m.type == MutationRef::ClearRange && (matchesThisServer || (data->isTss() && matchesTssPair)))) { - printf("%sSS %s removed b/c tag mutation: %s\n", - data->isTss() ? "T" : "", - data->thisServerID.toString().c_str(), - m.toString().c_str()); throw worker_removed(); } } else if (m.type == MutationRef::SetValue && m.param1 == rebootWhenDurablePrivateKey) { data->rebootAfterDurableVersion = currentVersion; - printf("%s got reboot after durable @ %lld\n", data->thisServerID.toString().c_str(), currentVersion); TraceEvent("RebootWhenDurableSet", data->thisServerID) .detail("DurableVersion", data->durableVersion.get()) .detail("RebootAfterDurableVersion", data->rebootAfterDurableVersion); @@ -3542,12 +3320,10 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { wait(delayJittered(.005, TaskPriority::TLogPeekReply)); } - // TODO REMOVE!! just for testing what happens when TSS gets behind if (g_network->isSimulated() && data->isTss() && g_simulator.tssMode == ISimulator::TSSMode::EnabledAddDelay && data->tssFaultInjectTime.present() && data->tssFaultInjectTime.get() < now()) { if (deterministicRandom()->random01() < 0.01) { TraceEvent(SevWarnAlways, "TSSInjectDelayForever", data->thisServerID); - printf("TSS %s INJECTING DELAY FOREVER!!\n", data->thisServerID.toString().c_str()); // small random chance to just completely get stuck here, each tss should eventually hit this in this // mode wait(Never()); @@ -3555,7 +3331,6 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { // otherwise pause for part of a second double delayTime = deterministicRandom()->random01(); TraceEvent(SevWarnAlways, "TSSInjectDelay", data->thisServerID).detail("Delay", delayTime); - printf("TSS %s INJECTING DELAY for %.4f!!\n", data->thisServerID.toString().c_str(), delayTime); wait(delay(delayTime)); } } @@ -3573,8 +3348,6 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { } } if (cursor->popped() > 0) { - printf( - "Worker removed because of popped=%d: %s\n", cursor->popped(), data->thisServerID.toString().c_str()); throw worker_removed(); } @@ -3982,9 +3755,6 @@ ACTOR Future updateStorage(StorageServer* data) { #endif void StorageServerDisk::makeNewStorageServerDurable() { - // TODO REMOVE print - printf( - "%sSS %s saving durable state\n", data->tssPairID.present() ? "T" : "", data->thisServerID.toString().c_str()); storage->set(persistFormat); storage->set(KeyValueRef(persistID, BinaryWriter::toValue(data->thisServerID, Unversioned()))); if (data->tssPairID.present()) { @@ -4268,17 +4038,6 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor debug_checkRestoredVersion(data->thisServerID, version, "StorageServer"); data->setInitialVersion(version); - // TODO REMOVE print - printf("%sSS %s restored durable state @ %lld\n", - data->tssPairID.present() ? "T" : "", - data->thisServerID.toString().c_str(), - version); - if (data->tssPairID.present()) { - printf("TSS %s recovered pairing to SS %s\n", - data->thisServerID.toString().c_str(), - data->tssPairID.get().toString().c_str()); - } - state RangeResult available = fShardAvailable.get(); state int availableLoc; for (availableLoc = 0; availableLoc < available.size(); availableLoc++) { @@ -4565,9 +4324,9 @@ ACTOR Future metricsCore(StorageServer* self, StorageServerInterface ssi) SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self->counters.cc, self->thisServerID.toString() + "/StorageMetrics", - [self=self](TraceEvent& te) { + [self = self](TraceEvent& te) { te.detail("Tag", self->tag.toString()); - StorageBytes sb = self->storage.getStorageBytes(); + StorageBytes sb = self->storage.getStorageBytes(); te.detail("KvstoreBytesUsed", sb.used); te.detail("KvstoreBytesFree", sb.free); te.detail("KvstoreBytesAvailable", sb.available); @@ -4688,19 +4447,6 @@ ACTOR Future serveGetKeyValuesRequests(StorageServer* self, FutureStreamisTss() ? "T" : "", - self->thisServerID.toString().c_str(), - req.begin.getKey().printable().c_str(), - req.end.getKey().printable().c_str(), - req.version); - } - - // A TSS should never be the source for fetch keys - ASSERT(!self->tssPairID.present() || !req.isFetchKeys); - // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade // before doing real work self->actors.add(self->readGuard(req, getKeyValuesQ)); @@ -4939,18 +4685,8 @@ ACTOR Future storageServerCore(StorageServer* self, StorageServerInterface ClientDBInfo clientInfo = self->db->get().client; Optional myTssPair = clientInfo.getTssPair(self->thisServerID); if (myTssPair.present()) { - // TODO REMOVE print, just for debugging - if (!self->ssPairID.present()) { - printf("SS %s found tss pair %s\n", - self->thisServerID.toString().c_str(), - myTssPair.get().id().toString().c_str()); - } self->setSSWithTssPair(myTssPair.get().id()); } else { - // TODO REMOVE print, just for debugging - if (self->ssPairID.present()) { - printf("SS %s lost tss pair\n", self->thisServerID.toString().c_str()); - } self->clearSSWithTssPair(); } } @@ -5057,17 +4793,11 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, Reference> db, std::string folder) { state StorageServer self(persistentData, db, ssi); - if (ssi.isTss) { - self.setTssPair(ssi.tssPairID); + if (ssi.isTss()) { + self.setTssPair(ssi.tssPairID.get()); ASSERT(self.isTss()); } - // TODO REMOVE - printf("initializing %sstorage %s with tag %s and tss pair=%s\n", - ssi.isTss ? "testing " : "", - ssi.id().toString().c_str(), - seedTag.toString().c_str(), - self.tssPairID.present() ? self.tssPairID.get().toString().c_str() : ""); self.sk = serverKeysPrefixFor(self.tssPairID.present() ? self.tssPairID.get() : self.thisServerID) .withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/ self.folder = folder; @@ -5080,12 +4810,7 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, std::pair verAndTag = wait(addStorageServer( self.cx, ssi)); // Might throw recruitment_failed in case of simultaneous master failure self.tag = verAndTag.second; - // self.setInitialVersion(ssi.isTss ? 0 : verAndTag.first - 1); - if (ssi.isTss) { - printf("TSS %s overriding initial version from %lld to %lld\n", - ssi.id().toString().c_str(), - verAndTag.first - 1, - tssSeedVersion); + if (ssi.isTss()) { self.setInitialVersion(tssSeedVersion); } else { self.setInitialVersion(verAndTag.first - 1); @@ -5100,7 +4825,7 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, TraceEvent("StorageServerInit", ssi.id()) .detail("Version", self.version.get()) .detail("SeedTag", seedTag.toString()) - .detail("TssPair", ssi.isTss ? ssi.tssPairID.toString() : ""); + .detail("TssPair", ssi.isTss() ? ssi.tssPairID.get().toString() : ""); InitializeStorageReply rep; rep.interf = ssi; rep.addedVersion = self.version.get(); @@ -5121,10 +4846,7 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, } ACTOR Future replaceInterface(StorageServer* self, StorageServerInterface ssi) { - printf("SS %s replacing interface\ngetValue=%s\n", - ssi.id().toString().c_str(), - ssi.getValue.getEndpoint().token.toString().c_str()); - ASSERT(!ssi.isTss); + ASSERT(!ssi.isTss()); state Transaction tr(self->cx); loop { @@ -5140,16 +4862,8 @@ ACTOR Future replaceInterface(StorageServer* self, StorageServerInterface : Never())) { state GetStorageServerRejoinInfoReply rep = _rep; - printf("SS %s got rejoin reply:\nversion: %" PRIu64 "\ntag: %s\nnewTag: %s\nnewLocality: %s\n", - ssi.id().toString().c_str(), - rep.version, - rep.tag.toString().c_str(), - rep.newTag.present() ? rep.newTag.get().toString().c_str() : "", - rep.newLocality ? "true" : "false"); - try { tr.reset(); - // TODO why doesn't this need ACCESS_SYSTEM_KEYS? tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr.setVersion(rep.version); @@ -5184,7 +4898,6 @@ ACTOR Future replaceInterface(StorageServer* self, StorageServerInterface choose { when(wait(tr.commit())) { - printf("SS committed rejoin txn\n"); self->history = rep.history; if (rep.newTag.present()) { @@ -5213,7 +4926,6 @@ ACTOR Future replaceInterface(StorageServer* self, StorageServerInterface when(wait(infoChanged)) {} } } catch (Error& e) { - printf("rejoin txn got error: %d!!\n", e.code()); wait(tr.onError(e)); } } @@ -5229,20 +4941,14 @@ ACTOR Future replaceTSSInterface(StorageServer* self, StorageServerInterfa state Reference tr = makeReference(self->cx); state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); - ASSERT(ssi.isTss); - - printf("TSS %s replacing interface:\ngetValue=%s\n", - ssi.id().toString().c_str(), - ssi.getValue.getEndpoint().token.toString().c_str()); - - // TODO should this loop until successful? it should never have conflicts, in theory + ASSERT(ssi.isTss()); loop { try { state Tag myTag; tr->reset(); - tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); // TODO is this needed? + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); Optional pairTagValue = wait(tr->get(serverTagKeyFor(self->tssPairID.get()))); @@ -5263,17 +4969,10 @@ ACTOR Future replaceTSSInterface(StorageServer* self, StorageServerInterfa tr->set(tssMappingChangeKey, deterministicRandom()->randomUniqueID().toString()); wait(tr->commit()); - - // TODO trace event instead - printf("tss %s added itself back, got tag %s for partner %s\n", - self->thisServerID.toString().c_str(), - self->tag.toString().c_str(), - self->tssPairID.get().toString().c_str()); self->tag = myTag; break; } catch (Error& e) { - printf("tss replace interface got error %d!!\n", e.code()); wait(tr->onError(e)); } } @@ -5317,7 +5016,7 @@ ACTOR Future storageServer(IKeyValueStore* persistentData, // if this is a tss storage file, use that as source of truth for this server being a tss instead of the // presence of the tss pair key in the storage engine - if (ssi.isTss) { + if (ssi.isTss()) { ASSERT(self.isTss()); ssi.tssPairID = self.tssPairID.get(); } else { diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 1b23040d0d..121e2477e0 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -979,8 +979,7 @@ ACTOR Future runTest(Database cx, testers, quiescent, spec.runConsistencyCheckOnCache, - // spec.runConsistencyCheckOnTSS, // TODO override with true to test - true, + spec.runConsistencyCheckOnTSS, 10000.0, 18000, spec.databasePingDelay, @@ -1429,19 +1428,14 @@ ACTOR Future runTests(ReferenceisSimulated() && enableDD) { - printf("waiting for DD\n"); wait(success(setDDMode(cx, 1))); - printf("done waiting for DD\n"); } } catch (Error& e) { TraceEvent(SevError, "TestFailure").error(e).detail("Reason", "Unable to set starting configuration"); } } - printf("starting configuration set, moving on\n"); - if (useDB && waitForQuiescenceBegin) { TraceEvent("TesterStartingPreTestChecks") .detail("DatabasePingDelay", databasePingDelay) @@ -1457,8 +1451,6 @@ ACTOR Future runTests(Reference storageServerRollbackRebooter(Future prevStorageServer, StorageServerInterface recruited; recruited.uniqueID = id; recruited.locality = locality; - recruited.isTss = isTss; + recruited.tssPairID = + isTss ? Optional(UID()) : Optional(); // set this here since we use its presence to determine + // whether this server is a tss or not recruited.initEndpoints(); DUMPTOKEN(recruited.getValue); @@ -1110,14 +1112,15 @@ ACTOR Future workerServer(Reference connFile, // TODO might be more efficient to mark a boolean on DiskStore in getDiskStores, but that kind of breaks // the abstraction since DiskStore also applies to storage cache + tlog bool isTss = s.filename.find(tssPrefix) != std::string::npos; - // TODO REMOVE after test - printf("%s is%s tss filename\n", s.filename.c_str(), isTss ? "" : " not"); Role ssRole = isTss ? Role::TESTING_STORAGE_SERVER : Role::STORAGE_SERVER; StorageServerInterface recruited; recruited.uniqueID = s.storeID; recruited.locality = locality; - recruited.isTss = isTss; + recruited.tssPairID = + isTss ? Optional(UID()) + : Optional(); // presence of optional is used as source of truth for tss vs not. Value + // gets overridden later in restoreDurableState recruited.initEndpoints(); std::map details; @@ -1509,27 +1512,17 @@ ACTOR Future workerServer(Reference connFile, when(InitializeStorageRequest req = waitNext(interf.storage.getFuture())) { if (!storageCache.exists(req.reqId)) { - printf("Got " - "InitializeStorageRequest:seedTag=%s\nreqId=%s\ninterfaceId=%s\nstoreType=%s\nisTss=%" - "s\ntssPairID=%s\ntssPairVersion=%lld\n\n", - req.seedTag.toString().c_str(), - req.reqId.toString().c_str(), - req.interfaceId.toString().c_str(), - req.storeType.toString().c_str(), - req.isTss ? "true" : "false", - req.isTss ? req.tssPairID.toString().c_str() : "", - req.isTss ? req.tssPairVersion : 0); + bool isTss = req.tssPairIDAndVersion.present(); StorageServerInterface recruited(req.interfaceId); recruited.locality = locality; - recruited.isTss = req.isTss; - recruited.tssPairID = req.tssPairID; + recruited.tssPairID = isTss ? req.tssPairIDAndVersion.get().first : Optional(); recruited.initEndpoints(); std::map details; details["StorageEngine"] = req.storeType.toString(); - details["IsTSS"] = std::to_string(recruited.isTss); - Role ssRole = recruited.isTss ? Role::TESTING_STORAGE_SERVER : Role::STORAGE_SERVER; + details["IsTSS"] = std::to_string(isTss); + Role ssRole = isTss ? Role::TESTING_STORAGE_SERVER : Role::STORAGE_SERVER; startRole(ssRole, recruited.id(), interf.id(), details); DUMPTOKEN(recruited.getValue); @@ -1545,21 +1538,25 @@ ACTOR Future workerServer(Reference connFile, DUMPTOKEN(recruited.getQueuingMetrics); DUMPTOKEN(recruited.getKeyValueStoreType); DUMPTOKEN(recruited.watchValue); - // TODO re-comment! - printf("Recruited as storageServer\n"); + // printf("Recruited as storageServer\n"); std::string filename = filenameFromId(req.storeType, folder, - recruited.isTss ? testingStoragePrefix.toString() : fileStoragePrefix.toString(), + isTss ? testingStoragePrefix.toString() : fileStoragePrefix.toString(), recruited.id()); IKeyValueStore* data = openKVStore(req.storeType, filename, recruited.id(), memoryLimit); Future kvClosed = data->onClosed(); filesClosed.add(kvClosed); ReplyPromise storageReady = req.reply; storageCache.set(req.reqId, storageReady.getFuture()); - Future s = - storageServer(data, recruited, req.seedTag, req.tssPairVersion, storageReady, dbInfo, folder); + Future s = storageServer(data, + recruited, + req.seedTag, + isTss ? req.tssPairIDAndVersion.get().second : 0, + storageReady, + dbInfo, + folder); s = handleIOErrors(s, data, recruited.id(), kvClosed); s = storageCache.removeOnReady(req.reqId, s); s = storageServerRollbackRebooter(s, @@ -1567,7 +1564,7 @@ ACTOR Future workerServer(Reference connFile, filename, recruited.id(), recruited.locality, - req.isTss, + isTss, dbInfo, folder, &filesClosed, diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 0aae7ca9d4..3b5156fb1e 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -1062,7 +1062,7 @@ struct ConsistencyCheckWorkload : TestWorkload { .detail("Begin", printable(shard.begin)) .detail("End", printable(shard.end)) .detail("StorageServer", storageServers[i].id()) - .detail("IsTSS", storageServers[i].isTss ? "True" : "False") + .detail("IsTSS", storageServers[i].isTss() ? "True" : "False") .error(reply.getError()); estimatedBytes.push_back(-1); } @@ -1082,8 +1082,9 @@ struct ConsistencyCheckWorkload : TestWorkload { .detail("StorageServer1", storageServers[firstValidStorageServer].id()) .detail("StorageServer2", storageServers[i].id()) .detail("IsTSS", - storageServers[i].isTss || storageServers[firstValidStorageServer].isTss ? "True" - : "False"); + storageServers[i].isTss() || storageServers[firstValidStorageServer].isTss() + ? "True" + : "False"); } } } @@ -1247,24 +1248,14 @@ struct ConsistencyCheckWorkload : TestWorkload { // add TSS to end of list, if configured and if not relocating if (!isRelocating && self->performTSSCheck) { - printf("CCheck: Checking for tss to add: isRelocating: %s, performTSSCheck: %s\n", - isRelocating ? "T" : "F", - self->performTSSCheck ? "T" : "F"); int initialSize = storageServers.size(); for (int i = 0; i < initialSize; i++) { Optional tssPair = cx->clientInfo->get().getTssPair(storageServers[i]); if (tssPair.present()) { - printf("CCheck: Adding TSS %s to consistency check!\n", tssPair.get().id().toString().c_str()); storageServers.push_back(tssPair.get().id()); storageServerInterfaces.push_back(tssPair.get()); - } else { - printf("CCheck: SS %s doesn't have tss pair\n", storageServers[i].toString().c_str()); } } - } else { - printf("CCheck: Not checking for tss to add: isRelocating: %s, performTSSCheck: %s\n", - isRelocating ? "T" : "F", - self->performTSSCheck ? "T" : "F"); } state vector estimatedBytes = wait(self->getStorageSizeEstimate(storageServerInterfaces, range)); @@ -1355,7 +1346,7 @@ struct ConsistencyCheckWorkload : TestWorkload { if (g_network->isSimulated()) { int invalidIndex = -1; printf("\n%sSERVER %d (%s); shard = %s - %s:\n", - storageServerInterfaces[j].isTss ? "TSS " : "", + storageServerInterfaces[j].isTss() ? "TSS " : "", j, storageServerInterfaces[j].address().toString().c_str(), printable(req.begin.getKey()).c_str(), @@ -1374,7 +1365,7 @@ struct ConsistencyCheckWorkload : TestWorkload { printf( "\n%sSERVER %d (%s); shard = %s - %s:\n", - storageServerInterfaces[firstValidServer].isTss ? "TSS " : "", + storageServerInterfaces[firstValidServer].isTss() ? "TSS " : "", firstValidServer, storageServerInterfaces[firstValidServer].address().toString().c_str(), printable(req.begin.getKey()).c_str(), @@ -1465,17 +1456,15 @@ struct ConsistencyCheckWorkload : TestWorkload { .detail("ValueMismatchKey", printable(valueMismatchKey)) .detail("MatchingKVPairs", matchingKVPairs) .detail("IsTSS", - storageServerInterfaces[j].isTss || - storageServerInterfaces[firstValidServer].isTss + storageServerInterfaces[j].isTss() || + storageServerInterfaces[firstValidServer].isTss() ? "True" : "False"); - // TODO should the test still fail if TSS is wrong? Or is just logging the trace - // logs ok if ((g_network->isSimulated() && g_simulator.tssMode != ISimulator::TSSMode::EnabledDropMutations) || - (!storageServerInterfaces[j].isTss && - !storageServerInterfaces[firstValidServer].isTss)) { + (!storageServerInterfaces[j].isTss() && + !storageServerInterfaces[firstValidServer].isTss())) { self->testFailure("Data inconsistent", true); return false; } @@ -1497,19 +1486,12 @@ struct ConsistencyCheckWorkload : TestWorkload { .detail("UID", storageServerInterfaces[j].id()) .detail("GetKeyValuesToken", storageServerInterfaces[j].getKeyValues.getEndpoint().token) - .detail("IsTSS", storageServerInterfaces[j].isTss ? "True" : "False") + .detail("IsTSS", storageServerInterfaces[j].isTss() ? "True" : "False") .error(e); - printf("CC %sSS %s failed with error % d\n", - storageServerInterfaces[j].isTss ? "T" : "", - storageServers[j].toString().c_str(), - e.code()); - // All shards should be available in quiscence - // TODO should the test still fail if TSS is unavailable? Or is just logging the trace - // logs ok if (self->performQuiescentChecks && - (g_network->isSimulated() || !storageServerInterfaces[j].isTss)) { + (g_network->isSimulated() || !storageServerInterfaces[j].isTss())) { self->testFailure("Storage server unavailable"); return false; } @@ -1604,7 +1586,6 @@ struct ConsistencyCheckWorkload : TestWorkload { bool hasValidEstimate = estimatedBytes.size() > 0; // If the storage servers' sampled estimate of shard size is different from ours - // TODO should the test still fail if TSS has wrong estimate? Or is just logging the trace logs ok if (self->performQuiescentChecks) { for (int j = 0; j < estimatedBytes.size(); j++) { if (estimatedBytes[j] >= 0 && estimatedBytes[j] != sampledBytes) { @@ -1612,9 +1593,9 @@ struct ConsistencyCheckWorkload : TestWorkload { .detail("EstimatedBytes", estimatedBytes[j]) .detail("CorrectSampledBytes", sampledBytes) .detail("StorageServer", storageServers[j]) - .detail("IsTSS", storageServerInterfaces[j].isTss ? "True" : "False"); + .detail("IsTSS", storageServerInterfaces[j].isTss() ? "True" : "False"); - if (!storageServerInterfaces[j].isTss) { + if (!storageServerInterfaces[j].isTss()) { self->testFailure("Storage servers had incorrect sampled estimate"); } @@ -1622,7 +1603,7 @@ struct ConsistencyCheckWorkload : TestWorkload { break; } else if (estimatedBytes[j] < 0 && - (g_network->isSimulated() || !storageServerInterfaces[j].isTss)) { + (g_network->isSimulated() || !storageServerInterfaces[j].isTss())) { self->testFailure("Could not get storage metrics from server"); hasValidEstimate = false; break; @@ -1734,8 +1715,9 @@ struct ConsistencyCheckWorkload : TestWorkload { if (!keyValueStoreType.present()) { TraceEvent("ConsistencyCheck_ServerUnavailable").detail("ServerID", storageServers[i].id()); self->testFailure("Storage server unavailable"); - } else if ((!storageServers[i].isTss && keyValueStoreType.get() != configuration.storageServerStoreType) || - (storageServers[i].isTss && + } else if ((!storageServers[i].isTss() && + keyValueStoreType.get() != configuration.storageServerStoreType) || + (storageServers[i].isTss() && keyValueStoreType.get() != configuration.testingStorageServerStoreType)) { TraceEvent("ConsistencyCheck_WrongKeyValueStoreType") .detail("ServerID", storageServers[i].id()) @@ -1747,10 +1729,6 @@ struct ConsistencyCheckWorkload : TestWorkload { // Check each pair of storage servers for an address match for (j = i + 1; j < storageServers.size(); j++) { - // TODO change this hack back once i fix recruitment - /*if (storageServers[i].isTss || storageServers[j].isTss) { - continue; - }*/ if (storageServers[i].address() == storageServers[j].address()) { TraceEvent("ConsistencyCheck_UndesirableServer") .detail("StorageServer1", storageServers[i].id()) @@ -1773,16 +1751,6 @@ struct ConsistencyCheckWorkload : TestWorkload { state vector storageServers = wait(getStorageServers(cx)); std::vector> missingStorage; // vector instead of a set to get the count - printf("CC starting check for storage: %d workers, %d SS\n", workers.size(), storageServers.size()); - printf("CC checking %d regions: ", configuration.regions.size()); - if (configuration.regions.size() == 1) { - printf("%s", configuration.regions[0].dcId.toString().c_str()); - } else if (configuration.regions.size() == 2) { - printf("%s %s", - configuration.regions[0].dcId.toString().c_str(), - configuration.regions[1].dcId.toString().c_str()); - } - printf("\n"); for (int i = 0; i < workers.size(); i++) { NetworkAddress addr = workers[i].interf.stableAddress(); if (!configuration.isExcludedServer(workers[i].interf.addresses()) && @@ -1792,29 +1760,10 @@ struct ConsistencyCheckWorkload : TestWorkload { for (int j = 0; j < storageServers.size(); j++) { if (storageServers[j].stableAddress() == addr) { found = true; - printf("CC found SS %s on %s in dc %s\n", - storageServers[j].id().toString().c_str(), - addr.toString().c_str(), - workers[i].interf.locality.dcId().present() - ? workers[i].interf.locality.dcId().get().toString().c_str() - : ""); break; } } if (!found) { - if (configuration.regions.size() == 0 || - (configuration.regions.size() == 1 && - workers[i].interf.locality.dcId() == configuration.regions[0].dcId) || - (configuration.regions.size() == 2 && - (workers[i].interf.locality.dcId() == configuration.regions[0].dcId || - workers[i].interf.locality.dcId() == configuration.regions[1].dcId))) { - printf("CC found no SS on %s in dc %s\n", - addr.toString().c_str(), - workers[i].interf.locality.dcId().present() - ? workers[i].interf.locality.dcId().get().toString().c_str() - : ""); - } - TraceEvent("ConsistencyCheck_NoStorage") .detail("Address", addr) .detail("ProcessClassEqualToStorageClass", @@ -1839,7 +1788,6 @@ struct ConsistencyCheckWorkload : TestWorkload { // TODO could improve this check by also ensuring DD is currently recruiting a TSS by using quietdb? bool couldExpectMissingTss = (configuration.desiredTSSCount - self->dbInfo->get().client.tssMapping.size()) > 0; - printf("CC couldExpectMissingTss = %s\n", couldExpectMissingTss ? "True" : "False"); int countMissing = missingStorage.size(); int acceptableTssMissing = 1; @@ -1858,16 +1806,10 @@ struct ConsistencyCheckWorkload : TestWorkload { } if (!couldExpectMissingTss || countMissing > acceptableTssMissing) { - printf("No storage server on %d workers. CouldBeTSS=%s, acceptableTssMissing=%d\n", - countMissing, - couldExpectMissingTss ? "T" : "F", - acceptableTssMissing); self->testFailure("No storage server on worker"); return false; } else { - // TODO sev=30 warn instead of print - printf("CC found %d missing storage server on worker, but it is likely a tss(es) waiting for a pair\n", - configuration.usableRegions); + TraceEvent(SevWarn, "ConsistencyCheck_TSSMissing"); } } @@ -1885,10 +1827,8 @@ struct ConsistencyCheckWorkload : TestWorkload { state bool foundExtraDataStore = false; state std::vector protectedProcessesToKill; - printf("CC checking for extra data stores\n"); state std::map> statefulProcesses; for (const auto& ss : storageServers) { - printf("CC Marking %ss as ok\n", ss.id().toString().c_str()); statefulProcesses[ss.address()].insert(ss.id()); // A process may have two addresses (same ip, different ports) if (ss.secondaryAddress().present()) { @@ -1945,9 +1885,6 @@ struct ConsistencyCheckWorkload : TestWorkload { if (statefulProcesses[itr->interf.address()].count(id)) { continue; } - printf("CC found extra data store %s on %s\n", - id.toString().c_str(), - itr->interf.address().toString().c_str()); // For extra data store TraceEvent("ConsistencyCheck_ExtraDataStore") .detail("Address", itr->interf.address()) @@ -1980,10 +1917,7 @@ struct ConsistencyCheckWorkload : TestWorkload { } } - printf("CC check for extra data stores complete\n"); - if (foundExtraDataStore) { - printf("CC Extra Data Stores\n"); self->testFailure("Extra data stores present on workers"); return false; } diff --git a/fdbserver/workloads/RandomMoveKeys.actor.cpp b/fdbserver/workloads/RandomMoveKeys.actor.cpp index 4fe5654a18..887c6da897 100644 --- a/fdbserver/workloads/RandomMoveKeys.actor.cpp +++ b/fdbserver/workloads/RandomMoveKeys.actor.cpp @@ -168,7 +168,7 @@ struct MoveKeysWorkload : TestWorkload { count[servers[s].address()]++; int o = 0; for (int s = 0; s < servers.size(); s++) - if (count[servers[s].address()] == 1 && !servers[s].isTss) + if (count[servers[s].address()] == 1 && !servers[s].isTss()) servers[o++] = servers[s]; servers.resize(o); } diff --git a/fdbserver/workloads/workloads.actor.h b/fdbserver/workloads/workloads.actor.h index ffd669e88b..d85e3469b5 100644 --- a/fdbserver/workloads/workloads.actor.h +++ b/fdbserver/workloads/workloads.actor.h @@ -152,7 +152,7 @@ public: databasePingDelay = g_network->isSimulated() ? 0.0 : 15.0; runConsistencyCheck = g_network->isSimulated(); runConsistencyCheckOnCache = false; - runConsistencyCheckOnTSS = false; + runConsistencyCheckOnTSS = true; waitForQuiescenceBegin = true; waitForQuiescenceEnd = true; simCheckRelocationDuration = false; diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index e4d5a4e6f9..1d287feed3 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -234,6 +234,7 @@ void FlowKnobs::initialize(bool randomize, bool isSimulated) { init( BASIC_LOAD_BALANCE_MIN_CPU, 0.05 ); //do not adjust LB probabilities if the proxies are less than 5% utilized init( BASIC_LOAD_BALANCE_BUCKETS, 40 ); //proxies bin recent GRV requests into 40 time bins init( BASIC_LOAD_BALANCE_COMPUTE_PRECISION, 10000 ); //determines how much of the LB usage is holding the CPU usage of the proxy + init( LOAD_BALANCE_TSS_TIMEOUT, 5.0 ); // Health Monitor init( FAILURE_DETECTION_DELAY, 4.0 ); if( randomize && BUGGIFY ) FAILURE_DETECTION_DELAY = 1.0; diff --git a/flow/Knobs.h b/flow/Knobs.h index 67ec3b82b7..9b700613f3 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -250,6 +250,7 @@ public: int BASIC_LOAD_BALANCE_COMPUTE_PRECISION; double BASIC_LOAD_BALANCE_MIN_REQUESTS; double BASIC_LOAD_BALANCE_MIN_CPU; + double LOAD_BALANCE_TSS_TIMEOUT; // Health Monitor int FAILURE_DETECTION_DELAY; diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index e0e84c6e25..7bf2a05e63 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -1230,8 +1230,6 @@ Future brokenPromiseToMaybeDelivered(Future in) { return t; } catch (Error& e) { if (e.code() == error_code_broken_promise) { - // TODO REMOVE! - printf("broken promise!!"); throw request_maybe_delivered(); } throw; diff --git a/flow/serialize.h b/flow/serialize.h index 7653648a80..81bb18ad4d 100644 --- a/flow/serialize.h +++ b/flow/serialize.h @@ -22,9 +22,6 @@ #define FLOW_SERIALIZE_H #pragma once -// TODO REMOVE -#include - #include #include #include @@ -112,12 +109,6 @@ class Serializer { public: static void serialize(Archive& ar, T& t) { t.serialize(ar); - // TODO REMOVE - if (!ar.protocolVersion().isValid()) { - printf("invalid protocol version %" PRIx64 " < %" PRIx64 "!!!\n", - ar.protocolVersion().version(), - ProtocolVersion::minValidProtocolVersion); - } ASSERT(ar.protocolVersion().isValid()); } }; From 95ab07fcb698ff5cbc926ada3c9974b5cb33b031 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Tue, 25 May 2021 20:42:07 +0000 Subject: [PATCH 437/461] Adding comments for clarity --- fdbserver/VersionedBTree.actor.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 49eac05655..454e7ea5d9 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -7565,11 +7565,11 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { { deterministicRandom()->randomInt(prev.k, next.k), deterministicRandom()->randomInt(prev.v, next.v) }); }; - // Build a set of N unique items + // Build a set of N unique items, where no consecutive items are in the set, a requirement of the seek behavior tests. std::set uniqueItems; while (uniqueItems.size() < N) { IntIntPair p = randomPair(); - auto nextP = p; // also check if next highest/lowest key is not in set for testLTE/testGTE + auto nextP = p; // also check if next highest/lowest key is not in set nextP.v++; auto prevP = p; prevP.v--; @@ -7591,7 +7591,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { std::vector toDelete; while (1) { IntIntPair p = randomPair(); - auto nextP = p; // also check if next highest/lowest key is not in set for testLTE/testGTE + auto nextP = p; // also check if next highest/lowest key is not in the set nextP.v++; auto prevP = p; prevP.v--; @@ -7745,6 +7745,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { } // SeekLTE to the next possible int pair value after each element to make sure the base element is found + // Assumes no consecutive items are present in the set for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; @@ -7761,6 +7762,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { } // SeekGTE to the previous possible int pair value after each element to make sure the base element is found + // Assumes no consecutive items are present in the set for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; @@ -7796,6 +7798,7 @@ TEST_CASE("/redwood/correctness/unit/deltaTree/IntIntPair") { } // SeekLTE to each element's next possible value, using each element as a hint + // Assumes no consecutive items are present in the set for (int i = 0; i < items.size(); ++i) { IntIntPair p = items[i]; IntIntPair q = p; From d68cb9b04872a505a2bfa4782f8a4e7e83c2efd3 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Tue, 25 May 2021 20:06:32 +0000 Subject: [PATCH 438/461] Changing role names and enabling tss by default in consistency check --- fdbserver/SimulatedCluster.actor.cpp | 12 ++++-------- fdbserver/worker.actor.cpp | 4 ++-- fdbserver/workloads/ConsistencyCheck.actor.cpp | 2 +- flow/ProtocolVersion.h | 1 - 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 70fed849d9..1747047d85 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -186,7 +186,7 @@ public: bool configureLocked = false; bool startIncompatibleProcess = false; int logAntiQuorum = -1; - bool firstTestInRestart = false; + bool isFirstTestInRestart = false; // Storage Engine Types: Verify match with SimulationConfig::generateNormalConfig // 0 = "ssd" // 1 = "memory" @@ -1171,7 +1171,7 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { } int tssCount = 0; - if (!testconfig.simpleConfig && deterministicRandom()->random01() < 0.25) { + if (!testConfig.simpleConfig && deterministicRandom()->random01() < 0.25) { // 1 or 2 tss tssCount = deterministicRandom()->randomInt(1, 3); } @@ -1189,14 +1189,10 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { db.grvProxyCount = 1; db.resolverCount = 1; } + int replication_type = testConfig.simpleConfig ? 1 : (std::max(testConfig.minimumReplication, datacenters > 4 ? deterministicRandom()->randomInt(1, 3) : std::min(deterministicRandom()->randomInt(0, 6), 3))); if (testConfig.config.present()) { set_config(testConfig.config.get()); } else { - int replication_type = testConfig.simpleConfig - ? 1 - : (std::max(testConfig.minimumReplication, - datacenters > 4 ? deterministicRandom()->randomInt(1, 3) - : std::min(deterministicRandom()->randomInt(0, 6), 3))); switch (replication_type) { case 0: { TEST(true); // Simulated cluster using custom redundancy mode @@ -1513,7 +1509,7 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { tssCount = std::max(0, std::min(tssCount, (db.usableRegions * (machine_count / datacenters) - replication_type) / 2)); - if (tssCount > 0) { + if (!testConfig.config.present() && tssCount > 0) { std::string confStr = format("tss_count:=%d tss_storage_engine:=%d", tssCount, db.storageServerStoreType); set_config(confStr); double tssRandom = deterministicRandom()->random01(); diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 11b90dbae5..d48b9cd628 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -2151,7 +2151,7 @@ ACTOR Future fdbd(Reference connFile, const Role Role::WORKER("Worker", "WK", false); const Role Role::STORAGE_SERVER("StorageServer", "SS"); -const Role Role::TESTING_STORAGE_SERVER("TestingStorageServer", "TS"); +const Role Role::TESTING_STORAGE_SERVER("TestingStorageServer", "ST"); const Role Role::TRANSACTION_LOG("TLog", "TL"); const Role Role::SHARED_TRANSACTION_LOG("SharedTLog", "SL", false); const Role Role::COMMIT_PROXY("CommitProxyServer", "CP"); @@ -2159,7 +2159,7 @@ const Role Role::GRV_PROXY("GrvProxyServer", "GP"); const Role Role::MASTER("MasterServer", "MS"); const Role Role::RESOLVER("Resolver", "RV"); const Role Role::CLUSTER_CONTROLLER("ClusterController", "CC"); -const Role Role::TESTER("TestClient", "TC"); +const Role Role::TESTER("Tester", "TS"); const Role Role::LOG_ROUTER("LogRouter", "LR"); const Role Role::DATA_DISTRIBUTOR("DataDistributor", "DD"); const Role Role::RATEKEEPER("Ratekeeper", "RK"); diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 3b5156fb1e..459501198e 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -97,7 +97,7 @@ struct ConsistencyCheckWorkload : TestWorkload { ConsistencyCheckWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { performQuiescentChecks = getOption(options, LiteralStringRef("performQuiescentChecks"), false); performCacheCheck = getOption(options, LiteralStringRef("performCacheCheck"), false); - performTSSCheck = getOption(options, LiteralStringRef("performTSSCheck"), false); + performTSSCheck = getOption(options, LiteralStringRef("performTSSCheck"), true); quiescentWaitTimeout = getOption(options, LiteralStringRef("quiescentWaitTimeout"), 600.0); distributed = getOption(options, LiteralStringRef("distributed"), true); shardSampleFactor = std::max(getOption(options, LiteralStringRef("shardSampleFactor"), 1), 1); diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h index 7feb6b3839..af7c6f1108 100644 --- a/flow/ProtocolVersion.h +++ b/flow/ProtocolVersion.h @@ -138,7 +138,6 @@ public: // introduced features PROTOCOL_VERSION_FEATURE(0x0FDB00B070010000LL, StableInterfaces); PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, TagThrottleValueReason); PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, SpanContext); - // TODO is this right? PROTOCOL_VERSION_FEATURE(0x0FDB00B070010001LL, TSS); }; From 6bd7fa4036bc0cad3cbcf22655379d8868df84b6 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 24 Mar 2021 16:27:35 -0600 Subject: [PATCH 439/461] Actually close files in simulation --- fdbrpc/AsyncFileNonDurable.actor.h | 4 ++++ fdbrpc/sim2.actor.cpp | 16 ++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 00c0f7441d..c5f065a4e3 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -276,6 +276,10 @@ public: Future deleteFuture = deleteFile(this); if (!deleteFuture.isReady()) filesBeingDeleted[filename] = deleteFuture; + } else if (isSoleOwner()) { + // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we + // we remove the file from the map to make sure it gets closed. + g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); } } diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 1af14ec676..6cddbb7e88 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -536,7 +536,10 @@ public: std::string getFilename() const override { return actualFilename; } - ~SimpleFile() override { _close(h); } + ~SimpleFile() override { + _close(h); + --openCount; + } private: int h; @@ -1933,10 +1936,7 @@ public: TraceEvent("ClogInterface") .detail("IP", ip.toString()) .detail("Delay", seconds) - .detail("Queue", - mode == ClogSend ? "Send" - : mode == ClogReceive ? "Receive" - : "All"); + .detail("Queue", mode == ClogSend ? "Send" : mode == ClogReceive ? "Receive" : "All"); if (mode == ClogSend || mode == ClogAll) g_clogging.clogSendFor(ip, seconds); @@ -2408,9 +2408,9 @@ int sf_open(const char* filename, int flags, int convFlags, int mode) { GENERIC_READ | ((flags & IAsyncFile::OPEN_READWRITE) ? GENERIC_WRITE : 0), FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, nullptr, - (flags & IAsyncFile::OPEN_EXCLUSIVE) ? CREATE_NEW - : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS - : OPEN_EXISTING, + (flags & IAsyncFile::OPEN_EXCLUSIVE) + ? CREATE_NEW + : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS : OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); int h = -1; From f32ce0c4b54265f2961f81663c1cc77a177f2e2d Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 24 Mar 2021 16:56:11 -0600 Subject: [PATCH 440/461] fix typo --- fdbrpc/AsyncFileNonDurable.actor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index c5f065a4e3..a1508f7fef 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -278,7 +278,7 @@ public: filesBeingDeleted[filename] = deleteFuture; } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we - // we remove the file from the map to make sure it gets closed. + // remove the file from the map to make sure it gets closed. g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); } } From 04613c3b1346fbe2b23bf4d1fb8edfc6a7d9ae02 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 24 Mar 2021 19:57:24 -0600 Subject: [PATCH 441/461] handle file renames properly --- fdbrpc/AsyncFileNonDurable.actor.h | 12 +++++++++++- flow/flow.h | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index a1508f7fef..28b3506d6e 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -279,7 +279,17 @@ public: } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we // remove the file from the map to make sure it gets closed. - g_simulator.getCurrentProcess()->machine->openFiles.erase(filename); + auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; + auto iter = openFiles.find(filename); + // the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the + // map anymore. + if (iter != openFiles.end()) { + // even if the filename exists, it doesn't mean that it references the same file. It could be that the + // file was renamed and later a file with the same name was opened. + if (iter->second.canGet() && iter->second.get().getPtr() == this) { + openFiles.erase(filename); + } + } } } diff --git a/flow/flow.h b/flow/flow.h index 987572d7c5..e03d598d9b 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -674,6 +674,8 @@ public: bool isValid() const { return sav != 0; } bool isReady() const { return sav->isSet(); } bool isError() const { return sav->isError(); } + // returns true if get can be called on this future (counterpart of canBeSet on Promises) + bool canGet() const { return isValid() && isReady() && !isError(); } Error& getError() const { ASSERT(isError()); return sav->error_state; From 7cb767fd3c00b459f1546d68add6d67e0ade78b2 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 25 Mar 2021 13:22:29 -0600 Subject: [PATCH 442/461] only remove files from the open map if they have no modifications in flight --- fdbrpc/AsyncFileNonDurable.actor.h | 49 ++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 28b3506d6e..ef686271c5 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -268,6 +268,37 @@ public: //TraceEvent("AsyncFileNonDurable_Destroy", id).detail("Filename", filename); } + // The purpose of this actor is to simply keep a reference to a non-durable file until all pending modifications + // have completed. When they return, this actor will die and therefore decrement the reference count by 1. + ACTOR void waitOnOutstandingModifications(Reference self) { + state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); + state std::string filename = self->filename; + + wait(g_simulator.onMachine(currentProcess)); + try { + Promise startSyncPromise = self->startSyncPromise; + self->startSyncPromise = Promise(); + startSyncPromise.send(true); + + std::vector> outstandingModifications; + + for (auto itr = self->pendingModifications.ranges().begin(); + itr != self->pendingModifications.ranges().end(); + ++itr) + if (itr->value().isValid() && !itr->value().isReady()) + outstandingModifications.push_back(itr->value()); + + // Ignore errors here so that all modifications can finish + wait(waitForAllReady(outstandingModifications)); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); + } catch (Error& e) { + state Error err = e; + wait(g_simulator.onProcess(currentProcess, currentTaskID)); + throw err; + } + } + void addref() override { ReferenceCounted::addref(); } void delref() override { if (delref_no_destroy()) { @@ -279,6 +310,24 @@ public: } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we // remove the file from the map to make sure it gets closed. + bool hasPendingModifications = false; + for (auto iter = pendingModifications.ranges().begin(); iter != pendingModifications.ranges().end(); + ++iter) { + if (iter->value().isValid() && !iter->value().isReady()) { + hasPendingModifications = true; + break; + } + } + if (hasPendingModifications) { + // If we still have pending references we won't close the file and instead wait for them. But while we + // wait for those to complete, another actor might open the file. So we call into an actor that will + // hold a refernce until all pending operations are complete. If someone opens this file before this + // completes, nothing will happen. Otherwise we will enter delref again but this time + // hasPendingModifications will evalualte to false. + addref(); + waitOnOutstandingModifications(Reference(this)); + return; + } auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; auto iter = openFiles.find(filename); // the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the From 7b4de4e037bca8ea9cf99b8a77ab8db594cb9fb3 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 25 Mar 2021 14:00:07 -0600 Subject: [PATCH 443/461] Revert change --- fdbrpc/AsyncFileNonDurable.actor.h | 47 +++++++----------------------- 1 file changed, 11 insertions(+), 36 deletions(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index ef686271c5..cc341ea155 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -276,27 +276,20 @@ public: state std::string filename = self->filename; wait(g_simulator.onMachine(currentProcess)); - try { - Promise startSyncPromise = self->startSyncPromise; - self->startSyncPromise = Promise(); - startSyncPromise.send(true); + Promise startSyncPromise = self->startSyncPromise; + self->startSyncPromise = Promise(); + startSyncPromise.send(true); - std::vector> outstandingModifications; + std::vector> outstandingModifications; - for (auto itr = self->pendingModifications.ranges().begin(); - itr != self->pendingModifications.ranges().end(); - ++itr) - if (itr->value().isValid() && !itr->value().isReady()) - outstandingModifications.push_back(itr->value()); + for (auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end(); + ++itr) + if (itr->value().isValid() && !itr->value().isReady()) + outstandingModifications.push_back(itr->value()); - // Ignore errors here so that all modifications can finish - wait(waitForAllReady(outstandingModifications)); - wait(g_simulator.onProcess(currentProcess, currentTaskID)); - } catch (Error& e) { - state Error err = e; - wait(g_simulator.onProcess(currentProcess, currentTaskID)); - throw err; - } + // Ignore errors here so that all modifications can finish + wait(waitForAllReady(outstandingModifications)); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); } void addref() override { ReferenceCounted::addref(); } @@ -310,24 +303,6 @@ public: } else if (isSoleOwner()) { // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we // remove the file from the map to make sure it gets closed. - bool hasPendingModifications = false; - for (auto iter = pendingModifications.ranges().begin(); iter != pendingModifications.ranges().end(); - ++iter) { - if (iter->value().isValid() && !iter->value().isReady()) { - hasPendingModifications = true; - break; - } - } - if (hasPendingModifications) { - // If we still have pending references we won't close the file and instead wait for them. But while we - // wait for those to complete, another actor might open the file. So we call into an actor that will - // hold a refernce until all pending operations are complete. If someone opens this file before this - // completes, nothing will happen. Otherwise we will enter delref again but this time - // hasPendingModifications will evalualte to false. - addref(); - waitOnOutstandingModifications(Reference(this)); - return; - } auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; auto iter = openFiles.find(filename); // the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the From cbce2f6f117ed2b6eb0064151648f2c730928844 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 1 Apr 2021 14:06:13 -0600 Subject: [PATCH 444/461] delete dead code --- fdbrpc/AsyncFileNonDurable.actor.h | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index cc341ea155..28b3506d6e 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -268,30 +268,6 @@ public: //TraceEvent("AsyncFileNonDurable_Destroy", id).detail("Filename", filename); } - // The purpose of this actor is to simply keep a reference to a non-durable file until all pending modifications - // have completed. When they return, this actor will die and therefore decrement the reference count by 1. - ACTOR void waitOnOutstandingModifications(Reference self) { - state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state TaskPriority currentTaskID = g_network->getCurrentTask(); - state std::string filename = self->filename; - - wait(g_simulator.onMachine(currentProcess)); - Promise startSyncPromise = self->startSyncPromise; - self->startSyncPromise = Promise(); - startSyncPromise.send(true); - - std::vector> outstandingModifications; - - for (auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end(); - ++itr) - if (itr->value().isValid() && !itr->value().isReady()) - outstandingModifications.push_back(itr->value()); - - // Ignore errors here so that all modifications can finish - wait(waitForAllReady(outstandingModifications)); - wait(g_simulator.onProcess(currentProcess, currentTaskID)); - } - void addref() override { ReferenceCounted::addref(); } void delref() override { if (delref_no_destroy()) { From a7564696702442ed7397cce43b53251ad0718f9d Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 26 May 2021 13:38:24 -0700 Subject: [PATCH 445/461] Use a weak reference in the open files cache (abstracted from a similar cache in AsyncFileCached) to avoid a problem where removing an item from the cache could cause us to reentrantly remove it again. --- fdbrpc/AsyncFileCached.actor.cpp | 3 +- fdbrpc/AsyncFileCached.actor.h | 28 ++++------------ fdbrpc/AsyncFileNonDurable.actor.h | 40 +++++++++++++---------- fdbrpc/sim2.actor.cpp | 32 ++++++++++++------- fdbrpc/simulator.h | 6 +++- fdbserver/SimulatedCluster.actor.cpp | 9 +++--- flow/genericactors.actor.h | 48 ++++++++++++++++++++++++++++ 7 files changed, 110 insertions(+), 56 deletions(-) diff --git a/fdbrpc/AsyncFileCached.actor.cpp b/fdbrpc/AsyncFileCached.actor.cpp index f4a57d4646..984795c105 100644 --- a/fdbrpc/AsyncFileCached.actor.cpp +++ b/fdbrpc/AsyncFileCached.actor.cpp @@ -46,7 +46,8 @@ EvictablePage::~EvictablePage() { } } -std::map AsyncFileCached::openFiles; +// A map of filename to the file handle for all opened cached files +std::map> AsyncFileCached::openFiles; void AsyncFileCached::remove_page(AFCPage* page) { pages.erase(page->pageOffset); diff --git a/fdbrpc/AsyncFileCached.actor.h b/fdbrpc/AsyncFileCached.actor.h index c5b6b3127c..2915b0557c 100644 --- a/fdbrpc/AsyncFileCached.actor.h +++ b/fdbrpc/AsyncFileCached.actor.h @@ -132,27 +132,13 @@ struct EvictablePageCache : ReferenceCounted { const CacheEvictionType cacheEvictionType; }; -struct OpenFileInfo : NonCopyable { - IAsyncFile* f; - Future> opened; // Only valid until the file is fully opened - - OpenFileInfo() : f(0) {} - OpenFileInfo(OpenFileInfo&& r) noexcept : f(r.f), opened(std::move(r.opened)) { r.f = 0; } - - Future> get() { - if (f) - return Reference::addRef(f); - else - return opened; - } -}; - struct AFCPage; class AsyncFileCached final : public IAsyncFile, public ReferenceCounted { friend struct AFCPage; public: + // Opens a file that uses the FDB in-memory page cache static Future> open(std::string filename, int flags, int mode) { //TraceEvent("AsyncFileCachedOpen").detail("Filename", filename); if (openFiles.find(filename) == openFiles.end()) { @@ -160,7 +146,7 @@ public: if (f.isReady() && f.isError()) return f; if (!f.isReady()) - openFiles[filename].opened = f; + openFiles[filename] = WeakFutureReference(f); else return f.get(); } @@ -263,7 +249,9 @@ public: ~AsyncFileCached() override; private: - static std::map openFiles; + // A map of filename to the file handle for all opened cached files + static std::map> openFiles; + std::string filename; Reference uncached; int64_t length; @@ -330,6 +318,7 @@ private: static Future> open_impl(std::string filename, int flags, int mode); + // Opens a file that uses the FDB in-memory page cache ACTOR static Future> open_impl(std::string filename, int flags, int mode, @@ -345,10 +334,7 @@ private: TraceEvent("AFCUnderlyingOpenEnd").detail("Filename", filename); int64_t l = wait(f->size()); TraceEvent("AFCUnderlyingSize").detail("Filename", filename).detail("Size", l); - auto& of = openFiles[filename]; - of.f = new AsyncFileCached(f, filename, l, pageCache); - of.opened = Future>(); - return Reference(of.f); + return new AsyncFileCached(f, filename, l, pageCache); } catch (Error& e) { if (e.code() != error_code_actor_cancelled) openFiles.erase(filename); diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 28b3506d6e..ccc2ad42b4 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -130,6 +130,9 @@ public: UID id; std::string filename; + // For files that use atomic write and create, they are initially created with an extra suffix + std::string initialFilename; + // An approximation of the size of the file; .size() should be used instead of this variable in most cases mutable int64_t approximateSize; @@ -182,11 +185,13 @@ private: reponses; // cannot call getResult on this actor collection, since the actors will be on different processes AsyncFileNonDurable(const std::string& filename, + const std::string& initialFilename, Reference file, Reference diskParameters, NetworkAddress openedAddress, bool aio) - : openedAddress(openedAddress), pendingModifications(uint64_t(-1)), approximateSize(0), reponses(false), + : filename(filename), initialFilename(initialFilename), file(file), diskParameters(diskParameters), + openedAddress(openedAddress), pendingModifications(uint64_t(-1)), approximateSize(0), reponses(false), aio(aio) { // This is only designed to work in simulation @@ -194,9 +199,6 @@ private: this->id = deterministicRandom()->randomUniqueID(); //TraceEvent("AsyncFileNonDurable_Create", id).detail("Filename", filename); - this->file = file; - this->filename = filename; - this->diskParameters = diskParameters; maxWriteDelay = FLOW_KNOBS->NON_DURABLE_MAX_WRITE_DELAY; hasBeenSynced = false; @@ -239,7 +241,7 @@ public: } state Reference nonDurableFile( - new AsyncFileNonDurable(filename, file, diskParameters, currentProcess->address, aio)); + new AsyncFileNonDurable(filename, actualFilename, file, diskParameters, currentProcess->address, aio)); // Causes the approximateSize member to be set state Future sizeFuture = nonDurableFile->size(); @@ -269,25 +271,29 @@ public: } void addref() override { ReferenceCounted::addref(); } + void delref() override { if (delref_no_destroy()) { - ASSERT(filesBeingDeleted.count(filename) == 0); - //TraceEvent("AsyncFileNonDurable_StartDelete", id).detail("Filename", filename); - Future deleteFuture = deleteFile(this); - if (!deleteFuture.isReady()) - filesBeingDeleted[filename] = deleteFuture; - } else if (isSoleOwner()) { - // isSoleOwner is a bit confusing here. What we mean is that the openFiles map is the sole owner. If we - // remove the file from the map to make sure it gets closed. + if (filesBeingDeleted.count(filename) == 0) { + //TraceEvent("AsyncFileNonDurable_StartDelete", id).detail("Filename", filename); + Future deleteFuture = deleteFile(this); + if (!deleteFuture.isReady()) + filesBeingDeleted[filename] = deleteFuture; + } + auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; auto iter = openFiles.find(filename); - // the file could've been renamed (DiskQueue does that for example). In that case the file won't be in the - // map anymore. + if (iter == openFiles.end()) { + iter = openFiles.find(initialFilename); + } + + // Various actions (e.g. simulated delete) can remove a file from openFiles prematurely, so it may already + // be gone if (iter != openFiles.end()) { // even if the filename exists, it doesn't mean that it references the same file. It could be that the // file was renamed and later a file with the same name was opened. - if (iter->second.canGet() && iter->second.get().getPtr() == this) { - openFiles.erase(filename); + if (iter->second.getPtrIfReady().orDefault(nullptr) == this) { + openFiles.erase(iter); } } } diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 6cddbb7e88..f11caa5461 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -1018,8 +1018,8 @@ public: // Get the size of all files we've created on the server and subtract them from the free space for (auto file = proc->machine->openFiles.begin(); file != proc->machine->openFiles.end(); ++file) { - if (file->second.isReady()) { - totalFileSize += ((AsyncFileNonDurable*)file->second.get().getPtr())->approximateSize; + if (file->second.get().isReady()) { + totalFileSize += ((AsyncFileNonDurable*)file->second.get().get().getPtr())->approximateSize; } numFiles++; } @@ -1936,7 +1936,10 @@ public: TraceEvent("ClogInterface") .detail("IP", ip.toString()) .detail("Delay", seconds) - .detail("Queue", mode == ClogSend ? "Send" : mode == ClogReceive ? "Receive" : "All"); + .detail("Queue", + mode == ClogSend ? "Send" + : mode == ClogReceive ? "Receive" + : "All"); if (mode == ClogSend || mode == ClogAll) g_clogging.clogSendFor(ip, seconds); @@ -2408,9 +2411,9 @@ int sf_open(const char* filename, int flags, int convFlags, int mode) { GENERIC_READ | ((flags & IAsyncFile::OPEN_READWRITE) ? GENERIC_WRITE : 0), FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, nullptr, - (flags & IAsyncFile::OPEN_EXCLUSIVE) - ? CREATE_NEW - : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS : OPEN_EXISTING, + (flags & IAsyncFile::OPEN_EXCLUSIVE) ? CREATE_NEW + : (flags & IAsyncFile::OPEN_CREATE) ? OPEN_ALWAYS + : OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); int h = -1; @@ -2440,7 +2443,7 @@ Future> Sim2FileSystem::open(const std::string& file actualFilename = filename + ".part"; auto partFile = machineCache.find(actualFilename); if (partFile != machineCache.end()) { - Future> f = AsyncFileDetachable::open(partFile->second); + Future> f = AsyncFileDetachable::open(partFile->second.get()); if (FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0) f = map(f, [=](Reference r) { return Reference(new AsyncFileWriteChecker(r)); @@ -2448,19 +2451,26 @@ Future> Sim2FileSystem::open(const std::string& file return f; } } - if (machineCache.find(actualFilename) == machineCache.end()) { + + Future> f; + auto itr = machineCache.find(actualFilename); + if (itr == machineCache.end()) { // Simulated disk parameters are shared by the AsyncFileNonDurable and the underlying SimpleFile. // This way, they can both keep up with the time to start the next operation auto diskParameters = makeReference(FLOW_KNOBS->SIM_DISK_IOPS, FLOW_KNOBS->SIM_DISK_BANDWIDTH); - machineCache[actualFilename] = - AsyncFileNonDurable::open(filename, + f = AsyncFileNonDurable::open(filename, actualFilename, SimpleFile::open(filename, flags, mode, diskParameters, false), diskParameters, (flags & IAsyncFile::OPEN_NO_AIO) == 0); + + machineCache[actualFilename] = WeakFutureReference(f); + } else { + f = itr->second.get(); } - Future> f = AsyncFileDetachable::open(machineCache[actualFilename]); + + f = AsyncFileDetachable::open(f); if (FLOW_KNOBS->PAGE_WRITE_CHECKSUM_HISTORY > 0) f = map(f, [=](Reference r) { return Reference(new AsyncFileWriteChecker(r)); }); return f; diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index 4b74ed91ba..19bed013f2 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -188,10 +188,14 @@ public: Promise shutdownSignal; }; + // A set of data associated with a simulated machine struct MachineInfo { ProcessInfo* machineProcess; std::vector processes; - std::map>> openFiles; + + // A map from filename to file handle for all open files on a machine + std::map> openFiles; + std::set deletingFiles; std::set closingFiles; Optional> machineId; diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 128eace3a8..5b06143ba0 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -175,7 +175,6 @@ class TestConfig { ifs.close(); } - public: int extraDB = 0; int minimumReplication = 0; @@ -708,8 +707,8 @@ ACTOR Future simulatedMachine(ClusterConnectionString connStr, // Copy the file pointers to a vector because the map may be modified while we are killing files std::vector files; for (auto fileItr = machineCache.begin(); fileItr != machineCache.end(); ++fileItr) { - ASSERT(fileItr->second.isReady()); - files.push_back((AsyncFileNonDurable*)fileItr->second.get().getPtr()); + ASSERT(fileItr->second.get().isReady()); + files.push_back((AsyncFileNonDurable*)fileItr->second.get().get().getPtr()); } std::vector> killFutures; @@ -725,7 +724,7 @@ ACTOR Future simulatedMachine(ClusterConnectionString connStr, for (auto it : machineCache) { filenames.insert(it.first); closingStr += it.first + ", "; - ASSERT(it.second.isReady() && !it.second.isError()); + ASSERT(it.second.get().canGet()); } for (auto it : g_simulator.getMachineById(localities.machineId())->deletingFiles) { @@ -1240,7 +1239,7 @@ void SimulationConfig::generateNormalConfig(const TestConfig& testConfig) { if (deterministicRandom()->random01() < 0.5) set_config(format("log_spill:=%d", TLogSpillType::DEFAULT)); } - + if (deterministicRandom()->random01() < 0.5) { set_config("backup_worker_enabled:=1"); } diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 7bf2a05e63..88360685cc 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -1899,6 +1899,54 @@ Future operator>>(Future const& lhs, Future const& rhs) { return runAfter(lhs, rhs); } +// A weak reference type to wrap a future Reference object. +// Once the future is complete, this object holds a pointer to the referenced object but does +// not contribute to its reference count. +template +class WeakFutureReference { +public: + WeakFutureReference() {} + WeakFutureReference(Future> future) : data(new WeakFutureReferenceData(future)) {} + + // Returns a future to obtain a normal reference handle + // If the future is ready, this creates a Reference to wrap the object + Future> get() { + if (!data) { + return Reference(); + } else if (data->ptr.present()) { + return Reference::addRef(data->ptr.get()); + } else { + return data->future; + } + } + + // Returns the raw pointer, if the object is ready + // Note: this should be used with care, as this pointer is not counted as a reference to the object and + // it could be deleted if all normal references are destroyed. + Optional getPtrIfReady() { return data->ptr; } + +private: + // A class to hold the state for a WeakFutureReference + struct WeakFutureReferenceData : public ReferenceCounted, NonCopyable { + Optional ptr; + Future> future; + Future moveResultFuture; + + WeakFutureReferenceData(Future> future) : future(future) { moveResultFuture = moveResult(this); } + + // Waits for the future to complete and then stores the pointer in local storage + // When this completes, we will no longer be counted toward the reference count of the object + ACTOR Future moveResult(WeakFutureReferenceData* self) { + Reference result = wait(self->future); + self->ptr = result.getPtr(); + self->future = Future>(); + return Void(); + } + }; + + Reference data; +}; + #include "flow/unactorcompiler.h" #endif From 944a03d57589f1abbe1641d74f1462e17725eb5a Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 26 May 2021 16:26:45 -0700 Subject: [PATCH 446/461] For files that use the atomic write and create mechanism, attempt to remove the file from the openFiles map at both its old and new name --- fdbrpc/AsyncFileNonDurable.actor.h | 31 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index ccc2ad42b4..bde8e0fe9e 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -281,20 +281,27 @@ public: filesBeingDeleted[filename] = deleteFuture; } - auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; - auto iter = openFiles.find(filename); - if (iter == openFiles.end()) { - iter = openFiles.find(initialFilename); + removeOpenFile(filename, this); + if (initialFilename != filename) { + removeOpenFile(initialFilename, this); } + } + } - // Various actions (e.g. simulated delete) can remove a file from openFiles prematurely, so it may already - // be gone - if (iter != openFiles.end()) { - // even if the filename exists, it doesn't mean that it references the same file. It could be that the - // file was renamed and later a file with the same name was opened. - if (iter->second.getPtrIfReady().orDefault(nullptr) == this) { - openFiles.erase(iter); - } + // Removes a file from the openFiles map + static void removeOpenFile(std::string filename, AsyncFileNonDurable* file) { + auto& openFiles = g_simulator.getCurrentProcess()->machine->openFiles; + + auto iter = openFiles.find(filename); + + // Various actions (e.g. simulated delete) can remove a file from openFiles prematurely, so it may already + // be gone. Renamed files (from atomic write and create) will also be present under only one of the two + // names. + if (iter != openFiles.end()) { + // even if the filename exists, it doesn't mean that it references the same file. It could be that the + // file was renamed and later a file with the same name was opened. + if (iter->second.getPtrIfReady().orDefault(nullptr) == file) { + openFiles.erase(iter); } } } From 065c4fdd5a039aa561f73d585426ce23000e7da8 Mon Sep 17 00:00:00 2001 From: Dan Lambright Date: Tue, 30 Mar 2021 12:31:10 -0400 Subject: [PATCH 447/461] issue 4252 --- fdbserver/Coordination.actor.cpp | 78 +++++++++++++++++++++++++--- fdbserver/CoordinationInterface.h | 2 +- fdbserver/Knobs.cpp | 2 + fdbserver/Knobs.h | 3 ++ fdbserver/SimulatedCluster.actor.cpp | 15 ++++-- fdbserver/worker.actor.cpp | 2 +- flow/error_definitions.h | 1 + 7 files changed, 90 insertions(+), 13 deletions(-) diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index 92de5a5b3c..8443e849eb 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -545,9 +545,15 @@ struct LeaderRegisterCollection { } }; +StringRef getClusterName(Key key) { + StringRef str = key.contents(); + return str.eat(":"); +} + // leaderServer multiplexes multiple leaderRegisters onto a single LeaderElectionRegInterface, // creating and destroying them on demand. -ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore* pStore, UID id) { +ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore* pStore, UID id, + Reference ccf) { state LeaderRegisterCollection regs(pStore); state ActorCollection forwarders(false); @@ -562,6 +568,16 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore info.forward = forward.get().serializedInfo; req.reply.send(CachedSerialization(info)); } else { + StringRef reqClusterName = getClusterName(req.clusterKey); + StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey()); + if (reqClusterName.compare(clusterName) || + ccf->getConnectionString().coordinators() != req.coordinators) { + TraceEvent(SevWarnAlways, "CCFMismatch") + .detail("RequestType", "OpenDatabaseCoordRequest") + .detail("LocalCS", ccf->getConnectionString().toString()) + .detail("IncomingClusterKey", req.clusterKey) + .detail("IncomingCoordinators", describeList(req.coordinators, req.coordinators.size())); + } regs.getInterface(req.clusterKey, id).openDatabase.send(req); } } @@ -570,6 +586,16 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore if (forward.present()) { req.reply.send(forward.get()); } else { + StringRef reqClusterName = getClusterName(req.key); + StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey()); + if (reqClusterName.compare(clusterName) || + ccf->getConnectionString().coordinators() != req.coordinators) { + TraceEvent(SevWarnAlways, "CCFMismatch") + .detail("RequestType", "ElectionResultRequest") + .detail("LocalCS", ccf->getConnectionString().toString()) + .detail("IncomingClusterKey", req.key) + .detail("IncomingCoordinators", describeList(req.coordinators, req.coordinators.size())); + } regs.getInterface(req.key, id).electionResult.send(req); } } @@ -577,30 +603,66 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore Optional forward = regs.getForward(req.key); if (forward.present()) req.reply.send(forward.get()); - else + else { + StringRef reqClusterName = getClusterName(req.key); + StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey()); + if (reqClusterName.compare(clusterName)) { + TraceEvent(SevWarnAlways, "CCFMismatch") + .detail("RequestType", "GetLeaderRequest") + .detail("LocalCS", ccf->getConnectionString().toString()) + .detail("IncomingClusterKey", req.key) + .detail("Key", reqClusterName).detail("Key2",clusterName); + } regs.getInterface(req.key, id).getLeader.send(req); + } } when(CandidacyRequest req = waitNext(interf.candidacy.getFuture())) { Optional forward = regs.getForward(req.key); if (forward.present()) req.reply.send(forward.get()); - else + else { + StringRef reqClusterName = getClusterName(req.key); + StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey()); + if (reqClusterName.compare(clusterName)) { + TraceEvent(SevWarnAlways, "CCFMismatch") + .detail("RequestType", "CandidacyRequest") + .detail("LocalCS", ccf->getConnectionString().toString()) + .detail("IncomingClusterKey", req.key); + } regs.getInterface(req.key, id).candidacy.send(req); + } } when(LeaderHeartbeatRequest req = waitNext(interf.leaderHeartbeat.getFuture())) { Optional forward = regs.getForward(req.key); if (forward.present()) req.reply.send(LeaderHeartbeatReply{ false }); - else + else { + StringRef reqClusterName = getClusterName(req.key); + StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey()); + if (reqClusterName.compare(clusterName)) { + TraceEvent(SevWarnAlways, "CCFMismatch") + .detail("RequestType", "LeaderHeartbeatRequest") + .detail("LocalCS", ccf->getConnectionString().toString()) + .detail("IncomingClusterKey", req.key); + } regs.getInterface(req.key, id).leaderHeartbeat.send(req); + } } when(ForwardRequest req = waitNext(interf.forward.getFuture())) { Optional forward = regs.getForward(req.key); if (forward.present()) req.reply.send(Void()); else { - forwarders.add( - LeaderRegisterCollection::setForward(®s, req.key, ClusterConnectionString(req.conn.toString()))); + StringRef reqClusterName = getClusterName(req.key); + StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey()); + if (reqClusterName.compare(clusterName)) { + TraceEvent(SevWarnAlways, "CCFMismatch") + .detail("RequestType", "ForwardRequest") + .detail("LocalCS", ccf->getConnectionString().toString()) + .detail("IncomingClusterKey", req.key); + } + forwarders.add(LeaderRegisterCollection::setForward(®s, req.key, + ClusterConnectionString(req.conn.toString()))); regs.getInterface(req.key, id).forward.send(req); } } @@ -611,7 +673,7 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore } } -ACTOR Future coordinationServer(std::string dataFolder) { +ACTOR Future coordinationServer(std::string dataFolder, Reference ccf) { state UID myID = deterministicRandom()->randomUniqueID(); state LeaderElectionRegInterface myLeaderInterface(g_network); state GenerationRegInterface myInterface(g_network); @@ -622,7 +684,7 @@ ACTOR Future coordinationServer(std::string dataFolder) { .detail("Folder", dataFolder); try { - wait(localGenerationReg(myInterface, &store) || leaderServer(myLeaderInterface, &store, myID) || + wait(localGenerationReg(myInterface, &store) || leaderServer(myLeaderInterface, &store, myID, ccf) || store.getError()); throw internal_error(); } catch (Error& e) { diff --git a/fdbserver/CoordinationInterface.h b/fdbserver/CoordinationInterface.h index 5e824ee0ee..ea379d1358 100644 --- a/fdbserver/CoordinationInterface.h +++ b/fdbserver/CoordinationInterface.h @@ -225,6 +225,6 @@ public: vector stateServers; }; -Future coordinationServer(std::string const& dataFolder); +Future coordinationServer(std::string const& dataFolder, Reference const& ccf); #endif diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index fc1234d243..d3e32203d9 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -631,6 +631,8 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi // Coordination init( COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL, 1.0 ); if( randomize && BUGGIFY ) COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL = 10.0; + init( FORWARD_REQUEST_TOO_OLD, 600.0 ); if( randomize && BUGGIFY ) FORWARD_REQUEST_TOO_OLD = 60.0; + init( ENABLE_CROSS_CLUSTER_SUPPORT, true ); if( randomize && BUGGIFY ) ENABLE_CROSS_CLUSTER_SUPPORT = false; // Buggification init( BUGGIFIED_EVENTUAL_CONSISTENCY, 1.0 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index be2caba6a1..6bc56d4457 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -559,6 +559,9 @@ public: // Coordination double COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL; + double FORWARD_REQUEST_TOO_OLD; + bool ENABLE_CROSS_CLUSTER_SUPPORT; // Allow a coordinator to serve requests whose connection string does not match + // the local copy // Buggification double BUGGIFIED_EVENTUAL_CONSISTENCY; diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 128eace3a8..72d810961d 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -1605,6 +1605,7 @@ void setupSimulatedSystem(vector>* systemActors, TEST(!useIPv6); // Use IPv4 vector coordinatorAddresses; + vector extraCoordinatorAddresses; // Used by extra DB if the DR db is a new one if (testConfig.minimumRegions > 1) { // do not put coordinators in the primary region so that we can kill that region safely int nonPrimaryDcs = dataCenters / 2; @@ -1614,6 +1615,9 @@ void setupSimulatedSystem(vector>* systemActors, auto ip = makeIPAddressForSim(useIPv6, { 2, dc, 1, m }); coordinatorAddresses.push_back( NetworkAddress(ip, sslEnabled && !sslOnly ? 2 : 1, true, sslEnabled && sslOnly)); + auto extraIp = makeIPAddressForSim(useIPv6, { 4, dc, 1, m }); + extraCoordinatorAddresses.push_back( + NetworkAddress(extraIp, sslEnabled && !sslOnly ? 2 : 1, true, sslEnabled && sslOnly)); TraceEvent("SelectedCoordinator").detail("Address", coordinatorAddresses.back()); } } @@ -1642,6 +1646,9 @@ void setupSimulatedSystem(vector>* systemActors, auto ip = makeIPAddressForSim(useIPv6, { 2, dc, 1, m }); coordinatorAddresses.push_back( NetworkAddress(ip, sslEnabled && !sslOnly ? 2 : 1, true, sslEnabled && sslOnly)); + auto extraIp = makeIPAddressForSim(useIPv6, { 4, dc, 1, m }); + extraCoordinatorAddresses.push_back( + NetworkAddress(extraIp, sslEnabled && !sslOnly ? 2 : 1, true, sslEnabled && sslOnly)); TraceEvent("SelectedCoordinator") .detail("Address", coordinatorAddresses.back()) .detail("M", m) @@ -1678,11 +1685,13 @@ void setupSimulatedSystem(vector>* systemActors, // If extraDB==0, leave g_simulator.extraDB as null because the test does not use DR. if (testConfig.extraDB == 1) { // The DR database can be either a new database or itself - g_simulator.extraDB = new ClusterConnectionString( - coordinatorAddresses, BUGGIFY ? LiteralStringRef("TestCluster:0") : LiteralStringRef("ExtraCluster:0")); + g_simulator.extraDB = + BUGGIFY ? new ClusterConnectionString(coordinatorAddresses, LiteralStringRef("TestCluster:0")) + : new ClusterConnectionString(extraCoordinatorAddresses, LiteralStringRef("ExtraCluster:0")); } else if (testConfig.extraDB == 2) { // The DR database is a new database - g_simulator.extraDB = new ClusterConnectionString(coordinatorAddresses, LiteralStringRef("ExtraCluster:0")); + g_simulator.extraDB = + new ClusterConnectionString(extraCoordinatorAddresses, LiteralStringRef("ExtraCluster:0")); } else if (testConfig.extraDB == 3) { // The DR database is the same database g_simulator.extraDB = new ClusterConnectionString(coordinatorAddresses, LiteralStringRef("TestCluster:0")); diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 5721b154d4..db5c09e0ed 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -2047,7 +2047,7 @@ ACTOR Future fdbd(Reference connFile, if (coordFolder.size()) { // SOMEDAY: remove the fileNotFound wrapper and make DiskQueue construction safe from errors setting up // their files - actors.push_back(fileNotFoundToNever(coordinationServer(coordFolder))); + actors.push_back(fileNotFoundToNever(coordinationServer(coordFolder, coordinators.ccf))); } state UID processIDUid = wait(createAndLockProcessIdFile(dataFolder)); diff --git a/flow/error_definitions.h b/flow/error_definitions.h index 70f8750836..4af3aee275 100755 --- a/flow/error_definitions.h +++ b/flow/error_definitions.h @@ -74,6 +74,7 @@ ERROR( disk_adapter_reset, 1050, "The disk queue adpater reset" ) ERROR( batch_transaction_throttled, 1051, "Batch GRV request rate limit exceeded") ERROR( dd_cancelled, 1052, "Data distribution components cancelled") ERROR( dd_not_found, 1053, "Data distributor not found") +ERROR( wrong_connection_file, 1054, "Connection file mismatch") ERROR( broken_promise, 1100, "Broken promise" ) ERROR( operation_cancelled, 1101, "Asynchronous operation cancelled" ) From 60d27d05d8edccaa77c57a04bf2d68b19da4447b Mon Sep 17 00:00:00 2001 From: Dan Lambright Date: Mon, 5 Apr 2021 15:52:48 -0400 Subject: [PATCH 448/461] add knob enabling cross cluster support (default true) --- fdbserver/Coordination.actor.cpp | 81 +++++++++++-------- fdbserver/Knobs.cpp | 1 - fdbserver/Knobs.h | 4 +- fdbserver/SimulatedCluster.actor.cpp | 2 +- .../workloads/ConfigureDatabase.actor.cpp | 8 +- .../SpecialKeySpaceCorrectness.actor.cpp | 3 +- 6 files changed, 57 insertions(+), 42 deletions(-) diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index 8443e849eb..b4d5f9f38a 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -568,17 +568,18 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore info.forward = forward.get().serializedInfo; req.reply.send(CachedSerialization(info)); } else { - StringRef reqClusterName = getClusterName(req.clusterKey); - StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey()); - if (reqClusterName.compare(clusterName) || - ccf->getConnectionString().coordinators() != req.coordinators) { - TraceEvent(SevWarnAlways, "CCFMismatch") + StringRef clusterName = ccf->getConnectionString().clusterKeyName(); + if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && + getClusterName(req.clusterKey).compare(clusterName)) { + TraceEvent(SevError, "CCFMismatch") .detail("RequestType", "OpenDatabaseCoordRequest") .detail("LocalCS", ccf->getConnectionString().toString()) .detail("IncomingClusterKey", req.clusterKey) .detail("IncomingCoordinators", describeList(req.coordinators, req.coordinators.size())); + req.reply.sendError(wrong_connection_file()); + } else { + regs.getInterface(req.clusterKey, id).openDatabase.send(req); } - regs.getInterface(req.clusterKey, id).openDatabase.send(req); } } when(ElectionResultRequest req = waitNext(interf.electionResult.getFuture())) { @@ -586,17 +587,19 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore if (forward.present()) { req.reply.send(forward.get()); } else { - StringRef reqClusterName = getClusterName(req.key); - StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey()); - if (reqClusterName.compare(clusterName) || - ccf->getConnectionString().coordinators() != req.coordinators) { - TraceEvent(SevWarnAlways, "CCFMismatch") + StringRef clusterName = ccf->getConnectionString().clusterKeyName(); + if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && + getClusterName(req.key).compare(clusterName)) { + TraceEvent(SevError, "CCFMismatch") .detail("RequestType", "ElectionResultRequest") .detail("LocalCS", ccf->getConnectionString().toString()) .detail("IncomingClusterKey", req.key) + .detail("ClusterKey", ccf->getConnectionString().clusterKey()) .detail("IncomingCoordinators", describeList(req.coordinators, req.coordinators.size())); + req.reply.sendError(wrong_connection_file()); + } else { + regs.getInterface(req.key, id).electionResult.send(req); } - regs.getInterface(req.key, id).electionResult.send(req); } } when(GetLeaderRequest req = waitNext(interf.getLeader.getFuture())) { @@ -604,16 +607,18 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore if (forward.present()) req.reply.send(forward.get()); else { - StringRef reqClusterName = getClusterName(req.key); - StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey()); - if (reqClusterName.compare(clusterName)) { - TraceEvent(SevWarnAlways, "CCFMismatch") + StringRef clusterName = ccf->getConnectionString().clusterKeyName(); + if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && + getClusterName(req.key).compare(clusterName)) { + TraceEvent(SevError, "CCFMismatch") .detail("RequestType", "GetLeaderRequest") .detail("LocalCS", ccf->getConnectionString().toString()) .detail("IncomingClusterKey", req.key) - .detail("Key", reqClusterName).detail("Key2",clusterName); + .detail("ClusterKey", ccf->getConnectionString().clusterKey()); + req.reply.sendError(wrong_connection_file()); + } else { + regs.getInterface(req.key, id).getLeader.send(req); } - regs.getInterface(req.key, id).getLeader.send(req); } } when(CandidacyRequest req = waitNext(interf.candidacy.getFuture())) { @@ -621,15 +626,17 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore if (forward.present()) req.reply.send(forward.get()); else { - StringRef reqClusterName = getClusterName(req.key); - StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey()); - if (reqClusterName.compare(clusterName)) { - TraceEvent(SevWarnAlways, "CCFMismatch") + StringRef clusterName = ccf->getConnectionString().clusterKeyName(); + if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && + getClusterName(req.key).compare(clusterName)) { + TraceEvent(SevError, "CCFMismatch") .detail("RequestType", "CandidacyRequest") .detail("LocalCS", ccf->getConnectionString().toString()) .detail("IncomingClusterKey", req.key); + req.reply.sendError(wrong_connection_file()); + } else { + regs.getInterface(req.key, id).candidacy.send(req); } - regs.getInterface(req.key, id).candidacy.send(req); } } when(LeaderHeartbeatRequest req = waitNext(interf.leaderHeartbeat.getFuture())) { @@ -637,15 +644,17 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore if (forward.present()) req.reply.send(LeaderHeartbeatReply{ false }); else { - StringRef reqClusterName = getClusterName(req.key); - StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey()); - if (reqClusterName.compare(clusterName)) { - TraceEvent(SevWarnAlways, "CCFMismatch") + StringRef clusterName = ccf->getConnectionString().clusterKeyName(); + if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && + getClusterName(req.key).compare(clusterName)) { + TraceEvent(SevError, "CCFMismatch") .detail("RequestType", "LeaderHeartbeatRequest") .detail("LocalCS", ccf->getConnectionString().toString()) .detail("IncomingClusterKey", req.key); + req.reply.sendError(wrong_connection_file()); + } else { + regs.getInterface(req.key, id).leaderHeartbeat.send(req); } - regs.getInterface(req.key, id).leaderHeartbeat.send(req); } } when(ForwardRequest req = waitNext(interf.forward.getFuture())) { @@ -653,17 +662,19 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore if (forward.present()) req.reply.send(Void()); else { - StringRef reqClusterName = getClusterName(req.key); - StringRef clusterName = getClusterName(ccf->getConnectionString().clusterKey()); - if (reqClusterName.compare(clusterName)) { - TraceEvent(SevWarnAlways, "CCFMismatch") + StringRef clusterName = ccf->getConnectionString().clusterKeyName(); + if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && + getClusterName(req.key).compare(clusterName)) { + TraceEvent(SevError, "CCFMismatch") .detail("RequestType", "ForwardRequest") .detail("LocalCS", ccf->getConnectionString().toString()) .detail("IncomingClusterKey", req.key); + req.reply.sendError(wrong_connection_file()); + } else { + forwarders.add( + LeaderRegisterCollection::setForward(®s, req.key, ClusterConnectionString(req.conn.toString()))); + regs.getInterface(req.key, id).forward.send(req); } - forwarders.add(LeaderRegisterCollection::setForward(®s, req.key, - ClusterConnectionString(req.conn.toString()))); - regs.getInterface(req.key, id).forward.send(req); } } when(wait(forwarders.getResult())) { diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index d3e32203d9..b002204b0b 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -631,7 +631,6 @@ void ServerKnobs::initialize(bool randomize, ClientKnobs* clientKnobs, bool isSi // Coordination init( COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL, 1.0 ); if( randomize && BUGGIFY ) COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL = 10.0; - init( FORWARD_REQUEST_TOO_OLD, 600.0 ); if( randomize && BUGGIFY ) FORWARD_REQUEST_TOO_OLD = 60.0; init( ENABLE_CROSS_CLUSTER_SUPPORT, true ); if( randomize && BUGGIFY ) ENABLE_CROSS_CLUSTER_SUPPORT = false; // Buggification diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 6bc56d4457..3426f6bb18 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -559,9 +559,7 @@ public: // Coordination double COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL; - double FORWARD_REQUEST_TOO_OLD; - bool ENABLE_CROSS_CLUSTER_SUPPORT; // Allow a coordinator to serve requests whose connection string does not match - // the local copy + bool ENABLE_CROSS_CLUSTER_SUPPORT; // Allow a coordinator to serve requests whose connection string does not match the local descriptor // Buggification double BUGGIFIED_EVENTUAL_CONSISTENCY; diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 72d810961d..bfa1f9d007 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -1691,7 +1691,7 @@ void setupSimulatedSystem(vector>* systemActors, } else if (testConfig.extraDB == 2) { // The DR database is a new database g_simulator.extraDB = - new ClusterConnectionString(extraCoordinatorAddresses, LiteralStringRef("ExtraCluster:0")); + new ClusterConnectionString(extraCoordinatorAddresses, LiteralStringRef("ExtraCluster:0")); } else if (testConfig.extraDB == 3) { // The DR database is the same database g_simulator.extraDB = new ClusterConnectionString(coordinatorAddresses, LiteralStringRef("TestCluster:0")); diff --git a/fdbserver/workloads/ConfigureDatabase.actor.cpp b/fdbserver/workloads/ConfigureDatabase.actor.cpp index d9193fc9d9..0ab7d1b88b 100644 --- a/fdbserver/workloads/ConfigureDatabase.actor.cpp +++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp @@ -270,6 +270,7 @@ struct ConfigureDatabaseWorkload : TestWorkload { return Void(); } state int randomChoice = deterministicRandom()->randomInt(0, 8); + if (randomChoice == 0) { wait(success( runRYWTransaction(cx, [=](Reference tr) -> Future> { @@ -316,8 +317,13 @@ struct ConfigureDatabaseWorkload : TestWorkload { } else if (randomChoice == 4) { //TraceEvent("ConfigureTestQuorumBegin").detail("NewQuorum", s); auto ch = autoQuorumChange(); + std::string desiredClusterName = "NewName%d"; + if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT) { + // if configuration does not allow changing the descriptor, pass empty string (keep old descriptor) + desiredClusterName = ""; + } if (deterministicRandom()->randomInt(0, 2)) - ch = nameQuorumChange(format("NewName%d", deterministicRandom()->randomInt(0, 100)), ch); + ch = nameQuorumChange(format(desiredClusterName.c_str(), deterministicRandom()->randomInt(0, 100)), ch); wait(success(changeQuorum(cx, ch))); //TraceEvent("ConfigureTestConfigureEnd").detail("NewQuorum", s); } else if (randomChoice == 5) { diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp index e6a6650de3..5a38e20d7e 100644 --- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp +++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp @@ -936,7 +936,8 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { // test change coordinators and cluster description // we randomly pick one process(not coordinator) and add it, in this case, it should always succeed { - state std::string new_cluster_description = deterministicRandom()->randomAlphaNumeric(8); + // choose a new description if configuration allows transactions across differently named clusters + state std::string new_cluster_description = SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? deterministicRandom()->randomAlphaNumeric(8) : cs.clusterKeyName().toString(); state std::string new_coordinator_process; state std::vector old_coordinators_processes; state bool possible_to_add_coordinator; From 742c22cef2eebd74134e58e05a68bf9b4e736678 Mon Sep 17 00:00:00 2001 From: Dan Lambright Date: Thu, 22 Apr 2021 13:01:21 -0400 Subject: [PATCH 449/461] Don't allow changing desriptor if knob is set --- fdbclient/CoordinationInterface.h | 28 +++++++++++++++++++++++++++- fdbclient/ManagementAPI.actor.cpp | 13 ++++++++++--- fdbclient/MonitorLeader.actor.cpp | 4 +++- fdbrpc/FlowTransport.actor.cpp | 4 +++- fdbserver/Coordination.actor.cpp | 4 ++++ fdbserver/CoordinationInterface.h | 12 ++++++------ 6 files changed, 53 insertions(+), 12 deletions(-) diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h index a852df7a94..2ebd4e1259 100644 --- a/fdbclient/CoordinationInterface.h +++ b/fdbclient/CoordinationInterface.h @@ -32,13 +32,15 @@ const int MAX_CLUSTER_FILE_BYTES = 60000; constexpr UID WLTOKEN_CLIENTLEADERREG_GETLEADER(-1, 2); constexpr UID WLTOKEN_CLIENTLEADERREG_OPENDATABASE(-1, 3); +constexpr UID WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE(-1, 4); -constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 10); +constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 11); // The coordinator interface as exposed to clients struct ClientLeaderRegInterface { RequestStream getLeader; RequestStream openDatabase; + RequestStream checkClusterNameMutability; ClientLeaderRegInterface() {} ClientLeaderRegInterface(NetworkAddress remote); @@ -236,4 +238,28 @@ struct ProtocolInfoRequest { } }; +struct CheckClusterNameMutabilityReply { + constexpr static FileIdentifier file_identifier = 7784299; + CheckClusterNameMutabilityReply() = default; + explicit CheckClusterNameMutabilityReply(bool value) : value(value) {} + bool value; + template + void serialize(Ar& ar) { + serializer(ar, value); + } +}; + +struct CheckClusterNameMutability { + constexpr static FileIdentifier file_identifier = 214729; + Key key; + ReplyPromise reply; + explicit CheckClusterNameMutability(Key key) : key(key) {} + CheckClusterNameMutability(){} + + template + void serialize(Ar& ar) { + serializer(ar, key, reply); + } +}; + #endif diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 90d670e801..56d9f0e6ec 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -1105,6 +1105,7 @@ ACTOR Future> changeQuorumChecker(Transaction* tr, vector>> leaderServers; ClientCoordinators coord(Reference(new ClusterConnectionFile(conn))); + leaderServers.reserve(coord.clientLeaderServers.size()); for (int i = 0; i < coord.clientLeaderServers.size(); i++) leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader, @@ -1188,14 +1189,20 @@ ACTOR Future changeQuorum(Database cx, Reference>> leaderServers; - ClientCoordinators coord(Reference(new ClusterConnectionFile(conn))); + state vector>> leaderServers; + state ClientCoordinators coord(Reference(new ClusterConnectionFile(conn))); + if (! change->getDesiredClusterKeyName().empty()) { + CheckClusterNameMutabilityReply mutabilityReply = wait(coord.clientLeaderServers[0].checkClusterNameMutability.getReply( + CheckClusterNameMutability())); + if (! mutabilityReply.value) { + return CoordinatorsResult::BAD_DATABASE_STATE; + } + } leaderServers.reserve(coord.clientLeaderServers.size()); for (int i = 0; i < coord.clientLeaderServers.size(); i++) leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader, GetLeaderRequest(coord.clusterKey, UID()), TaskPriority::CoordinationReply)); - choose { when(wait(waitForAll(leaderServers))) {} when(wait(delay(5.0))) { return CoordinatorsResult::COORDINATOR_UNREACHABLE; } diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index 057e546501..a4dfe5a4a1 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -380,11 +380,13 @@ ClientCoordinators::ClientCoordinators(Key clusterKey, std::vector pingLatencyLogger(TransportData* self) { } TransportData::TransportData(uint64_t transportId) - : endpoints(/*wellKnownTokenCount*/ 11), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints), + : endpoints(/*wellKnownTokenCount*/ WLTOKEN_COUNTS), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints), warnAlwaysForLargePacket(true), lastIncompatibleMessage(0), transportId(transportId), numIncompatibleConnections(0) { degraded = makeReference>(false); diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index b4d5f9f38a..974062a056 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -560,6 +560,10 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore wait(LeaderRegisterCollection::init(®s)); loop choose { + when(CheckClusterNameMutability req = waitNext(interf.checkClusterNameMutability.getFuture())) { + CheckClusterNameMutabilityReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? true : false); + req.reply.send(rep); + } when(OpenDatabaseCoordRequest req = waitNext(interf.openDatabase.getFuture())) { Optional forward = regs.getForward(req.clusterKey); if (forward.present()) { diff --git a/fdbserver/CoordinationInterface.h b/fdbserver/CoordinationInterface.h index ea379d1358..9cf4cb3ea0 100644 --- a/fdbserver/CoordinationInterface.h +++ b/fdbserver/CoordinationInterface.h @@ -24,12 +24,12 @@ #include "fdbclient/CoordinationInterface.h" -constexpr UID WLTOKEN_LEADERELECTIONREG_CANDIDACY(-1, 4); -constexpr UID WLTOKEN_LEADERELECTIONREG_ELECTIONRESULT(-1, 5); -constexpr UID WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT(-1, 6); -constexpr UID WLTOKEN_LEADERELECTIONREG_FORWARD(-1, 7); -constexpr UID WLTOKEN_GENERATIONREG_READ(-1, 8); -constexpr UID WLTOKEN_GENERATIONREG_WRITE(-1, 9); +constexpr UID WLTOKEN_LEADERELECTIONREG_CANDIDACY(-1, 5); +constexpr UID WLTOKEN_LEADERELECTIONREG_ELECTIONRESULT(-1, 6); +constexpr UID WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT(-1, 7); +constexpr UID WLTOKEN_LEADERELECTIONREG_FORWARD(-1, 8); +constexpr UID WLTOKEN_GENERATIONREG_READ(-1, 9); +constexpr UID WLTOKEN_GENERATIONREG_WRITE(-1, 10); struct GenerationRegInterface { constexpr static FileIdentifier file_identifier = 16726744; From fcfb78162c74cededb6d96f28ee2844dd579af8f Mon Sep 17 00:00:00 2001 From: Dan Lambright Date: Fri, 23 Apr 2021 09:19:48 -0400 Subject: [PATCH 450/461] misc cleanup for publishing --- fdbclient/CoordinationInterface.h | 20 +++++++++++--------- fdbclient/ManagementAPI.actor.cpp | 8 ++++---- fdbclient/MonitorLeader.actor.cpp | 4 ++-- fdbrpc/FlowTransport.actor.cpp | 5 +++-- fdbserver/Coordination.actor.cpp | 21 +++++++++++---------- 5 files changed, 31 insertions(+), 27 deletions(-) diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h index 2ebd4e1259..919ae9c315 100644 --- a/fdbclient/CoordinationInterface.h +++ b/fdbclient/CoordinationInterface.h @@ -36,11 +36,11 @@ constexpr UID WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE(-1, 4); constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 11); -// The coordinator interface as exposed to clients +// well known endpoints published to the client. struct ClientLeaderRegInterface { RequestStream getLeader; RequestStream openDatabase; - RequestStream checkClusterNameMutability; + RequestStream checkDescriptorMutable; ClientLeaderRegInterface() {} ClientLeaderRegInterface(NetworkAddress remote); @@ -238,10 +238,11 @@ struct ProtocolInfoRequest { } }; -struct CheckClusterNameMutabilityReply { +// Returns true if the cluster descriptor may be modified. +struct CheckDescriptorMutableReply { constexpr static FileIdentifier file_identifier = 7784299; - CheckClusterNameMutabilityReply() = default; - explicit CheckClusterNameMutabilityReply(bool value) : value(value) {} + CheckDescriptorMutableReply() = default; + explicit CheckDescriptorMutableReply(bool value) : value(value) {} bool value; template void serialize(Ar& ar) { @@ -249,12 +250,13 @@ struct CheckClusterNameMutabilityReply { } }; -struct CheckClusterNameMutability { +// Allows client to check if allowed to change the cluster descriptor. +struct CheckDescriptorMutable { constexpr static FileIdentifier file_identifier = 214729; Key key; - ReplyPromise reply; - explicit CheckClusterNameMutability(Key key) : key(key) {} - CheckClusterNameMutability(){} + ReplyPromise reply; + explicit CheckDescriptorMutable(Key key) : key(key) {} + CheckDescriptorMutable(){} template void serialize(Ar& ar) { diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 56d9f0e6ec..2d6fb4c36b 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -1191,12 +1191,12 @@ ACTOR Future changeQuorum(Database cx, Reference>> leaderServers; state ClientCoordinators coord(Reference(new ClusterConnectionFile(conn))); + // check if allowed to modify the cluster descriptor if (! change->getDesiredClusterKeyName().empty()) { - CheckClusterNameMutabilityReply mutabilityReply = wait(coord.clientLeaderServers[0].checkClusterNameMutability.getReply( - CheckClusterNameMutability())); - if (! mutabilityReply.value) { + CheckDescriptorMutableReply mutabilityReply = wait(coord.clientLeaderServers[0].checkDescriptorMutable.getReply( + CheckDescriptorMutable())); + if (! mutabilityReply.value) return CoordinatorsResult::BAD_DATABASE_STATE; - } } leaderServers.reserve(coord.clientLeaderServers.size()); for (int i = 0; i < coord.clientLeaderServers.size(); i++) diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index a4dfe5a4a1..0a22c0c508 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -381,12 +381,12 @@ ClientCoordinators::ClientCoordinators(Key clusterKey, std::vector pingLatencyLogger(TransportData* self) { } TransportData::TransportData(uint64_t transportId) - : endpoints(/*wellKnownTokenCount*/ WLTOKEN_COUNTS), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints), + : endpoints(WLTOKEN_COUNTS), endpointNotFoundReceiver(endpoints), pingReceiver(endpoints), warnAlwaysForLargePacket(true), lastIncompatibleMessage(0), transportId(transportId), numIncompatibleConnections(0) { degraded = makeReference>(false); diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index 974062a056..eb7dcf8c6c 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -545,7 +545,8 @@ struct LeaderRegisterCollection { } }; -StringRef getClusterName(Key key) { +// extract the prefix descriptor from cluster id +StringRef getClusterDescriptor(Key key) { StringRef str = key.contents(); return str.eat(":"); } @@ -558,10 +559,10 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore state ActorCollection forwarders(false); wait(LeaderRegisterCollection::init(®s)); - + loop choose { - when(CheckClusterNameMutability req = waitNext(interf.checkClusterNameMutability.getFuture())) { - CheckClusterNameMutabilityReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? true : false); + when(CheckDescriptorMutable req = waitNext(interf.checkDescriptorMutable.getFuture())) { + CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? true : false); req.reply.send(rep); } when(OpenDatabaseCoordRequest req = waitNext(interf.openDatabase.getFuture())) { @@ -574,7 +575,7 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore } else { StringRef clusterName = ccf->getConnectionString().clusterKeyName(); if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && - getClusterName(req.clusterKey).compare(clusterName)) { + getClusterDescriptor(req.clusterKey).compare(clusterName)) { TraceEvent(SevError, "CCFMismatch") .detail("RequestType", "OpenDatabaseCoordRequest") .detail("LocalCS", ccf->getConnectionString().toString()) @@ -593,7 +594,7 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore } else { StringRef clusterName = ccf->getConnectionString().clusterKeyName(); if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && - getClusterName(req.key).compare(clusterName)) { + getClusterDescriptor(req.key).compare(clusterName)) { TraceEvent(SevError, "CCFMismatch") .detail("RequestType", "ElectionResultRequest") .detail("LocalCS", ccf->getConnectionString().toString()) @@ -613,7 +614,7 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore else { StringRef clusterName = ccf->getConnectionString().clusterKeyName(); if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && - getClusterName(req.key).compare(clusterName)) { + getClusterDescriptor(req.key).compare(clusterName)) { TraceEvent(SevError, "CCFMismatch") .detail("RequestType", "GetLeaderRequest") .detail("LocalCS", ccf->getConnectionString().toString()) @@ -632,7 +633,7 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore else { StringRef clusterName = ccf->getConnectionString().clusterKeyName(); if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && - getClusterName(req.key).compare(clusterName)) { + getClusterDescriptor(req.key).compare(clusterName)) { TraceEvent(SevError, "CCFMismatch") .detail("RequestType", "CandidacyRequest") .detail("LocalCS", ccf->getConnectionString().toString()) @@ -650,7 +651,7 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore else { StringRef clusterName = ccf->getConnectionString().clusterKeyName(); if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && - getClusterName(req.key).compare(clusterName)) { + getClusterDescriptor(req.key).compare(clusterName)) { TraceEvent(SevError, "CCFMismatch") .detail("RequestType", "LeaderHeartbeatRequest") .detail("LocalCS", ccf->getConnectionString().toString()) @@ -668,7 +669,7 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore else { StringRef clusterName = ccf->getConnectionString().clusterKeyName(); if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && - getClusterName(req.key).compare(clusterName)) { + getClusterDescriptor(req.key).compare(clusterName)) { TraceEvent(SevError, "CCFMismatch") .detail("RequestType", "ForwardRequest") .detail("LocalCS", ccf->getConnectionString().toString()) From fc65154b5dad31705bf628abb52d4c4361ef4720 Mon Sep 17 00:00:00 2001 From: Dan Lambright Date: Wed, 28 Apr 2021 08:48:15 -0400 Subject: [PATCH 451/461] forward back new coordinator --- fdbclient/CoordinationInterface.h | 18 +++++++++--------- fdbserver/Coordination.actor.cpp | 11 ++++++++--- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h index 919ae9c315..71448bf8b2 100644 --- a/fdbclient/CoordinationInterface.h +++ b/fdbclient/CoordinationInterface.h @@ -252,16 +252,16 @@ struct CheckDescriptorMutableReply { // Allows client to check if allowed to change the cluster descriptor. struct CheckDescriptorMutable { - constexpr static FileIdentifier file_identifier = 214729; - Key key; - ReplyPromise reply; - explicit CheckDescriptorMutable(Key key) : key(key) {} - CheckDescriptorMutable(){} + constexpr static FileIdentifier file_identifier = 214729; + Key key; + ReplyPromise reply; + explicit CheckDescriptorMutable(Key key) : key(key) {} + CheckDescriptorMutable(){} - template - void serialize(Ar& ar) { - serializer(ar, key, reply); - } + template + void serialize(Ar& ar) { + serializer(ar, key, reply); + } }; #endif diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index eb7dcf8c6c..733637e7a0 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -559,11 +559,16 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore state ActorCollection forwarders(false); wait(LeaderRegisterCollection::init(®s)); - + loop choose { when(CheckDescriptorMutable req = waitNext(interf.checkDescriptorMutable.getFuture())) { - CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? true : false); - req.reply.send(rep); + Optional forward = regs.getForward(req.key); + if (forward.present()) { + req.reply.send(CheckDescriptorMutableReply{false}); + } else { + CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? true : false); + req.reply.send(rep); + } } when(OpenDatabaseCoordRequest req = waitNext(interf.openDatabase.getFuture())) { Optional forward = regs.getForward(req.clusterKey); From 53d0ecc2fa0e2c04767f02d2116f588753ce7cd9 Mon Sep 17 00:00:00 2001 From: Dan Lambright Date: Wed, 5 May 2021 15:01:56 -0400 Subject: [PATCH 452/461] respond to comments made on 5/4 --- fdbclient/CoordinationInterface.h | 3 +- fdbserver/Coordination.actor.cpp | 47 +++++++++++++++---------------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h index 71448bf8b2..a36182b7f3 100644 --- a/fdbclient/CoordinationInterface.h +++ b/fdbclient/CoordinationInterface.h @@ -34,6 +34,7 @@ constexpr UID WLTOKEN_CLIENTLEADERREG_GETLEADER(-1, 2); constexpr UID WLTOKEN_CLIENTLEADERREG_OPENDATABASE(-1, 3); constexpr UID WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE(-1, 4); +// the value of this endpoint should be stable and not change. constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 11); // well known endpoints published to the client. @@ -256,7 +257,7 @@ struct CheckDescriptorMutable { Key key; ReplyPromise reply; explicit CheckDescriptorMutable(Key key) : key(key) {} - CheckDescriptorMutable(){} + CheckDescriptorMutable() {} template void serialize(Ar& ar) { diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index 733637e7a0..22c1fb2ce8 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -406,8 +406,8 @@ ACTOR Future leaderRegister(LeaderElectionRegInterface interf, Key key) { // If the current leader's priority became worse, we still need to notified all clients because now one // of them might be better than the leader. In addition, even though FitnessRemote is better than - // FitnessUnknown, we still need to notified clients so that monitorLeaderRemotely has a chance to switch - // from passively monitoring the leader to actively attempting to become the leader. + // FitnessUnknown, we still need to notified clients so that monitorLeaderRemotely has a chance to + // switch from passively monitoring the leader to actively attempting to become the leader. if (!currentNominee.present() || !nextNominee.present() || !currentNominee.get().equalInternalId(nextNominee.get()) || nextNominee.get() > currentNominee.get() || @@ -553,8 +553,10 @@ StringRef getClusterDescriptor(Key key) { // leaderServer multiplexes multiple leaderRegisters onto a single LeaderElectionRegInterface, // creating and destroying them on demand. -ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore* pStore, UID id, - Reference ccf) { +ACTOR Future leaderServer(LeaderElectionRegInterface interf, + OnDemandStore* pStore, + UID id, + Reference ccf) { state LeaderRegisterCollection regs(pStore); state ActorCollection forwarders(false); @@ -564,7 +566,7 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore when(CheckDescriptorMutable req = waitNext(interf.checkDescriptorMutable.getFuture())) { Optional forward = regs.getForward(req.key); if (forward.present()) { - req.reply.send(CheckDescriptorMutableReply{false}); + req.reply.send(CheckDescriptorMutableReply{ false }); } else { CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? true : false); req.reply.send(rep); @@ -579,9 +581,9 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore req.reply.send(CachedSerialization(info)); } else { StringRef clusterName = ccf->getConnectionString().clusterKeyName(); - if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && - getClusterDescriptor(req.clusterKey).compare(clusterName)) { - TraceEvent(SevError, "CCFMismatch") + if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && + getClusterDescriptor(req.clusterKey).compare(clusterName)) { + TraceEvent(SevWarn, "CCFMismatch") .detail("RequestType", "OpenDatabaseCoordRequest") .detail("LocalCS", ccf->getConnectionString().toString()) .detail("IncomingClusterKey", req.clusterKey) @@ -598,9 +600,8 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore req.reply.send(forward.get()); } else { StringRef clusterName = ccf->getConnectionString().clusterKeyName(); - if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && - getClusterDescriptor(req.key).compare(clusterName)) { - TraceEvent(SevError, "CCFMismatch") + if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) { + TraceEvent(SevWarn, "CCFMismatch") .detail("RequestType", "ElectionResultRequest") .detail("LocalCS", ccf->getConnectionString().toString()) .detail("IncomingClusterKey", req.key) @@ -618,9 +619,8 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore req.reply.send(forward.get()); else { StringRef clusterName = ccf->getConnectionString().clusterKeyName(); - if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && - getClusterDescriptor(req.key).compare(clusterName)) { - TraceEvent(SevError, "CCFMismatch") + if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) { + TraceEvent(SevWarn, "CCFMismatch") .detail("RequestType", "GetLeaderRequest") .detail("LocalCS", ccf->getConnectionString().toString()) .detail("IncomingClusterKey", req.key) @@ -637,9 +637,8 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore req.reply.send(forward.get()); else { StringRef clusterName = ccf->getConnectionString().clusterKeyName(); - if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && - getClusterDescriptor(req.key).compare(clusterName)) { - TraceEvent(SevError, "CCFMismatch") + if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) { + TraceEvent(SevWarn, "CCFMismatch") .detail("RequestType", "CandidacyRequest") .detail("LocalCS", ccf->getConnectionString().toString()) .detail("IncomingClusterKey", req.key); @@ -655,9 +654,8 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore req.reply.send(LeaderHeartbeatReply{ false }); else { StringRef clusterName = ccf->getConnectionString().clusterKeyName(); - if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && - getClusterDescriptor(req.key).compare(clusterName)) { - TraceEvent(SevError, "CCFMismatch") + if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) { + TraceEvent(SevWarn, "CCFMismatch") .detail("RequestType", "LeaderHeartbeatRequest") .detail("LocalCS", ccf->getConnectionString().toString()) .detail("IncomingClusterKey", req.key); @@ -673,16 +671,15 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore req.reply.send(Void()); else { StringRef clusterName = ccf->getConnectionString().clusterKeyName(); - if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && - getClusterDescriptor(req.key).compare(clusterName)) { - TraceEvent(SevError, "CCFMismatch") + if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT && getClusterDescriptor(req.key).compare(clusterName)) { + TraceEvent(SevWarn, "CCFMismatch") .detail("RequestType", "ForwardRequest") .detail("LocalCS", ccf->getConnectionString().toString()) .detail("IncomingClusterKey", req.key); req.reply.sendError(wrong_connection_file()); } else { - forwarders.add( - LeaderRegisterCollection::setForward(®s, req.key, ClusterConnectionString(req.conn.toString()))); + forwarders.add(LeaderRegisterCollection::setForward( + ®s, req.key, ClusterConnectionString(req.conn.toString()))); regs.getInterface(req.key, id).forward.send(req); } } From 64c10d36250c0acedb78ebd531598ef0e54237e6 Mon Sep 17 00:00:00 2001 From: Dan Lambright Date: Thu, 6 May 2021 11:51:33 -0400 Subject: [PATCH 453/461] fix joshua failures, formatting --- fdbclient/ManagementAPI.actor.cpp | 8 ++++---- fdbclient/MonitorLeader.actor.cpp | 6 ++++-- fdbrpc/FlowTransport.actor.cpp | 4 ++-- fdbserver/Knobs.h | 3 ++- fdbserver/workloads/ConfigureDatabase.actor.cpp | 5 +++-- fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp | 4 +++- 6 files changed, 18 insertions(+), 12 deletions(-) diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 2d6fb4c36b..bfa998d25f 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -1192,10 +1192,10 @@ ACTOR Future changeQuorum(Database cx, Reference>> leaderServers; state ClientCoordinators coord(Reference(new ClusterConnectionFile(conn))); // check if allowed to modify the cluster descriptor - if (! change->getDesiredClusterKeyName().empty()) { - CheckDescriptorMutableReply mutabilityReply = wait(coord.clientLeaderServers[0].checkDescriptorMutable.getReply( - CheckDescriptorMutable())); - if (! mutabilityReply.value) + if (!change->getDesiredClusterKeyName().empty()) { + CheckDescriptorMutableReply mutabilityReply = + wait(coord.clientLeaderServers[0].checkDescriptorMutable.getReply(CheckDescriptorMutable())); + if (!mutabilityReply.value) return CoordinatorsResult::BAD_DATABASE_STATE; } leaderServers.reserve(coord.clientLeaderServers.size()); diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index 0a22c0c508..86a09ff424 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -386,7 +386,8 @@ ClientLeaderRegInterface::ClientLeaderRegInterface(NetworkAddress remote) ClientLeaderRegInterface::ClientLeaderRegInterface(INetwork* local) { getLeader.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskPriority::Coordination); openDatabase.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_OPENDATABASE, TaskPriority::Coordination); - checkDescriptorMutable.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE, TaskPriority::Coordination); + checkDescriptorMutable.makeWellKnownEndpoint(WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE, + TaskPriority::Coordination); } // Nominee is the worker among all workers that are considered as leader by a coordinator @@ -498,7 +499,8 @@ ACTOR Future monitorLeaderOneGeneration(ReferencegetConnectionString().toString()).trackLatest("MonitorLeaderForwarding"); + .detail("OldConnStr", info.intermediateConnFile->getConnectionString().toString()) + .trackLatest("MonitorLeaderForwarding"); info.intermediateConnFile = makeReference( connFile->getFilename(), ClusterConnectionString(leader.get().first.serializedInfo.toString())); return info; diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 23cd61be53..248011ffcb 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -51,7 +51,7 @@ constexpr UID WLTOKEN_PING_PACKET(-1, 1); constexpr int PACKET_LEN_WIDTH = sizeof(uint32_t); const uint64_t TOKEN_STREAM_FLAG = 1; -const int WLTOKEN_COUNTS = 12; // number of wellKnownEndpoints +const int WLTOKEN_COUNTS = 13; // number of wellKnownEndpoints class EndpointMap : NonCopyable { public: @@ -1218,7 +1218,7 @@ ACTOR static Future connectionReader(TransportData* transport, } compatible = false; if (!protocolVersion.hasInexpensiveMultiVersionClient()) { - if(peer) { + if (peer) { peer->protocolVersion->set(protocolVersion); } diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 3426f6bb18..a89ac9c375 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -559,7 +559,8 @@ public: // Coordination double COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL; - bool ENABLE_CROSS_CLUSTER_SUPPORT; // Allow a coordinator to serve requests whose connection string does not match the local descriptor + bool ENABLE_CROSS_CLUSTER_SUPPORT; // Allow a coordinator to serve requests whose connection string does not match + // the local descriptor // Buggification double BUGGIFIED_EVENTUAL_CONSISTENCY; diff --git a/fdbserver/workloads/ConfigureDatabase.actor.cpp b/fdbserver/workloads/ConfigureDatabase.actor.cpp index 0ab7d1b88b..ae03375ccb 100644 --- a/fdbserver/workloads/ConfigureDatabase.actor.cpp +++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp @@ -318,12 +318,13 @@ struct ConfigureDatabaseWorkload : TestWorkload { //TraceEvent("ConfigureTestQuorumBegin").detail("NewQuorum", s); auto ch = autoQuorumChange(); std::string desiredClusterName = "NewName%d"; - if (! SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT) { + if (!SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT) { // if configuration does not allow changing the descriptor, pass empty string (keep old descriptor) desiredClusterName = ""; } if (deterministicRandom()->randomInt(0, 2)) - ch = nameQuorumChange(format(desiredClusterName.c_str(), deterministicRandom()->randomInt(0, 100)), ch); + ch = nameQuorumChange(format(desiredClusterName.c_str(), deterministicRandom()->randomInt(0, 100)), + ch); wait(success(changeQuorum(cx, ch))); //TraceEvent("ConfigureTestConfigureEnd").detail("NewQuorum", s); } else if (randomChoice == 5) { diff --git a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp index 5a38e20d7e..6d6f711a9f 100644 --- a/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp +++ b/fdbserver/workloads/SpecialKeySpaceCorrectness.actor.cpp @@ -937,7 +937,9 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload { // we randomly pick one process(not coordinator) and add it, in this case, it should always succeed { // choose a new description if configuration allows transactions across differently named clusters - state std::string new_cluster_description = SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? deterministicRandom()->randomAlphaNumeric(8) : cs.clusterKeyName().toString(); + state std::string new_cluster_description = SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT + ? deterministicRandom()->randomAlphaNumeric(8) + : cs.clusterKeyName().toString(); state std::string new_coordinator_process; state std::vector old_coordinators_processes; state bool possible_to_add_coordinator; From 10289ef8f1d5aeba5842e506b958f7b7f8bfe799 Mon Sep 17 00:00:00 2001 From: Dan Lambright Date: Mon, 17 May 2021 17:22:19 -0400 Subject: [PATCH 454/461] Respond to AJs comments --- fdbclient/CoordinationInterface.h | 17 ++++++++--------- fdbclient/ManagementAPI.actor.cpp | 4 ++-- fdbrpc/FlowTransport.actor.cpp | 4 ++-- fdbserver/Coordination.actor.cpp | 8 +++++--- fdbserver/CoordinationInterface.h | 12 ++++++------ 5 files changed, 23 insertions(+), 22 deletions(-) diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h index a36182b7f3..dda9cb47ed 100644 --- a/fdbclient/CoordinationInterface.h +++ b/fdbclient/CoordinationInterface.h @@ -32,16 +32,16 @@ const int MAX_CLUSTER_FILE_BYTES = 60000; constexpr UID WLTOKEN_CLIENTLEADERREG_GETLEADER(-1, 2); constexpr UID WLTOKEN_CLIENTLEADERREG_OPENDATABASE(-1, 3); -constexpr UID WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE(-1, 4); // the value of this endpoint should be stable and not change. -constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 11); +constexpr UID WLTOKEN_PROTOCOL_INFO(-1, 10); +constexpr UID WLTOKEN_CLIENTLEADERREG_DESCRIPTOR_MUTABLE(-1, 11); // well known endpoints published to the client. struct ClientLeaderRegInterface { RequestStream getLeader; RequestStream openDatabase; - RequestStream checkDescriptorMutable; + RequestStream checkDescriptorMutable; ClientLeaderRegInterface() {} ClientLeaderRegInterface(NetworkAddress remote); @@ -243,21 +243,20 @@ struct ProtocolInfoRequest { struct CheckDescriptorMutableReply { constexpr static FileIdentifier file_identifier = 7784299; CheckDescriptorMutableReply() = default; - explicit CheckDescriptorMutableReply(bool value) : value(value) {} - bool value; + explicit CheckDescriptorMutableReply(bool isMutable) : isMutable(isMutable) {} + bool isMutable; template void serialize(Ar& ar) { - serializer(ar, value); + serializer(ar, isMutable); } }; // Allows client to check if allowed to change the cluster descriptor. -struct CheckDescriptorMutable { +struct CheckDescriptorMutableRequest { constexpr static FileIdentifier file_identifier = 214729; Key key; ReplyPromise reply; - explicit CheckDescriptorMutable(Key key) : key(key) {} - CheckDescriptorMutable() {} + CheckDescriptorMutableRequest() {} template void serialize(Ar& ar) { diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index bfa998d25f..217340a93c 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -1194,8 +1194,8 @@ ACTOR Future changeQuorum(Database cx, ReferencegetDesiredClusterKeyName().empty()) { CheckDescriptorMutableReply mutabilityReply = - wait(coord.clientLeaderServers[0].checkDescriptorMutable.getReply(CheckDescriptorMutable())); - if (!mutabilityReply.value) + wait(coord.clientLeaderServers[0].checkDescriptorMutable.getReply(CheckDescriptorMutableRequest())); + if (!mutabilityReply.isMutable) return CoordinatorsResult::BAD_DATABASE_STATE; } leaderServers.reserve(coord.clientLeaderServers.size()); diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 248011ffcb..9e978dda66 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -51,7 +51,7 @@ constexpr UID WLTOKEN_PING_PACKET(-1, 1); constexpr int PACKET_LEN_WIDTH = sizeof(uint32_t); const uint64_t TOKEN_STREAM_FLAG = 1; -const int WLTOKEN_COUNTS = 13; // number of wellKnownEndpoints +const int WLTOKEN_COUNTS = 12; // number of wellKnownEndpoints class EndpointMap : NonCopyable { public: @@ -98,7 +98,7 @@ void EndpointMap::realloc() { void EndpointMap::insertWellKnown(NetworkMessageReceiver* r, const Endpoint::Token& token, TaskPriority priority) { int index = token.second(); - ASSERT(index < WLTOKEN_COUNTS); + ASSERT(index <= WLTOKEN_COUNTS); ASSERT(data[index].receiver == nullptr); data[index].receiver = r; data[index].token() = diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index 22c1fb2ce8..16124db34a 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -563,12 +563,14 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, wait(LeaderRegisterCollection::init(®s)); loop choose { - when(CheckDescriptorMutable req = waitNext(interf.checkDescriptorMutable.getFuture())) { + when(CheckDescriptorMutableRequest req = waitNext(interf.checkDescriptorMutable.getFuture())) { Optional forward = regs.getForward(req.key); + // Note the response returns the value of a knob enforced by checking only one coordinator. It is not + // quorum based. if (forward.present()) { - req.reply.send(CheckDescriptorMutableReply{ false }); + req.reply.sendError(coordinators_changed()); } else { - CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT ? true : false); + CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT); req.reply.send(rep); } } diff --git a/fdbserver/CoordinationInterface.h b/fdbserver/CoordinationInterface.h index 9cf4cb3ea0..ea379d1358 100644 --- a/fdbserver/CoordinationInterface.h +++ b/fdbserver/CoordinationInterface.h @@ -24,12 +24,12 @@ #include "fdbclient/CoordinationInterface.h" -constexpr UID WLTOKEN_LEADERELECTIONREG_CANDIDACY(-1, 5); -constexpr UID WLTOKEN_LEADERELECTIONREG_ELECTIONRESULT(-1, 6); -constexpr UID WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT(-1, 7); -constexpr UID WLTOKEN_LEADERELECTIONREG_FORWARD(-1, 8); -constexpr UID WLTOKEN_GENERATIONREG_READ(-1, 9); -constexpr UID WLTOKEN_GENERATIONREG_WRITE(-1, 10); +constexpr UID WLTOKEN_LEADERELECTIONREG_CANDIDACY(-1, 4); +constexpr UID WLTOKEN_LEADERELECTIONREG_ELECTIONRESULT(-1, 5); +constexpr UID WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT(-1, 6); +constexpr UID WLTOKEN_LEADERELECTIONREG_FORWARD(-1, 7); +constexpr UID WLTOKEN_GENERATIONREG_READ(-1, 8); +constexpr UID WLTOKEN_GENERATIONREG_WRITE(-1, 9); struct GenerationRegInterface { constexpr static FileIdentifier file_identifier = 16726744; From d233e1736f1fdb3282c7b70baa4fede5ed4e01e5 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Thu, 27 May 2021 09:58:02 -0700 Subject: [PATCH 455/461] Add release notes for PR 4863 --- documentation/sphinx/source/release-notes/release-notes-630.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/documentation/sphinx/source/release-notes/release-notes-630.rst b/documentation/sphinx/source/release-notes/release-notes-630.rst index bebd55e859..ca6a8fd029 100644 --- a/documentation/sphinx/source/release-notes/release-notes-630.rst +++ b/documentation/sphinx/source/release-notes/release-notes-630.rst @@ -5,6 +5,7 @@ Release Notes 6.3.14 ====== +* Fixed fdbbackup start command that automatically configures database with backup workers to only do so when using partitioned logs. `(PR #4863) `_ * Added ``cluster.bounce_impact`` section to status to report if there will be any extra effects when bouncing the cluster, and if so, the reason for those effects. `(PR #4770) `_ * Added ``fetched_versions`` to the storage metrics section of status to report how fast a storage server is catching up in versions. `(PR #4770) `_ * Added ``fetches_from_logs`` to the storage metrics section of status to report how frequently a storage server fetches updates from transaction logs. `(PR #4770) `_ From d82eac406245dbcc16736dd7f81bee57d7db0fea Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 27 May 2021 20:41:49 -0700 Subject: [PATCH 456/461] Fix a test issue where closing an AsyncFileNonDurable could permanently prevent you from reopening the file if the machine was in a failed state during cleanup --- fdbrpc/AsyncFileNonDurable.actor.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index bde8e0fe9e..f813c1a354 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -238,6 +238,7 @@ public: //TraceEvent("AsyncFileNonDurableOpenWaitOnDelete2").detail("Filename", filename); if (shutdown.isReady()) throw io_error().asInjectedFault(); + wait(g_simulator.onProcess(currentProcess, currentTaskID)); } state Reference nonDurableFile( @@ -859,11 +860,9 @@ private: //TraceEvent("AsyncFileNonDurable_FinishDelete", self->id).detail("Filename", self->filename); delete self; - wait(g_simulator.onProcess(currentProcess, currentTaskID)); return Void(); } catch (Error& e) { state Error err = e; - wait(g_simulator.onProcess(currentProcess, currentTaskID)); throw err; } } From 750901dd1d9c124801701156be7bf677b0adfc6f Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 27 May 2021 21:54:59 -0700 Subject: [PATCH 457/461] Reduce the frequency that buggified reads are failed so that transactions with a lot of reads aren't doomed to almost always fail. --- fdbclient/NativeAPI.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 75c11db594..214b8196ac 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -2235,7 +2235,7 @@ ACTOR Future> getValue(Future version, state GetValueReply reply; try { - if (CLIENT_BUGGIFY) { + if (CLIENT_BUGGIFY_WITH_PROB(.01)) { throw deterministicRandom()->randomChoice( std::vector{ transaction_too_old(), future_version() }); } @@ -3078,7 +3078,7 @@ ACTOR Future getRange(Database cx, ++cx->transactionPhysicalReads; state GetKeyValuesReply rep; try { - if (CLIENT_BUGGIFY) { + if (CLIENT_BUGGIFY_WITH_PROB(.01)) { throw deterministicRandom()->randomChoice( std::vector{ transaction_too_old(), future_version() }); } From cc3175fc505bf130e98cb2cb69a7200e966c2ebf Mon Sep 17 00:00:00 2001 From: Dan Lambright Date: Fri, 28 May 2021 11:09:41 -0400 Subject: [PATCH 458/461] remove forwarding --- fdbclient/CoordinationInterface.h | 3 +-- fdbserver/Coordination.actor.cpp | 9 ++------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h index dda9cb47ed..bb76688b15 100644 --- a/fdbclient/CoordinationInterface.h +++ b/fdbclient/CoordinationInterface.h @@ -254,13 +254,12 @@ struct CheckDescriptorMutableReply { // Allows client to check if allowed to change the cluster descriptor. struct CheckDescriptorMutableRequest { constexpr static FileIdentifier file_identifier = 214729; - Key key; ReplyPromise reply; CheckDescriptorMutableRequest() {} template void serialize(Ar& ar) { - serializer(ar, key, reply); + serializer(ar, reply); } }; diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index 16124db34a..02c90aad19 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -564,15 +564,10 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, loop choose { when(CheckDescriptorMutableRequest req = waitNext(interf.checkDescriptorMutable.getFuture())) { - Optional forward = regs.getForward(req.key); // Note the response returns the value of a knob enforced by checking only one coordinator. It is not // quorum based. - if (forward.present()) { - req.reply.sendError(coordinators_changed()); - } else { - CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT); - req.reply.send(rep); - } + CheckDescriptorMutableReply rep(SERVER_KNOBS->ENABLE_CROSS_CLUSTER_SUPPORT); + req.reply.send(rep); } when(OpenDatabaseCoordRequest req = waitNext(interf.openDatabase.getFuture())) { Optional forward = regs.getForward(req.clusterKey); From f6253db7dc08ce470e2d8c2f5b182a06e9411da8 Mon Sep 17 00:00:00 2001 From: Josh Slocum Date: Fri, 28 May 2021 18:19:42 +0000 Subject: [PATCH 459/461] Addressing final PR comments --- fdbrpc/QueueModel.cpp | 2 +- fdbrpc/QueueModel.h | 2 +- fdbserver/storageserver.actor.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbrpc/QueueModel.cpp b/fdbrpc/QueueModel.cpp index 6aaaf3df34..124c839647 100644 --- a/fdbrpc/QueueModel.cpp +++ b/fdbrpc/QueueModel.cpp @@ -60,7 +60,7 @@ double QueueModel::addRequest(uint64_t id) { return d.penalty; } -void QueueModel::updateTssEndpoint(uint64_t endpointId, TSSEndpointData tssData) { +void QueueModel::updateTssEndpoint(uint64_t endpointId, const TSSEndpointData& tssData) { auto& d = data[endpointId]; if (!d.tssData.present()) { tssCount++; diff --git a/fdbrpc/QueueModel.h b/fdbrpc/QueueModel.h index 1e8cd009a0..89db9afee8 100644 --- a/fdbrpc/QueueModel.h +++ b/fdbrpc/QueueModel.h @@ -110,7 +110,7 @@ public: int laggingRequestCount; int laggingTSSCompareCount; - void updateTssEndpoint(uint64_t endpointId, TSSEndpointData endpointData); + void updateTssEndpoint(uint64_t endpointId, const TSSEndpointData& endpointData); void removeOldTssData(UID currentGeneration); Optional getTssData(uint64_t endpointId); diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 507de28f32..1f55bf4070 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -4071,7 +4071,7 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor wait(yield()); } - // TODO why is this seemingly random delay here? + // TODO: why is this seemingly random delay here? wait(delay(0.0001)); { From f28dae7c70c102fada52ad27bbcdeb7fcbe4e853 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 28 May 2021 12:43:30 -0700 Subject: [PATCH 460/461] Require a minimum of 6.2.33 for 6.2 snapshot restarting tests to avoid a bug in prior versions --- tests/CMakeLists.txt | 16 ++++++++-------- .../SnapCycleRestart-1.txt | 0 .../SnapCycleRestart-2.txt | 0 .../SnapTestAttrition-1.txt | 0 .../SnapTestAttrition-2.txt | 0 .../SnapTestRestart-1.txt | 0 .../SnapTestRestart-2.txt | 0 .../SnapTestSimpleRestart-1.txt | 0 .../SnapTestSimpleRestart-2.txt | 0 9 files changed, 8 insertions(+), 8 deletions(-) rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapCycleRestart-1.txt (100%) rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapCycleRestart-2.txt (100%) rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapTestAttrition-1.txt (100%) rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapTestAttrition-2.txt (100%) rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapTestRestart-1.txt (100%) rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapTestRestart-2.txt (100%) rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapTestSimpleRestart-1.txt (100%) rename tests/restarting/{from_6.2.29 => from_6.2.33}/SnapTestSimpleRestart-2.txt (100%) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e12b1e3ce9..5b254573fc 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -186,17 +186,17 @@ if(WITH_PYTHON) TEST_FILES restarting/from_5.0.0/StorefrontTestRestart-1.txt restarting/from_5.0.0/StorefrontTestRestart-2.txt) add_fdb_test( - TEST_FILES restarting/from_6.2.29/SnapTestAttrition-1.txt - restarting/from_6.2.29/SnapTestAttrition-2.txt) + TEST_FILES restarting/from_6.2.33/SnapTestAttrition-1.txt + restarting/from_6.2.33/SnapTestAttrition-2.txt) add_fdb_test( - TEST_FILES restarting/from_6.2.29/SnapTestSimpleRestart-1.txt - restarting/from_6.2.29/SnapTestSimpleRestart-2.txt) + TEST_FILES restarting/from_6.2.33/SnapTestSimpleRestart-1.txt + restarting/from_6.2.33/SnapTestSimpleRestart-2.txt) add_fdb_test( - TEST_FILES restarting/from_6.2.29/SnapTestRestart-1.txt - restarting/from_6.2.29/SnapTestRestart-2.txt) + TEST_FILES restarting/from_6.2.33/SnapTestRestart-1.txt + restarting/from_6.2.33/SnapTestRestart-2.txt) add_fdb_test( - TEST_FILES restarting/from_6.2.29/SnapCycleRestart-1.txt - restarting/from_6.2.29/SnapCycleRestart-2.txt) + TEST_FILES restarting/from_6.2.33/SnapCycleRestart-1.txt + restarting/from_6.2.33/SnapCycleRestart-2.txt) add_fdb_test( TEST_FILES restarting/from_5.1.7/DrUpgradeRestart-1.txt restarting/from_5.1.7/DrUpgradeRestart-2.txt) diff --git a/tests/restarting/from_6.2.29/SnapCycleRestart-1.txt b/tests/restarting/from_6.2.33/SnapCycleRestart-1.txt similarity index 100% rename from tests/restarting/from_6.2.29/SnapCycleRestart-1.txt rename to tests/restarting/from_6.2.33/SnapCycleRestart-1.txt diff --git a/tests/restarting/from_6.2.29/SnapCycleRestart-2.txt b/tests/restarting/from_6.2.33/SnapCycleRestart-2.txt similarity index 100% rename from tests/restarting/from_6.2.29/SnapCycleRestart-2.txt rename to tests/restarting/from_6.2.33/SnapCycleRestart-2.txt diff --git a/tests/restarting/from_6.2.29/SnapTestAttrition-1.txt b/tests/restarting/from_6.2.33/SnapTestAttrition-1.txt similarity index 100% rename from tests/restarting/from_6.2.29/SnapTestAttrition-1.txt rename to tests/restarting/from_6.2.33/SnapTestAttrition-1.txt diff --git a/tests/restarting/from_6.2.29/SnapTestAttrition-2.txt b/tests/restarting/from_6.2.33/SnapTestAttrition-2.txt similarity index 100% rename from tests/restarting/from_6.2.29/SnapTestAttrition-2.txt rename to tests/restarting/from_6.2.33/SnapTestAttrition-2.txt diff --git a/tests/restarting/from_6.2.29/SnapTestRestart-1.txt b/tests/restarting/from_6.2.33/SnapTestRestart-1.txt similarity index 100% rename from tests/restarting/from_6.2.29/SnapTestRestart-1.txt rename to tests/restarting/from_6.2.33/SnapTestRestart-1.txt diff --git a/tests/restarting/from_6.2.29/SnapTestRestart-2.txt b/tests/restarting/from_6.2.33/SnapTestRestart-2.txt similarity index 100% rename from tests/restarting/from_6.2.29/SnapTestRestart-2.txt rename to tests/restarting/from_6.2.33/SnapTestRestart-2.txt diff --git a/tests/restarting/from_6.2.29/SnapTestSimpleRestart-1.txt b/tests/restarting/from_6.2.33/SnapTestSimpleRestart-1.txt similarity index 100% rename from tests/restarting/from_6.2.29/SnapTestSimpleRestart-1.txt rename to tests/restarting/from_6.2.33/SnapTestSimpleRestart-1.txt diff --git a/tests/restarting/from_6.2.29/SnapTestSimpleRestart-2.txt b/tests/restarting/from_6.2.33/SnapTestSimpleRestart-2.txt similarity index 100% rename from tests/restarting/from_6.2.29/SnapTestSimpleRestart-2.txt rename to tests/restarting/from_6.2.33/SnapTestSimpleRestart-2.txt From 69dbe04d42a4dbe42f2b4e453c4d1856ff08e23a Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 28 May 2021 14:34:20 -0700 Subject: [PATCH 461/461] Rename WeakFutureReference to UnsafeWeakFutureReference and add warning comment --- fdbrpc/AsyncFileCached.actor.cpp | 2 +- fdbrpc/AsyncFileCached.actor.h | 4 ++-- fdbrpc/sim2.actor.cpp | 2 +- fdbrpc/simulator.h | 2 +- flow/genericactors.actor.h | 21 +++++++++++++-------- 5 files changed, 18 insertions(+), 13 deletions(-) diff --git a/fdbrpc/AsyncFileCached.actor.cpp b/fdbrpc/AsyncFileCached.actor.cpp index 984795c105..6354e55cd0 100644 --- a/fdbrpc/AsyncFileCached.actor.cpp +++ b/fdbrpc/AsyncFileCached.actor.cpp @@ -47,7 +47,7 @@ EvictablePage::~EvictablePage() { } // A map of filename to the file handle for all opened cached files -std::map> AsyncFileCached::openFiles; +std::map> AsyncFileCached::openFiles; void AsyncFileCached::remove_page(AFCPage* page) { pages.erase(page->pageOffset); diff --git a/fdbrpc/AsyncFileCached.actor.h b/fdbrpc/AsyncFileCached.actor.h index 2915b0557c..84c42f9716 100644 --- a/fdbrpc/AsyncFileCached.actor.h +++ b/fdbrpc/AsyncFileCached.actor.h @@ -146,7 +146,7 @@ public: if (f.isReady() && f.isError()) return f; if (!f.isReady()) - openFiles[filename] = WeakFutureReference(f); + openFiles[filename] = UnsafeWeakFutureReference(f); else return f.get(); } @@ -250,7 +250,7 @@ public: private: // A map of filename to the file handle for all opened cached files - static std::map> openFiles; + static std::map> openFiles; std::string filename; Reference uncached; diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index f11caa5461..ee735b963a 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -2465,7 +2465,7 @@ Future> Sim2FileSystem::open(const std::string& file diskParameters, (flags & IAsyncFile::OPEN_NO_AIO) == 0); - machineCache[actualFilename] = WeakFutureReference(f); + machineCache[actualFilename] = UnsafeWeakFutureReference(f); } else { f = itr->second.get(); } diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index 19bed013f2..f83686f464 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -194,7 +194,7 @@ public: std::vector processes; // A map from filename to file handle for all open files on a machine - std::map> openFiles; + std::map> openFiles; std::set deletingFiles; std::set closingFiles; diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 88360685cc..400b9cdf41 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -1902,11 +1902,14 @@ Future operator>>(Future const& lhs, Future const& rhs) { // A weak reference type to wrap a future Reference object. // Once the future is complete, this object holds a pointer to the referenced object but does // not contribute to its reference count. +// +// WARNING: this class will not be aware when the underlying object is destroyed. It is up to the +// user to make sure that an UnsafeWeakFutureReference is discarded at the same time the object is. template -class WeakFutureReference { +class UnsafeWeakFutureReference { public: - WeakFutureReference() {} - WeakFutureReference(Future> future) : data(new WeakFutureReferenceData(future)) {} + UnsafeWeakFutureReference() {} + UnsafeWeakFutureReference(Future> future) : data(new UnsafeWeakFutureReferenceData(future)) {} // Returns a future to obtain a normal reference handle // If the future is ready, this creates a Reference to wrap the object @@ -1926,17 +1929,19 @@ public: Optional getPtrIfReady() { return data->ptr; } private: - // A class to hold the state for a WeakFutureReference - struct WeakFutureReferenceData : public ReferenceCounted, NonCopyable { + // A class to hold the state for an UnsafeWeakFutureReference + struct UnsafeWeakFutureReferenceData : public ReferenceCounted, NonCopyable { Optional ptr; Future> future; Future moveResultFuture; - WeakFutureReferenceData(Future> future) : future(future) { moveResultFuture = moveResult(this); } + UnsafeWeakFutureReferenceData(Future> future) : future(future) { + moveResultFuture = moveResult(this); + } // Waits for the future to complete and then stores the pointer in local storage // When this completes, we will no longer be counted toward the reference count of the object - ACTOR Future moveResult(WeakFutureReferenceData* self) { + ACTOR Future moveResult(UnsafeWeakFutureReferenceData* self) { Reference result = wait(self->future); self->ptr = result.getPtr(); self->future = Future>(); @@ -1944,7 +1949,7 @@ private: } }; - Reference data; + Reference data; }; #include "flow/unactorcompiler.h"