Merge remote-tracking branch 'origin/master' into 33300740-with-shutdown-hooks

2017-09-26 11:28:40 -07:00 · 2017-09-26 11:28:40 -07:00 · a5f1c3b15b
parent 59aae5e994 298b54104e
commit a5f1c3b15b
106 changed files with 3100 additions and 1740 deletions
--- a/bindings/c/local.mk
+++ b/bindings/c/local.mk
@ -23,14 +23,18 @@
 fdb_c_CFLAGS := $(fdbclient_CFLAGS)
 fdb_c_LDFLAGS := $(fdbrpc_LDFLAGS)
 fdb_c_LIBS := lib/libfdbclient.a lib/libfdbrpc.a lib/libflow.a
+fdb_c_tests_LIBS := -Llib -lfdb_c
+fdb_c_tests_HEADERS := -Ibindings/c

 ifeq ($(PLATFORM),linux)
  fdb_c_LIBS += lib/libstdc++.a -lm -lpthread -lrt -ldl
  fdb_c_LDFLAGS += -Wl,--version-script=bindings/c/fdb_c.map -static-libgcc -Wl,-z,nodelete
+  fdb_c_tests_LIBS += -lpthread
 endif

 ifeq ($(PLATFORM),osx)
  fdb_c_LDFLAGS += -lc++ -Xlinker -exported_symbols_list -Xlinker bindings/c/fdb_c.symbols
+  fdb_c_tests_LIBS += -lpthread

  lib/libfdb_c.dylib: bindings/c/fdb_c.symbols

@ -74,3 +78,24 @@ fdb_c_BUILD_SOURCES += bindings/c/fdb_c.g.S
 bindings/c/foundationdb/fdb_c_options.g.h: bin/vexillographer.exe fdbclient/vexillographer/fdb.options $(ALL_MAKEFILES)
 	@echo "Building       $@"
 	@$(MONO) bin/vexillographer.exe fdbclient/vexillographer/fdb.options c $@
+
+bin/fdb_c_performance_test: bindings/c/test/performance_test.c bindings/c/test/test.h fdb_c
+	@echo "Compiling      fdb_c_performance_test"
+	@$(CC) $(CFLAGS) $(fdb_c_tests_LIBS) $(fdb_c_tests_HEADERS) -o $@ bindings/c/test/performance_test.c
+
+bin/fdb_c_ryw_benchmark: bindings/c/test/ryw_benchmark.c bindings/c/test/test.h fdb_c
+	@echo "Compiling      fdb_c_ryw_benchmark"
+	@$(CC) $(CFLAGS) $(fdb_c_tests_LIBS) $(fdb_c_tests_HEADERS) -o $@ bindings/c/test/ryw_benchmark.c
+
+packages/fdb-c-tests-$(VERSION)-$(PLATFORM).tar.gz: bin/fdb_c_performance_test bin/fdb_c_ryw_benchmark
+	@echo "Packaging      $@"
+	@rm -rf packages/fdb-c-tests-$(VERSION)-$(PLATFORM)
+	@mkdir -p packages/fdb-c-tests-$(VERSION)-$(PLATFORM)/bin
+	@cp bin/fdb_c_performance_test packages/fdb-c-tests-$(VERSION)-$(PLATFORM)/bin
+	@cp bin/fdb_c_ryw_benchmark packages/fdb-c-tests-$(VERSION)-$(PLATFORM)/bin
+	@tar -C packages -czvf $@ fdb-c-tests-$(VERSION)-$(PLATFORM) > /dev/null
+	@rm -rf packages/fdb-c-tests-$(VERSION)-$(PLATFORM)
+
+fdb_c_tests: packages/fdb-c-tests-$(VERSION)-$(PLATFORM).tar.gz
+
+packages: fdb_c_tests
--- a/bindings/c/test/performance_test.c
+++ b/bindings/c/test/performance_test.c
@ -0,0 +1,623 @@
+/*
+ * performance_test.c
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "test.h"
+#include <foundationdb/fdb_c.h>
+#include <foundationdb/fdb_c_options.g.h>
+
+#include <stdio.h>
+#include <pthread.h>
+
+pthread_t netThread;
+
+int numKeys = 1000000;
+int keySize = 16;
+uint8_t** keys = NULL;
+int valueSize = 100;
+uint8_t *valueStr = NULL;
+
+fdb_error_t waitError(FDBFuture *f) {
+	fdb_error_t blockError = fdb_future_block_until_ready(f);
+	if(!blockError) {
+		return fdb_future_get_error(f);
+	} else {
+		return blockError;
+	}
+}
+
+struct RunResult run(struct ResultSet *rs, FDBDatabase *db, struct RunResult (*func)(struct ResultSet*, FDBTransaction*)) {
+	FDBTransaction *tr = NULL;
+	checkError(fdb_database_create_transaction(db, &tr), "create transaction", rs);
+	fdb_error_t e = fdb_database_create_transaction(db, &tr);
+	checkError(e, "create transaction", rs);
+
+	while(1) {
+		struct RunResult r = func(rs, tr);
+		e = r.e;
+		if(!e) {
+			FDBFuture *f = fdb_transaction_commit(tr);
+			e = waitError(f);
+			fdb_future_destroy(f);
+		}
+
+		if(e) {
+			FDBFuture *f = fdb_transaction_on_error(tr, e);
+			fdb_error_t retryE = waitError(f);
+			fdb_future_destroy(f);
+			if (retryE) {
+				return (struct RunResult) {0, retryE};
+			}
+		} else {
+			return r;
+		}
+	}
+
+	return RES(0, 4100); // internal_error ; we should never get here
+}
+
+int runTest(struct RunResult (*testFxn)(struct ResultSet*, FDBTransaction*), FDBDatabase *db, struct ResultSet *rs, const char *kpiName) {
+	int numRuns = 25;
+	int *results = malloc(sizeof(int)*numRuns);
+	int i = 0;
+	for(; i < numRuns; ++i) {
+		struct RunResult res = run(rs, db, testFxn);
+		if(res.e) {
+			logError(res.e, kpiName, rs);
+			free(results);
+			return 0;
+		}
+		results[i] = res.res;
+		if(results[i] < 0) {
+			free(results);
+			return -1;
+		}
+	}
+
+	int result = median(results, numRuns);
+	free(results);
+
+	addKpi(rs, kpiName, result, "keys/s");
+
+	return result;
+}
+
+int runTestDb(struct RunResult (*testFxn)(struct ResultSet*, FDBDatabase*), FDBDatabase *db, struct ResultSet *rs, const char *kpiName) {
+	int numRuns = 25;
+	int *results = malloc(sizeof(int)*numRuns);
+	int i = 0;
+	for(; i < numRuns; ++i) {
+		struct RunResult res = testFxn(rs, db);
+		if(res.e) {
+			logError(res.e, kpiName, rs);
+			free(results);
+			return 0;
+		}
+		results[i] = res.res;
+		if(results[i] < 0) {
+			free(results);
+			return -1;
+		}
+	}
+
+	int result = median(results, numRuns);
+	free(results);
+
+	addKpi(rs, kpiName, result, "keys/s");
+
+	return result;
+}
+
+
+struct RunResult clearAll(struct ResultSet *rs, FDBTransaction *tr) {
+	fdb_transaction_clear_range(tr, (uint8_t*)"", 0, (uint8_t*)"\xff", 1);
+	return RES(0, 0);
+}
+
+uint32_t start = 0;
+uint32_t stop = 0;
+struct RunResult insertRange(struct ResultSet *rs, FDBTransaction *tr) {
+	int i;
+	for(i = start; i < stop; i++) {
+		fdb_transaction_set(tr, keys[i], keySize, valueStr, valueSize);
+	}
+	return RES(0, 0);
+}
+
+void insertData(struct ResultSet *rs, FDBDatabase *db) {
+	checkError(run(rs, db, &clearAll).e, "clearing database", rs);
+
+	// TODO: Do this asynchronously.
+	start = 0;
+	while(start < numKeys) {
+		stop = start + 1000;
+		if(stop > numKeys) stop = numKeys;
+		checkError(run(rs, db, &insertRange).e, "inserting data range", rs);
+		start = stop;
+	}
+}
+
+fdb_error_t setRetryLimit(struct ResultSet *rs, FDBTransaction *tr, uint64_t limit) {
+	return fdb_transaction_set_option(tr, FDB_TR_OPTION_RETRY_LIMIT, (const uint8_t*)&limit, sizeof(uint64_t));
+}
+
+uint32_t FUTURE_LATENCY_COUNT = 100000;
+const char *FUTURE_LATENCY_KPI = "C future throughput (local client)";
+struct RunResult futureLatency(struct ResultSet *rs, FDBTransaction *tr) {
+	fdb_error_t e = maybeLogError(setRetryLimit(rs, tr, 5), "setting retry limit", rs);
+	if(e) return RES(0, e);
+
+	FDBFuture *f = fdb_transaction_get_read_version(tr);
+	e = waitError(f);
+	fdb_future_destroy(f);
+	maybeLogError(e, "getting initial read version", rs);
+	if(e) return RES(0, e);
+
+	double start = getTime();
+	int i;
+	for(i = 0; i < FUTURE_LATENCY_COUNT; i++) {
+		FDBFuture *f = fdb_transaction_get_read_version(tr);
+		e = waitError(f);
+		fdb_future_destroy(f);
+		maybeLogError(e, "getting read version", rs);
+		if(e) return RES(0, e);
+	}
+	double end = getTime();
+
+	return RES(FUTURE_LATENCY_COUNT/(end - start), 0);
+}
+
+uint32_t CLEAR_COUNT = 100000;
+const char *CLEAR_KPI = "C clear throughput (local client)";
+struct RunResult clear(struct ResultSet *rs, FDBTransaction *tr) {
+	double start = getTime();
+	int i;
+	for(i = 0; i < CLEAR_COUNT; i++) {
+		int k = ((uint64_t)rand()) % numKeys;
+		fdb_transaction_clear(tr, keys[k], keySize);
+	}
+	double end = getTime();
+
+	fdb_transaction_reset(tr); // Don't actually clear things.
+	return RES(CLEAR_COUNT/(end - start), 0);
+}
+
+uint32_t CLEAR_RANGE_COUNT = 100000;
+const char *CLEAR_RANGE_KPI = "C clear range throughput (local client)";
+struct RunResult clearRange(struct ResultSet *rs, FDBTransaction *tr) {
+	double start = getTime();
+	int i;
+	for(i = 0; i < CLEAR_RANGE_COUNT; i++) {
+		int k = ((uint64_t)rand()) % (numKeys - 1);
+		fdb_transaction_clear_range(tr, keys[k], keySize, keys[k+1], keySize);
+	}
+	double end = getTime();
+
+	fdb_transaction_reset(tr); // Don't actually clear things.
+	return RES(CLEAR_RANGE_COUNT/(end - start), 0);
+}
+
+uint32_t SET_COUNT = 100000;
+const char *SET_KPI = "C set throughput (local client)";
+struct RunResult set(struct ResultSet *rs, FDBTransaction *tr) {
+	double start = getTime();
+	int i;
+	for(i = 0; i < SET_COUNT; i++) {
+		int k = ((uint64_t)rand()) % numKeys;
+		fdb_transaction_set(tr, keys[k], keySize, valueStr, valueSize);
+	}
+	double end = getTime();
+
+	fdb_transaction_reset(tr); // Don't actually set things.
+	return RES(SET_COUNT/(end - start), 0);
+}
+
+uint32_t PARALLEL_GET_COUNT = 10000;
+const char *PARALLEL_GET_KPI = "C parallel get throughput (local client)";
+struct RunResult parallelGet(struct ResultSet *rs, FDBTransaction *tr) {
+	fdb_error_t e = maybeLogError(setRetryLimit(rs, tr, 5), "setting retry limit", rs);
+	if(e) return RES(0, e);
+
+	FDBFuture **futures = (FDBFuture**)malloc((sizeof(FDBFuture*)) * PARALLEL_GET_COUNT);
+
+	double start = getTime();
+
+	int i;
+	for(i = 0; i < PARALLEL_GET_COUNT; i++) {
+		int k = ((uint64_t)rand()) % numKeys;
+		futures[i] = fdb_transaction_get(tr, keys[k], keySize, 0);
+	}
+
+	fdb_bool_t present;
+	uint8_t const *outValue;
+	int outValueLength;
+
+	for(i = 0; i < PARALLEL_GET_COUNT; i++) {
+		e = maybeLogError(fdb_future_block_until_ready(futures[i]), "waiting for get future", rs);
+		if(e) {
+			fdb_future_destroy(futures[i]);
+			return RES(0, e);
+		}
+
+		e = maybeLogError(fdb_future_get_value(futures[i], &present, &outValue, &outValueLength), "getting future value", rs);
+		if(e) {
+			fdb_future_destroy(futures[i]);
+			return RES(0, e);
+		}
+
+		fdb_future_destroy(futures[i]);
+	}
+
+	double end = getTime();
+
+	free(futures);
+	return RES(PARALLEL_GET_COUNT/(end - start), 0);
+}
+
+uint32_t ALTERNATING_GET_SET_COUNT = 2000;
+const char *ALTERNATING_GET_SET_KPI = "C alternating get set throughput (local client)";
+struct RunResult alternatingGetSet(struct ResultSet *rs, FDBTransaction *tr) {
+	fdb_error_t e = maybeLogError(setRetryLimit(rs, tr, 5), "setting retry limit", rs);
+	if(e) return RES(0, e);
+
+	FDBFuture **futures = (FDBFuture**)malloc((sizeof(FDBFuture*)) * ALTERNATING_GET_SET_COUNT);
+
+	double start = getTime();
+
+	int i;
+	for(i = 0; i < ALTERNATING_GET_SET_COUNT; i++) {
+		int k = ((uint64_t)rand()) % numKeys;
+		fdb_transaction_set(tr, keys[k], keySize, valueStr, valueSize);
+		futures[i] = fdb_transaction_get(tr, keys[k], keySize, 0);
+	}
+
+	fdb_bool_t present;
+	uint8_t const *outValue;
+	int outValueLength;
+
+	for(i = 0; i < ALTERNATING_GET_SET_COUNT; i++) {
+		e = maybeLogError(fdb_future_block_until_ready(futures[i]), "waiting for get future", rs);
+		if(e) {
+			fdb_future_destroy(futures[i]);
+			return RES(0, e);
+		}
+
+		e = maybeLogError(fdb_future_get_value(futures[i], &present, &outValue, &outValueLength), "getting future value", rs);
+		if(e) {
+			fdb_future_destroy(futures[i]);
+			return RES(0, e);
+		}
+
+		fdb_future_destroy(futures[i]);
+	}
+
+	double end = getTime();
+
+	free(futures);
+	return RES(ALTERNATING_GET_SET_COUNT/(end - start), 0);
+}
+
+uint32_t SERIAL_GET_COUNT = 2000;
+const char *SERIAL_GET_KPI = "C serial get throughput (local client)";
+struct RunResult serialGet(struct ResultSet *rs, FDBTransaction *tr) {
+	fdb_error_t e = maybeLogError(setRetryLimit(rs, tr, 5), "setting retry limit", rs);
+	if(e) return RES(0, e);
+
+	int i;
+	uint32_t *keyIndices = (uint32_t*)malloc((sizeof(uint32_t)) * SERIAL_GET_COUNT);
+
+	if(SERIAL_GET_COUNT > numKeys/2) {
+		for(i = 0; i < SERIAL_GET_COUNT; i++) {
+			keyIndices[i] = ((uint64_t)rand()) % numKeys;
+		}
+	} else {
+		for(i = 0; i < SERIAL_GET_COUNT; i++) {
+			while(1) {
+				// Yes, this is a linear scan. This happens outside
+				// the part we are measuring.
+				uint32_t index = ((uint64_t)rand()) % numKeys;
+				int j;
+				fdb_bool_t found = 0;
+				for(j = 0; j < i; j++) {
+					if(keyIndices[j] == index) {
+						found = 1;
+						break;
+					}
+				}
+
+				if(!found) {
+					keyIndices[i] = index;
+					break;
+				}
+			}
+		}
+	}
+
+	double start = getTime();
+
+	fdb_bool_t present;
+	uint8_t const *outValue;
+	int outValueLength;
+
+	for(i = 0; i < SERIAL_GET_COUNT; i++) {
+		FDBFuture *f = fdb_transaction_get(tr, keys[keyIndices[i]], keySize, 0);
+		fdb_error_t e = maybeLogError(fdb_future_block_until_ready(f), "getting key in serial", rs);
+		if(e) {
+			free(keyIndices);
+			fdb_future_destroy(f);
+			return RES(0, e);
+		}
+
+		e = maybeLogError(fdb_future_get_value(f, &present, &outValue, &outValueLength), "getting future value", rs);
+		fdb_future_destroy(f);
+		if(e) {
+			free(keyIndices);
+			return RES(0, e);
+		}
+	}
+
+	double end = getTime();
+
+	free(keyIndices);
+	return RES(SERIAL_GET_COUNT/(end - start), 0);
+}
+
+uint32_t GET_RANGE_COUNT = 100000;
+const char *GET_RANGE_KPI = "C get range throughput (local client)";
+struct RunResult getRange(struct ResultSet *rs, FDBTransaction *tr) {
+	fdb_error_t e = maybeLogError(setRetryLimit(rs, tr, 5), "setting retry limit", rs);
+	if(e) return RES(0, e);
+
+	uint32_t startKey = ((uint64_t)rand()) % (numKeys - GET_RANGE_COUNT - 1);
+
+	double start = getTime();
+
+	const FDBKeyValue *outKv;
+	int outCount;
+	fdb_bool_t outMore = 1;
+	int totalOut = 0;
+	int iteration = 0;
+
+	FDBFuture *f = fdb_transaction_get_range(tr,
+		keys[startKey], keySize, 1, 0,
+		keys[startKey + GET_RANGE_COUNT], keySize, 1, 0,
+		0, 0,
+		FDB_STREAMING_MODE_WANT_ALL, ++iteration, 0, 0);
+
+	while(outMore) {
+		e = maybeLogError(fdb_future_block_until_ready(f), "getting range", rs);
+		if(e) {
+			fdb_future_destroy(f);
+			return RES(0, e);
+		}
+
+		e = maybeLogError(fdb_future_get_keyvalue_array(f, &outKv, &outCount, &outMore), "reading range array", rs);
+		if(e) {
+			fdb_future_destroy(f);
+			return RES(0, e);
+		}
+
+		totalOut += outCount;
+
+		if(outMore) {
+			FDBFuture *f2 = fdb_transaction_get_range(tr,
+				outKv[outCount - 1].key, outKv[outCount - 1].key_length, 1, 1,
+				keys[startKey + GET_RANGE_COUNT], keySize, 1, 0,
+				0, 0,
+				FDB_STREAMING_MODE_WANT_ALL, ++iteration, 0, 0);
+			fdb_future_destroy(f);
+			f = f2;
+		}
+	}
+
+	if(totalOut != GET_RANGE_COUNT) {
+		char *msg = (char*)malloc((sizeof(char)) * 200);
+		sprintf(msg, "verifying out count (%d != %d)", totalOut, GET_RANGE_COUNT);
+		logError(4100, msg, rs);
+		free(msg);
+		fdb_future_destroy(f);
+		return RES(0, 4100);
+	}
+	if(outMore) {
+		logError(4100, "verifying no more in range", rs);
+		fdb_future_destroy(f);
+		return RES(0, 4100);
+	}
+	fdb_future_destroy(f);
+
+	double end = getTime();
+
+	return RES(GET_RANGE_COUNT/(end - start), 0);
+}
+
+uint32_t GET_KEY_COUNT = 2000;
+const char *GET_KEY_KPI = "C get key throughput (local client)";
+struct RunResult getKey(struct ResultSet *rs, FDBTransaction *tr) {
+	fdb_error_t e = maybeLogError(setRetryLimit(rs, tr, 5), "setting retry limit", rs);
+	if(e) return RES(0, e);
+
+	double start = getTime();
+
+	fdb_bool_t present;
+	uint8_t const *outValue;
+	int outValueLength;
+
+	int i;
+	for(i = 0; i < GET_KEY_COUNT; i++) {
+		int key = ((uint64_t)rand()) % numKeys;
+		int offset = (((uint64_t)rand()) % 21) - 10;
+		FDBFuture *f = fdb_transaction_get_key(tr, keys[key], keySize, 1, offset, 0);
+
+		e = maybeLogError(fdb_future_block_until_ready(f), "waiting for get key", rs);
+		if(e) {
+			fdb_future_destroy(f);
+			return RES(0, e);
+		}
+
+		e = maybeLogError(fdb_future_get_value(f, &present, &outValue, &outValueLength), "getting future value", rs);
+		fdb_future_destroy(f);
+		if(e) {
+			return RES(0, e);
+		}
+	}
+
+	double end = getTime();
+
+	return RES(GET_KEY_COUNT/(end - start), 0);
+}
+
+uint32_t GET_SINGLE_KEY_RANGE_COUNT = 2000;
+const char *GET_SINGLE_KEY_RANGE_KPI = "C get_single_key_range throughput (local client)";
+struct RunResult getSingleKeyRange(struct ResultSet *rs, FDBTransaction *tr) {
+	fdb_error_t e = maybeLogError(setRetryLimit(rs, tr, 5), "setting retry limit", rs);
+	if(e) return RES(0, e);
+
+	double start = getTime();
+
+	const FDBKeyValue *outKv;
+	int outCount;
+	fdb_bool_t outMore;
+
+	int i;
+	for(i = 0; i < GET_SINGLE_KEY_RANGE_COUNT; i++) {
+		int key = ((uint64_t)rand()) % (numKeys - 1);
+		FDBFuture *f = fdb_transaction_get_range(tr,
+			keys[key], keySize, 1, 0,
+			keys[key + 1], keySize, 1, 0,
+			2, 0,
+			FDB_STREAMING_MODE_EXACT, 1, 0, 0);
+
+		e = maybeLogError(fdb_future_block_until_ready(f), "waiting for single key range", rs);
+		if(e) {
+			fdb_future_destroy(f);
+			return RES(0, e);
+		}
+
+		e = maybeLogError(fdb_future_get_keyvalue_array(f, &outKv, &outCount, &outMore), "reading single key range array", rs);
+		if(e) {
+			fdb_future_destroy(f);
+			return RES(0, e);
+		}
+
+		if(outCount != 1) {
+			logError(4100, "more than one key returned in single key range read", rs);
+			fdb_future_destroy(f);
+			return RES(0, 4100);
+		}
+		if(outMore) {
+			logError(4100, "more keys to read in single key range read", rs);
+			fdb_future_destroy(f);
+			return RES(0, 4100);
+		}
+
+		fdb_future_destroy(f);
+	}
+
+	double end = getTime();
+
+	return RES(GET_SINGLE_KEY_RANGE_COUNT/(end - start), 0);
+}
+
+struct RunResult singleKey(struct ResultSet *rs, FDBTransaction *tr) {
+	int k = ((uint64_t)rand()) % numKeys;
+	fdb_transaction_set(tr, keys[k], keySize, valueStr, valueSize);
+	return RES(0, 0);
+}
+
+uint32_t WRITE_TRANSACTION_COUNT = 1000;
+const char *WRITE_TRANSACTION_KPI = "C write_transaction throughput (local client)";
+struct RunResult writeTransaction(struct ResultSet *rs, FDBDatabase *db) {
+	double start = getTime();
+
+	int i;
+	for(i = 0; i < WRITE_TRANSACTION_COUNT; i++) {
+		struct RunResult res = run(rs, db, &singleKey);
+		if(res.e) return res;
+	}
+
+	double end = getTime();
+
+	return RES(WRITE_TRANSACTION_COUNT/(end - start), 0);
+}
+
+void runTests(struct ResultSet *rs) {
+	FDBDatabase *db = openDatabase(rs, &netThread);
+
+	printf("Loading database...\n");
+	insertData(rs, db);
+
+	printf("future_latency\n");
+	runTest(&futureLatency, db, rs, FUTURE_LATENCY_KPI);
+
+	printf("clear\n");
+	runTest(&clear, db, rs, CLEAR_KPI);
+
+	printf("clear_range\n");
+	runTest(&clearRange, db, rs, CLEAR_RANGE_KPI);
+
+	printf("set\n");
+	runTest(&set, db, rs, SET_KPI);
+
+	printf("parallel_get\n");
+	runTest(&parallelGet, db, rs, PARALLEL_GET_KPI);
+
+	printf("alternating_get_set\n");
+	runTest(&alternatingGetSet, db, rs, ALTERNATING_GET_SET_KPI);
+
+	printf("serial_get\n");
+	runTest(&serialGet, db, rs, SERIAL_GET_KPI);
+
+	printf("get_range\n");
+	runTest(&getRange, db, rs, GET_RANGE_KPI);
+
+	printf("get_key\n");
+	runTest(&getKey, db, rs, GET_KEY_KPI);
+
+	printf("get_single_key_range\n");
+	runTest(&getSingleKeyRange, db, rs, GET_SINGLE_KEY_RANGE_KPI);
+
+	printf("write_transaction\n");
+	runTestDb(&writeTransaction, db, rs, WRITE_TRANSACTION_KPI);
+
+	fdb_database_destroy(db);
+	fdb_stop_network();
+}
+
+int main(int argc, char **argv) {
+	srand(time(NULL));
+	struct ResultSet *rs = newResultSet();
+	checkError(fdb_select_api_version(500), "select API version", rs);
+	printf("Running performance test at client version: %s\n", fdb_get_client_version());
+
+	valueStr = (uint8_t*)malloc((sizeof(uint8_t))*valueSize);
+	int i;
+	for(i = 0; i < valueSize; i++) {
+		valueStr[i] = (uint8_t)'x';
+	}
+
+	keys = generateKeys(numKeys, keySize);
+	runTests(rs);
+	writeResultSet(rs);
+
+	free(valueStr);
+	freeResultSet(rs);
+	freeKeys(keys, numKeys);
+
+	return 0;
+}
--- a/bindings/c/test/ryw_benchmark.c
+++ b/bindings/c/test/ryw_benchmark.c
@ -19,8 +19,6 @@
 */

 #include "test.h"
-
-#define FDB_API_VERSION 500
 #include <foundationdb/fdb_c.h>
 #include <foundationdb/fdb_c_options.g.h>

@ -32,59 +30,10 @@

 pthread_t netThread;

-void preload(FDBTransaction *tr, int numKeys) {
-	fdb_transaction_clear_range(tr, (uint8_t*)"", 0, (uint8_t*)"\xff", 1);
-
-	uint32_t i;
-	for(i = 0; i < numKeys; ++i) {
-		uint32_t k = htonl(i);
-		fdb_transaction_set(tr, (uint8_t*)&k, 4, (uint8_t*)&k, 4);
-	}
-}
-
-void* runNetwork() {
-	checkError(fdb_run_network(), "run network", NULL);
-	return NULL;
-}
-
-FDBDatabase* openDatabase(struct ResultSet *rs) {
-	checkError(fdb_setup_network(), "setup network", rs);
-	pthread_create(&netThread, NULL, &runNetwork, NULL);
-
-	FDBFuture *f = fdb_create_cluster(NULL);
-	checkError(fdb_future_block_until_ready(f), "block for cluster", rs);
-
-	FDBCluster *cluster;
-	checkError(fdb_future_get_cluster(f, &cluster), "get cluster", rs);
-
-	fdb_future_destroy(f);
-
-	f = fdb_cluster_create_database(cluster, (uint8_t*)"DB", 2);
-	checkError(fdb_future_block_until_ready(f), "block for database", rs);
-
-	FDBDatabase *db;
-	checkError(fdb_future_get_database(f, &db), "get database", rs);
-
-	fdb_future_destroy(f);
-	fdb_cluster_destroy(cluster);
-
-	return db;
-}
-
 int numKeys = 10000;
 int keySize = 16;
 uint8_t** keys;

-void populateKeys() {
-	keys = (uint8_t**)malloc(sizeof(uint8_t*)*(numKeys+1)); // This and its contents are never deallocated
-
-	uint32_t i;
-	for(i = 0; i <= numKeys; ++i) {
-		keys[i] = malloc(keySize);
-		sprintf((char*)keys[i], "%0*d", keySize, i);
-	}
-}
-
 void insertData(FDBTransaction *tr) {
 	fdb_transaction_clear_range(tr, (uint8_t*)"", 0, (uint8_t*)"\xff", 1);

@ -265,8 +214,8 @@ int interleavedSetsGets(FDBTransaction *tr, struct ResultSet *rs) {
 	return 10000 / (end - start);
 }

-struct ResultSet* runTests(struct ResultSet *rs) {
-	FDBDatabase *db = openDatabase(rs);
+void runTests(struct ResultSet *rs) {
+	FDBDatabase *db = openDatabase(rs, &netThread);

 	FDBTransaction *tr;
 	checkError(fdb_database_create_transaction(db, &tr), "create transaction", rs);
@ -289,18 +238,20 @@ struct ResultSet* runTests(struct ResultSet *rs) {

 	fdb_database_destroy(db);
 	fdb_stop_network();
-
-	return rs;
 }

 int main(int argc, char **argv) {
+	srand(time(NULL));
 	struct ResultSet *rs = newResultSet();
 	checkError(fdb_select_api_version(500), "select API version", rs);
 	printf("Running RYW Benchmark test at client version: %s\n", fdb_get_client_version());

-	populateKeys();
+	keys = generateKeys(numKeys, keySize);
 	runTests(rs);
 	writeResultSet(rs);
 	freeResultSet(rs);
+	freeKeys(keys, numKeys);
+
+	return 0;
 }

--- a/bindings/c/test/test.h
+++ b/bindings/c/test/test.h
@ -24,6 +24,14 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <pthread.h>
+
+#ifndef FDB_API_VERSION
+#define FDB_API_VERSION 500
+#endif
+
+#include <foundationdb/fdb_c.h>
+#include <foundationdb/fdb_c_options.g.h>

 double getTime() {
 	static struct timeval tv;
@ -31,6 +39,29 @@ double getTime() {
 	return tv.tv_usec/1000000.0 + tv.tv_sec;
 }

+void writeKey(uint8_t **dest, int key, int keySize) {
+	*dest = (uint8_t*)malloc((sizeof(uint8_t))*keySize);
+	sprintf((char*)*dest, "%0*d", keySize, key);
+}
+
+uint8_t **generateKeys(int numKeys, int keySize) {
+	uint8_t **keys = (uint8_t**)malloc(sizeof(uint8_t*)*(numKeys+1));
+
+	uint32_t i;
+	for(i = 0; i <= numKeys; ++i) {
+		writeKey(keys + i, i, keySize);
+	}
+
+	return keys;
+}
+void freeKeys(uint8_t **keys, int numKeys) {
+	uint32_t i;
+	for(i = 0; i < numKeys; i++) {
+		free(keys[i]);
+	}
+	free(keys);
+}
+
 int cmpfunc(const void* a, const void* b) {
 	return (*(int*)a - *(int*)b);
 }
@ -40,6 +71,12 @@ int median(int *values, int length) {
 	return values[length/2];
 }

+struct RunResult {
+	int res;
+	fdb_error_t e;
+};
+#define RES(x, y) (struct RunResult) { x, y }
+
 struct Kpi {
 	const char *name;
 	int value;
@ -86,7 +123,6 @@ void addError(struct ResultSet *rs, const char *message) {
 }

 void writeResultSet(struct ResultSet *rs) {
-	srand(time(NULL)); // TODO: move this?
 	uint64_t id = ((uint64_t)rand() << 32) + rand();
 	char name[100];
 	sprintf(name, "fdb-c_result-%llu.json", id);
@ -147,10 +183,10 @@ void freeResultSet(struct ResultSet *rs) {
 	free(rs);
 }

-int getError(int err, const char* context, struct ResultSet *rs) {
+fdb_error_t getError(fdb_error_t err, const char* context, struct ResultSet *rs) {
 	if(err) {
 		char *msg = (char*)malloc(strlen(context) + 100);
-		sprintf(msg, "Error in %s: %d", context, err);
+		sprintf(msg, "Error in %s: %s", context, fdb_get_error(err));
 		fprintf(stderr, "%s\n", msg);
 		if(rs != NULL) {
 			addError(rs, msg);
@ -162,7 +198,7 @@ int getError(int err, const char* context, struct ResultSet *rs) {
 	return err;
 }

-void checkError(int err, const char* context, struct ResultSet *rs) {
+void checkError(fdb_error_t err, const char* context, struct ResultSet *rs) {
 	if(getError(err, context, rs)) {
 		if(rs != NULL) {
 			writeResultSet(rs);
@ -172,3 +208,50 @@ void checkError(int err, const char* context, struct ResultSet *rs) {
 	}
 }

+fdb_error_t logError(fdb_error_t err, const char* context, struct ResultSet *rs) {
+	char *msg = (char*)malloc(strlen(context) + 100);
+	sprintf(msg, "Error in %s: %s", context, fdb_get_error(err));
+	fprintf(stderr, "%s\n", msg);
+	if(rs != NULL) {
+		addError(rs, msg);
+	}
+
+	free(msg);
+	return err;
+}
+
+fdb_error_t maybeLogError(fdb_error_t err, const char* context, struct ResultSet *rs) {
+	if(err && !fdb_error_predicate( FDB_ERROR_PREDICATE_RETRYABLE, err ) ) {
+		return logError(err, context, rs);
+	}
+	return err;
+}
+
+void* runNetwork() {
+	checkError(fdb_run_network(), "run network", NULL);
+	return NULL;
+}
+
+FDBDatabase* openDatabase(struct ResultSet *rs, pthread_t *netThread) {
+	checkError(fdb_setup_network(), "setup network", rs);
+	pthread_create(netThread, NULL, &runNetwork, NULL);
+
+	FDBFuture *f = fdb_create_cluster(NULL);
+	checkError(fdb_future_block_until_ready(f), "block for cluster", rs);
+
+	FDBCluster *cluster;
+	checkError(fdb_future_get_cluster(f, &cluster), "get cluster", rs);
+
+	fdb_future_destroy(f);
+
+	f = fdb_cluster_create_database(cluster, (uint8_t*)"DB", 2);
+	checkError(fdb_future_block_until_ready(f), "block for database", rs);
+
+	FDBDatabase *db;
+	checkError(fdb_future_get_database(f, &db), "get database", rs);
+
+	fdb_future_destroy(f);
+	fdb_cluster_destroy(cluster);
+
+	return db;
+}
--- a/bindings/flow/fdb_flow.actor.cpp
+++ b/bindings/flow/fdb_flow.actor.cpp
@ -41,7 +41,7 @@ ACTOR Future<Void> _test() {
 	// tr->setVersion(1);

 	Version ver = wait( tr->getReadVersion() );
-	printf("%ld\n", ver);
+	printf("%lld\n", ver);

 	state std::vector< Future<Version> > versions;

--- a/bindings/flow/local.mk
+++ b/bindings/flow/local.mk
@ -29,7 +29,7 @@ packages/fdb-flow-$(FLOWVER)-$(PLATFORM)-$(ARCH).tar.gz: fdb_flow
 	@rm -rf packages/fdb-flow-$(FLOWVER)-$(PLATFORM)-$(ARCH)
 	@mkdir -p packages/fdb-flow-$(FLOWVER)-$(PLATFORM)-$(ARCH)/lib packages/fdb-flow-$(FLOWVER)-$(PLATFORM)-$(ARCH)/include/bindings/flow packages/fdb-flow-$(FLOWVER)-$(PLATFORM)-$(ARCH)/include/bindings/c/foundationdb
 	@cp lib/libfdb_flow.a packages/fdb-flow-$(FLOWVER)-$(PLATFORM)-$(ARCH)/lib
-	@find bindings/flow -name '*.h' -not -name 'bindings/flow/tester/*' -exec cp {} packages/fdb-flow-$(FLOWVER)-$(PLATFORM)-$(ARCH)/include/bindings/flow \;
+	@find bindings/flow -name '*.h' -not -path 'bindings/flow/tester/*' -exec cp {} packages/fdb-flow-$(FLOWVER)-$(PLATFORM)-$(ARCH)/include/bindings/flow \;
 	@find bindings/c/foundationdb -name '*.h' -exec cp {} packages/fdb-flow-$(FLOWVER)-$(PLATFORM)-$(ARCH)/include/bindings/c/foundationdb \;
 	@tar czf packages/fdb-flow-$(FLOWVER)-$(PLATFORM)-$(ARCH).tar.gz -C packages fdb-flow-$(FLOWVER)-$(PLATFORM)-$(ARCH)
 	@rm -rf packages/fdb-flow-$(FLOWVER)-$(PLATFORM)-$(ARCH)
--- a/bindings/java/fdbJNI.cpp
+++ b/bindings/java/fdbJNI.cpp
@ -183,8 +183,10 @@ JNIEXPORT void JNICALL Java_com_apple_cie_foundationdb_NativeFuture_Future_1regi
 	// Here we cache a thread-local reference to jenv
 	g_thread_jenv = jenv;
 	fdb_error_t err = fdb_future_set_callback( f, &callCallback, callback );
-	if( err )
+	if( err ) {
+		jenv->DeleteGlobalRef( callback );
 		safeThrow( jenv, getThrowable( jenv, err ) );
+	}
 }

 JNIEXPORT void JNICALL Java_com_apple_cie_foundationdb_NativeFuture_Future_1blockUntilReady(JNIEnv *jenv, jobject, jlong future) {
@ -335,15 +337,7 @@ JNIEXPORT jobject JNICALL Java_com_apple_cie_foundationdb_FutureResults_FutureRe
 			return JNI_NULL;
 		}

-		uint8_t *keyvalues_barr = (uint8_t *)jenv->GetByteArrayElements(lastKey, NULL);
-		if (!keyvalues_barr) {
-			throwRuntimeEx( jenv, "Error getting handle to native resources" );
-			return JNI_NULL;
-		}
-
-		memcpy(keyvalues_barr, kvs[count - 1].key, kvs[count - 1].key_length);
-		// void function that is not documented as not throwing
-		jenv->ReleaseByteArrayElements(lastKey, (jbyte *)keyvalues_barr, 0);
+		jenv->SetByteArrayRegion(lastKey, 0, kvs[count - 1].key_length, (jbyte *)kvs[count - 1].key);
 	}

 	jobject result = jenv->NewObject(resultCls, resultCtorId, lastKey, count, (jboolean)more);
@ -353,6 +347,7 @@ JNIEXPORT jobject JNICALL Java_com_apple_cie_foundationdb_FutureResults_FutureRe
 	return result;
 }

+// SOMEDAY: explore doing this more efficiently with Direct ByteBuffers
 JNIEXPORT jobject JNICALL Java_com_apple_cie_foundationdb_FutureResults_FutureResults_1get(JNIEnv *jenv, jobject, jlong future) {
 	if( !future ) {
 		throwParamNotNull(jenv);
@ -384,12 +379,6 @@ JNIEXPORT jobject JNICALL Java_com_apple_cie_foundationdb_FutureResults_FutureRe
 			throwOutOfMem(jenv);
 		return JNI_NULL;
 	}
-	uint8_t *keyvalues_barr = (uint8_t *)jenv->GetByteArrayElements(keyValueArray, NULL); 
-	if (!keyvalues_barr) {
-		throwRuntimeEx( jenv, "Error getting handle to native resources" );
-		return JNI_NULL;
-	}
-
 	jintArray lengthArray = jenv->NewIntArray(count * 2);
 	if( !lengthArray ) {
 		if( !jenv->ExceptionOccurred() )
@ -406,16 +395,15 @@ JNIEXPORT jobject JNICALL Java_com_apple_cie_foundationdb_FutureResults_FutureRe

 	int offset = 0;
 	for(int i = 0; i < count; i++) {
-		memcpy(keyvalues_barr + offset, kvs[i].key, kvs[i].key_length);
+		jenv->SetByteArrayRegion(keyValueArray, offset, kvs[i].key_length, (jbyte *)kvs[i].key);
 		length_barr[ i * 2 ] = kvs[i].key_length;
 		offset += kvs[i].key_length;

-		memcpy(keyvalues_barr + offset, kvs[i].value, kvs[i].value_length);
+		jenv->SetByteArrayRegion(keyValueArray, offset, kvs[i].value_length, (jbyte *)kvs[i].value);
 		length_barr[ (i * 2) + 1 ] = kvs[i].value_length;
 		offset += kvs[i].value_length;
 	}

-	jenv->ReleaseByteArrayElements(keyValueArray, (jbyte *)keyvalues_barr, 0);
 	jenv->ReleaseIntArrayElements(lengthArray, length_barr, 0);

 	jobject result = jenv->NewObject(resultCls, resultCtorId, keyValueArray, lengthArray, (jboolean)more);
@ -425,7 +413,7 @@ JNIEXPORT jobject JNICALL Java_com_apple_cie_foundationdb_FutureResults_FutureRe
 	return result;
 }

-// SOMEDAY: this could be done much more efficiently with Direct ByteBuffers
+// SOMEDAY: explore doing this more efficiently with Direct ByteBuffers
 JNIEXPORT jbyteArray JNICALL Java_com_apple_cie_foundationdb_FutureResult_FutureResult_1get(JNIEnv *jenv, jobject, jlong future) {
 	if( !future ) {
 		throwParamNotNull(jenv);
@ -451,15 +439,8 @@ JNIEXPORT jbyteArray JNICALL Java_com_apple_cie_foundationdb_FutureResult_Future
 			throwOutOfMem(jenv);
 		return JNI_NULL;
 	}
-	uint8_t *barr = (uint8_t *)jenv->GetByteArrayElements(result, NULL); 
-	if (!barr) {
-		throwRuntimeEx( jenv, "Error getting handle to native resources" );
-		return JNI_NULL;
-	}

-	memcpy(barr, value, length);
-	// passing "0" here commits the data back and releases the native copy
-	jenv->ReleaseByteArrayElements(result, (jbyte *)barr, 0);
+	jenv->SetByteArrayRegion(result, 0, length, (const jbyte *)value);
 	return result;
 }

@ -484,15 +465,8 @@ JNIEXPORT jbyteArray JNICALL Java_com_apple_cie_foundationdb_FutureKey_FutureKey
 			throwOutOfMem(jenv);
 		return JNI_NULL;
 	}
-	uint8_t *barr = (uint8_t *)jenv->GetByteArrayElements(result, NULL); 
-	if (!barr) {
-		throwRuntimeEx( jenv, "Error getting handle to native resources" );
-		return JNI_NULL;
-	}

-	memcpy(barr, value, length);
-	// passing "0" here commits the data back and releases the native copy
-	jenv->ReleaseByteArrayElements(result, (jbyte *)barr, 0);
+	jenv->SetByteArrayRegion(result, 0, length, (const jbyte *)value);
 	return result;
 }

@ -922,6 +896,8 @@ JNIEXPORT jlong JNICALL Java_com_apple_cie_foundationdb_FDBTransaction_Transacti
 	int size = jenv->GetArrayLength( key );

 	FDBFuture *f = fdb_transaction_get_addresses_for_key( tr, barr, size );
+
+	jenv->ReleaseByteArrayElements( key, (jbyte *)barr, JNI_ABORT );
 	return (jlong)f;
 }

@ -966,6 +942,8 @@ JNIEXPORT jlong JNICALL Java_com_apple_cie_foundationdb_FDBTransaction_Transacti
 	}
 	int size = jenv->GetArrayLength( key );
 	FDBFuture *f = fdb_transaction_watch( tr, barr, size );
+
+	jenv->ReleaseByteArrayElements( key, (jbyte *)barr, JNI_ABORT );
 	return (jlong)f;
 }

--- a/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/AsListTest.java
+++ b/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/AsListTest.java
@ -1,88 +0,0 @@
-/*
- * AsListTest.java
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.apple.cie.foundationdb.test;
-
-import com.apple.cie.foundationdb.Database;
-import com.apple.cie.foundationdb.FDB;
-import com.apple.cie.foundationdb.LocalityUtil;
-import com.apple.cie.foundationdb.Transaction;
-import com.apple.cie.foundationdb.async.AsyncUtil;
-
-import java.util.function.Function;
-import java.util.concurrent.CompletableFuture;
-
-public class AsListTest {
-	/**
-	 * When the database contains keys a, b, c, d, e -- this should return 5 items,
-	 * a bug made the addition of the clear into the result returning 0 items.
-	 */
-	public static void main(String[] args) {
-		FDB fdb = FDB.selectAPIVersion(500);
-		Database database = fdb.open("T:\\circus\\tags\\RebarCluster-bbc\\cluster_id.txt");
-		database.options().setLocationCacheSize(42);
-		Transaction tr = database.createTransaction();
-		//tr.clear("g".getBytes());
-		/*tr.clear("bbb".getBytes());
-		AsyncIterable<KeyValue> query = tr.getRange(
-				KeySelector.firstGreaterOrEqual("a".getBytes()),
-				KeySelector.firstGreaterOrEqual("e".getBytes()),
-				Integer.MAX_VALUE);
-		//List<KeyValue> list = query.asList().get();
-		//System.out.println("List size: " + list.size());
-*/
-		String[] keyAddresses = LocalityUtil.getAddressesForKey(tr, "a".getBytes()).join();
-		for(String s : keyAddresses) {
-			System.out.println(" @ " + s);
-		}
-
-		@SuppressWarnings("unused")
-		CompletableFuture<Integer> i = AsyncUtil.applySafely(new Function<Exception, CompletableFuture<Integer>>() {
-			@Override
-			public CompletableFuture<Integer> apply(Exception o) {
-				return CompletableFuture.completedFuture(3);
-			}
-		}, new RuntimeException());
-
-		CompletableFuture<Integer> f = null;
-
-		@SuppressWarnings({ "unused", "null" })
-		CompletableFuture<String> g = f.thenComposeAsync(new Function<Integer, CompletableFuture<String>>() {
-			@Override
-			public CompletableFuture<String> apply(Integer o) {
-				return CompletableFuture.completedFuture(o.toString());
-			}
-		});
-
-		@SuppressWarnings({ "unused", "null" })
-		CompletableFuture<String> g2 = f.thenComposeAsync(new Function<Integer, CompletableFuture<String>>() {
-			@Override
-			public CompletableFuture<String> apply(Integer o) {
-				return CompletableFuture.completedFuture(o.toString());
-			}
-		}).exceptionally(new Function<Throwable, String>() {
-			@Override
-			public String apply(Throwable o) {
-				// TODO Auto-generated method stub
-				return null;
-			}
-		});
-	}
-}
--- a/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/PerformanceTester.java
+++ b/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/PerformanceTester.java
@ -352,7 +352,7 @@ public class PerformanceTester extends AbstractTester {
            long start = System.nanoTime();
            for (int i = 0; i < count; i++) {
                int keyIndex = randomKeyIndex();
-                tr.getRange(key(keyIndex), key(keyIndex + 1)).asList().join();
+                tr.getRange(key(keyIndex), key(keyIndex + 1), 2).asList().join();
            }
            long end = System.nanoTime();

--- a/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/TestApp.java
+++ b/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/TestApp.java
@ -1,91 +0,0 @@
-/*
- * TestApp.java
- *
- * This source file is part of the FoundationDB open source project
- *
- * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.apple.cie.foundationdb.test;
-
-import java.util.concurrent.CompletableFuture;
-
-import com.apple.cie.foundationdb.Cluster;
-import com.apple.cie.foundationdb.Database;
-import com.apple.cie.foundationdb.FDB;
-import com.apple.cie.foundationdb.Transaction;
-
-public class TestApp {
-
-	public static void main(String[] args) throws Exception {
-		try {
-			Cluster cluster = FDB.selectAPIVersion(500).createCluster();
-			System.out.println("I now have the cluster");
-			Database db = cluster.openDatabase();
-
-			Transaction tr = db.createTransaction();
-			System.out.println("TR: " + tr);
-
-			byte[] appleValue = tr.get("apple".getBytes()).get();
-			System.out.println("Apple: " + (appleValue == null ? null : new String(appleValue)));
-
-			tr.set("apple".getBytes(), "crunchy".getBytes());
-			System.out.println("Attempting to commit apple/crunchy...");
-			tr.commit().get(); // FIXME: this is not an ok use of the API
-			tr = db.createTransaction();
-
-			long topTime = 0, getTime = 0, bottomTime = 0;
-
-			for(int i = 0; i < 1000; i++) {
-				long a = System.currentTimeMillis();
-
-				final byte[] key = ("apple" + i).getBytes();
-				tr = db.createTransaction();
-				CompletableFuture<byte[]> future = tr.get(key);
-
-				long b = System.currentTimeMillis();
-
-				future.get();
-
-				long c = System.currentTimeMillis();
-
-				tr.set(key, ("Apple" + i).getBytes());
-				final CompletableFuture<Void> commit = tr.commit();
-
-				long d = System.currentTimeMillis();
-
-				commit.whenCompleteAsync((v, error) -> {
-					if(error != null) {
-						error.printStackTrace();
-					}
-				});
-
-				topTime += b - a;
-				getTime += c - b;
-				bottomTime += d - c;
-			}
-
-			System.out.println(" Top:    " + topTime);
-			System.out.println(" Get:    " + getTime);
-			System.out.println(" Bottom: " + bottomTime);
-
-			tr.dispose();
-			db.dispose();
-			cluster.dispose();
-		} catch(Throwable t) {
-			t.printStackTrace();
-		}
-	}
-}
--- a/bindings/nodejs/include.mk
+++ b/bindings/nodejs/include.mk
@ -46,7 +46,7 @@ bindings/nodejs/fdb_node.stamp: bindings/nodejs/src/FdbOptions.g.cpp bindings/no
 	for ver in $(NODE_VERSIONS); do \
 		MMVER=`echo $$ver | sed -e 's,\., ,g' | awk '{print $$1 "." $$2}'` && \
 		mkdir modules/$$MMVER && \
-		node-gyp configure --target=$$ver && \
+		node-gyp configure --dist-url=https://nodejs.org/dist --target=$$ver && \
 		node-gyp -v build && \
 		cp build/Release/fdblib.node modules/$${MMVER} ; \
 	done
@ -67,6 +67,7 @@ bindings/nodejs/package.json: bindings/nodejs/package.json.in $(ALL_MAKEFILES) v
 	@m4 -DVERSION=$(NPMVER) $< > $@
 	@echo "Updating       Node dependencies"
 	@cd bindings/nodejs && \
+	npm config set registry "https://registry.npmjs.org/" && \
 	npm update

 fdb_node_npm: fdb_node versions.target bindings/nodejs/README.md bindings/nodejs/lib/*.js bindings/nodejs/src/* bindings/nodejs/binding.gyp LICENSE
--- a/design/tuple.md
+++ b/design/tuple.md
@ -0,0 +1,196 @@
+# FDB Tuple layer typecodes
+
+This document is intended to be the system of record for the allocation of typecodes in the Tuple layer. The source code isn’t good enough because a typecode might be added to one language (or by a customer) before another.
+
+Status: Standard means that all of our language bindings implement this typecode
+Status: Reserved means that this typecode is not yet used in our standard language bindings, but may be in use by third party bindings or specific applications
+Status: Deprecated means that a previous layer used this type, but issues with that type code have led us to mark this type code as not to be used.
+
+
+### **Null Value**
+
+Typecode: `0x00`
+Length: 0 bytes  
+Status: Standard
+
+### **Byte String**
+
+Typecode: `0x01`
+Length: Variable (terminated by` [\x00]![\xff]`)  
+Encoding: `b'\x01' + value.replace(b'\x00', b'\x00\xFF') + b'\x00'`  
+Test case: `pack(“foo\x00bar”) == b'\x01foo\x00\xffbar\x00'`  
+Status: Standard
+
+In other words, byte strings are null terminated with null values occurring in the string escaped in an order-preserving way.
+
+### **Unicode String**
+
+Typecode: `0x02`
+Length: Variable (terminated by` [\x00]![\xff]`)  
+Encoding: `b'\x02' + value.encode('utf-8').replace(b'\x00', b'\x00\xFF') + b'\x00'`  
+Test case: `pack( u"F\u00d4O\u0000bar" ) == b'\x02F\xc3\x94O\x00\xffbar\x00'`  
+Status: Standard
+
+This is the same way that byte strings are encoded, but first, the unicode string is encoded in UTF-8.
+
+### **(DEPRECATED) Nested Tuple**
+
+Typecodes: `0x03` - `0x04`
+Length: Variable (terminated by `0x04` type code)  
+Status: Deprecated  
+
+This encoding was used by a few layers. However, it had ordering problems when one tuple was a prefix of another and the type of the first element in the longer tuple was either null or a byte string. For an example, consider the empty tuple and the tuple containing only null. In the old scheme, the empty tuple would be encoded as `\x03\x04` while the tuple containing only null would be encoded as `\x03\x00\x04`, so the second tuple would sort first based on their bytes, which is incorrect semantically.
+
+### **Nested Tuple**
+
+Typecodes: `0x05`
+Length: Variable (terminated by `[\x00]![\xff]` at beginning of nested element)  
+Encoding: `b'\x05' + ''.join(map(lambda x: b'\x00\xff' if x is None else pack(x), value)) + b'\x00'`  
+Test case: `pack( (“foo\x00bar”, None, ()) ) == b'\x05\x01foo\x00\xffbar\x00\x00\xff\x05\x00\x00'`  
+Status: Standard
+
+The list is ended with a 0x00 byte. Nulls within the tuple are encoded as `\x00\xff`. There is no other null escaping. In particular, 0x00 bytes that are within the nested types can be left as-is as they are passed over when decoding the interior types. To show how this fixes the bug in the previous version of nested tuples, the empty tuple is now encoded as `\x05\x00` while the tuple containing only null is encoded as `\x05\x00\xff\x00`, so the first tuple will sort first.
+
+### **Negative arbitrary-precision Integer**
+
+Typecodes: `0x0a`, `0x0b`
+Encoding: Not defined yet  
+Status: Reserved; `0x0b` used in Python and Java
+
+These typecodes are reserved for encoding integers larger than 8 bytes. Presumably the type code would be followed by some encoding of the length, followed by the big endian one’s complement number. Reserving two typecodes for each of positive and negative numbers is probably overkill, but until there’s a design in place we might as well not use them. In the Python and Java implementations, `0x0b` stores negative numbers which are expressed with between 9 and 255 bytes. The first byte following the type code (`0x0b`) is a single byte expressing the number of bytes in the integer (with its bits flipped to preserve order), followed by that number of bytes representing the number in big endian order in one's complement.
+
+### **Integer**
+
+Typecodes: `0x0c` - `0x1c`
+&nbsp;`0x0c` is an 8 byte negative number  
+&nbsp;`0x13` is a 1 byte negative number  
+&nbsp;`0x14` is a zero  
+&nbsp;`0x15` is a 1 byte positive number  
+&nbsp;`0x1c` is an 8 byte positive number  
+Length: Depends on typecode (0-8 bytes)  
+Encoding: positive numbers are big endian  
+ negative numbers are big endian one’s complement (so -1 is `0x13` `0xfe`)  
+Test case: `pack( -5551212 ) == b'\x11\xabK\x93'`  
+Status: Standard
+
+There is some variation in the ability of language bindings to encode and decode values at the outside of the possible range, because of different native representations of integers. 
+
+### **Positive arbitrary-precision Integer**
+
+Typecodes: `0x1d`, `0x1e`
+Encoding: Not defined yet  
+Status: Reserved; 0x1d used in Python and Java
+
+These typecodes are reserved for encoding integers larger than 8 bytes. Presumably the type code would be followed by some encoding of the length, followed by the big endian one’s complement number. Reserving two typecodes for each of positive and negative numbers is probably overkill, but until there’s a design in place we might as well not use them. In the Python and Java implementations, `0x1d` stores positive numbers which are expressed with between 9 and 255 bytes. The first byte following the type code (`0x1d`) is a single byte expressing the number of bytes in the integer, followed by that number of bytes representing the number in big endian order.
+
+### **IEEE Binary Floating Point**
+
+Typecodes:   
+&nbsp;`0x20` - float (32 bits)  
+&nbsp;`0x21` - double (64 bits)  
+&nbsp;`0x22` - long double (80 bits)  
+Length: 4 - 10 bytes  
+Test case: `pack( -42f ) == b'=\xd7\xff\xff'`  
+Encoding: Big-endian IEEE binary representation, followed by the following transformation:  
+```python
+ if ord(rep[0])&0x80: # Check sign bit
+    # Flip all bits, this is easier in most other languages!
+    return "".join( chr(0xff^ord(r)) for r in rep )
+ else:
+    # Flip just the sign bit
+    return chr(0x80^ord(rep[0])) + rep[1:]
+```
+Status: Standard (float and double) ; Reserved (long double)
+
+The binary representation should not be assumed to be canonicalized (as to multiple representations of NaN, for example) by a reader. This order sorts all numbers in the following way:
+
+* All negative NaN values with order determined by mantissa bits (which are semantically meaningless)
+* Negative inifinity
+* All real numbers in the standard order (except that -0.0 < 0.0)
+* Positive infinity
+* All positive NaN values with order determined by mantissa bits
+
+This should be equivalent to the standard IEEE total ordering.
+
+### **Arbitrary-precision Decimal**
+
+Typecodes: `0x23`, `0x24`
+Length: Arbitrary  
+Encoding: Scale followed by arbitrary precision integer  
+Status: Reserved
+
+This encoding format has been used by layers. Note that this encoding makes almost no guarantees about ordering properties of tuple-encoded values and should thus generally be avoided.
+
+### **(DEPRECATED) True Value**
+
+Typecode: `0x25`
+Length: 0 bytes  
+Status: Deprecated
+
+### **False Value**
+
+Typecode: `0x26`
+Length: 0 bytes  
+Status: Standard
+
+### **True Value**
+
+Typecode: `0x27`
+Length: 0 bytes  
+Status: Standard
+
+Note that false will sort before true with the given encoding.
+
+### **RFC 4122 UUID**
+
+Typecode: `0x30`
+Length: 16 bytes  
+Encoding: Network byte order as defined in the rfc: [_http://www.ietf.org/rfc/rfc4122.txt_](http://www.ietf.org/rfc/rfc4122.txt)  
+Status: Standard
+
+This is equivalent to the unsigned byte ordering of the UUID bytes in big-endian order.
+
+### **64 bit identifier**
+
+Typecode: `0x31`
+Length: 8 bytes  
+Encoding: Big endian unsigned 8-byte integer (typically random or perhaps semi-sequential)  
+Status: Reserved
+
+There’s definitely some question of whether this deserves to be separated from a plain old 64 bit integer, but a separate type was desired in one of the third-party bindings. This type has not been ported over to the first-party bindings.
+
+### **80 Bit versionstamp**
+
+Typecode: `0x32`
+Length: 10 bytes  
+Encoding: Big endian 10-byte integer. First/high 8 bytes are a database version, next two are batch version.  
+Status: Reserved
+
+### **96 Bit Versionstamp**
+
+Typecode: `0x33`
+Length: 12 bytes  
+Encoding: Big endian 12-byte integer. First/high 8 bytes are a database version, next two are batch version, next two are ordering within transaction.  
+Status: Reserved
+
+The two versionstamp typecodes are reserved for future work adding compatibility between the tuple layer and versionstamp operations. Note that the first 80 bits of the 96 bit versionstamp are the same as the contents of the 80 bit versionstamp, and they correspond to what the `SET_VERSIONSTAMP_KEY` mutation will write into a database key , i.e., the first 8 bytes are a big-endian, unsigned version corresponding to the commit version of a transaction, and the next to bytes are a big-endian, unsigned batch number ordering transactions are committed at the same version. The final two bytes of the 96 bit versionstamp are written by the client and should order writes within a single transaction, thereby providing a global order for all versions.
+
+### **User type codes**
+
+Typecode: `0x40` - `0x4f`
+Length: Variable (user defined)  
+Encoding: User defined  
+Status: Reserved
+
+These type codes may be used by third party extenders without coordinating with us. If used in shipping software, the software should use the directory layer and specify a specific layer name when opening its directories to eliminate the possibility of conflicts.
+
+The only way in which future official, otherwise backward-compatible versions of the tuple layer would be expected to use these type codes is to implement some kind of actual extensibility point for this purpose - they will not be used for standard types.
+
+### **Escape Character**
+
+Typecode: `0xff`
+Length: N/A  
+Encoding: N/A  
+Status: Reserved
+
+This type code is not used for anything. However, several of the other tuple types depend on this type code not being used as a type code for other types in order to correctly escape bytes in an order-preserving way. Therefore, it would be a Very Bad Idea™ for future development to start using this code for anything else.
--- a/fdbbackup/backup.actor.cpp
+++ b/fdbbackup/backup.actor.cpp
@ -23,6 +23,7 @@
 #include "flow/serialize.h"
 #include "flow/IRandom.h"
 #include "flow/genericactors.actor.h"
+#include "flow/SignalSafeUnwind.h"

 #include "fdbclient/FDBTypes.h"
 #include "fdbclient/BackupAgent.h"
@ -98,7 +99,7 @@ enum {
 	OPT_CLUSTERFILE, OPT_QUIET, OPT_DRYRUN, OPT_FORCE,
 	OPT_HELP, OPT_DEVHELP, OPT_VERSION, OPT_PARENTPID, OPT_CRASHONERROR,
 	OPT_NOBUFSTDOUT, OPT_BUFSTDOUTERR, OPT_TRACE, OPT_TRACE_DIR,
-	OPT_KNOB, OPT_TRACE_LOG_GROUP,
+	OPT_KNOB, OPT_TRACE_LOG_GROUP, OPT_LOCALITY,

 	//DB constants
 	OPT_SOURCE_CLUSTER,
@ -121,6 +122,7 @@ CSimpleOpt::SOption g_rgAgentOptions[] = {
 	{ OPT_TRACE,           "--log",            SO_NONE },
 	{ OPT_TRACE_DIR,       "--logdir",         SO_REQ_SEP },
 	{ OPT_CRASHONERROR,    "--crash",          SO_NONE },
+	{ OPT_LOCALITY,        "--locality_",      SO_REQ_SEP },
 	{ OPT_HELP,            "-?",               SO_NONE },
 	{ OPT_HELP,            "-h",               SO_NONE },
 	{ OPT_HELP,            "--help",           SO_NONE },
@ -315,6 +317,7 @@ CSimpleOpt::SOption g_rgDBAgentOptions[] = {
 	{ OPT_TRACE,           "--log",            SO_NONE },
 	{ OPT_TRACE_DIR,       "--logdir",         SO_REQ_SEP },
 	{ OPT_CRASHONERROR,    "--crash",          SO_NONE },
+	{ OPT_LOCALITY,        "--locality_",      SO_REQ_SEP },
 	{ OPT_HELP,            "-?",               SO_NONE },
 	{ OPT_HELP,            "-h",               SO_NONE },
 	{ OPT_HELP,            "--help",           SO_NONE },
@ -1820,6 +1823,7 @@ extern uint8_t *g_extra_memory;

 int main(int argc, char* argv[]) {
 	platformInit();
+	initSignalSafeUnwind();

 	int	status = FDB_EXIT_SUCCESS;

@ -1993,6 +1997,7 @@ int main(int argc, char* argv[]) {
 		std::string traceLogGroup;
 		ESOError	lastError;
 		bool partial = true;
+		LocalityData localities;

 		std::vector<std::string> blobArgs;

@ -2084,6 +2089,17 @@ int main(int argc, char* argv[]) {
 				case OPT_TRACE_LOG_GROUP:
 					traceLogGroup = args->OptionArg();
 					break;
+				case OPT_LOCALITY: {
+					std::string syn = args->OptionSyntax();
+					if (!StringRef(syn).startsWith(LiteralStringRef("--locality_"))) {
+						fprintf(stderr, "ERROR: unable to parse locality key '%s'\n", syn.c_str());
+						return FDB_EXIT_ERROR;
+					}
+					syn = syn.substr(11);
+					std::transform(syn.begin(), syn.end(), syn.begin(), ::tolower);
+					localities.set(Standalone<StringRef>(syn), Standalone<StringRef>(std::string(args->OptionArg())));
+					break;
+					}
 				case OPT_CLUSTERFILE:
 					clusterFile = args->OptionArg();
 					break;
@ -2363,7 +2379,7 @@ int main(int argc, char* argv[]) {
 				.detail("CommandLine", commandLine)
 				.trackLatest("ProgramStart");

-			db = cluster->createDatabase(databaseKey).get();
+			db = cluster->createDatabase(databaseKey, localities).get();
 			
 			if(sourceClusterFile.size()) {
 				auto resolvedSourceClusterFile = ClusterConnectionFile::lookupClusterFileName(sourceClusterFile);
@ -2384,7 +2400,7 @@ int main(int argc, char* argv[]) {
 					return 1;
 				}

-				source_db = source_cluster->createDatabase(databaseKey).get();
+				source_db = source_cluster->createDatabase(databaseKey, localities).get();
 			}
 		}

--- a/fdbcli/fdbcli.actor.cpp
+++ b/fdbcli/fdbcli.actor.cpp
@ -30,6 +30,7 @@
 #include "fdbclient/FDBOptions.g.h"

 #include "flow/DeterministicRandom.h"
+#include "flow/SignalSafeUnwind.h"
 #include "fdbrpc/TLSConnection.h"
 #include "fdbrpc/Platform.h"

@ -436,9 +437,9 @@ void initHelp() {
 		"clear a range of keys from the database",
 		"All keys between BEGINKEY (inclusive) and ENDKEY (exclusive) are cleared from the database. This command will succeed even if the specified range is empty, but may fail because of conflicts." ESCAPINGK);
 	helpMap["configure"] = CommandHelp(
-		"configure [new] <single|double|triple|three_data_hall|three_datacenter|ssd|memory|proxies=<PROXIES>|logs=<LOGS>|resolvers=<RESOLVERS>>*",
+		"configure [new] <single|double|triple|three_data_hall|three_datacenter|multi_dc|ssd|memory|proxies=<PROXIES>|logs=<LOGS>|resolvers=<RESOLVERS>>*",
 		"change database configuration",
-		"The `new' option, if present, initializes a new database with the given configuration rather than changing the configuration of an existing one. When used, both a redundancy mode and a storage engine must be specified.\n\nRedundancy mode:\n  single - one copy of the data.  Not fault tolerant.\n  double - two copies of data (survive one failure).\n  triple - three copies of data (survive two failures).\n  three_data_hall - See the Admin Guide.\n  three_datacenter - See the Admin Guide.\n\nStorage engine:\n  ssd - B-Tree storage engine optimized for solid state disks.\n  memory - Durable in-memory storage engine for small datasets.\n\nproxies=<PROXIES>: Sets the desired number of proxies in the cluster. Must be at least 1, or set to -1 which restores the number of proxies to the default value.\n\nlogs=<LOGS>: Sets the desired number of log servers in the cluster. Must be at least 1, or set to -1 which restores the number of logs to the default value.\n\nresolvers=<RESOLVERS>: Sets the desired number of resolvers in the cluster. Must be at least 1, or set to -1 which restores the number of resolvers to the default value.\n\nSee the FoundationDB Administration Guide for more information.");
+		"The `new' option, if present, initializes a new database with the given configuration rather than changing the configuration of an existing one. When used, both a redundancy mode and a storage engine must be specified.\n\nRedundancy mode:\n  single - one copy of the data.  Not fault tolerant.\n  double - two copies of data (survive one failure).\n  triple - three copies of data (survive two failures).\n  three_data_hall - See the Admin Guide.\n  three_datacenter - See the Admin Guide.\n  multi_dc - See the Admin Guide.\n\nStorage engine:\n  ssd - B-Tree storage engine optimized for solid state disks.\n  memory - Durable in-memory storage engine for small datasets.\n\nproxies=<PROXIES>: Sets the desired number of proxies in the cluster. Must be at least 1, or set to -1 which restores the number of proxies to the default value.\n\nlogs=<LOGS>: Sets the desired number of log servers in the cluster. Must be at least 1, or set to -1 which restores the number of logs to the default value.\n\nresolvers=<RESOLVERS>: Sets the desired number of resolvers in the cluster. Must be at least 1, or set to -1 which restores the number of resolvers to the default value.\n\nSee the FoundationDB Administration Guide for more information.");
 	helpMap["coordinators"] = CommandHelp(
 		"coordinators auto|<ADDRESS>+ [description=new_cluster_description]",
 		"change cluster coordinators or description",
@ -504,6 +505,7 @@ void initHelp() {
 		"If no addresses are specified, populates the list of processes which can be killed. Processes cannot be killed before this list has been populated.\n\nIf `all' is specified, attempts to kill all known processes.\n\nIf `list' is specified, displays all known processes. This is only useful when the database is unresponsive.\n\nFor each IP:port pair in <ADDRESS>*, attempt to kill the specified process.");

 	hiddenCommands.insert("expensive_data_check");
+	hiddenCommands.insert("datadistribution");
 }

 void printVersion() {
@ -1672,7 +1674,18 @@ ACTOR Future<bool> exclude( Database db, std::vector<StringRef> tokens, Referenc
 			state double worstFreeSpaceRatio = 1.0;
 			try {
 				for (auto proc : processesMap.obj()){
+					bool storageServer = false;
 					StatusArray rolesArray = proc.second.get_obj()["roles"].get_array();
+					for (StatusObjectReader role : rolesArray) {
+						if (role["role"].get_str() == "storage") {
+							storageServer = true;
+							break;
+						}
+					}
+					// Skip non-storage servers in free space calculation
+					if (!storageServer)
+						continue;
+
 					StatusObjectReader process(proc.second);
 					std::string addrStr;
 					if (!process.get("address", addrStr)) {
@ -1681,6 +1694,9 @@ ACTOR Future<bool> exclude( Database db, std::vector<StringRef> tokens, Referenc
 					}
 					NetworkAddress addr = NetworkAddress::parse(addrStr);
 					bool excluded = (process.has("excluded") && process.last().get_bool()) || addressExcluded(exclusions, addr);
+					ssTotalCount++;
+					if (excluded)
+						ssExcludedCount++;

 					if(!excluded) {
 						StatusObjectReader disk;
@ -1703,15 +1719,6 @@ ACTOR Future<bool> exclude( Database db, std::vector<StringRef> tokens, Referenc

 						worstFreeSpaceRatio = std::min(worstFreeSpaceRatio, double(free_bytes)/total_bytes);
 					}
-
-					for (StatusObjectReader role : rolesArray) {
-						if (role["role"].get_str() == "storage") {
-							if (excluded)
-								ssExcludedCount++;
-							ssTotalCount++;
-							break;
-						}
-					}
 				}
 			}
 			catch (...)  // std::exception
@ -1895,7 +1902,7 @@ void onoff_generator(const char* text, const char *line, std::vector<std::string
 }

 void configure_generator(const char* text, const char *line, std::vector<std::string>& lc) {
-	const char* opts[] = {"new", "single", "double", "triple", "three_data_hall", "three_datacenter", "ssd", "ssd-1", "ssd-2", "memory", "proxies=", "logs=", "resolvers=", NULL};
+	const char* opts[] = {"new", "single", "double", "triple", "three_data_hall", "three_datacenter", "multi_dc", "ssd", "ssd-1", "ssd-2", "memory", "proxies=", "logs=", "resolvers=", NULL};
 	array_generator(text, line, opts, lc);
 }

@ -2210,35 +2217,45 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 			state UID randomID = g_random->randomUniqueID();
 			TraceEvent(SevInfo, "CLICommandLog", randomID).detail("command", printable(StringRef(line)));

-			bool err, partial;
-			state std::vector<std::vector<StringRef>> parsed = parseLine(line, err, partial);
-			if (err) {
-				LogCommand(line, randomID, "ERROR: malformed escape sequence");
-				is_error = true;
-				continue;
-			}
-			if (partial) {
-				LogCommand(line, randomID, "ERROR: unterminated quote");
-				is_error = true;
-				continue;
+			bool malformed, partial;
+			state std::vector<std::vector<StringRef>> parsed = parseLine(line, malformed, partial);
+			if (malformed) LogCommand(line, randomID, "ERROR: malformed escape sequence");
+			if (partial) LogCommand(line, randomID, "ERROR: unterminated quote");
+			if (malformed || partial) {
+				if (parsed.size() > 0) {
+					// Denote via a special token that the command was a parse failure.
+					auto& last_command = parsed.back();
+					last_command.insert(last_command.begin(), StringRef((const uint8_t*)"parse_error", strlen("parse_error")));
+				}
 			}

 			state bool multi = parsed.size() > 1;
+			is_error = false;

 			state std::vector<std::vector<StringRef>>::iterator iter;
 			for (iter = parsed.begin(); iter != parsed.end(); ++iter) {
 				state std::vector<StringRef> tokens = *iter;

-				if (opt.exec.present() && is_error) {
+				if (is_error) {
 					printf("WARNING: the previous command failed, the remaining commands will not be executed.\n");
-					return 1;
+					break;
 				}

-				is_error = false;
-
 				if (!tokens.size())
 					continue;

+				if (tokencmp(tokens[0], "parse_error")) {
+					printf("ERROR: Command failed to completely parse.\n");
+					if (tokens.size() > 1) {
+						printf("ERROR: Not running partial or malformed command:");
+						for (auto t = tokens.begin() + 1; t != tokens.end(); ++t)
+							printf(" %s", formatStringRef(*t, true).c_str());
+						printf("\n");
+					}
+					is_error = true;
+					continue;
+				}
+
 				if (multi) {
 					printf(">>>");
 					for (auto t = tokens.begin(); t != tokens.end(); ++t)
@ -2717,6 +2734,25 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise) {
 					continue;
 				}

+				if (tokencmp(tokens[0], "datadistribution")) {
+					if (tokens.size() != 2) {
+						printf("Usage: datadistribution <on|off>\n");
+						is_error = true;
+					} else {
+						if(tokencmp(tokens[1], "on")) {
+							int _ = wait(setDDMode(db, 1));
+							printf("Data distribution is enabled\n");
+						} else if(tokencmp(tokens[1], "off")) {
+							int _ = wait(setDDMode(db, 0));
+							printf("Data distribution is disabled\n");
+						} else {
+							printf("Usage: datadistribution <on|off>\n");
+							is_error = true;
+						}
+					}
+					continue;
+				}
+
 				if (tokencmp(tokens[0], "option")) {
 					if (tokens.size() == 2 || tokens.size() > 4) {
 						printUsage(tokens[0]);
@ -2841,6 +2877,7 @@ ACTOR Future<Void> timeExit(double duration) {

 int main(int argc, char **argv) {
 	platformInit();
+	initSignalSafeUnwind();
 	Error::init();

 	registerCrashHandler();
--- a/fdbclient/ClientLogEvents.h
+++ b/fdbclient/ClientLogEvents.h
@ -64,88 +64,140 @@ namespace FdbClientLogEvents {
 	};

 	struct EventGet : public Event {
-		EventGet(double ts, double lat, int size) : Event(GET_LATENCY, ts), latency(lat), valueSize(size) { }
+		EventGet(double ts, double lat, int size, const KeyRef &in_key) : Event(GET_LATENCY, ts), latency(lat), valueSize(size), key(in_key) { }

 		template <typename Ar>	Ar& serialize(Ar &ar) {
 			if (!ar.isDeserializing)
-				return Event::serialize(ar) & latency & valueSize;
+				return Event::serialize(ar) & latency & valueSize & key;
 			else
-				return ar & latency & valueSize;
+				return ar & latency & valueSize & key;
 		}

 		double latency;
 		int valueSize;
+		Key key;

 		void logEvent(std::string id) const {
-			TraceEvent("TransactionTrace_Get").detail("TransactionID", id).detail("Latency", latency).detail("ValueSizeBytes", valueSize);
+			TraceEvent("TransactionTrace_Get").detail("TransactionID", id).detail("Latency", latency).detail("ValueSizeBytes", valueSize).detail("Key", printable(key));
 		}
 	};

 	struct EventGetRange : public Event {
-		EventGetRange(double ts, double lat, int size) : Event(GET_RANGE_LATENCY, ts), latency(lat), rangeSize(size) { }
+		EventGetRange(double ts, double lat, int size, const KeyRef &start_key, const KeyRef & end_key) : Event(GET_RANGE_LATENCY, ts), latency(lat), rangeSize(size), startKey(start_key), endKey(end_key) { }

 		template <typename Ar>	Ar& serialize(Ar &ar) {
 			if (!ar.isDeserializing)
-				return Event::serialize(ar) & latency & rangeSize;
+				return Event::serialize(ar) & latency & rangeSize & startKey & endKey;
 			else
-				return ar & latency & rangeSize;
+				return ar & latency & rangeSize & startKey & endKey;
 		}

 		double latency;
 		int rangeSize;
+		Key startKey;
+		Key endKey;

 		void logEvent(std::string id) const {
-			TraceEvent("TransactionTrace_GetRange").detail("TransactionID", id).detail("Latency", latency).detail("RangeSizeBytes", rangeSize);
+			TraceEvent("TransactionTrace_GetRange").detail("TransactionID", id).detail("Latency", latency).detail("RangeSizeBytes", rangeSize).detail("StartKey", printable(startKey)).detail("EndKey", printable(endKey));
 		}
 	};

 	struct EventCommit : public Event {
-		EventCommit() :Event(COMMIT_LATENCY, 0) {}
-		EventCommit(double ts, double lat, int mut, int bytes) : Event(COMMIT_LATENCY, ts), latency(lat), numMutations(mut), commitBytes(bytes) { }
+		EventCommit(double ts, double lat, int mut, int bytes, CommitTransactionRequest *commit_req) : Event(COMMIT_LATENCY, ts), latency(lat), numMutations(mut), commitBytes(bytes), req(*commit_req) { }

 		template <typename Ar>	Ar& serialize(Ar &ar) {
 			if (!ar.isDeserializing)
-				return Event::serialize(ar) & latency & numMutations & commitBytes;
+				return Event::serialize(ar) & latency & numMutations & commitBytes & req.transaction & req.arena;
 			else
-				return ar & latency & numMutations & commitBytes;
+				return ar & latency & numMutations & commitBytes & req.transaction & req.arena;
 		}

 		double latency;
 		int numMutations;
 		int commitBytes;
+		CommitTransactionRequest req; // Only CommitTransactionRef and Arena object within CommitTransactionRequest is serialized

 		void logEvent(std::string id) const {
+			for (auto &read_range : req.transaction.read_conflict_ranges) {
+				TraceEvent("TransactionTrace_Commit_ReadConflictRange").detail("TransactionID", id).detail("Begin", printable(read_range.begin)).detail("End", printable(read_range.end));
+			}
+
+			for (auto &write_range : req.transaction.write_conflict_ranges) {
+				TraceEvent("TransactionTrace_Commit_WriteConflictRange").detail("TransactionID", id).detail("Begin", printable(write_range.begin)).detail("End", printable(write_range.end));
+			}
+
+			for (auto &mutation : req.transaction.mutations) {
+				TraceEvent("TransactionTrace_Commit_Mutation").detail("TransactionID", id).detail("Mutation", mutation.toString());
+			}
+
 			TraceEvent("TransactionTrace_Commit").detail("TransactionID", id).detail("Latency", latency).detail("NumMutations", numMutations).detail("CommitSizeBytes", commitBytes);
 		}
 	};

-	struct EventError : public Event {
-		EventError(EventType t, double ts, int err_code) : Event(t, ts), errCode(err_code) { }
+	struct EventGetError : public Event {
+		EventGetError(double ts, int err_code, const KeyRef &in_key) : Event(ERROR_GET, ts), errCode(err_code), key(in_key) { }

 		template <typename Ar>	Ar& serialize(Ar &ar) {
 			if (!ar.isDeserializing)
-				return Event::serialize(ar) & errCode;
+				return Event::serialize(ar) & errCode & key;
 			else
-				return ar & errCode;
+				return ar & errCode & key;
 		}
+
 		int errCode;
+		Key key;

 		void logEvent(std::string id) const {
-			const char *eventName;
-			if(type == ERROR_GET) {
-				eventName = "TransactionTrace_GetError";
-			}
-			else if(type == ERROR_GET_RANGE) {
-				eventName = "TransactionTrace_GetRangeError";
-			}
-			else if(type == ERROR_COMMIT) {
-				eventName = "TransactionTrace_CommitError";
-			}
-			else {
-				eventName = "TransactionTrace_Error";
+			TraceEvent("TransactionTrace_GetError").detail("TransactionID", id).detail("ErrCode", errCode).detail("Key", printable(key));
+		}
+	};
+
+	struct EventGetRangeError : public Event {
+		EventGetRangeError(double ts, int err_code, const KeyRef &start_key, const KeyRef & end_key) : Event(ERROR_GET_RANGE, ts), errCode(err_code), startKey(start_key), endKey(end_key) { }
+
+		template <typename Ar>	Ar& serialize(Ar &ar) {
+			if (!ar.isDeserializing)
+				return Event::serialize(ar) & errCode & startKey & endKey;
+			else
+				return ar & errCode & startKey & endKey;
+		}
+
+		int errCode;
+		Key startKey;
+		Key endKey;
+
+		void logEvent(std::string id) const {
+			TraceEvent("TransactionTrace_GetRangeError").detail("TransactionID", id).detail("ErrCode", errCode).detail("StartKey", printable(startKey)).detail("EndKey", printable(endKey));
+		}
+	};
+
+	struct EventCommitError : public Event {
+		EventCommitError(double ts, int err_code, CommitTransactionRequest *commit_req) : Event(ERROR_COMMIT, ts), errCode(err_code), req(*commit_req) { }
+	
+		template <typename Ar>	Ar& serialize(Ar &ar) {
+			if (!ar.isDeserializing)
+				return Event::serialize(ar) & errCode & req.transaction & req.arena;
+			else
+				return ar & errCode & req.transaction & req.arena;
+		}
+
+		int errCode;
+		CommitTransactionRequest req; // Only CommitTransactionRef and Arena object within CommitTransactionRequest is serialized
+
+		void logEvent(std::string id) const {
+			for (auto &read_range : req.transaction.read_conflict_ranges) {
+				TraceEvent("TransactionTrace_CommitError_ReadConflictRange").detail("TransactionID", id).detail("Begin", printable(read_range.begin)).detail("End", printable(read_range.end));
 			}

-			TraceEvent(SevWarn, eventName).detail("TransactionID", id).detail("Error", errCode).detail("Description", Error(errCode).what());
+			for (auto &write_range : req.transaction.write_conflict_ranges) {
+				TraceEvent("TransactionTrace_CommitError_WriteConflictRange").detail("TransactionID", id).detail("Begin", printable(write_range.begin)).detail("End", printable(write_range.end));
+			}
+
+			for (auto &mutation : req.transaction.mutations) {
+				TraceEvent("TransactionTrace_CommitError_Mutation").detail("TransactionID", id).detail("Mutation", mutation.toString());
+			}
+
+			TraceEvent("TransactionTrace_CommitError").detail("TransactionID", id).detail("ErrCode", errCode);
 		}
 	};
 }
--- a/fdbclient/CommitTransaction.h
+++ b/fdbclient/CommitTransaction.h
@ -27,6 +27,7 @@
 struct MutationRef {
 	static const int OVERHEAD_BYTES = 12; //12 is the size of Header in MutationList entries
 	enum Type : uint8_t { SetValue=0, ClearRange, AddValue, DebugKeyRange, DebugKey, NoOp, And, Or, Xor, AppendIfFits, AvailableForReuse, Reserved_For_LogProtocolMessage /* See fdbserver/LogProtocolMessage.h */, Max, Min, SetVersionstampedKey, SetVersionstampedValue, MAX_ATOMIC_OP };
+	const char * typeString[MAX_ATOMIC_OP] = { "SetValue", "ClearRange", "AddValue", "DebugKeyRange", "DebugKey", "NoOp", "And", "Or", "Xor", "AppendIfFits", "AvailableForReuse", "Reserved_For_LogProtocolMessage", "Max", "Min", "SetVersionstampedKey", "SetVersionstampedValue" };
 	// This is stored this way for serialization purposes.
 	uint8_t type;
 	StringRef param1, param2;
@ -38,7 +39,12 @@ struct MutationRef {
 	int expectedSize() const { return param1.size() + param2.size(); }

 	std::string toString() const {
-		return format("code: %d param1: %s param2: %s", type, printable(param1).c_str(), printable(param2).c_str());
+		if (type < MutationRef::MAX_ATOMIC_OP) {
+			return format("code: %s param1: %s param2: %s", typeString[type], printable(param1).c_str(), printable(param2).c_str());
+		}
+		else {
+			return format("code: %s param1: %s param2: %s", "Invalid", printable(param1).c_str(), printable(param2).c_str());
+		}
 	}

 	template <class Ar>
--- a/fdbclient/Knobs.cpp
+++ b/fdbclient/Knobs.cpp
@ -45,7 +45,7 @@ ClientKnobs::ClientKnobs(bool randomize) {

 	init( WRONG_SHARD_SERVER_DELAY,                .01 ); if( randomize && BUGGIFY ) WRONG_SHARD_SERVER_DELAY = g_random->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test)
 	init( FUTURE_VERSION_RETRY_DELAY,              .01 ); if( randomize && BUGGIFY ) FUTURE_VERSION_RETRY_DELAY = g_random->random01();// FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY;
-	init( REPLY_BYTE_LIMIT,                     500000 );
+	init( REPLY_BYTE_LIMIT,                      80000 );
 	init( DEFAULT_BACKOFF,                         .01 ); if( randomize && BUGGIFY ) DEFAULT_BACKOFF = g_random->random01();
 	init( DEFAULT_MAX_BACKOFF,                     1.0 );
 	init( BACKOFF_GROWTH_RATE,                     2.0 );
--- a/fdbclient/ManagementAPI.actor.cpp
+++ b/fdbclient/ManagementAPI.actor.cpp
@ -123,6 +123,15 @@ std::map<std::string, std::string> configForToken( std::string const& mode ) {
 		tLogPolicy = IRepPolicyRef(new PolicyAcross(2, "data_hall",
 			IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne())))
 		));
+	} else if(mode == "multi_dc") {
+		redundancy="6";
+		log_replicas="4";
+		storagePolicy = IRepPolicyRef(new PolicyAcross(3, "dcid",
+			IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne())))
+		));
+		tLogPolicy = IRepPolicyRef(new PolicyAcross(2, "dcid",
+			IRepPolicyRef(new PolicyAcross(2, "zoneid", IRepPolicyRef(new PolicyOne())))
+		));
 	} else
 		redundancySpecified = false;
 	if (redundancySpecified) {
@ -1044,6 +1053,39 @@ ACTOR Future<vector<AddressExclusion>> getExcludedServers( Database cx ) {
 	}
 }

+ACTOR Future<int> setDDMode( Database cx, int mode ) {
+	state Transaction tr(cx);
+	state int oldMode = -1;
+	state BinaryWriter wr(Unversioned());
+	wr << mode;
+
+	loop {
+		try {
+			Optional<Value> old = wait( tr.get( dataDistributionModeKey ) );
+			if (oldMode < 0) {
+				oldMode = 1;
+				if (old.present()) {
+					BinaryReader rd(old.get(), Unversioned());
+					rd >> oldMode;
+				}
+			}
+			if (!mode) {
+				BinaryWriter wrMyOwner(Unversioned());
+				wrMyOwner << dataDistributionModeLock;
+				tr.set( moveKeysLockOwnerKey, wrMyOwner.toStringRef() );
+			}
+
+			tr.set( dataDistributionModeKey, wr.toStringRef() );
+
+			Void _ = wait( tr.commit() );
+			return oldMode;
+		} catch (Error& e) {
+			TraceEvent("setDDModeRetrying").error(e);
+			Void _ = wait (tr.onError(e));
+		}
+	}
+}
+
 ACTOR Future<Void> waitForExcludedServers( Database cx, vector<AddressExclusion> excl ) {
 	state std::set<AddressExclusion> exclusions( excl.begin(), excl.end() );

--- a/fdbclient/ManagementAPI.h
+++ b/fdbclient/ManagementAPI.h
@ -153,6 +153,8 @@ Future<Void> unlockDatabase( Database const& cx, UID const& id );
 Future<Void> checkDatabaseLock( Transaction* const& tr, UID const& id );
 Future<Void> checkDatabaseLock( Reference<ReadYourWritesTransaction> const& tr, UID const& id );

+Future<int> setDDMode( Database const& cx, int const& mode );
+
 // Gets the cluster connection string
 Future<std::vector<NetworkAddress>> getCoordinators( Database const& cx );
 #endif
--- a/fdbclient/MasterProxyInterface.h
+++ b/fdbclient/MasterProxyInterface.h
@ -33,7 +33,7 @@ struct MasterProxyInterface {
 	RequestStream< struct CommitTransactionRequest > commit;
 	RequestStream< struct GetReadVersionRequest > getConsistentReadVersion;  // Returns a version which (1) is committed, and (2) is >= the latest version reported committed (by a commit response) when this request was sent
 															     //   (at some point between when this request is sent and when its response is received, the latest version reported committed)
-	RequestStream< ReplyPromise<vector<StorageServerInterface>> > getKeyServersLocations;
+	RequestStream< ReplyPromise<vector<pair<KeyRangeRef, vector<StorageServerInterface>>>> > getKeyServersLocations;
 	RequestStream< struct GetStorageServerRejoinInfoRequest > getStorageServerRejoinInfo;

 	RequestStream<ReplyPromise<Void>> waitFailure;
--- a/fdbclient/MetricLogger.actor.cpp
+++ b/fdbclient/MetricLogger.actor.cpp
--- a/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/OSTest.java
+++ b/bindings/java/src-completable/test/com/apple/cie/foundationdb/test/OSTest.java
@ -1,5 +1,5 @@
 /*
- * OSTest.java
+ * MetricLogger.h
 *
 * This source file is part of the FoundationDB open source project
 *
@ -18,21 +18,8 @@
 * limitations under the License.
 */

-package com.apple.cie.foundationdb.test;
+#pragma once

-import java.io.InputStream;
+#include "NativeAPI.h"

-public class OSTest {
-
-	/**
-	 * @param args
-	 */
-	public static void main(String[] args) {
-		System.out.println("OS name: " + System.getProperty("os.name"));
-		System.out.println("OS arch: " + System.getProperty("os.arch"));
-
-		InputStream stream = OSTest.class.getResourceAsStream("/lib/linux/amd64/libfdb_java.so");
-		System.out.println("Stream: " + stream);
-	}
-
-}
+Future<Void> runMetrics( Future<Database> const& fcx, Key const& metricsPrefix );
--- a/fdbclient/MonitorLeader.actor.cpp
+++ b/fdbclient/MonitorLeader.actor.cpp
@ -79,8 +79,7 @@ ClusterConnectionString const& ClusterConnectionFile::getConnectionString() {

 void ClusterConnectionFile::notifyConnected() {
 	if (setConn){
-		this->setConnectionString(this->getConnectionString());
-		setConn = false;
+		this->writeFile();
 	}
 }

@ -106,6 +105,7 @@ bool ClusterConnectionFile::fileContentsUpToDate(ClusterConnectionString &fileCo
 }

 bool ClusterConnectionFile::writeFile() {
+	setConn = false;
 	if(filename.size()) {
 		try {
 			atomicReplace( filename, "# DO NOT EDIT!\n# This file is auto-generated, it is not to be edited by hand\n" + cs.toString().append("\n") );
@ -306,21 +306,20 @@ ClientLeaderRegInterface::ClientLeaderRegInterface( INetwork* local ) {
 	getLeader.makeWellKnownEndpoint( WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskCoordination );
 }

-ACTOR Future<Void> monitorNominee( Key key, ClientLeaderRegInterface coord, Reference<AsyncVar<vector<Optional<LeaderInfo>>>> nominees, int index ) {
+ACTOR Future<Void> monitorNominee( Key key, ClientLeaderRegInterface coord, AsyncTrigger* nomineeChange, Optional<LeaderInfo> *info, int generation ) {
 	loop {
-		auto const& nom = nominees->get()[index];
-		Optional<LeaderInfo> li = wait( retryBrokenPromise( coord.getLeader, GetLeaderRequest( key, nom.present() ? nom.get().changeID : UID() ), TaskCoordinationReply ) );
-		TraceEvent("GetLeaderReply").detail("Coordinator", coord.getLeader.getEndpoint().address).detail("Nominee", li.present() ? li.get().changeID : UID());
+		state Optional<LeaderInfo> li = wait( retryBrokenPromise( coord.getLeader, GetLeaderRequest( key, info->present() ? info->get().changeID : UID() ), TaskCoordinationReply ) );
+		Void _ = wait( Future<Void>(Void()) ); // Make sure we weren't cancelled

-		if (li != nominees->get()[index]) {
-			vector<Optional<LeaderInfo>> v = nominees->get();
-			v[index] = li;
-			nominees->set(v);
+		TraceEvent("GetLeaderReply").detail("Coordinator", coord.getLeader.getEndpoint().address).detail("Nominee", li.present() ? li.get().changeID : UID()).detail("Generation", generation);
+
+		if (li != *info) {
+			*info = li;
+			nomineeChange->trigger();

 			if( li.present() && li.get().forward )
 				Void _ = wait( Future<Void>(Never()) );
-
-			Void _ = wait( Future<Void>(Void()) ); // Make sure we weren't cancelled
+			Void _ = wait( Future<Void>(Void()) );
 		}
 	}
 }
@ -344,49 +343,61 @@ Optional<LeaderInfo> getLeader( vector<Optional<LeaderInfo>> nominees ) {
 		return Optional<LeaderInfo>();
 }

-ACTOR Future<Void> monitorLeaderInternal( Reference<ClusterConnectionFile> connFile, Reference<AsyncVar<Value>> outSerializedLeaderInfo ) {
-	state ActorCollection ac(false);
-	state Reference<ClusterConnectionFile> intermediateConnFile = connFile;
-	state bool hasConnected = false;
+struct MonitorLeaderInfo {
+	bool hasConnected;
+	Reference<ClusterConnectionFile> intermediateConnFile;
+	int generation;
+
+	MonitorLeaderInfo() : hasConnected(false), generation(0) {}
+	explicit MonitorLeaderInfo( Reference<ClusterConnectionFile> intermediateConnFile ) : intermediateConnFile(intermediateConnFile), hasConnected(false), generation(0) {}
+};
+
+ACTOR Future<MonitorLeaderInfo> monitorLeaderOneGeneration( Reference<ClusterConnectionFile> connFile, Reference<AsyncVar<Value>> outSerializedLeaderInfo, MonitorLeaderInfo info ) {
+	state ClientCoordinators coordinators( info.intermediateConnFile );
+	state AsyncTrigger nomineeChange;
+	state std::vector<Optional<LeaderInfo>> nominees;
+	state Future<Void> allActors;
+
+	nominees.resize(coordinators.clientLeaderServers.size());
+
+	std::vector<Future<Void>> actors;
+	for(int i=0; i<coordinators.clientLeaderServers.size(); i++)
+		actors.push_back( monitorNominee( coordinators.clusterKey, coordinators.clientLeaderServers[i], &nomineeChange, &nominees[i], info.generation ) );
+	allActors = waitForAll(actors);

 	loop {
-		ac.clear(false);
-		state ClientCoordinators coordinators( intermediateConnFile );
-		state Reference<AsyncVar<vector<Optional<LeaderInfo>>>> nominees( new AsyncVar<vector<Optional<LeaderInfo>>>() );
-
-		nominees->set( vector<Optional<LeaderInfo>>( coordinators.clientLeaderServers.size() ) );
-
-		for(int i=0; i<coordinators.clientLeaderServers.size(); i++)
-			ac.add( monitorNominee( coordinators.clusterKey, coordinators.clientLeaderServers[i], nominees, i ) );
-
-		loop {
-			state Optional<LeaderInfo> leader = getLeader(nominees->get());
-			TraceEvent("MonitorLeaderChange").detail("NewLeader", leader.present() ? leader.get().changeID : UID());
-			if (leader.present()) {
-				if( leader.get().forward ) {
-					intermediateConnFile = Reference<ClusterConnectionFile>(new ClusterConnectionFile(connFile->getFilename(), ClusterConnectionString(leader.get().serializedInfo.toString())));
-					TraceEvent("MonitorLeaderForwarding").detail("ConnStr", intermediateConnFile->getConnectionString().toString());
-					break;
-				}
-				if(connFile != intermediateConnFile) {
-					if(!hasConnected) {
-						TraceEvent(SevWarnAlways, "IncorrectClusterFileContentsAtConnection").detail("Filename", connFile->getFilename())
-							.detail("ConnectionStringFromFile", connFile->getConnectionString().toString())
-							.detail("CurrentConnectionString", intermediateConnFile->getConnectionString().toString());
-					}
-					connFile->setConnectionString(intermediateConnFile->getConnectionString());
-					intermediateConnFile = connFile;
-				}
-
-				hasConnected = true;
-				connFile->notifyConnected();
-
-				outSerializedLeaderInfo->set( leader.get().serializedInfo );
+		Optional<LeaderInfo> leader = getLeader(nominees);
+		TraceEvent("MonitorLeaderChange").detail("NewLeader", leader.present() ? leader.get().changeID : UID(1,1));
+		if (leader.present()) {
+			if( leader.get().forward ) {
+				TraceEvent("MonitorLeaderForwarding").detail("NewConnStr", leader.get().serializedInfo.toString()).detail("OldConnStr", info.intermediateConnFile->getConnectionString().toString());
+				info.intermediateConnFile = Reference<ClusterConnectionFile>(new ClusterConnectionFile(connFile->getFilename(), ClusterConnectionString(leader.get().serializedInfo.toString())));
+				return info;
+			}
+			if(connFile != info.intermediateConnFile) {
+				if(!info.hasConnected) {
+					TraceEvent(SevWarnAlways, "IncorrectClusterFileContentsAtConnection").detail("Filename", connFile->getFilename())
+						.detail("ConnectionStringFromFile", connFile->getConnectionString().toString())
+						.detail("CurrentConnectionString", info.intermediateConnFile->getConnectionString().toString());
+				}
+				connFile->setConnectionString(info.intermediateConnFile->getConnectionString());
+				info.intermediateConnFile = connFile;
 			}
-			//else
-			//	outSerializedLeaderInfo->set( Value() );  // or keep talking to the last known leader??

-			Void _ = wait( nominees->onChange() || ac.getResult() );
+			info.hasConnected = true;
+			connFile->notifyConnected();
+
+			outSerializedLeaderInfo->set( leader.get().serializedInfo );
 		}
+		Void _ = wait( nomineeChange.onTrigger() || allActors );
+	}
+}
+
+ACTOR Future<Void> monitorLeaderInternal( Reference<ClusterConnectionFile> connFile, Reference<AsyncVar<Value>> outSerializedLeaderInfo ) {
+	state MonitorLeaderInfo info(connFile);
+	loop {
+		MonitorLeaderInfo _info = wait( monitorLeaderOneGeneration( connFile, outSerializedLeaderInfo, info) );
+		info = _info;
+		info.generation++;
 	}
 }
--- a/fdbclient/NativeAPI.actor.cpp
+++ b/fdbclient/NativeAPI.actor.cpp
@ -602,7 +602,7 @@ Reference<LocationInfo> DatabaseContext::setCachedLocation( const KeyRangeRef& k
 		attempts++;
 		auto r = locationCache.randomRange();
 		Key begin = r.begin(), end = r.end();  // insert invalidates r, so can't be passed a mere reference into it
-		if( begin >= keyServersPrefix )
+		if( begin >= keyServersPrefix && attempts > maxEvictionAttempts / 2)
 			continue;
 		locationCache.insert( KeyRangeRef(begin, end), Reference<LocationInfo>() );
 	}
@ -754,8 +754,8 @@ Reference<Cluster> Cluster::createCluster(std::string connFileName, int apiVersi
 	return Reference<Cluster>(new Cluster( rccf, apiVersion));
 }

-Future<Database> Cluster::createDatabase( Standalone<StringRef> dbName ) {
-	return DatabaseContext::createDatabase( clusterInterface, Reference<Cluster>::addRef( this ), dbName, LocalityData() );
+Future<Database> Cluster::createDatabase( Standalone<StringRef> dbName, LocalityData locality ) {
+	return DatabaseContext::createDatabase( clusterInterface, Reference<Cluster>::addRef( this ), dbName, locality );
 }

 Future<Void> Cluster::onConnected() {
@ -1102,22 +1102,33 @@ ACTOR Future< pair<KeyRange,Reference<LocationInfo>> > getKeyLocation( Database

 	state vector<StorageServerInterface> serverInterfaces;
 	state KeyRangeRef range;
-
+	
 	// We assume that not only /FF/keyServers but /FF/serverList is present on the keyServersLocations since we now need both of them to terminate our search. Currently this is guaranteed because nothing after /FF/keyServers is split.
 	if ( ( key.startsWith( serverListPrefix) && (!isBackward || key.size() > serverListPrefix.size()) ) ||
 		( key.startsWith( keyServersPrefix ) && (!isBackward || key.size() > keyServersPrefix.size()) )) {
 		if( info.debugID.present() )
-			g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.Before");
+			g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.Before");	
 		loop {
 			choose {
 				when ( Void _ = wait( cx->onMasterProxiesChanged() ) ) {}
-				when ( vector<StorageServerInterface> s = wait( loadBalance( cx->getMasterProxies(), &MasterProxyInterface::getKeyServersLocations, ReplyPromise<vector<StorageServerInterface>>(), info.taskID ) ) ) {
+				when ( vector<pair<KeyRangeRef, vector<StorageServerInterface>>> keyServersShards = wait( loadBalance( cx->getMasterProxies(), &MasterProxyInterface::getKeyServersLocations, ReplyPromise<vector<pair<KeyRangeRef, vector<StorageServerInterface>>>>(), info.taskID ) ) ) {
 					if( info.debugID.present() )
 						g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.After");
-					ASSERT( s.size() );  // There should always be storage servers, except on version 0 which should not get to this function
-					range = KeyRangeRef( keyServersPrefix, allKeys.end );
-					serverInterfaces = s;
-					break;
+					ASSERT( keyServersShards.size() );  // There should always be storage servers, except on version 0 which should not get to this function
+
+					Reference<LocationInfo> cachedLocation;
+					for (pair<KeyRangeRef, vector<StorageServerInterface>> keyServersShard : keyServersShards) {
+						auto locationInfo = cx->setCachedLocation(keyServersShard.first, keyServersShard.second);
+
+						if (isBackward ? (keyServersShard.first.begin < key && keyServersShard.first.end >= key) : keyServersShard.first.contains(key)) {
+							range = keyServersShard.first;
+							cachedLocation = locationInfo;
+						}
+					}
+
+					ASSERT(isBackward ? (range.begin < key && range.end >= key) : range.contains(key));
+
+					return make_pair(range, cachedLocation);
 				}
 			}
 		}
@ -1334,7 +1345,7 @@ ACTOR Future<Optional<Value>> getValue( Future<Version> version, Key key, Databa
 			cx->readLatencies.addSample(latency);
 			if (trLogInfo) {
 				int valueSize = reply.value.present() ? reply.value.get().size() : 0;
-				trLogInfo->addLog(FdbClientLogEvents::EventGet(startTimeD, latency, valueSize));
+				trLogInfo->addLog(FdbClientLogEvents::EventGet(startTimeD, latency, valueSize, key));
 			}
 			cx->getValueCompleted->latency = timer_int() - startTime;
 			cx->getValueCompleted->log();
@ -1364,7 +1375,7 @@ ACTOR Future<Optional<Value>> getValue( Future<Version> version, Key key, Databa
 				Void _ = wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID));
 			} else {
 				if (trLogInfo)
-					trLogInfo->addLog(FdbClientLogEvents::EventError(FdbClientLogEvents::ERROR_GET, startTimeD, static_cast<int>(e.code())));
+					trLogInfo->addLog(FdbClientLogEvents::EventGetError(startTimeD, static_cast<int>(e.code()), key));
 				throw e;
 			}
 		}
@ -1654,6 +1665,15 @@ Future<Key> resolveKey( Database const& cx, KeySelector const& key, Version cons
 ACTOR Future<Standalone<RangeResultRef>> getRangeFallback( Database cx, Version version,
 	KeySelector begin, KeySelector end, GetRangeLimits limits, bool reverse, TransactionInfo info )
 {
+	if(version == latestVersion) {
+		state Transaction transaction(cx);
+		transaction.setOption(FDBTransactionOptions::CAUSAL_READ_RISKY);
+		transaction.setOption(FDBTransactionOptions::LOCK_AWARE);
+		transaction.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+		Version ver = wait( transaction.getReadVersion() );
+		version = ver;
+	}
+
 	Future<Key> fb = resolveKey(cx, begin, version, info);
 	state Future<Key> fe = resolveKey(cx, end, version, info);

@ -1849,15 +1869,8 @@ ACTOR Future<Standalone<RangeResultRef>> getRange( Database cx, Future<Version>
 				cx->invalidateCache( beginServer.second );

 				if (e.code() == error_code_wrong_shard_server) {
-					if (version == latestVersion) {
-						// latestVersion queries are only for keyServersPrefix/*, which shard is guaranteed not to split,
-						//   so we should always be able to use the fast path--try again
-						TEST(true); //Latest version retry fast path
-						TraceEvent("LatestVersionRetryFastPath").detail("KeyBegin", printable(begin.getKey())).detail("KeyEnd", printable(end.getKey()));
-					} else {
 						Standalone<RangeResultRef> result = wait( getRangeFallback(cx, version, originalBegin, originalEnd, originalLimits, reverse, info ) );
 						return result;
-					}
 				}

 				Void _ = wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, info.taskID));
@ -1877,13 +1890,13 @@ ACTOR Future<Standalone<RangeResultRef>> getRangeWrapper(Database cx, Reference<
 			int rangeSize = 0;
 			for (const KeyValueRef &res : ret.contents())
 				rangeSize += res.key.size() + res.value.size();
-			trLogInfo->addLog(FdbClientLogEvents::EventGetRange(startTime, latency, rangeSize));
+			trLogInfo->addLog(FdbClientLogEvents::EventGetRange(startTime, latency, rangeSize, begin.getKey(), end.getKey()));
 		}
 		return ret;
 	}
 	catch (Error &e) {
 		if (trLogInfo)
-			trLogInfo->addLog(FdbClientLogEvents::EventError(FdbClientLogEvents::ERROR_GET_RANGE, startTime, static_cast<int>(e.code())));
+			trLogInfo->addLog(FdbClientLogEvents::EventGetRangeError(startTime, static_cast<int>(e.code()), begin.getKey(), end.getKey()));
 		throw;
 	}
 }
@ -2506,7 +2519,7 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
 					cx->commitLatencies.addSample(latency);
 					cx->latencies.addSample(now() - tr->startTime);
 					if (trLogInfo)
-						trLogInfo->addLog(FdbClientLogEvents::EventCommit(startTime, latency, req->transaction.mutations.size(), req->transaction.mutations.expectedSize()));
+						trLogInfo->addLog(FdbClientLogEvents::EventCommit(startTime, latency, req->transaction.mutations.size(), req->transaction.mutations.expectedSize(), req));
 					return Void();
 				} else {
 					if (info.debugID.present())
@ -2543,8 +2556,8 @@ ACTOR static Future<Void> tryCommit( Database cx, Reference<TransactionLogInfo>
 		} else {
 			if (e.code() != error_code_past_version && e.code() != error_code_not_committed && e.code() != error_code_database_locked)
 				TraceEvent(SevError, "tryCommitError").error(e);
-			if (trLogInfo)
-				trLogInfo->addLog(FdbClientLogEvents::EventError(FdbClientLogEvents::ERROR_COMMIT, startTime, static_cast<int>(e.code())));
+			if (e.code() != error_code_actor_cancelled && trLogInfo)
+				trLogInfo->addLog(FdbClientLogEvents::EventCommitError(startTime, static_cast<int>(e.code()), req));
 			throw;
 		}
 	}
--- a/fdbclient/NativeAPI.h
+++ b/fdbclient/NativeAPI.h
@ -110,7 +110,7 @@ public:
 	static Reference<Cluster> createCluster(std::string connFileName, int apiVersion);

 	// See DatabaseContext::createDatabase
-	Future<Database> createDatabase( Standalone<StringRef> dbName );
+	Future<Database> createDatabase( Standalone<StringRef> dbName, LocalityData locality = LocalityData() );

 	void setOption(FDBClusterOptions::Option option, Optional<StringRef> value);

--- a/fdbclient/ReadYourWrites.actor.cpp
+++ b/fdbclient/ReadYourWrites.actor.cpp
@ -292,7 +292,10 @@ public:

 		if ( key.offset <= 0 && it.beginKey() == key.getKey() && key.getKey() != allKeys.begin )
 			--it;
+
 		ExtStringRef keykey = key.getKey();
+		bool keyNeedsCopy = false;
+
 		// Invariant: it.beginKey() <= keykey && keykey <= it.endKey() && (key.isBackward() ? it.beginKey() != keykey : it.endKey() != keykey)
 		// Maintaining this invariant, we transform the key selector toward firstGreaterOrEqual form until we reach an unknown range or the result
 		while (key.offset > 1 && !it.is_unreadable() && !it.is_unknown_range() && it.endKey() < maxKey ) {
@ -300,17 +303,20 @@ public:
 				--key.offset;
 			++it;
 			keykey = it.beginKey();
+			keyNeedsCopy = true;
 		}
 		while (key.offset < 1 && !it.is_unreadable() && !it.is_unknown_range() && it.beginKey() != allKeys.begin) {
 			if (it.is_kv()) {
 				++key.offset;
 				if (key.offset == 1) {
 					keykey = it.beginKey();
+					keyNeedsCopy = true;
 					break;
 				}
 			}
 			--it;
 			keykey = it.endKey();
+			keyNeedsCopy = true;
 		}

 		if(!alreadyExhausted) {
@ -326,7 +332,7 @@ public:
 		
 		if (!it.is_unreadable() && !it.is_unknown_range() && key.offset > 1) {
 			*readThroughEnd = true;
-			key.setKey(maxKey);
+			key.setKey(maxKey); // maxKey is a KeyRef, but points to a LiteralStringRef. TODO: how can we ASSERT this?
 			key.offset = 1;
 			return;
 		}
@ -334,9 +340,15 @@ public:
 		while (!it.is_unreadable() && it.is_empty_range() && it.endKey() < maxKey) {
 			++it;
 			keykey = it.beginKey();
+			keyNeedsCopy = true;
 		}

-		key.setKey(keykey.toArenaOrRef(key.arena()));
+		if(keyNeedsCopy) {
+			key.setKey(keykey.toArena(key.arena()));
+		}
+		else {
+			key.setKey(keykey.toArenaOrRef(key.arena()));
+		}
 	}

 	static KeyRangeRef getKnownKeyRange( RangeResultRef data, KeySelector begin, KeySelector end, Arena& arena ) {
@ -564,7 +576,10 @@ public:
 					additionalRows += singleClears;
 				}

-				read_end.setKey(std::max(read_end.getKey(), read_begin.getKey()));
+				if(read_end.getKey() < read_begin.getKey()) {
+					read_end.setKey(read_begin.getKey());
+					read_end.arena().dependsOn(read_begin.arena());
+				}

 				state GetRangeLimits requestLimit = limits;
 				setRequestLimits(requestLimit, additionalRows, 2-read_begin.offset, requestCount);
@ -829,7 +844,10 @@ public:
 					additionalRows += singleClears;
 				}

-				read_begin.setKey(std::min( read_begin.getKey(), read_end.getKey() ));
+				if(read_begin.getKey() > read_end.getKey()) {
+					read_begin.setKey(read_end.getKey());
+					read_begin.arena().dependsOn(read_end.arena());
+				}

 				state GetRangeLimits requestLimit = limits;
 				setRequestLimits(requestLimit, additionalRows, read_end.offset, requestCount);
--- a/fdbclient/SystemData.cpp
+++ b/fdbclient/SystemData.cpp
@ -33,7 +33,8 @@ const KeyRef afterAllKeys = LiteralStringRef("\xff\xff\x00");
 const KeyRangeRef keyServersKeys( LiteralStringRef("\xff/keyServers/"), LiteralStringRef("\xff/keyServers0") );
 const KeyRef keyServersPrefix = keyServersKeys.begin;
 const KeyRef keyServersEnd = keyServersKeys.end;
-const KeyRef keyServersKeyServersKey = LiteralStringRef("\xff/keyServers/\xff/keyServers/");
+const KeyRangeRef keyServersKeyServersKeys ( LiteralStringRef("\xff/keyServers/\xff/keyServers/"), LiteralStringRef("\xff/keyServers/\xff/keyServers0"));
+const KeyRef keyServersKeyServersKey = keyServersKeyServersKeys.begin;

 const Key keyServersKey( const KeyRef& k ) {
 	return k.withPrefix( keyServersPrefix );
--- a/fdbclient/SystemData.h
+++ b/fdbclient/SystemData.h
@ -34,7 +34,7 @@ extern const KeyRangeRef allKeys; // '' to systemKeys.end
 extern const KeyRef afterAllKeys;

 //    "\xff/keyServers/[[begin]]" := "[[vector<serverID>, vector<serverID>]]"
-extern const KeyRangeRef keyServersKeys;
+extern const KeyRangeRef keyServersKeys, keyServersKeyServersKeys;
 extern const KeyRef keyServersPrefix, keyServersEnd, keyServersKeyServersKey;
 const Key keyServersKey( const KeyRef& k );
 const KeyRef keyServersKey( const KeyRef& k, Arena& arena );
--- a/fdbclient/fdbclient.vcxproj
+++ b/fdbclient/fdbclient.vcxproj
@ -38,6 +38,8 @@
      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Debug|X64'">false</EnableCompile>
      <EnableCompile Condition="'$(Configuration)|$(Platform)'=='Release|X64'">false</EnableCompile>
    </ActorCompiler>
+    <ClInclude Include="MetricLogger.h" />
+    <ActorCompiler Include="MetricLogger.actor.cpp" />
    <ClInclude Include="FailureMonitorClient.h" />
    <ClInclude Include="FDBOptions.g.h" />
    <ClInclude Include="FDBOptions.h" />
--- a/fdbclient/vexillographer/fdb.options
+++ b/fdbclient/vexillographer/fdb.options
@ -94,7 +94,7 @@ description is not currently required but encouraged.
    <Option name="disable_client_statistics_logging" code="70"
            description="Disables logging of client statistics, such as sampled transaction activity." />
    <Option name="enable_slow_task_profiling" code="71"
-            description="Enables slow task profiling. Requires trace logging to be enabled." />
+            description="Enables debugging feature to perform slow task profiling. Requires trace logging to be enabled. WARNING: this feature is not recommended for use in production." />
    <Option name="supported_client_versions" code="1000"
            paramType="String" paramDescription="[release version],[source version],[protocol version];..."
            description="This option is set automatically to communicate the list of supported clients to the active client."
--- a/fdbmonitor/fdbmonitor.cpp
+++ b/fdbmonitor/fdbmonitor.cpp
@ -253,7 +253,8 @@ public:
 	uint32_t restart_delay_reset_interval;
 	double last_start;
 	bool quiet;
-	bool delete_wd40_env;
+	//bool delete_wd40_env;
+	const char *delete_envvars;
 	bool deconfigured;
 	bool kill_on_configuration_change;

@ -261,7 +262,7 @@ public:
 	int pipes[2][2];

 	Command() : argv(NULL) { }
-	Command(const CSimpleIni& ini, std::string _section, uint64_t id, fdb_fd_set fds, int* maxfd) : section(_section), argv(NULL), quiet(false), delete_wd40_env(false), fds(fds), deconfigured(false), kill_on_configuration_change(true) {
+	Command(const CSimpleIni& ini, std::string _section, uint64_t id, fdb_fd_set fds, int* maxfd) : section(_section), argv(NULL), quiet(false), delete_envvars(NULL), fds(fds), deconfigured(false), kill_on_configuration_change(true) {
 		char _ssection[strlen(section.c_str()) + 22];
 		snprintf(_ssection, strlen(section.c_str()) + 22, "%s.%llu", section.c_str(), id);
 		ssection = _ssection;
@ -351,10 +352,8 @@ public:
 		if (q && !strcmp(q, "true"))
 			quiet = true;

-		const char* dwe = get_value_multi(ini, "delete_wd40_env", ssection.c_str(), section.c_str(), "general", NULL);
-		if(dwe && !strcmp(dwe, "true")) {
-			delete_wd40_env = true;
-		}
+		const char* del_env = get_value_multi(ini, "delete_envvars", ssection.c_str(), section.c_str(), "general", NULL);
+		delete_envvars = del_env;

 		const char* kocc = get_value_multi(ini, "kill_on_configuration_change", ssection.c_str(), section.c_str(), "general", NULL);
 		if(kocc && strcmp(kocc, "true")) {
@ -373,7 +372,7 @@ public:

 		for (auto i : keys) {
 			if (!strcmp(i.pItem, "command") || !strcmp(i.pItem, "restart_delay") || !strcmp(i.pItem, "initial_restart_delay") || !strcmp(i.pItem, "restart_backoff") ||
-				!strcmp(i.pItem, "restart_delay_reset_interval") || !strcmp(i.pItem, "disable_lifecycle_logging") || !strcmp(i.pItem, "delete_wd40_env") ||
+				!strcmp(i.pItem, "restart_delay_reset_interval") || !strcmp(i.pItem, "disable_lifecycle_logging") || !strcmp(i.pItem, "delete_envvars") ||
 				!strcmp(i.pItem, "kill_on_configuration_change"))
 			{
 				continue;
@ -408,7 +407,7 @@ public:
 	}
 	void update(const Command& other) {
 		quiet = other.quiet;
-		delete_wd40_env = other.delete_wd40_env;
+		delete_envvars = other.delete_envvars;
 		initial_restart_delay = other.initial_restart_delay;
 		max_restart_delay = other.max_restart_delay;
 		restart_backoff = other.restart_backoff;
@ -474,12 +473,21 @@ void start_process(Command* cmd, uint64_t id, uid_t uid, gid_t gid, int delay, s
 		signal(SIGINT, SIG_DFL);
 		signal(SIGTERM, SIG_DFL);

-		if(cmd->delete_wd40_env) {
-			/* remove WD40 environment variables */
-			if(unsetenv("WD40_BV") || unsetenv("WD40_IS_MY_DADDY") || unsetenv("CONF_BUILD_VERSION")) {
-				log_err("unsetenv", errno, "Failed to remove parent environment variables");
-				exit(1);
-			}
+		if(cmd->delete_envvars != NULL && std::strlen(cmd->delete_envvars) > 0) {
+			std::string vars(cmd->delete_envvars);
+			size_t start = 0;
+			do {
+				size_t bound = vars.find(" ", start);
+				std::string var = vars.substr(start, bound - start);
+				log_msg(LOG_INFO, "Deleting parent environment variable: \"%s\"\n", var.c_str());
+				if(unsetenv(var.c_str())) {
+					log_err("unsetenv", errno, "Failed to remove parent environment variable: %s\n", var.c_str());
+					exit(1);
+				}
+				start = bound;
+				while(vars[start] == ' ')
+					start++;
+			} while(start <= vars.length());
 		}

 		dup2( cmd->pipes[0][1], fileno(stdout) );
--- a/fdbrpc/AsyncFileNonDurable.actor.h
+++ b/fdbrpc/AsyncFileNonDurable.actor.h
@ -213,8 +213,9 @@ public:
 			//If we are in the process of deleting a file, we can't let someone else modify it at the same time.  We therefore block the creation of new files until deletion is complete
 			state std::map<std::string, Future<Void>>::iterator deletedFile = filesBeingDeleted.find(filename);
 			if(deletedFile != filesBeingDeleted.end()) {
-				//TraceEvent("AsyncFileNonDurableOpenWaitOnDelete").detail("Filename", filename);
+				//TraceEvent("AsyncFileNonDurableOpenWaitOnDelete1").detail("Filename", filename);
 				Void _ = wait( deletedFile->second || shutdown );
+				//TraceEvent("AsyncFileNonDurableOpenWaitOnDelete2").detail("Filename", filename);
 				if(shutdown.isReady())
 					throw io_error().asInjectedFault();
 			}
@ -711,35 +712,44 @@ private:

 	//Finishes all outstanding actors on an AsyncFileNonDurable and then deletes it
 	ACTOR Future<Void> deleteFile(AsyncFileNonDurable *self) {
-		//We must run on the main thread (instead of a SQLite coroutine).  We don't want to signal any promises from a coroutine, so we switch at the beginning
-		//of this ACTOR
-		Void _ = wait(self->returnToMainThread());
+		state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
+		state int currentTaskID = g_network->getCurrentTask();
+		state std::string filename = self->filename;

-		//Make sure all writes have gone through.
-		Promise<bool> startSyncPromise = self->startSyncPromise;
-		self->startSyncPromise = Promise<bool>();
-		startSyncPromise.send(true);
+		Void _ = wait( g_simulator.onMachine( currentProcess ) );
+		try {
+			//Make sure all writes have gone through.
+			Promise<bool> startSyncPromise = self->startSyncPromise;
+			self->startSyncPromise = Promise<bool>();
+			startSyncPromise.send(true);

-		std::vector<Future<Void>> outstandingModifications;
+			std::vector<Future<Void>> outstandingModifications;

-		for(auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end(); ++itr)
-			if(itr->value().isValid() && !itr->value().isReady())
-				outstandingModifications.push_back(itr->value());
+			for(auto itr = self->pendingModifications.ranges().begin(); itr != self->pendingModifications.ranges().end(); ++itr)
+				if(itr->value().isValid() && !itr->value().isReady())
+					outstandingModifications.push_back(itr->value());

-		//Ignore errors here so that all modifications can finish
-		Void _ = wait(waitForAllReady(outstandingModifications));
+			//Ignore errors here so that all modifications can finish
+			Void _ = wait(waitForAllReady(outstandingModifications));

-		//Make sure we aren't in the process of killing the file
-		if(self->killed.isSet())
-			Void _ = wait(self->killComplete.getFuture());
+			//Make sure we aren't in the process of killing the file
+			if(self->killed.isSet())
+				Void _ = wait(self->killComplete.getFuture());

-		//Remove this file from the filesBeingDeleted map so that new files can be created with this filename
-		g_simulator.getMachineByNetworkAddress( self->openedAddress )->closingFiles.erase(self->getFilename());
-		AsyncFileNonDurable::filesBeingDeleted.erase(self->filename);
-		//TraceEvent("AsyncFileNonDurable_FinishDelete", self->id).detail("Filename", self->filename);
+			//Remove this file from the filesBeingDeleted map so that new files can be created with this filename
+			g_simulator.getMachineByNetworkAddress( self->openedAddress )->closingFiles.erase(self->getFilename());
+			g_simulator.getMachineByNetworkAddress( self->openedAddress )->deletingFiles.erase(self->getFilename());
+			AsyncFileNonDurable::filesBeingDeleted.erase(self->filename);
+			//TraceEvent("AsyncFileNonDurable_FinishDelete", self->id).detail("Filename", self->filename);

-		delete self;
-		return Void();
+			delete self;
+			Void _ = wait( g_simulator.onProcess( currentProcess, currentTaskID ) );
+			return Void();
+		} catch( Error &e ) {
+			state Error err = e;
+			Void _ = wait( g_simulator.onProcess( currentProcess, currentTaskID ) );
+			throw err;
+		}
 	}
 };

--- a/fdbrpc/FlowTests.actor.cpp
+++ b/fdbrpc/FlowTests.actor.cpp
@ -344,10 +344,10 @@ TEST_CASE("flow/flow/quorum")
 	vector<Future<int>> fs;
 	vector<Future<Void>> qs;
 	for (auto& p : ps) fs.push_back(p.getFuture());
-			
+
 	for (int i = 0; i <= ps.size(); i++)
 		qs.push_back( quorum(fs, i) );
-			
+
 	for (int i = 0; i < ps.size(); i++) {
 		ASSERT(qs[i].isReady());
 		ASSERT(!qs[i + 1].isReady());
@ -357,7 +357,7 @@ TEST_CASE("flow/flow/quorum")
 	return Void();
 }

-TEST_CASE("flow/flow/trivial futures") 
+TEST_CASE("flow/flow/trivial futures")
 {
 	Future<int> invalid;
 	ASSERT(!invalid.isValid());
@ -499,7 +499,7 @@ TEST_CASE("flow/flow/promisestream callbacks")
 	onReady(p.getFuture(), [&result](int x) { result = x; }, [&result](Error e){ result = -1; });

 	ASSERT(result == 0);
-			
+
 	p = PromiseStream<int>();

 	ASSERT(result == -1);
@ -989,7 +989,7 @@ TEST_CASE("flow/flow/perf/actor patterns")
 			ASSERT(out2[i].isReady());
 		}
 		printf("2xcheeseActor(chooseTwoActor(cheeseActor(fifo), never)): %0.2f M/sec\n", N / 1e6 / (timer() - start));
-		printf("sizeof(CheeseWaitActorActor) == %d\n", sizeof(CheeseWaitActorActor));
+		printf("sizeof(CheeseWaitActorActor) == %lu\n", sizeof(CheeseWaitActorActor));
 	}

 	{
@ -1140,11 +1140,11 @@ TEST_CASE("flow/flow/YieldedAsyncMap/cancel2")
 	state Future<Void> y2 = yam.onChange(2);

 	auto* pyam = &yam;
-	uncancellable(trigger( 
+	uncancellable(trigger(
 		[pyam](){
 			printf("Triggered\n");
-			pyam->triggerAll(); 
-		}, 
+			pyam->triggerAll();
+		},
 		delay(1)));

 	Void _ = wait(y1);
@ -1191,4 +1191,4 @@ TEST_CASE("fdbrpc/flow/wait_expression_after_cancel")
 	f.cancel();
 	ASSERT( a == 1 );
 	return Void();
-}
+}
--- a/fdbrpc/FlowTransport.actor.cpp
+++ b/fdbrpc/FlowTransport.actor.cpp
@ -492,7 +492,8 @@ static void scanPackets( TransportData* transport, uint8_t*& unprocessed_begin,

 		if (checksumEnabled) {
 			bool isBuggifyEnabled = false;
-			if(g_network->isSimulated() && g_simulator.enableConnectionFailures && BUGGIFY_WITH_PROB(0.001)) {
+			if(g_network->isSimulated() && g_network->now() - g_simulator.lastConnectionFailure > g_simulator.connectionFailuresDisableDuration && BUGGIFY_WITH_PROB(0.001)) {
+				g_simulator.lastConnectionFailure = g_network->now();
 				isBuggifyEnabled = true;
 				TraceEvent(SevInfo, "BitsFlip");
 				int flipBits = 32 - (int) floor(log2(g_random->randomUInt32()));
--- a/fdbrpc/IAsyncFile.actor.cpp
+++ b/fdbrpc/IAsyncFile.actor.cpp
@ -27,25 +27,36 @@

 IAsyncFile::IAsyncFile(){};

-ACTOR static Future<Void> incrementalDeleteHelper( std::string filename, int64_t truncateAmt, double interval ){
+ACTOR static Future<Void> incrementalDeleteHelper( std::string filename, bool mustBeDurable, int64_t truncateAmt, double interval ) {
+	state Reference<IAsyncFile> file;
+	state int64_t remainingFileSize;
+	state bool exists = fileExists(filename);

-	state Reference<IAsyncFile> f = wait(
-		IAsyncFileSystem::filesystem()->open(filename, IAsyncFile::OPEN_READWRITE, 0));
-	state int64_t filesize = wait(f->size());
-	state int64_t i = filesize;
+	if(exists) {
+		Reference<IAsyncFile> f = wait(IAsyncFileSystem::filesystem()->open(filename, IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_UNCACHED, 0));
+		file = f;

-	Void _ = wait(IAsyncFileSystem::filesystem()->deleteFile(filename, true));
-	for( ;i > 0; i -= truncateAmt ){
-		Void _ = wait(f->truncate(i));
-		Void _ = wait(f->sync());
-		Void _ = wait(delay(interval));
+		int64_t fileSize = wait(file->size());
+		remainingFileSize = fileSize;
 	}
+
+	Void _ = wait(IAsyncFileSystem::filesystem()->deleteFile(filename, mustBeDurable));
+
+	if(exists) {
+		for( ; remainingFileSize > 0; remainingFileSize -= truncateAmt ){
+			Void _ = wait(file->truncate(remainingFileSize));
+			Void _ = wait(file->sync());
+			Void _ = wait(delay(interval));
+		}
+	}
+
 	return Void();
 }

-Future<Void> IAsyncFile::incrementalDelete( std::string filename){
+Future<Void> IAsyncFile::incrementalDelete( std::string filename, bool mustBeDurable ) {
 	return uncancellable(incrementalDeleteHelper(
 		filename,
+		mustBeDurable,
 		FLOW_KNOBS->INCREMENTAL_DELETE_TRUNCATE_AMOUNT,
 		FLOW_KNOBS->INCREMENTAL_DELETE_INTERVAL));
 }
@ -63,6 +74,6 @@ TEST_CASE( "fileio/incrementalDelete" ) {
 	Void _ = wait(f->truncate(fileSize));
 	//close the file by deleting the reference
 	f.clear();
-	Void _ = wait(IAsyncFile::incrementalDelete(filename));
+	Void _ = wait(IAsyncFile::incrementalDelete(filename, true));
 	return Void();
 }
--- a/fdbrpc/IAsyncFile.h
+++ b/fdbrpc/IAsyncFile.h
@ -57,8 +57,10 @@ public:
 	virtual Future<Void> flush() { return Void();  }      // Sends previous writes to the OS if they have been buffered in memory, but does not make them power safe
 	virtual Future<int64_t> size() = 0;
 	virtual std::string getFilename() = 0;
-	//start an actor to truncate the file repeatedly so that the operating system doesn't delete it all at once
-	static Future<Void> incrementalDelete( std::string filename);
+
+	// Unlinks a file and then deletes it slowly by truncating the file repeatedly.
+	// If mustBeDurable, returns only when the file is guaranteed to be deleted even after a power failure.
+	static Future<Void> incrementalDelete( std::string filename, bool mustBeDurable );

 	// Attempt to read the *length bytes at offset without copying.  If successful, a pointer to the
 	//   requested bytes is written to *data, and the number of bytes successfully read is
--- a/fdbrpc/LoadBalance.actor.h
+++ b/fdbrpc/LoadBalance.actor.h
@ -204,6 +204,22 @@ Future< REPLY_TYPE(Request) > loadBalance(
 				}
 			}
 		}
+		if( nextMetric > 1e8 ) {
+			for(int i=alternatives->countBest(); i<alternatives->size(); i++) {
+				RequestStream<Request> const* thisStream = &alternatives->get( i, channel );
+				if (!IFailureMonitor::failureMonitor().getState( thisStream->getEndpoint() ).failed) {
+					auto& qd = model->getMeasurement(thisStream->getEndpoint().token.first());
+					double thisMetric = qd.smoothOutstanding.smoothTotal();
+					double thisTime = qd.latency;
+				
+					if( thisMetric < nextMetric ) {
+						nextAlt = i;
+						nextMetric = thisMetric;
+						nextTime = thisTime;
+					}
+				}
+			}
+		}

 		if(nextTime < 1e9) {
 			if(bestTime > FLOW_KNOBS->INSTANT_SECOND_REQUEST_MULTIPLIER*(model->secondMultiplier*(nextTime) + FLOW_KNOBS->BASE_SECOND_REQUEST_TIME)) {
--- a/fdbrpc/Locality.cpp
+++ b/fdbrpc/Locality.cpp
@ -27,7 +27,7 @@ const StringRef LocalityData::keyDcId = LiteralStringRef("dcid");
 const StringRef LocalityData::keyMachineId = LiteralStringRef("machineid");
 const StringRef LocalityData::keyDataHallId = LiteralStringRef("data_hall");

-ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) {
+ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) const {
 	switch( role ) {
 	case ProcessClass::Storage:
 		switch( _class ) {
@ -108,6 +108,23 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) {
 		default:
 			return ProcessClass::WorstFit;
 		}
+	case ProcessClass::ClusterController:
+		switch( _class ) {
+		case ProcessClass::StatelessClass:
+			return ProcessClass::BestFit;
+		case ProcessClass::MasterClass:
+			return ProcessClass::GoodFit;
+		case ProcessClass::ResolutionClass:
+			return ProcessClass::BestOtherFit;
+		case ProcessClass::ProxyClass:
+			return ProcessClass::BestOtherFit;
+		case ProcessClass::UnsetClass:
+			return ProcessClass::UnsetFit;
+		case ProcessClass::TesterClass:
+			return ProcessClass::NeverAssign;
+		default:
+			return ProcessClass::WorstFit;
+		}
 	default:
 		return ProcessClass::NeverAssign;
 	}
--- a/fdbrpc/Locality.h
+++ b/fdbrpc/Locality.h
@ -28,7 +28,7 @@ struct ProcessClass {
 	// This enum is stored in restartInfo.ini for upgrade tests, so be very careful about changing the existing items!
 	enum ClassType { UnsetClass, StorageClass, TransactionClass, ResolutionClass, TesterClass, ProxyClass, MasterClass, StatelessClass, LogClass, InvalidClass = -1 };
 	enum Fitness { BestFit, GoodFit, BestOtherFit, UnsetFit, WorstFit, NeverAssign };
-	enum ClusterRole { Storage, TLog, Proxy, Master, Resolver };
+	enum ClusterRole { Storage, TLog, Proxy, Master, Resolver, ClusterController };
 	enum ClassSource { CommandLineSource, AutoSource, DBSource, InvalidSource = -1 };
 	int16_t _class;
 	int16_t _source;
@ -100,7 +100,7 @@ public:
 		}
 	}

-	Fitness machineClassFitness( ClusterRole role );
+	Fitness machineClassFitness( ClusterRole role ) const ;

 	template <class Ar>
 	void serialize(Ar& ar) {
--- a/fdbrpc/Platform.cpp
+++ b/fdbrpc/Platform.cpp
@ -89,15 +89,11 @@ void eraseDirectoryRecursive( std::string const& dir ) {
 	INJECT_FAULT( platform_error, "eraseDirectoryRecursive" );
 #ifdef _WIN32
 	system( ("rd /s /q \"" + dir + "\"").c_str() );
-#elif defined(__linux__)
+#elif defined(__linux__) || defined(__APPLE__)
 	int error =
 		nftw(dir.c_str(),
-			 [](const char *fpath, const struct stat *sb, int typeflag,
-				struct FTW *ftwbuf) -> int {
-				 if (remove(fpath))
-					 return FTW_STOP;
-				 return FTW_CONTINUE;
-			 }, 64, FTW_DEPTH | FTW_PHYS | FTW_ACTIONRETVAL);
+			[](const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf) -> int { return remove(fpath); }
+			, 64, FTW_DEPTH | FTW_PHYS);
 	/* Looks like calling code expects this to continue silently if
 	   the directory we're deleting doesn't exist in the first
 	   place */
@ -105,14 +101,6 @@ void eraseDirectoryRecursive( std::string const& dir ) {
 		TraceEvent(SevError, "nftw").detail("Directory", dir).GetLastError();
 		throw platform_error();
 	}
-#elif defined(__APPLE__)
-	// const char* argv[2];
-	// argv[0] = dir.c_str();
-	// argv[1] = NULL;
-	// FTS* fts = fts_open(argv, FTS_PHYSICAL | FTS_SEEDOT | FTS_NOSTAT, NULL);
-	// while (FTSENT* ent = fts_read(fts)) {
-	// 	if (ent->fts_info 
-	// }
 #else
 #error Port me!
 #endif
--- a/fdbrpc/ReplicationPolicy.cpp
+++ b/fdbrpc/ReplicationPolicy.cpp
@ -60,7 +60,7 @@ bool IReplicationPolicy::validateFull(
 		}
 		else if (validate(fromServers->getGroupEntries(), fromServers)) {
 			if (g_replicationdebug > 2) {
-				printf("Error: Validated unsolved policy with all%5lu servers\n", fromServers->size());
+				printf("Error: Validated unsolved policy with all%5d servers\n", fromServers->size());
 			}
 			valid = false;
 		}
@ -194,7 +194,7 @@ bool PolicyAcross::validate(
 				count ++;
 			}
 			else if (g_replicationdebug > 4) {
-				printf("Across invalid solution:%5lu key: %-7s value: (%3lu) %-10s policy: %-10s => %s\n", itValid.second.size(), _attribKey.c_str(), itValid.first._id, fromServers->valueText(itValid.first).c_str(), _policy->name().c_str(), _policy->info().c_str());
+				printf("Across invalid solution:%5lu key: %-7s value: (%3d) %-10s policy: %-10s => %s\n", itValid.second.size(), _attribKey.c_str(), itValid.first._id, fromServers->valueText(itValid.first).c_str(), _policy->name().c_str(), _policy->info().c_str());
 				if (g_replicationdebug > 5) {
 					for (auto& entry : itValid.second) {
 						printf("   entry: %s\n", fromServers->getEntryInfo(entry).c_str());
--- a/fdbrpc/dsltest.actor.cpp
+++ b/fdbrpc/dsltest.actor.cpp
@ -79,7 +79,7 @@ bool testFuzzActor( Future<int>(*actor)(FutureStream<int> const&, PromiseStream<
 			}
 		}
 		if (outCount+1 != expectedOutput.size()) {
-			printf("\tERROR: %s output length incorrect: %d != expected %d\n", desc, outCount+1, expectedOutput.size());
+			printf("\tERROR: %s output length incorrect: %d != expected %lu\n", desc, outCount+1, expectedOutput.size());
 			if (trial) printf("\t\tResult was inconsistent between runs!\n");
 			ok = false;
 			//return false;
--- a/fdbrpc/genericactors.actor.cpp
+++ b/fdbrpc/genericactors.actor.cpp
@ -31,7 +31,7 @@ ACTOR Future<Void> disableConnectionFailuresAfter( double time, std::string cont
 	Void _ = wait( delay(time) );

 	if(g_network->isSimulated()) {
-		g_simulator.enableConnectionFailures = false;
+		g_simulator.connectionFailuresDisableDuration = 1e6;
 		g_simulator.speedUpSimulation = true;
 		TraceEvent(SevWarnAlways, ("DisableConnectionFailures_" + context).c_str());
 	}
--- a/fdbrpc/sim2.actor.cpp
+++ b/fdbrpc/sim2.actor.cpp
@ -65,6 +65,29 @@ bool simulator_should_inject_fault( const char* context, const char* file, int l
 	return false;
 }

+void ISimulator::displayWorkers() const
+{
+	std::map<std::string, std::vector<ISimulator::ProcessInfo*>>	zoneMap;
+
+	// Create a map of zone Id
+	for (auto processInfo : getAllProcesses()) {
+		std::string dataHall = processInfo->locality.dataHallId().present() ? processInfo->locality.dataHallId().get().printable() : "[unset]";
+		std::string zoneId = processInfo->locality.zoneId().present() ? processInfo->locality.zoneId().get().printable() : "[unset]";
+		zoneMap[format("%-8s  %s", dataHall.c_str(), zoneId.c_str())].push_back(processInfo);
+	}
+
+	printf("DataHall  ZoneId\n");
+	printf("                  Address   Name      Class        Excluded Failed Rebooting Role                                              DataFolder\n");
+	for (auto& zoneRecord : zoneMap) {
+		printf("\n%s\n", zoneRecord.first.c_str());
+		for (auto& processInfo : zoneRecord.second) {
+			printf("                  %9s %-10s%-13s%-8s %-6s %-9s %-48s %-40s\n",
+			processInfo->address.toString().c_str(), processInfo->name, processInfo->startingClass.toString().c_str(), (processInfo->excluded ? "True" : "False"), (processInfo->failed ? "True" : "False"), (processInfo->rebooting ? "True" : "False"), getRoles(processInfo->address).c_str(), processInfo->dataFolder);
+		}
+	}
+
+	return;
+}

 namespace std {
 template<>
@ -342,7 +365,8 @@ private:
 	}

 	void rollRandomClose() {
-		if (g_simulator.enableConnectionFailures && g_random->random01() < .00001) {
+		if (now() - g_simulator.lastConnectionFailure > g_simulator.connectionFailuresDisableDuration && g_random->random01() < .00001) {
+			g_simulator.lastConnectionFailure = now();
 			double a = g_random->random01(), b = g_random->random01();
 			TEST(true);  // Simulated connection failure
 			TraceEvent("ConnectionFailure", dbgid).detail("MyAddr", process->address).detail("PeerAddr", peerProcess->address).detail("SendClosed", a > .33).detail("RecvClosed", a < .66).detail("Explicit", b < .3);
@ -408,7 +432,7 @@ public:
 		if(openCount == 2000) {
 			TraceEvent(SevWarnAlways, "DisableConnectionFailures_TooManyFiles");
 			g_simulator.speedUpSimulation = true;
-			g_simulator.enableConnectionFailures = false;
+			g_simulator.connectionFailuresDisableDuration = 1e6;
 		}

 		Void _ = wait( g_simulator.onMachine( currentProcess ) );
@ -863,16 +887,29 @@ public:
 		// This is a _rudimentary_ simulation of the untrustworthiness of non-durable deletes and the possibility of
 		// rebooting during a durable one.  It isn't perfect: for example, on real filesystems testing
 		// for the existence of a non-durably deleted file BEFORE a reboot will show that it apparently doesn't exist.
-		g_simulator.getCurrentProcess()->machine->openFiles.erase(filename);
+		if(g_simulator.getCurrentProcess()->machine->openFiles.count(filename)) {
+			g_simulator.getCurrentProcess()->machine->openFiles.erase(filename);
+			g_simulator.getCurrentProcess()->machine->deletingFiles.insert(filename);
+		}
 		if ( mustBeDurable || g_random->random01() < 0.5 ) {
-			Void _ = wait( ::delay(0.05 * g_random->random01()) );
-			if (!self->getCurrentProcess()->rebooting) {
-				auto f = IAsyncFileSystem::filesystem(self->net2)->deleteFile(filename, false);
-				ASSERT( f.isReady() );
+			state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess();
+			state int currentTaskID = g_network->getCurrentTask();
+			Void _ = wait( g_simulator.onMachine( currentProcess ) );
+			try {
 				Void _ = wait( ::delay(0.05 * g_random->random01()) );
-				TEST( true );  // Simulated durable delete
+				if (!currentProcess->rebooting) {
+					auto f = IAsyncFileSystem::filesystem(self->net2)->deleteFile(filename, false);
+					ASSERT( f.isReady() );
+					Void _ = wait( ::delay(0.05 * g_random->random01()) );
+					TEST( true );  // Simulated durable delete
+				}
+				Void _ = wait( g_simulator.onProcess( currentProcess, currentTaskID ) );
+				return Void();
+			} catch( Error &e ) {
+				state Error err = e;
+				Void _ = wait( g_simulator.onProcess( currentProcess, currentTaskID ) );
+				throw err;
 			}
-			return Void();
 		} else {
 			TEST( true );  // Simulated non-durable delete
 			return Void();
@ -965,8 +1002,11 @@ public:
 		for (auto processInfo : getAllProcesses()) {
 			// Add non-test processes (ie. datahall is not be set for test processes)
 			if (processInfo->isAvailableClass()) {
+				// Ignore excluded machines
+				if (processInfo->excluded)
+					processesDead.push_back(processInfo);
 				// Mark all of the unavailable as dead
-				if (!processInfo->isAvailable())
+				else if (!processInfo->isAvailable())
 					processesDead.push_back(processInfo);
 				else if (protectedAddresses.count(processInfo->address))
 					processesLeft.push_back(processInfo);
@ -1020,22 +1060,22 @@ public:
 			}
 			// Reboot and Delete if remaining machines do NOT fulfill policies
 			else if ((kt != RebootAndDelete) && (kt != RebootProcessAndDelete) && (!processesLeft.validate(tLogPolicy))) {
-				auto newKt = (g_random->random01() < 0.33) ? RebootAndDelete : Reboot;
+				newKt = (g_random->random01() < 0.33) ? RebootAndDelete : Reboot;
 				canSurvive = false;
 				TraceEvent("KillChanged").detail("KillType", kt).detail("NewKillType", newKt).detail("tLogPolicy", tLogPolicy->info()).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("RemainingZones", ::describeZones(localitiesLeft)).detail("RemainingDataHalls", ::describeDataHalls(localitiesLeft)).detail("Reason", "tLogPolicy does not validates against remaining processes.");
 			}
 			else if ((kt != RebootAndDelete) && (kt != RebootProcessAndDelete) && (!processesLeft.validate(storagePolicy))) {
-				auto newKt = (g_random->random01() < 0.33) ? RebootAndDelete : Reboot;
+				newKt = (g_random->random01() < 0.33) ? RebootAndDelete : Reboot;
 				canSurvive = false;
 				TraceEvent("KillChanged").detail("KillType", kt).detail("NewKillType", newKt).detail("storagePolicy", storagePolicy->info()).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("RemainingZones", ::describeZones(localitiesLeft)).detail("RemainingDataHalls", ::describeDataHalls(localitiesLeft)).detail("Reason", "storagePolicy does not validates against remaining processes.");
 			}
 			else if ((kt != RebootAndDelete) && (kt != RebootProcessAndDelete) && (nQuorum > uniqueMachines.size())) {
-				auto newKt = (g_random->random01() < 0.33) ? RebootAndDelete : Reboot;
+				newKt = (g_random->random01() < 0.33) ? RebootAndDelete : Reboot;
 				canSurvive = false;
 				TraceEvent("KillChanged").detail("KillType", kt).detail("NewKillType", newKt).detail("storagePolicy", storagePolicy->info()).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("RemainingZones", ::describeZones(localitiesLeft)).detail("RemainingDataHalls", ::describeDataHalls(localitiesLeft)).detail("Quorum", nQuorum).detail("Machines", uniqueMachines.size()).detail("Reason", "Not enough unique machines to perform auto configuration of coordinators.");
 			}
 			else {
-				TraceEvent("CanSurviveKills").detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("DeadZones", ::describeZones(localitiesDead)).detail("DeadDataHalls", ::describeDataHalls(localitiesDead)).detail("tLogPolicy", tLogPolicy->info()).detail("storagePolicy", storagePolicy->info()).detail("Quorum", nQuorum).detail("Machines", uniqueMachines.size()).detail("ZonesLeft", ::describeZones(localitiesLeft)).detail("ValidateRemaining", processesLeft.validate(tLogPolicy));
+				TraceEvent("CanSurviveKills").detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("DeadZones", ::describeZones(localitiesDead)).detail("DeadDataHalls", ::describeDataHalls(localitiesDead)).detail("tLogPolicy", tLogPolicy->info()).detail("storagePolicy", storagePolicy->info()).detail("Quorum", nQuorum).detail("Machines", uniqueMachines.size()).detail("ZonesLeft", ::describeZones(localitiesLeft)).detail("DataHallsLeft", ::describeDataHalls(localitiesLeft)).detail("ValidateRemaining", processesLeft.validate(tLogPolicy));
 			}
 		}
 		if (newKillType) *newKillType = newKt;
@ -1059,12 +1099,12 @@ public:
 		TEST( kt == InjectFaults ); // Simulated machine was killed with faults

 		if (kt == KillInstantly) {
-			TraceEvent(SevWarn, "FailMachine").detail("Name", machine->name).detail("Address", machine->address).detailext("ZoneId", machine->locality.zoneId()).detail("Process", describe(*machine)).detail("Rebooting", machine->rebooting).backtrace();
+			TraceEvent(SevWarn, "FailMachine", machine->locality.zoneId()).detail("Name", machine->name).detail("Address", machine->address).detailext("ZoneId", machine->locality.zoneId()).detail("Process", describe(*machine)).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace();
 			// This will remove all the "tracked" messages that came from the machine being killed
 			latestEventCache.clear();
 			machine->failed = true;
 		} else if (kt == InjectFaults) {
-			TraceEvent(SevWarn, "FaultMachine").detail("Name", machine->name).detail("Address", machine->address).detailext("ZoneId", machine->locality.zoneId()).detail("Process", describe(*machine)).detail("Rebooting", machine->rebooting).backtrace();
+			TraceEvent(SevWarn, "FaultMachine", machine->locality.zoneId()).detail("Name", machine->name).detail("Address", machine->address).detailext("ZoneId", machine->locality.zoneId()).detail("Process", describe(*machine)).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace();
 			should_inject_fault = simulator_should_inject_fault;
 			machine->fault_injection_r = g_random->randomUniqueID().first();
 			machine->fault_injection_p1 = 0.1;
@ -1075,8 +1115,10 @@ public:
 		ASSERT(!protectedAddresses.count(machine->address) || machine->rebooting);
 	}
 	virtual void rebootProcess( ProcessInfo* process, KillType kt ) {
-		if( kt == RebootProcessAndDelete && protectedAddresses.count(process->address) )
+		if( kt == RebootProcessAndDelete && protectedAddresses.count(process->address) ) {
+			TraceEvent("RebootChanged").detail("ZoneId", process->locality.describeZone()).detail("KillType", RebootProcess).detail("OrigKillType", kt).detail("Reason", "Protected process");
 			kt = RebootProcess;
+		}
 		doReboot( process, kt );
 	}
 	virtual void rebootProcess(Optional<Standalone<StringRef>> zoneId, bool allProcesses ) {
@ -1121,6 +1163,7 @@ public:
 		TEST(kt == InjectFaults);  // Trying to kill by injecting faults

 		if(speedUpSimulation && !forceKill) {
+			TraceEvent(SevWarn, "AbortedKill", zoneId).detailext("ZoneId", zoneId).detail("Reason", "Unforced kill within speedy simulation.").backtrace();
 			return false;
 		}

@ -1145,15 +1188,25 @@ public:
 		if ((kt == KillInstantly) || (kt == InjectFaults) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete))
 		{
 			std::vector<ProcessInfo*>	processesLeft, processesDead;
+			int	protectedWorker = 0, unavailable = 0, excluded = 0;

 			for (auto machineRec : machines) {
 				for (auto processInfo : machineRec.second.processes) {
 					// Add non-test processes (ie. datahall is not be set for test processes)
 					if (processInfo->isAvailableClass()) {
-						if (!processInfo->isAvailable())
+						// Do not include any excluded machines
+						if (processInfo->excluded) {
 							processesDead.push_back(processInfo);
-						else if (protectedAddresses.count(processInfo->address))
+							excluded ++;
+						}
+						else if (!processInfo->isAvailable()) {
+							processesDead.push_back(processInfo);
+							unavailable ++;
+						}
+						else if (protectedAddresses.count(processInfo->address)) {
 							processesLeft.push_back(processInfo);
+							protectedWorker ++;
+						}
 						else if (machineRec.second.zoneId != zoneId)
 							processesLeft.push_back(processInfo);
 						// Add processes from dead machines and datacenter machines to dead group
@ -1166,7 +1219,7 @@ public:
 				if ((kt != Reboot) && (!killIsSafe)) {
 					kt = Reboot;
 				}
-				TraceEvent("ChangedKillMachine", zoneId).detailext("ZoneId", zoneId).detail("KillType", kt).detail("OrigKillType", ktOrig).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("TotalProcesses", machines.size()).detail("processesPerMachine", processesPerMachine).detail("tLogPolicy", tLogPolicy->info()).detail("storagePolicy", storagePolicy->info());
+				TraceEvent("ChangedKillMachine", zoneId).detailext("ZoneId", zoneId).detail("KillType", kt).detail("OrigKillType", ktOrig).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("TotalProcesses", machines.size()).detail("processesPerMachine", processesPerMachine).detail("Protected", protectedWorker).detail("Unavailable", unavailable).detail("Excluded", excluded).detail("ProtectedTotal", protectedAddresses.size()).detail("tLogPolicy", tLogPolicy->info()).detail("storagePolicy", storagePolicy->info());
 			}
 			else if ((kt == KillInstantly) || (kt == InjectFaults)) {
 				TraceEvent("DeadMachine", zoneId).detailext("ZoneId", zoneId).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("TotalProcesses", machines.size()).detail("processesPerMachine", processesPerMachine).detail("tLogPolicy", tLogPolicy->info()).detail("storagePolicy", storagePolicy->info());
@ -1193,36 +1246,40 @@ public:
 		// Check if any processes on machine are rebooting
 		if( processesOnMachine != processesPerMachine && kt >= RebootAndDelete ) {
 			TEST(true); //Attempted reboot, but the target did not have all of its processes running
-			TraceEvent(SevWarn, "AbortedReboot", zoneId).detailext("ZoneId", zoneId).detail("Reason", "The target did not have all of its processes running.").detail("processes", processesOnMachine).detail("processesPerMachine", processesPerMachine).backtrace();
+			TraceEvent(SevWarn, "AbortedKill", zoneId).detail("KillType", kt).detailext("ZoneId", zoneId).detail("Reason", "Machine processes does not match number of processes per machine").detail("processes", processesOnMachine).detail("processesPerMachine", processesPerMachine).backtrace();
 			return false;
 		}

 		// Check if any processes on machine are rebooting
 		if ( processesOnMachine != processesPerMachine) {
 			TEST(true); //Attempted reboot, but the target did not have all of its processes running
-			TraceEvent(SevWarn, "AbortedKill", zoneId).detailext("ZoneId", zoneId).detail("Reason", "The target did not have all of its processes running.").detail("processes", processesOnMachine).detail("processesPerMachine", processesPerMachine).backtrace();
+			TraceEvent(SevWarn, "AbortedKill", zoneId).detail("KillType", kt).detailext("ZoneId", zoneId).detail("Reason", "Machine processes does not match number of processes per machine").detail("processes", processesOnMachine).detail("processesPerMachine", processesPerMachine).backtrace();
 			return false;
 		}

-
 		TraceEvent("KillMachine", zoneId).detailext("ZoneId", zoneId).detail("Kt", kt).detail("KtOrig", ktOrig).detail("KilledMachines", killedMachines).detail("KillableMachines", processesOnMachine).detail("ProcessPerMachine", processesPerMachine).detail("KillChanged", kt!=ktOrig).detail("killIsSafe", killIsSafe);
 		if (kt < RebootAndDelete ) {
 			if(kt == InjectFaults && machines[zoneId].machineProcess != nullptr)
 				killProcess_internal( machines[zoneId].machineProcess, kt );
 			for (auto& process : machines[zoneId].processes) {
-				TraceEvent("KillMachineProcess", zoneId).detail("KillType", kt).detail("Process", process->toString()).detail("startingClass", process->startingClass.toString());
+				TraceEvent("KillMachineProcess", zoneId).detail("KillType", kt).detail("Process", process->toString()).detail("startingClass", process->startingClass.toString()).detail("failed", process->failed).detail("excluded", process->excluded).detail("rebooting", process->rebooting);
 				if (process->startingClass != ProcessClass::TesterClass)
 					killProcess_internal( process, kt );
 			}
 		}
 		else if ( kt == Reboot || killIsSafe) {
 			for (auto& process : machines[zoneId].processes) {
-				TraceEvent("KillMachineProcess", zoneId).detail("KillType", kt).detail("Process", process->toString()).detail("startingClass", process->startingClass.toString());
+				TraceEvent("KillMachineProcess", zoneId).detail("KillType", kt).detail("Process", process->toString()).detail("startingClass", process->startingClass.toString()).detail("failed", process->failed).detail("excluded", process->excluded).detail("rebooting", process->rebooting);
 				if (process->startingClass != ProcessClass::TesterClass)
 					doReboot(process, kt );
 			}
 		}

+		TEST(kt == RebootAndDelete); // Resulted in a reboot and delete
+		TEST(kt == Reboot); // Resulted in a reboot
+		TEST(kt == KillInstantly); // Resulted in an instant kill
+		TEST(kt == InjectFaults);  // Resulted in a kill by injecting faults
+
 		return true;
 	}

@ -1233,13 +1290,16 @@ public:
 		int	dcProcesses = 0;

 		// Switch to a reboot, if anything protected on machine
-		for (auto& process : processes) {
-			auto processDcId = process->locality.dcId();
-			auto processZoneId = process->locality.zoneId();
+		for (auto& procRecord : processes) {
+			auto processDcId = procRecord->locality.dcId();
+			auto processZoneId = procRecord->locality.zoneId();
 			ASSERT(processZoneId.present());
 			if (processDcId.present() && (processDcId == dcId)) {
-				if (protectedAddresses.count(process->address))
+				if ((kt != Reboot) && (protectedAddresses.count(procRecord->address))) {
 					kt = Reboot;
+					TraceEvent(SevWarn, "DcKillChanged").detailext("DataCenter", dcId).detail("KillType", kt).detail("OrigKillType", ktOrig)
+						.detail("Reason", "Datacenter has protected process").detail("ProcessAddress", procRecord->address).detail("failed", procRecord->failed).detail("rebooting", procRecord->rebooting).detail("excluded", procRecord->excluded).detail("Process", describe(*procRecord));
+				}
 				datacenterZones[processZoneId.get()] ++;
 				dcProcesses ++;
 			}
@ -1254,7 +1314,9 @@ public:
 					// Add non-test processes (ie. datahall is not be set for test processes)
 					if (processInfo->isAvailableClass()) {
 						// Mark all of the unavailable as dead
-						if (!processInfo->isAvailable())
+						if (processInfo->excluded)
+							processesDead.push_back(processInfo);
+						else if (!processInfo->isAvailable())
 							processesDead.push_back(processInfo);
 						else if (protectedAddresses.count(processInfo->address))
 							processesLeft.push_back(processInfo);
@ -1268,10 +1330,18 @@ public:
 			}

 			if (!canKillProcesses(processesLeft, processesDead, kt, &kt)) {
-				TraceEvent(SevWarn, "DcKillChanged").detailext("DataCenter", dcId).detail("KillType", ktOrig).detail("NewKillType", kt);
+				TraceEvent(SevWarn, "DcKillChanged").detailext("DataCenter", dcId).detail("KillType", kt).detail("OrigKillType", ktOrig);
 			}
 			else {
-				TraceEvent("DeadDataCenter").detailext("DataCenter", dcId).detail("KillType", kt).detail("DcZones", datacenterZones.size()).detail("DcProcesses", dcProcesses).detail("ProcessesDead", processesDead.size()).detail("ProcessesLeft", processesLeft.size()).detail("tLogPolicy", storagePolicy->info()).detail("storagePolicy", storagePolicy->info());
+				TraceEvent("DeadDataCenter").detailext("DataCenter", dcId).detail("KillType", kt).detail("DcZones", datacenterZones.size()).detail("DcProcesses", dcProcesses).detail("ProcessesDead", processesDead.size()).detail("ProcessesLeft", processesLeft.size()).detail("tLogPolicy", tLogPolicy->info()).detail("storagePolicy", storagePolicy->info());
+				for (auto process : processesLeft) {
+					auto zoneId = process->locality.zoneId();
+					TraceEvent("DeadDcSurvivors", zoneId).detailext("ZoneId", zoneId).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("SurvivingProcess", describe(*process));
+				}
+				for (auto process : processesDead) {
+					auto zoneId = process->locality.zoneId();
+					TraceEvent("DeadDcVictims", zoneId).detailext("ZoneId", zoneId).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("VictimProcess", describe(*process));
+				}
 			}
 		}

@ -1283,10 +1353,13 @@ public:
 			.detail("DcZones", datacenterZones.size())
 			.detail("DcProcesses", dcProcesses)
 			.detailext("DCID", dcId)
-			.detail("KillType", kt);
+			.detail("KillType", kt)
+			.detail("OrigKillType", ktOrig);

 		for (auto& datacenterZone : datacenterZones)
-			killMachine( datacenterZone.first, kt, (kt == RebootAndDelete), true);
+		killMachine( datacenterZone.first, kt, (kt == RebootAndDelete), true);
+// ahm  If above doesn't work, go conservative
+//	killMachine( datacenterZone.first, kt, false, true);
 	}
 	virtual void clogInterface( uint32_t ip, double seconds, ClogMode mode = ClogDefault ) {
 		if (mode == ClogDefault) {
@ -1449,7 +1522,7 @@ public:
 void startNewSimulator() {
 	ASSERT( !g_network );
 	g_network = g_pSimulator = new Sim2();
-	g_simulator.enableConnectionFailures = g_random->random01() < 0.5;
+	g_simulator.connectionFailuresDisableDuration = g_random->random01() < 0.5 ? 0 : 1e6;
 }

 static double networkLatency() {
@ -1464,6 +1537,9 @@ static double networkLatency() {
 }

 ACTOR void doReboot( ISimulator::ProcessInfo *p, ISimulator::KillType kt ) {
+	TraceEvent("RebootingProcessAttempt").detailext("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("startingClass", p->startingClass.toString()).detail("failed", p->failed).detail("excluded", p->excluded).detail("rebooting", p->rebooting).detail("TaskDefaultDelay", TaskDefaultDelay);
+//	ASSERT(p->failed); //ahm
+
 	Void _ = wait( g_sim2.delay( 0, TaskDefaultDelay, p ) ); // Switch to the machine in question

 	try {
@ -1476,7 +1552,7 @@ ACTOR void doReboot( ISimulator::ProcessInfo *p, ISimulator::KillType kt ) {

 		if( p->rebooting )
 			return;
-		TraceEvent("RebootingMachine").detail("KillType", kt).detail("Address", p->address).detailext("ZoneId", p->locality.zoneId()).detailext("DataHall", p->locality.dataHallId()).detail("Locality", p->locality.toString());
+		TraceEvent("RebootingProcess").detail("KillType", kt).detail("Address", p->address).detailext("ZoneId", p->locality.zoneId()).detailext("DataHall", p->locality.dataHallId()).detail("Locality", p->locality.toString()).detail("failed", p->failed).detail("excluded", p->excluded).backtrace();
 		p->rebooting = true;
 		p->shutdownSignal.send( kt );
 	} catch (Error& e) {
@ -1488,7 +1564,7 @@ ACTOR void doReboot( ISimulator::ProcessInfo *p, ISimulator::KillType kt ) {

 //Simulates delays for performing operations on disk
 Future<Void> waitUntilDiskReady( Reference<DiskParameters> diskParameters, int64_t size, bool sync ) {
-	if(!g_simulator.enableConnectionFailures)
+	if(g_simulator.connectionFailuresDisableDuration > 1e4)
 		return delay(0.0001);

 	if( diskParameters->nextOperation < now() ) diskParameters->nextOperation = now();
--- a/fdbrpc/simulator.h
+++ b/fdbrpc/simulator.h
@ -34,7 +34,7 @@ enum ClogMode { ClogDefault, ClogAll, ClogSend, ClogReceive };

 class ISimulator : public INetwork {
 public:
-	ISimulator() : killedMachines(0), killableMachines(0), machinesNeededForProgress(3), neededDatacenters(1), killableDatacenters(0), killedDatacenters(0), maxCoordinatorsInDatacenter(0), desiredCoordinators(1), processesPerMachine(0), isStopped(false), enableConnectionFailures(true), speedUpSimulation(false), allSwapsDisabled(false), backupAgents(WaitForType), extraDB(NULL) {}
+	ISimulator() : killedMachines(0), killableMachines(0), machinesNeededForProgress(3), neededDatacenters(1), killableDatacenters(0), killedDatacenters(0), maxCoordinatorsInDatacenter(0), desiredCoordinators(1), processesPerMachine(0), isStopped(false), lastConnectionFailure(0), connectionFailuresDisableDuration(0), speedUpSimulation(false), allSwapsDisabled(false), backupAgents(WaitForType), extraDB(NULL) {}

 	// Order matters!
 	enum KillType { None, KillInstantly, InjectFaults, RebootAndDelete, Reboot, RebootProcessAndDelete, RebootProcess };
@ -108,6 +108,7 @@ public:
 		ProcessInfo* machineProcess;
 		std::vector<ProcessInfo*> processes;
 		std::map<std::string, Future<Reference<IAsyncFile>>> openFiles;
+		std::set<std::string> deletingFiles;
 		std::set<std::string> closingFiles;
 		Optional<Standalone<StringRef>>	zoneId;

@ -148,18 +149,83 @@ public:
 	//virtual KillType getMachineKillState( UID zoneID ) = 0;
 	virtual bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses, std::vector<ProcessInfo*> const& deadProcesses, KillType kt, KillType* newKillType) const = 0;
 	virtual bool isAvailable() const = 0;
+	virtual void displayWorkers() const;
+
+	virtual void addRole(NetworkAddress const& address, std::string const& role) {
+		roleAddresses[address][role] ++;
+		TraceEvent("RoleAdd").detail("Address", address).detail("Role", role).detail("Roles", roleAddresses[address].size()).detail("Value", roleAddresses[address][role]);
+	}
+
+	virtual void removeRole(NetworkAddress const& address, std::string const& role) {
+		auto addressIt = roleAddresses.find(address);
+		if (addressIt != roleAddresses.end()) {
+			auto rolesIt = addressIt->second.find(role);
+			if (rolesIt != addressIt->second.end()) {
+				if (rolesIt->second > 1) {
+					rolesIt->second --;
+					TraceEvent("RoleRemove").detail("Address", address).detail("Role", role).detail("Roles", addressIt->second.size()).detail("Value", rolesIt->second).detail("Result", "Decremented Role");
+				}
+				else {
+					addressIt->second.erase(rolesIt);
+					if (addressIt->second.size()) {
+						TraceEvent("RoleRemove").detail("Address", address).detail("Role", role).detail("Roles", addressIt->second.size()).detail("Value", 0).detail("Result", "Removed Role");
+					}
+					else {
+						roleAddresses.erase(addressIt);
+						TraceEvent("RoleRemove").detail("Address", address).detail("Role", role).detail("Roles", 0).detail("Value", 0).detail("Result", "Removed Address");
+					}
+				}
+			}
+			else {
+				TraceEvent(SevWarn,"RoleRemove").detail("Address", address).detail("Role", role).detail("Result", "Role Missing");
+			}
+		}
+		else {
+			TraceEvent(SevWarn,"RoleRemove").detail("Address", address).detail("Role", role).detail("Result", "Address Missing");
+		}
+	}
+
+	virtual std::string getRoles(NetworkAddress const& address, bool skipWorkers = true) const {
+		auto addressIt = roleAddresses.find(address);
+		std::string roleText;
+		if (addressIt != roleAddresses.end()) {
+			for (auto& roleIt : addressIt->second) {
+				if ((!skipWorkers) || (roleIt.first != "Worker"))
+					roleText += roleIt.first + ((roleIt.second > 1) ? format("-%d ", roleIt.second) : " ");
+			}
+		}
+		if (roleText.empty())
+				roleText = "[unset]";
+		return roleText;
+	}

 	virtual void excludeAddress(NetworkAddress const& address) {
-		excludedAddresses.insert(address);
+		excludedAddresses[address]++;
+		TraceEvent("ExcludeAddress").detail("Address", address).detail("Value", excludedAddresses[address]);
 	}
+
 	virtual void includeAddress(NetworkAddress const& address) {
-		excludedAddresses.erase(address);
+		auto addressIt = excludedAddresses.find(address);
+		if (addressIt != excludedAddresses.end()) {
+			if (addressIt->second > 1) {
+				addressIt->second --;
+				TraceEvent("IncludeAddress").detail("Address", address).detail("Value", addressIt->second).detail("Result", "Decremented");
+			}
+			else {
+				excludedAddresses.erase(addressIt);
+				TraceEvent("IncludeAddress").detail("Address", address).detail("Value", 0).detail("Result", "Removed");
+			}
+		}
+		else {
+			TraceEvent(SevWarn,"IncludeAddress").detail("Address", address).detail("Result", "Missing");
+		}
 	}
 	virtual void includeAllAddresses() {
+		TraceEvent("IncludeAddressAll").detail("AddressTotal", excludedAddresses.size());
 		excludedAddresses.clear();
 	}
 	virtual bool isExcluded(NetworkAddress const& address) const {
-		return excludedAddresses.count(address) == 0;
+		return excludedAddresses.find(address) != excludedAddresses.end();
 	}

 	virtual void disableSwapToMachine(Optional<Standalone<StringRef>> zoneId ) {
@ -215,7 +281,8 @@ public:
 	std::string connectionString;

 	bool isStopped;
-	bool enableConnectionFailures;
+	double lastConnectionFailure;
+	double connectionFailuresDisableDuration;
 	bool speedUpSimulation;
 	BackupAgentType backupAgents;

@ -228,7 +295,8 @@ protected:

 private:
 	std::set<Optional<Standalone<StringRef>>> swapsDisabled;
-	std::set<NetworkAddress> excludedAddresses;
+	std::map<NetworkAddress, int> excludedAddresses;
+	std::map<NetworkAddress, std::map<std::string, int>> roleAddresses;
 	bool allSwapsDisabled;
 };

--- a/fdbserver/ClusterController.actor.cpp
+++ b/fdbserver/ClusterController.actor.cpp
@ -188,13 +188,16 @@ public:
 	}

 	//FIXME: get master in the same datacenter as the proxies and resolvers for ratekeeper, however this is difficult because the master is recruited before we know the cluster's configuration
-	std::pair<WorkerInterface, ProcessClass> getMasterWorker( bool checkStable = false ) {
+	std::pair<WorkerInterface, ProcessClass> getMasterWorker( DatabaseConfiguration const& conf, bool checkStable = false ) {
 		ProcessClass::Fitness bestFit = ProcessClass::NeverAssign;
 		Optional<std::pair<WorkerInterface, ProcessClass>> bestInfo;
 		int numEquivalent = 1;
 		for( auto& it : id_worker ) {
-			if( workerAvailable( it.second, checkStable ) ) {
-				ProcessClass::Fitness fit = it.second.processClass.machineClassFitness( ProcessClass::Master );
+			auto fit = it.second.processClass.machineClassFitness( ProcessClass::Master );
+			if(conf.isExcludedServer(it.second.interf.address())) {
+				fit = std::max(fit, ProcessClass::WorstFit);
+			}
+			if( workerAvailable(it.second, checkStable) && fit != ProcessClass::NeverAssign ) {
 				if( fit < bestFit ) {
 					bestInfo = std::make_pair(it.second.interf, it.second.processClass);
 					bestFit = fit;
@ -211,14 +214,56 @@ public:
 		throw no_more_servers();
 	}

-std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogsAcrossDatacenters( DatabaseConfiguration const& conf, std::map< Optional<Standalone<StringRef>>, int>& id_used, bool checkStable = false )
+	std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForSeedServers( DatabaseConfiguration const& conf ) {
+		std::map<ProcessClass::Fitness, vector<std::pair<WorkerInterface, ProcessClass>>> fitness_workers;
+		std::vector<std::pair<WorkerInterface, ProcessClass>> results;
+		LocalitySetRef logServerSet = Reference<LocalitySet>(new LocalityMap<std::pair<WorkerInterface, ProcessClass>>());
+		LocalityMap<std::pair<WorkerInterface, ProcessClass>>* logServerMap = (LocalityMap<std::pair<WorkerInterface, ProcessClass>>*) logServerSet.getPtr();
+		bool bCompleted = false;
+
+		for( auto& it : id_worker ) {
+			auto fitness = it.second.processClass.machineClassFitness( ProcessClass::Storage );
+			if( workerAvailable(it.second, false) && !conf.isExcludedServer(it.second.interf.address()) && fitness != ProcessClass::NeverAssign ) {
+				fitness_workers[ fitness ].push_back(std::make_pair(it.second.interf, it.second.processClass));
+			}
+		}
+
+		for( auto& it : fitness_workers ) {
+			for (auto& worker : it.second ) {
+				logServerMap->add(worker.first.locality, &worker);
+			}
+
+			std::vector<LocalityEntry> bestSet;
+			if( logServerSet->selectReplicas(conf.storagePolicy, bestSet) ) {
+				results.reserve(bestSet.size());
+				for (auto& entry : bestSet) {
+					auto object = logServerMap->getObject(entry);
+					results.push_back(*object);
+				}
+				bCompleted = true;
+				break;
+			}
+		}
+
+		logServerSet->clear();
+		logServerSet.clear();
+
+		if (!bCompleted) {
+			throw no_more_servers();
+		}
+
+		return results;
+	}
+
+	std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogsAcrossDatacenters( DatabaseConfiguration const& conf, std::map< Optional<Standalone<StringRef>>, int>& id_used, bool checkStable = false )
 	{
 		std::map<ProcessClass::Fitness, vector<std::pair<WorkerInterface, ProcessClass>>> fitness_workers;
-		std::vector<std::pair<WorkerInterface, ProcessClass>>		results;
-		std::vector<LocalityData>							unavailableLocals;
-		LocalitySetRef																					logServerSet;
-		LocalityMap<std::pair<WorkerInterface, ProcessClass>>*	logServerMap;
-		bool		bCompleted = false;
+		std::vector<std::pair<WorkerInterface, ProcessClass>> results;
+		std::vector<LocalityData> unavailableLocals;
+		LocalitySetRef logServerSet;
+		LocalityMap<std::pair<WorkerInterface, ProcessClass>>* logServerMap;
+		UID functionId = g_nondeterministic_random->randomUniqueID();
+		bool bCompleted = false;

 		logServerSet = Reference<LocalitySet>(new LocalityMap<std::pair<WorkerInterface, ProcessClass>>());
 		logServerMap = (LocalityMap<std::pair<WorkerInterface, ProcessClass>>*) logServerSet.getPtr();
@ -230,7 +275,7 @@ std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogsAcrossDa
 			}
 			else {
 				if (it.second.interf.locality.dataHallId().present())
-					TraceEvent(SevWarn,"GWFTADNotAvailable", id)
+					TraceEvent(SevWarn,"GWFTADNotAvailable", functionId)
 						.detail("Fitness", fitness)
 						.detailext("Zone", it.second.interf.locality.zoneId())
 						.detailext("DataHall", it.second.interf.locality.dataHallId())
@ -243,7 +288,8 @@ std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogsAcrossDa
 						.detail("Locality", it.second.interf.locality.toString())
 						.detail("tLogReplicationFactor", conf.tLogReplicationFactor)
 						.detail("tLogPolicy", conf.tLogPolicy ? conf.tLogPolicy->info() : "[unset]")
-						.detail("DesiredLogs", conf.getDesiredLogs());
+						.detail("DesiredLogs", conf.getDesiredLogs())
+						.detail("InterfaceId", id);
 				unavailableLocals.push_back(it.second.interf.locality);
 			}
 		}
@ -258,12 +304,13 @@ std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogsAcrossDa
 				logServerMap->add(worker.first.locality, &worker);
 			}
 			if (logServerSet->size() < conf.tLogReplicationFactor) {
-				TraceEvent(SevWarn,"GWFTADTooFew", id)
+				TraceEvent(SevWarn,"GWFTADTooFew", functionId)
 					.detail("Fitness", fitness)
 					.detail("Processes", logServerSet->size())
 					.detail("tLogReplicationFactor", conf.tLogReplicationFactor)
 					.detail("tLogPolicy", conf.tLogPolicy ? conf.tLogPolicy->info() : "[unset]")
-					.detail("DesiredLogs", conf.getDesiredLogs());
+					.detail("DesiredLogs", conf.getDesiredLogs())
+					.detail("InterfaceId", id);
 			}
 			else if (logServerSet->size() <= conf.getDesiredLogs()) {
 				ASSERT(conf.tLogPolicy);
@ -275,12 +322,13 @@ std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogsAcrossDa
 					break;
 				}
 				else {
-					TraceEvent(SevWarn,"GWFTADNotAcceptable", id)
+					TraceEvent(SevWarn,"GWFTADNotAcceptable", functionId)
 						.detail("Fitness", fitness)
 						.detail("Processes", logServerSet->size())
 						.detail("tLogReplicationFactor", conf.tLogReplicationFactor)
 						.detail("tLogPolicy", conf.tLogPolicy ? conf.tLogPolicy->info() : "[unset]")
-						.detail("DesiredLogs", conf.getDesiredLogs());
+						.detail("DesiredLogs", conf.getDesiredLogs())
+						.detail("InterfaceId", id);
 				}
 			}
 			// Try to select the desired size, if larger
@ -300,7 +348,7 @@ std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogsAcrossDa
 						results.push_back(*object);
 						tLocalities.push_back(object->first.locality);
 					}
-					TraceEvent("GWFTADBestResults", id)
+					TraceEvent("GWFTADBestResults", functionId)
 						.detail("Fitness", fitness)
 						.detail("Processes", logServerSet->size())
 						.detail("BestCount", bestSet.size())
@ -308,17 +356,19 @@ std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogsAcrossDa
 						.detail("BestDataHalls", ::describeDataHalls(tLocalities))
 						.detail("tLogPolicy", conf.tLogPolicy ? conf.tLogPolicy->info() : "[unset]")
 						.detail("TotalResults", results.size())
-						.detail("DesiredLogs", conf.getDesiredLogs());
+						.detail("DesiredLogs", conf.getDesiredLogs())
+						.detail("InterfaceId", id);
 					bCompleted = true;
 					break;
 				}
 				else {
-					TraceEvent(SevWarn,"GWFTADNoBest", id)
+					TraceEvent(SevWarn,"GWFTADNoBest", functionId)
 						.detail("Fitness", fitness)
 						.detail("Processes", logServerSet->size())
 						.detail("tLogReplicationFactor", conf.tLogReplicationFactor)
 						.detail("tLogPolicy", conf.tLogPolicy ? conf.tLogPolicy->info() : "[unset]")
-						.detail("DesiredLogs", conf.getDesiredLogs());
+						.detail("DesiredLogs", conf.getDesiredLogs())
+						.detail("InterfaceId", id);
 				}
 			}
 		}
@ -331,7 +381,7 @@ std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogsAcrossDa
 					tLocalities.push_back(object->first.locality);
 				}

-				TraceEvent(SevWarn, "GetTLogTeamFailed")
+				TraceEvent(SevWarn, "GetTLogTeamFailed", functionId)
 					.detail("Policy", conf.tLogPolicy->info())
 					.detail("Processes", logServerSet->size())
 					.detail("Workers", id_worker.size())
@ -344,7 +394,8 @@ std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogsAcrossDa
 					.detail("DesiredLogs", conf.getDesiredLogs())
 					.detail("RatingTests",SERVER_KNOBS->POLICY_RATING_TESTS)
 					.detail("checkStable", checkStable)
-					.detail("PolicyGenerations",SERVER_KNOBS->POLICY_GENERATIONS).backtrace();
+					.detail("PolicyGenerations",SERVER_KNOBS->POLICY_GENERATIONS)
+					.detail("InterfaceId", id).backtrace();

 			// Free the set
 			logServerSet->clear();
@ -356,14 +407,25 @@ std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogsAcrossDa
 			id_used[result.first.locality.processId()]++;
 		}

-		TraceEvent("GetTLogTeamDone")
+		TraceEvent("GetTLogTeamDone", functionId)
 			.detail("Completed", bCompleted).detail("Policy", conf.tLogPolicy->info())
 			.detail("Results", results.size()).detail("Processes", logServerSet->size())
 			.detail("Workers", id_worker.size())
 			.detail("Replication", conf.tLogReplicationFactor)
 			.detail("Desired", conf.getDesiredLogs())
 			.detail("RatingTests",SERVER_KNOBS->POLICY_RATING_TESTS)
-			.detail("PolicyGenerations",SERVER_KNOBS->POLICY_GENERATIONS);
+			.detail("PolicyGenerations",SERVER_KNOBS->POLICY_GENERATIONS)
+			.detail("InterfaceId", id);
+
+		for (auto& result : results) {
+			TraceEvent("GetTLogTeamWorker", functionId)
+				.detail("Class", result.second.toString())
+				.detail("Address", result.first.address())
+				.detailext("Zone", result.first.locality.zoneId())
+				.detailext("DataHall", result.first.locality.dataHallId())
+				.detail("isExcludedServer", conf.isExcludedServer(result.first.address()))
+				.detail("isAvailable", IFailureMonitor::failureMonitor().getState(result.first.storage.getEndpoint()).isAvailable());
+		}

 		// Free the set
 		logServerSet->clear();
@ -552,6 +614,12 @@ std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogsAcrossDa
 		RecruitFromConfigurationReply result;
 		std::map< Optional<Standalone<StringRef>>, int> id_used;

+		if(req.recruitSeedServers) {
+			auto storageServers = getWorkersForSeedServers(req.configuration);
+			for(int i = 0; i < storageServers.size(); i++)
+				result.storageServers.push_back(storageServers[i].first);
+		}
+
 		id_used[masterProcessId]++;
 		auto tlogs = getWorkersForTlogsAcrossDatacenters( req.configuration, id_used );
 		for(int i = 0; i < tlogs.size(); i++)
@ -620,7 +688,7 @@ std::vector<std::pair<WorkerInterface, ProcessClass>> getWorkersForTlogsAcrossDa
 		id_used[masterProcessId]++;

 		ProcessClass::Fitness oldMasterFit = masterWorker->second.processClass.machineClassFitness( ProcessClass::Master );
-		ProcessClass::Fitness newMasterFit = getMasterWorker(true).second.machineClassFitness( ProcessClass::Master );
+		ProcessClass::Fitness newMasterFit = getMasterWorker(db.config, true).second.machineClassFitness( ProcessClass::Master );

 		if(dbi.recoveryState < RecoveryState::FULLY_RECOVERED) {
 			if(oldMasterFit > newMasterFit) {
@ -749,7 +817,7 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
 		try {
 			state double recoveryStart = now();
 			TraceEvent("CCWDB", cluster->id).detail("Recruiting", "Master");
-			state std::pair<WorkerInterface, ProcessClass> masterWorker = cluster->getMasterWorker();
+			state std::pair<WorkerInterface, ProcessClass> masterWorker = cluster->getMasterWorker(db->config);
 			if( masterWorker.second.machineClassFitness( ProcessClass::Master ) > SERVER_KNOBS->EXPECTED_MASTER_FITNESS && now() - cluster->startTime < SERVER_KNOBS->WAIT_FOR_GOOD_RECRUITMENT_DELAY ) {
 				TraceEvent("CCWDB", cluster->id).detail("Fitness", masterWorker.second.machineClassFitness( ProcessClass::Master ));
 				Void _ = wait( delay(SERVER_KNOBS->ATTEMPT_RECRUITMENT_DELAY) );
@ -771,7 +839,6 @@ ACTOR Future<Void> clusterWatchDatabase( ClusterControllerData* cluster, Cluster
 				iMaster = newMaster.get();

 				db->masterRegistrationCount = 0;
-				db->config = DatabaseConfiguration();
 				db->forceMasterFailure = Promise<Void>();

 				auto dbInfo = ServerDBInfo( LiteralStringRef("DB") );
@ -1198,7 +1265,7 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c
 	}

 	db->masterRegistrationCount = req.registrationCount;
-	db->config = req.configuration;
+	if(req.configuration.present()) db->config = req.configuration.get();

 	bool isChanged = false;
 	auto dbInfo = self->db.serverInfo->get();
@ -1560,14 +1627,14 @@ ACTOR Future<Void> clusterControllerCore( ClusterControllerFullInterface interf,
 	}
 }

-ACTOR Future<Void> clusterController( ServerCoordinators coordinators, Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> currentCC, bool hasConnected ) {
+ACTOR Future<Void> clusterController( ServerCoordinators coordinators, Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> currentCC, bool hasConnected, Reference<AsyncVar<ProcessClass>> asyncProcessClass ) {
 	loop {
 		state ClusterControllerFullInterface cci;
 		state bool inRole = false;
 		cci.initEndpoints();
 		try {
 			//Register as a possible leader; wait to be elected
-			state Future<Void> leaderFail = tryBecomeLeader( coordinators, cci, currentCC, hasConnected );
+			state Future<Void> leaderFail = tryBecomeLeader( coordinators, cci, currentCC, hasConnected, asyncProcessClass );

 			while (!currentCC->get().present() || currentCC->get().get() != cci) {
 				choose {
@ -1591,12 +1658,12 @@ ACTOR Future<Void> clusterController( ServerCoordinators coordinators, Reference
 	}
 }

-ACTOR Future<Void> clusterController( Reference<ClusterConnectionFile> connFile, Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> currentCC ) {
+ACTOR Future<Void> clusterController( Reference<ClusterConnectionFile> connFile, Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> currentCC, Reference<AsyncVar<ProcessClass>> asyncProcessClass) {
 	state bool hasConnected = false;
 	loop {
 		try {
 			ServerCoordinators coordinators( connFile );
-			Void _ = wait( clusterController( coordinators, currentCC, hasConnected ) );
+			Void _ = wait( clusterController( coordinators, currentCC, hasConnected, asyncProcessClass ) );
 		} catch( Error &e ) {
 			if( e.code() != error_code_coordinators_changed )
 				throw; // Expected to terminate fdbserver
--- a/fdbserver/ClusterRecruitmentInterface.h
+++ b/fdbserver/ClusterRecruitmentInterface.h
@ -64,15 +64,16 @@ struct ClusterControllerFullInterface {

 struct RecruitFromConfigurationRequest {
 	DatabaseConfiguration configuration;
+	bool recruitSeedServers;
 	ReplyPromise< struct RecruitFromConfigurationReply > reply;

 	RecruitFromConfigurationRequest() {}
-	explicit RecruitFromConfigurationRequest(DatabaseConfiguration const& configuration)
-		: configuration(configuration) {}
+	explicit RecruitFromConfigurationRequest(DatabaseConfiguration const& configuration, bool recruitSeedServers)
+		: configuration(configuration), recruitSeedServers(recruitSeedServers) {}

 	template <class Ar>
 	void serialize( Ar& ar ) {
-		ar & configuration & reply;
+		ar & configuration & recruitSeedServers & reply;
 	}
 };

@ -80,10 +81,11 @@ struct RecruitFromConfigurationReply {
 	vector<WorkerInterface> tLogs;
 	vector<WorkerInterface> proxies;
 	vector<WorkerInterface> resolvers;
+	vector<WorkerInterface> storageServers;

 	template <class Ar>
 	void serialize( Ar& ar ) {
-		ar & tLogs & proxies & resolvers;
+		ar & tLogs & proxies & resolvers & storageServers;
 	}
 };

@ -150,7 +152,7 @@ struct RegisterMasterRequest {
 	vector<ResolverInterface> resolvers;
 	DBRecoveryCount recoveryCount;
 	int64_t registrationCount;
-	DatabaseConfiguration configuration;
+	Optional<DatabaseConfiguration> configuration;
 	vector<UID> priorCommittedLogServers;
 	int recoveryState;
 	
--- a/fdbserver/Coordination.actor.cpp
+++ b/fdbserver/Coordination.actor.cpp
@ -247,8 +247,8 @@ ACTOR Future<Void> leaderRegister(LeaderElectionRegInterface interf, Key key) {
 				return Void();
 			} else {
 				Optional<LeaderInfo> nextNominee = 
-					availableLeaders.size() ? *availableLeaders.rbegin() : 
-					availableCandidates.size() ? *availableCandidates.rbegin() : Optional<LeaderInfo>();
+					availableLeaders.size() ? *availableLeaders.begin() : 
+					availableCandidates.size() ? *availableCandidates.begin() : Optional<LeaderInfo>();

 				if (nextNominee != currentNominee || !availableLeaders.size()) {
 					TraceEvent("NominatingLeader").detail("Nominee", nextNominee.present() ? nextNominee.get().changeID : UID())
--- a/fdbserver/DataDistribution.actor.cpp
+++ b/fdbserver/DataDistribution.actor.cpp
@ -997,16 +997,22 @@ struct DDTeamCollection {
 				state int teamsToBuild = desiredTeams - teamCount;

 				state vector<std::vector<UID>> builtTeams;
-				int addedTeams = wait( self->addAllTeams( self, desiredServerVector, &builtTeams, teamsToBuild ) );

-				if( addedTeams < teamsToBuild ) {
-					for( int i = 0; i < builtTeams.size(); i++ ) {
-						std::sort(builtTeams[i].begin(), builtTeams[i].end());
-						self->addTeam( builtTeams[i].begin(), builtTeams[i].end() );
+				if (self->teamSize <= 3) {
+					int addedTeams = wait( self->addAllTeams( self, desiredServerVector, &builtTeams, teamsToBuild ) );
+
+					if( addedTeams < teamsToBuild ) {
+						for( int i = 0; i < builtTeams.size(); i++ ) {
+							std::sort(builtTeams[i].begin(), builtTeams[i].end());
+							self->addTeam( builtTeams[i].begin(), builtTeams[i].end() );
+						}
+						TraceEvent("AddAllTeams", self->masterId).detail("CurrentTeams", self->teams.size()).detail("AddedTeams", builtTeams.size());
 					}
-					TraceEvent("AddAllTeams", self->masterId).detail("CurrentTeams", self->teams.size()).detail("AddedTeams", builtTeams.size());
-				}
-				else {
+					else {
+						int addedTeams = self->addTeamsBestOf( teamsToBuild );
+						TraceEvent("AddTeamsBestOf", self->masterId).detail("CurrentTeams", self->teams.size()).detail("AddedTeams", addedTeams);
+					}
+				} else {
 					int addedTeams = self->addTeamsBestOf( teamsToBuild );
 					TraceEvent("AddTeamsBestOf", self->masterId).detail("CurrentTeams", self->teams.size()).detail("AddedTeams", addedTeams);
 				}
@ -1878,54 +1884,6 @@ ACTOR Future<bool> isDataDistributionEnabled( Database cx ) {
 	}
 }

-ACTOR Future<int> disableDataDistribution( Database cx ) {
-	state Transaction tr(cx);
-	state int oldMode = -1;
-	state BinaryWriter wr(Unversioned());
-	wr << 0;
-
-	loop {
-		try {
-			Optional<Value> old = wait( tr.get( dataDistributionModeKey ) );
-			if (oldMode < 0) {
-				oldMode = 1;
-				if (old.present()) {
-					BinaryReader rd(old.get(), Unversioned());
-					rd >> oldMode;
-				}
-			}
-			// SOMEDAY: Write a wrapper in MoveKeys.h
-			BinaryWriter wrMyOwner(Unversioned()); wrMyOwner << dataDistributionModeLock;
-			tr.set( moveKeysLockOwnerKey, wrMyOwner.toStringRef() );
-			tr.set( dataDistributionModeKey, wr.toStringRef() );
-
-			Void _ = wait( tr.commit() );
-			return oldMode;
-		} catch (Error& e) {
-			TraceEvent("disableDDModeRetrying").error(e);
-			Void _ = wait ( tr.onError(e) );
-		}
-	}
-}
-
-ACTOR Future<Void> enableDataDistribution( Database cx, int mode ) {
-	state Transaction tr(cx);
-	state BinaryWriter wr(Unversioned());
-	wr << mode;
-
-	loop {
-		try {
-			Optional<Value> old = wait( tr.get( dataDistributionModeKey ) );
-			tr.set( dataDistributionModeKey, wr.toStringRef() );
-			Void _ = wait( tr.commit() );
-			return Void();
-		} catch (Error& e) {
-			TraceEvent("enableDDModeRetrying").error(e);
-			Void _ = wait( tr.onError(e) );
-		}
-	}
-}
-
 //Ensures that the serverKeys key space is properly coalesced
 //This method is only used for testing and is not implemented in a manner that is safe for large databases
 ACTOR Future<Void> debugCheckCoalescing(Database cx) {
--- a/fdbserver/DataDistribution.h
+++ b/fdbserver/DataDistribution.h
@ -210,7 +210,4 @@ struct ShardSizeBounds {
 ShardSizeBounds getShardSizeBounds(KeyRangeRef shard, int64_t maxShardSize);

 //Determines the maximum shard size based on the size of the database
-int64_t getMaxShardSize( double dbSizeEstimate );
-
-Future<Void> enableDataDistribution( Database const& cx, int const& mode );
-Future<int> disableDataDistribution( Database const& cx );
+int64_t getMaxShardSize( double dbSizeEstimate );
--- a/fdbserver/DataDistributionTracker.actor.cpp
+++ b/fdbserver/DataDistributionTracker.actor.cpp
@ -376,15 +376,6 @@ Future<Void> shardMerger(
 	TEST(true);  // shard to be merged
 	ASSERT( keys.begin > allKeys.begin );

-	// We must not merge the keyServers shard
-	if (keys.begin == keyServersPrefix) {
-		TraceEvent(SevError, "LastShardMerge", self->masterId)
-			.detail("ShardKeyBegin", printable(keys.begin))
-			.detail("ShardKeyEnd", printable(keys.end))
-			.detail("TrackerID", trackerId);
-		ASSERT(false);
-	}
-
 	// This will merge shards both before and after "this" shard in keyspace.
 	int shardsMerged = 1;
 	bool forwardComplete = false;
@ -394,7 +385,7 @@ Future<Void> shardMerger(
 	loop {
 		Optional<StorageMetrics> newMetrics;
 		if( !forwardComplete ) {
-			if( nextIter->range().end == keyServersPrefix ) {
+			if( nextIter->range().end == allKeys.end ) {
 				forwardComplete = true;
 				continue;
 			}
@ -610,14 +601,6 @@ ACTOR Future<Void> trackInitialShards(DataDistributionTracker *self,
 	state int lastBegin = -1;
 	state vector<UID> last;

-	//The ending shard does not have a shardTracker, so instead just track the size of the shard
-	Reference<AsyncVar<Optional<StorageMetrics>>> endShardSize( new AsyncVar<Optional<StorageMetrics>>() );
-	KeyRangeRef endShardRange( keyServersPrefix, allKeys.end );
-	ShardTrackedData endShardData;
-	endShardData.stats = endShardSize;
-	endShardData.trackBytes = trackShardBytes( self, endShardRange, endShardSize, g_random->randomUniqueID(), false );
-	self->shards.insert( endShardRange, endShardData );
-
 	state int s;
 	for(s=0; s<initData->shards.size(); s++) {
 		state InitialDataDistribution::Team src = initData->shards[s].value.first;
@ -637,8 +620,7 @@ ACTOR Future<Void> trackInitialShards(DataDistributionTracker *self,

 			if (lastBegin >= 0) {
 				state KeyRangeRef keys( initData->shards[lastBegin].begin, initData->shards[s].begin );
-				if (keys.begin < keyServersPrefix) // disallow spliting of keyServers shard
-					restartShardTrackers( self, keys );
+				restartShardTrackers( self, keys );
 				shardsAffectedByTeamFailure->defineShard( keys );
 				shardsAffectedByTeamFailure->moveShard( keys, last );
 			}
@ -648,7 +630,7 @@ ACTOR Future<Void> trackInitialShards(DataDistributionTracker *self,
 		Void _ = wait( yield( TaskDataDistribution ) );
 	}

-	Future<Void> initialSize = changeSizes( self, KeyRangeRef(allKeys.begin, keyServersPrefix), 0 );
+	Future<Void> initialSize = changeSizes( self, KeyRangeRef(allKeys.begin, allKeys.end), 0 );
 	self->readyToStart.send(Void());
 	Void _ = wait( initialSize );
 	self->maxShardSizeUpdater = updateMaxShardSize( self->cx->dbName, self->dbSizeEstimate, self->maxShardSize );
--- a/fdbserver/DatabaseConfiguration.cpp
+++ b/fdbserver/DatabaseConfiguration.cpp
@ -77,18 +77,22 @@ std::map<std::string, std::string> DatabaseConfiguration::toMap() const {
 	std::map<std::string, std::string> result;

 	if( initialized ) {
+		std::string tlogInfo = tLogPolicy->info();
+		std::string storageInfo = storagePolicy->info();
 		if( durableStorageQuorum == storageTeamSize &&
 			tLogWriteAntiQuorum == 0 ) {
 			if( tLogReplicationFactor == 1 && durableStorageQuorum == 1 )
 				result["redundancy_mode"] = "single";
 			else if( tLogReplicationFactor == 2 && durableStorageQuorum == 2 )
 				result["redundancy_mode"] = "double";
+			else if( tLogReplicationFactor == 3 && durableStorageQuorum == 3 && tlogInfo == "((dcid^3 x 1) & (zoneid^3 x 1))" && storageInfo == "((dcid^3 x 1) & (zoneid^3 x 1))" )
+				result["redundancy_mode"] = "three_datacenter";
 			else if( tLogReplicationFactor == 3 && durableStorageQuorum == 3 )
 				result["redundancy_mode"] = "triple";
-			else if( tLogReplicationFactor == 3 && durableStorageQuorum == 2 )
-				result["redundancy_mode"] = "fast_recovery_double";
-			else if( tLogReplicationFactor == 4 && durableStorageQuorum == 3 )
-				result["redundancy_mode"] = "fast_recovery_triple";
+			else if( tLogReplicationFactor == 4 && durableStorageQuorum == 3 && tlogInfo == "data_hall^2 x zoneid^2 x 1" && storageInfo == "data_hall^3 x 1" )
+				result["redundancy_mode"] = "three_data_hall";
+			else if( tLogReplicationFactor == 4 && durableStorageQuorum == 6 && tlogInfo == "dcid^2 x zoneid^2 x 1" && storageInfo == "dcid^3 x zoneid^2 x 1" )
+				result["redundancy_mode"] = "multi_dc";
 			else
 				result["redundancy_mode"] = "custom";
 		} else
--- a/fdbserver/DiskQueue.actor.cpp
+++ b/fdbserver/DiskQueue.actor.cpp
@ -298,11 +298,11 @@ public:
 				.detail("File0Size", self->files[0].size).detail("File1Size", self->files[1].size)
 				.detail("File0Name", self->files[0].dbgFilename).detail("SyncedFiles", syncFiles.size());*/

-			committed.send(Void());
 			if(g_random->random01() < 0.01) {
 				//occasionally delete all the ready future in the AndFuture
 				self->lastCommit.cleanup();
 			}
+			committed.send(Void());
 		} catch (Error& e) {
 			delete pageMem;
 			TEST(true);  // push error
@ -405,8 +405,8 @@ public:
 				TraceEvent("DiskQueueShutdownDeleting", self->dbgid)
 					.detail("File0", self->filename(0))
 					.detail("File1", self->filename(1));
-				Void _ = wait( IAsyncFileSystem::filesystem()->deleteFile( self->filename(0), false ) );
-				Void _ = wait( IAsyncFileSystem::filesystem()->deleteFile( self->filename(1), true ) );
+				Void _ = wait( IAsyncFile::incrementalDelete( self->filename(0), false ) );
+				Void _ = wait( IAsyncFile::incrementalDelete( self->filename(1), true ) );
 			}
 			TraceEvent("DiskQueueShutdownComplete", self->dbgid)
 				.detail("DeleteFiles", deleteFiles)
@ -419,8 +419,8 @@ public:
 		}

 		if( error.code() != error_code_actor_cancelled ) {
-			if (!self->stopped.isSet()) self->stopped.send(Void());
-			if (!self->error.isSet()) self->error.send(Never());
+			if (self->stopped.canBeSet()) self->stopped.send(Void());
+			if (self->error.canBeSet()) self->error.send(Never());
 			delete self;
 		}
 	}
--- a/fdbserver/KeyValueStoreSQLite.actor.cpp
+++ b/fdbserver/KeyValueStoreSQLite.actor.cpp
@ -1587,7 +1587,11 @@ private:
 			if (checkIntegrityOnOpen || EXPENSIVE_VALIDATION) {
 				if(conn.check(false) != 0) {
 					// A corrupt btree structure must not be used.
-					throw file_corrupt();
+					if (g_network->isSimulated() && (g_simulator.getCurrentProcess()->fault_injection_p1 || g_simulator.getCurrentProcess()->machine->machineProcess->fault_injection_p1 || g_simulator.getCurrentProcess()->rebooting)) {
+						throw file_corrupt().asInjectedFault();
+					} else {
+						throw file_corrupt();
+					}
 				}
 			}
 		}
@ -1823,8 +1827,8 @@ private:
 			self->logging.cancel();
 			Void _ = wait( self->readThreads->stop() && self->writeThread->stop() );
 			if (deleteOnClose) {
-				Void _ = wait( IAsyncFileSystem::filesystem()->deleteFile( self->filename, true ) );
-				Void _ = wait( IAsyncFileSystem::filesystem()->deleteFile( self->filename + "-wal", false ) );
+				Void _ = wait( IAsyncFile::incrementalDelete( self->filename, true ) );
+				Void _ = wait( IAsyncFile::incrementalDelete( self->filename + "-wal", false ) );
 			}
 		} catch (Error& e) {
 			TraceEvent(SevError, "KVDoCloseError", self->logID)
--- a/fdbserver/Knobs.cpp
+++ b/fdbserver/Knobs.cpp
@ -49,7 +49,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( MAX_MESSAGE_SIZE,            std::max<int>(LOG_SYSTEM_PUSHED_DATA_BLOCK_SIZE, 1e5 + 2e4 + 1) + 8 ); // VALUE_SIZE_LIMIT + SYSTEM_KEY_SIZE_LIMIT + 9 bytes (4 bytes for length, 4 bytes for sequence number, and 1 byte for mutation type)
 	init( TLOG_MESSAGE_BLOCK_BYTES,                             10e6 );
 	init( TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR,      double(TLOG_MESSAGE_BLOCK_BYTES) / (TLOG_MESSAGE_BLOCK_BYTES - MAX_MESSAGE_SIZE) ); //1.0121466709838096006362758832473
-	init( PEEK_TRACKER_EXPIRATION_TIME,                          600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = 0.1;
+	init( PEEK_TRACKER_EXPIRATION_TIME,                          600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = g_random->coinflip() ? 0.1 : 60;
 	init( PARALLEL_GET_MORE_REQUESTS,                             32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2;
 	init( MAX_QUEUE_COMMIT_BYTES,                               15e6 ); if( randomize && BUGGIFY ) MAX_QUEUE_COMMIT_BYTES = 5000;

@ -124,7 +124,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 		If this value is too small relative to SHARD_MIN_BYTES_PER_KSEC immediate merging work will be generated.
 		*/

-	init( STORAGE_METRIC_TIMEOUT,                              600.0 ); if( randomize && BUGGIFY ) STORAGE_METRIC_TIMEOUT = 10.0;
+	init( STORAGE_METRIC_TIMEOUT,                              600.0 ); if( randomize && BUGGIFY ) STORAGE_METRIC_TIMEOUT = g_random->coinflip() ? 10.0 : 60.0;
 	init( METRIC_DELAY,                                          0.1 ); if( randomize && BUGGIFY ) METRIC_DELAY = 1.0;
 	init( ALL_DATA_REMOVED_DELAY,                                1.0 );
 	init( INITIAL_FAILURE_REACTION_DELAY,                       30.0 ); if( randomize && BUGGIFY ) INITIAL_FAILURE_REACTION_DELAY = 0.0;
@ -255,18 +255,18 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) {
 	init( MASTER_FAILURE_SLOPE_DURING_RECOVERY,                  0.1 );
 	init( WORKER_COORDINATION_PING_DELAY,                         60 );
 	init( SIM_SHUTDOWN_TIMEOUT,                                   10 );
-	init( SHUTDOWN_TIMEOUT,                                      600 );
+	init( SHUTDOWN_TIMEOUT,                                      600 ); if( randomize && BUGGIFY ) SHUTDOWN_TIMEOUT = 60.0;
 	init( MASTER_SPIN_DELAY,                                     1.0 ); if( randomize && BUGGIFY ) MASTER_SPIN_DELAY = 10.0;
 	init( WAIT_FOR_GOOD_RECRUITMENT_DELAY,                       1.0 );
 	init( ATTEMPT_RECRUITMENT_DELAY,                            0.05 );
 	init( WORKER_FAILURE_TIME,                                   1.0 ); if( randomize && BUGGIFY ) WORKER_FAILURE_TIME = 10.0;
 	init( CHECK_BETTER_MASTER_INTERVAL,                          1.0 ); if( randomize && BUGGIFY ) CHECK_BETTER_MASTER_INTERVAL = 0.001;
-	init( INCOMPATIBLE_PEERS_LOGGING_INTERVAL,                   600 );
+	init( INCOMPATIBLE_PEERS_LOGGING_INTERVAL,                   600 ); if( randomize && BUGGIFY ) INCOMPATIBLE_PEERS_LOGGING_INTERVAL = 60.0;
 	init( EXPECTED_MASTER_FITNESS,             ProcessClass::GoodFit );
 	init( EXPECTED_TLOG_FITNESS,               ProcessClass::GoodFit );
 	init( EXPECTED_PROXY_FITNESS,              ProcessClass::GoodFit );
 	init( EXPECTED_RESOLVER_FITNESS,           ProcessClass::GoodFit );
-	init( RECRUITMENT_TIMEOUT,                                   600 ); if( randomize && BUGGIFY ) RECRUITMENT_TIMEOUT = 1.0;
+	init( RECRUITMENT_TIMEOUT,                                   600 ); if( randomize && BUGGIFY ) RECRUITMENT_TIMEOUT = g_random->coinflip() ? 60.0 : 1.0;

 	init( POLICY_RATING_TESTS,                                   200 ); if( randomize && BUGGIFY ) POLICY_RATING_TESTS = 20;
 	init( POLICY_GENERATIONS,                                    100 ); if( randomize && BUGGIFY ) POLICY_GENERATIONS = 10;
--- a/fdbserver/LeaderElection.actor.cpp
+++ b/fdbserver/LeaderElection.actor.cpp
@ -20,6 +20,7 @@

 #include "flow/actorcompiler.h"
 #include "fdbrpc/FailureMonitor.h"
+#include "fdbrpc/Locality.h"
 #include "ClusterRecruitmentInterface.h"
 #include "fdbserver/CoordinationInterface.h"
 #include "fdbclient/MonitorLeader.h"
@ -74,7 +75,7 @@ ACTOR Future<Void> changeLeaderCoordinators( ServerCoordinators coordinators, Va
 	return Void();
 }

-ACTOR Future<Void> tryBecomeLeaderInternal( ServerCoordinators coordinators, Value proposedSerializedInterface, Reference<AsyncVar<Value>> outSerializedLeader, bool hasConnected ) {
+ACTOR Future<Void> tryBecomeLeaderInternal( ServerCoordinators coordinators, Value proposedSerializedInterface, Reference<AsyncVar<Value>> outSerializedLeader, bool hasConnected, Reference<AsyncVar<ProcessClass>> asyncProcessClass ) {
 	state Reference<AsyncVar<vector<Optional<LeaderInfo>>>> nominees( new AsyncVar<vector<Optional<LeaderInfo>>>() );
 	state LeaderInfo myInfo;
 	state Future<Void> candidacies;
@ -90,7 +91,10 @@ ACTOR Future<Void> tryBecomeLeaderInternal( ServerCoordinators coordinators, Val
 	while (!iAmLeader) {
 		state Future<Void> badCandidateTimeout;

-		myInfo.changeID = g_random->randomUniqueID();
+		UID randomID = g_random->randomUniqueID();
+		int64_t mask = 15ll << 60;
+		int64_t modifiedFirstPart = (randomID.first() & ~mask) | ((int64_t)asyncProcessClass->get().machineClassFitness(ProcessClass::ClusterController) << 60);
+		myInfo.changeID = UID(modifiedFirstPart, randomID.second());

 		vector<Future<Void>> cand;
 		for(int i=0; i<coordinators.leaderElectionServers.size(); i++)
--- a/fdbserver/LeaderElection.h
+++ b/fdbserver/LeaderElection.h
@ -23,6 +23,7 @@
 #pragma once

 #include "fdbrpc/fdbrpc.h"
+#include "fdbrpc/Locality.h"

 class ServerCoordinators;

@ -30,7 +31,8 @@ template <class LeaderInterface>
 Future<Void> tryBecomeLeader( ServerCoordinators const& coordinators,
 							  LeaderInterface const& proposedInterface,
 							  Reference<AsyncVar<Optional<LeaderInterface>>> const& outKnownLeader,
-							  bool hasConnected);
+							  bool hasConnected,
+							  Reference<AsyncVar<ProcessClass>> const& asyncProcessClass);

 // Participates in the given coordination group's leader election process, nominating the given
 // LeaderInterface (presumed to be a local interface) as leader.  The leader election process is
@ -46,16 +48,17 @@ Future<Void> changeLeaderCoordinators( ServerCoordinators const& coordinators, V

 #pragma region Implementation

-Future<Void> tryBecomeLeaderInternal( ServerCoordinators const& coordinators, Value const& proposedSerializedInterface, Reference<AsyncVar<Value>> const& outSerializedLeader, bool const& hasConnected );
+Future<Void> tryBecomeLeaderInternal( ServerCoordinators const& coordinators, Value const& proposedSerializedInterface, Reference<AsyncVar<Value>> const& outSerializedLeader, bool const& hasConnected, Reference<AsyncVar<ProcessClass>> const& asyncProcessClass );

 template <class LeaderInterface>
 Future<Void> tryBecomeLeader( ServerCoordinators const& coordinators,
 							  LeaderInterface const& proposedInterface,
 							  Reference<AsyncVar<Optional<LeaderInterface>>> const& outKnownLeader,
-							  bool hasConnected)
+							  bool hasConnected,
+							  Reference<AsyncVar<ProcessClass>> const& asyncProcessClass)
 {
 	Reference<AsyncVar<Value>> serializedInfo( new AsyncVar<Value> );
-	Future<Void> m = tryBecomeLeaderInternal( coordinators, BinaryWriter::toValue(proposedInterface, IncludeVersion()), serializedInfo, hasConnected );
+	Future<Void> m = tryBecomeLeaderInternal( coordinators, BinaryWriter::toValue(proposedInterface, IncludeVersion()), serializedInfo, hasConnected, asyncProcessClass );
 	return m || asyncDeserialize( serializedInfo, outKnownLeader );
 }

--- a/fdbserver/LogSystem.h
+++ b/fdbserver/LogSystem.h
@ -115,6 +115,7 @@ struct ILogSystem {
 		bool parallelGetMore;
 		int sequence;
 		Deque<Future<TLogPeekReply>> futureResults;
+		Future<Void> interfaceChanged;

 		ServerPeekCursor( Reference<AsyncVar<OptionalInterface<TLogInterface>>> const& interf, Tag tag, Version begin, Version end, bool returnIfBlocked, bool parallelGetMore );

--- a/fdbserver/LogSystemPeekCursor.actor.cpp
+++ b/fdbserver/LogSystemPeekCursor.actor.cpp
@ -119,6 +119,10 @@ ACTOR Future<Void> serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self
 		throw internal_error();
 	}

+	if(!self->interfaceChanged.isValid()) {
+		self->interfaceChanged = self->interf->onChange();
+	}
+
 	loop {
 		try {
 			while(self->futureResults.size() < SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && self->interf->get().present()) {
@ -139,7 +143,9 @@ ACTOR Future<Void> serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self
 					//TraceEvent("SPC_getMoreB", self->randomID).detail("has", self->hasMessage()).detail("end", res.end).detail("popped", res.popped.present() ? res.popped.get() : 0);
 					return Void();
 				}
-				when( Void _ = wait( self->interf->onChange() ) ) {
+				when( Void _ = wait( self->interfaceChanged ) ) {
+					self->interfaceChanged = self->interf->onChange();
+					self->randomID = g_random->randomUniqueID();
 					self->sequence = 0;
 					self->futureResults.clear();
 				}
@ -150,6 +156,7 @@ ACTOR Future<Void> serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self
 				return Void();
 			} else if(e.code() == error_code_timed_out) {
 				TraceEvent("PeekCursorTimedOut", self->randomID);
+				self->interfaceChanged = self->interf->onChange();
 				self->randomID = g_random->randomUniqueID();
 				self->sequence = 0;
 				self->futureResults.clear();
--- a/fdbserver/MasterProxyServer.actor.cpp
+++ b/fdbserver/MasterProxyServer.actor.cpp
@ -1001,17 +1001,49 @@ ACTOR static Future<Void> readRequestServer(
 	TraceEvent("ProxyReadyForReads", proxy.id());

 	loop choose{
-		when(ReplyPromise<vector<StorageServerInterface>> req = waitNext(proxy.getKeyServersLocations.getFuture())) {
-			// SOMEDAY: keep ssis around?
-			vector<UID> src, dest;
-			decodeKeyServersValue(commitData->txnStateStore->readValue(keyServersKeyServersKey).get().get(), src, dest);
-			vector<StorageServerInterface> ssis;
-			ssis.reserve(src.size());
-			for (auto const& id : src) {
-				ssis.push_back(decodeServerListValue(commitData->txnStateStore->readValue(serverListKeyFor(id)).get().get()));
+		when(ReplyPromise<vector<pair<KeyRangeRef, vector<StorageServerInterface>>>> req = waitNext(proxy.getKeyServersLocations.getFuture())) {
+			Standalone<VectorRef<KeyValueRef>> keyServersBegin = commitData->txnStateStore->readRange(KeyRangeRef(allKeys.begin, keyServersKeyServersKeys.begin), -1).get();
+			Standalone<VectorRef<KeyValueRef>> keyServersEnd = commitData->txnStateStore->readRange(KeyRangeRef(keyServersKeyServersKeys.end, allKeys.end), 2).get();
+			Standalone<VectorRef<KeyValueRef>> keyServersShardBoundaries = commitData->txnStateStore->readRange(KeyRangeRef(keyServersBegin[0].key, keyServersEnd[1].key)).get();
+
+			Standalone<VectorRef<KeyValueRef>> serverListBegin = commitData->txnStateStore->readRange(KeyRangeRef(allKeys.begin, keyServersKey(serverListKeys.begin)), -1).get();
+			Standalone<VectorRef<KeyValueRef>> serverListEnd = commitData->txnStateStore->readRange(KeyRangeRef(keyServersKey(serverListKeys.end), allKeys.end), 2).get();
+			Standalone<VectorRef<KeyValueRef>> serverListShardBoundaries = commitData->txnStateStore->readRange(KeyRangeRef(serverListBegin[0].key, serverListEnd[1].key)).get();
+
+			bool ignoreFirstServerListShard = false;
+			if (keyServersShardBoundaries.back().key > serverListShardBoundaries.front().key)
+				ignoreFirstServerListShard = true;
+
+			// shards include all keyServers and serverLists information
+			vector<pair<KeyRangeRef, vector<StorageServerInterface>>> shards;
+			int reserveSize = keyServersShardBoundaries.size() + serverListShardBoundaries.size() - 2 - (ignoreFirstServerListShard ? 1 : 0);
+			shards.reserve(reserveSize);
+
+			for (int i = 0; i < keyServersShardBoundaries.size() - 1; i++) {
+				vector<UID> src, dest;
+				decodeKeyServersValue(keyServersShardBoundaries[i].value, src, dest);
+				vector<StorageServerInterface> ssis;
+				ssis.reserve(src.size());
+				for (auto const& id : src) {
+					ssis.push_back(decodeServerListValue(commitData->txnStateStore->readValue(serverListKeyFor(id)).get().get()));
+				}
+
+				shards.push_back(std::make_pair(KeyRangeRef(keyServersShardBoundaries[i].key.removePrefix(keyServersPrefix), keyServersShardBoundaries[i + 1].key.removePrefix(keyServersPrefix)), ssis));
 			}

-			req.send(ssis);
+			for (int i = ignoreFirstServerListShard ? 1 : 0 ; i < serverListShardBoundaries.size() - 1; i++) {
+				vector<UID> src, dest;
+				decodeKeyServersValue(serverListShardBoundaries[i].value, src, dest);
+				vector<StorageServerInterface> ssis;
+				ssis.reserve(src.size());
+				for (auto const& id : src) {
+					ssis.push_back(decodeServerListValue(commitData->txnStateStore->readValue(serverListKeyFor(id)).get().get()));
+				}
+
+				shards.push_back(std::make_pair(KeyRangeRef(serverListShardBoundaries[i].key.removePrefix(keyServersPrefix), serverListShardBoundaries[i + 1].key.removePrefix(keyServersPrefix)), ssis));
+			}
+
+			req.send(shards);
 		}
 		when(GetStorageServerRejoinInfoRequest req = waitNext(proxy.getStorageServerRejoinInfo.getFuture())) {
 			if (commitData->txnStateStore->readValue(serverListKeyFor(req.id)).get().present()) {
--- a/fdbserver/MoveKeys.actor.cpp
+++ b/fdbserver/MoveKeys.actor.cpp
@ -828,8 +828,7 @@ void seedShardServers(

 	// We have to set this range in two blocks, because the master tracking of "keyServersLocations" depends on a change to a specific
 	//   key (keyServersKeyServersKey)
-	krmSetPreviouslyEmptyRange( tr, arena, keyServersPrefix, KeyRangeRef(KeyRef(), keyServersPrefix), keyServersValue( serverIds ), Value() );
-	krmSetPreviouslyEmptyRange( tr, arena, keyServersPrefix, KeyRangeRef(keyServersPrefix, allKeys.end), keyServersValue( serverIds ), Value() );
+	krmSetPreviouslyEmptyRange( tr, arena, keyServersPrefix, KeyRangeRef(KeyRef(), allKeys.end), keyServersValue( serverIds ), Value() );

 	for(int s=0; s<servers.size(); s++)
 		krmSetPreviouslyEmptyRange( tr, arena, serverKeysPrefixFor( servers[s].id() ), allKeys, serverKeysTrue, serverKeysFalse );
--- a/fdbserver/QuietDatabase.actor.cpp
+++ b/fdbserver/QuietDatabase.actor.cpp
@ -139,6 +139,7 @@ ACTOR Future<vector<StorageServerInterface>> getStorageServers( Database cx, boo
 	state Transaction tr( cx );
 	if (use_system_priority)
 		tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+		tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 	loop {
 		try {
 			Standalone<RangeResultRef> serverList = wait( tr.getRange( serverListKeys, CLIENT_KNOBS->TOO_MANY ) );
@ -332,7 +333,7 @@ ACTOR Future<Void> disableConnectionFailuresAfter( Future<Void> f, double disabl
 		}
 		when(Void _ = wait(delay(disableTime))) {
 			g_simulator.speedUpSimulation = true;
-			g_simulator.enableConnectionFailures = false;
+			g_simulator.connectionFailuresDisableDuration = 1e6;
 			TraceEvent(SevWarnAlways, ("DisableConnectionFailures_" + context).c_str());
 		}
 	}
--- a/fdbserver/SimulatedCluster.actor.cpp
+++ b/fdbserver/SimulatedCluster.actor.cpp
@ -204,8 +204,8 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(
 	loop {
 		auto waitTime = SERVER_KNOBS->MIN_REBOOT_TIME + (SERVER_KNOBS->MAX_REBOOT_TIME - SERVER_KNOBS->MIN_REBOOT_TIME) * g_random->random01();
 		cycles ++;
-		TraceEvent("SimulatedFDBDWait").detail("Cycles", cycles).detail("RandomId", randomId)
-			.detail("ProcessAddress", NetworkAddress(ip, port, true, false))
+		TraceEvent("SimulatedFDBDPreWait").detail("Cycles", cycles).detail("RandomId", randomId)
+			.detail("Address", NetworkAddress(ip, port, true, false))
 			.detailext("ZoneId", localities.zoneId())
 			.detail("waitTime", waitTime).detail("Port", port);

@ -219,10 +219,10 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(
 			TraceEvent("SimulatedRebooterStarting", localities.zoneId()).detail("Cycles", cycles).detail("RandomId", randomId)
 				.detailext("ZoneId", localities.zoneId())
 				.detailext("DataHall", localities.dataHallId())
-				.detail("ProcessAddress", process->address.toString())
-				.detail("ProcessExcluded", process->excluded)
+				.detail("Address", process->address.toString())
+				.detail("Excluded", process->excluded)
 				.detail("UsingSSL", useSSL);
-			TraceEvent("ProgramStart").detail("Cycles", cycles)
+			TraceEvent("ProgramStart").detail("Cycles", cycles).detail("RandomId", randomId)
 				.detail("SourceVersion", getHGVersion())
 				.detail("Version", FDB_VT_VERSION)
 				.detail("PackageName", FDB_VT_PACKAGE_NAME)
@ -248,7 +248,7 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(
 			} catch (Error& e) {
 				// If in simulation, if we make it here with an error other than io_timeout but enASIOTimedOut is set then somewhere an io_timeout was converted to a different error.
 				if(g_network->isSimulated() && e.code() != error_code_io_timeout && (bool)g_network->global(INetwork::enASIOTimedOut))
-					TraceEvent(SevError, "IOTimeoutErrorSuppressed").detail("ErrorCode", e.code()).backtrace();
+					TraceEvent(SevError, "IOTimeoutErrorSuppressed").detail("ErrorCode", e.code()).detail("RandomId", randomId).backtrace();

 				if (onShutdown.isReady() && onShutdown.isError()) throw onShutdown.getError();
 				if(e.code() != error_code_actor_cancelled)
@ -258,15 +258,15 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(
 			}

 			TraceEvent("SimulatedFDBDDone", localities.zoneId()).detail("Cycles", cycles).detail("RandomId", randomId)
-				.detail("ProcessAddress", process->address)
-				.detail("ProcessExcluded", process->excluded)
+				.detail("Address", process->address)
+				.detail("Excluded", process->excluded)
 				.detailext("ZoneId", localities.zoneId())
 				.detail("KillType", onShutdown.isReady() ? onShutdown.get() : ISimulator::None);

 			if (!onShutdown.isReady())
 				onShutdown = ISimulator::InjectFaults;
 		} catch (Error& e) {
-			TraceEvent(destructed ? SevInfo : SevError, "SimulatedFDBDRebooterError", localities.zoneId()).error(e, true);
+			TraceEvent(destructed ? SevInfo : SevError, "SimulatedFDBDRebooterError", localities.zoneId()).detail("RandomId", randomId).error(e, true);
 			onShutdown = e;
 		}

@ -276,6 +276,11 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(
 			process->rebooting = true;
 			process->shutdownSignal.send(ISimulator::None);
 		}
+		TraceEvent("SimulatedFDBDWait", localities.zoneId()).detail("Cycles", cycles).detail("RandomId", randomId)
+			.detail("Address", process->address)
+			.detail("Excluded", process->excluded)
+			.detail("Rebooting", process->rebooting)
+			.detailext("ZoneId", localities.zoneId());
 		Void _ = wait( g_simulator.onProcess( simProcess ) );

 		Void _ = wait(delay(0.00001 + FLOW_KNOBS->MAX_BUGGIFIED_DELAY));  // One last chance for the process to clean up?
@ -284,15 +289,15 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(

 		auto shutdownResult = onShutdown.get();
 		TraceEvent("SimulatedFDBDShutdown", localities.zoneId()).detail("Cycles", cycles).detail("RandomId", randomId)
-			.detail("ProcessAddress", process->address)
-			.detail("ProcessExcluded", process->excluded)
+			.detail("Address", process->address)
+			.detail("Excluded", process->excluded)
 			.detailext("ZoneId", localities.zoneId())
 			.detail("KillType", shutdownResult);

 		if( shutdownResult < ISimulator::RebootProcessAndDelete ) {
 			TraceEvent("SimulatedFDBDLowerReboot", localities.zoneId()).detail("Cycles", cycles).detail("RandomId", randomId)
-				.detail("ProcessAddress", process->address)
-				.detail("ProcessExcluded", process->excluded)
+				.detail("Address", process->address)
+				.detail("Excluded", process->excluded)
 				.detailext("ZoneId", localities.zoneId())
 				.detail("KillType", shutdownResult);
 			return onShutdown.get();
@ -300,7 +305,7 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(

 		if( onShutdown.get() == ISimulator::RebootProcessAndDelete ) {
 			TraceEvent("SimulatedFDBDRebootAndDelete", localities.zoneId()).detail("Cycles", cycles).detail("RandomId", randomId)
-				.detail("ProcessAddress", process->address)
+				.detail("Address", process->address)
 				.detailext("ZoneId", localities.zoneId())
 				.detail("KillType", shutdownResult);
 			*coordFolder = joinPath(baseFolder, g_random->randomUniqueID().toString());
@ -317,7 +322,7 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(
 		}
 		else {
 			TraceEvent("SimulatedFDBDJustRepeat", localities.zoneId()).detail("Cycles", cycles).detail("RandomId", randomId)
-				.detail("ProcessAddress", process->address)
+				.detail("Address", process->address)
 				.detailext("ZoneId", localities.zoneId())
 				.detail("KillType", shutdownResult);
 		}
@ -351,6 +356,7 @@ ACTOR Future<Void> simulatedMachine(
 	state int bootCount = 0;
 	state std::vector<std::string> myFolders;
 	state std::vector<std::string> coordFolders;
+	state UID randomId = g_nondeterministic_random->randomUniqueID();

 	try {
 		CSimpleIni ini;
@ -387,6 +393,7 @@ ACTOR Future<Void> simulatedMachine(
 				std::string path = joinPath(myFolders[i], "fdb.cluster");
 				Reference<ClusterConnectionFile> clusterFile(useSeedFile ? new ClusterConnectionFile(path, connStr.toString()) : new ClusterConnectionFile(path));
 				processes.push_back(simulatedFDBDRebooter(clusterFile, ips[i], sslEnabled, i + 1, localities, processClass, &myFolders[i], &coordFolders[i], baseFolder, connStr, useSeedFile, runBackupAgents));
+				TraceEvent("SimulatedMachineProcess", randomId).detail("Address", NetworkAddress(ips[i], i+1, true, false)).detailext("ZoneId", localities.zoneId()).detailext("DataHall", localities.dataHallId()).detail("Folder", myFolders[i]);
 			}

 			TEST( bootCount >= 1 ); // Simulated machine rebooted
@ -394,7 +401,7 @@ ACTOR Future<Void> simulatedMachine(
 			TEST( bootCount >= 3 ); // Simulated machine rebooted three times
 			++bootCount;

-			TraceEvent("SimulatedMachineStart")
+			TraceEvent("SimulatedMachineStart", randomId)
 				.detail("Folder0", myFolders[0])
 				.detail("CFolder0", coordFolders[0])
 				.detail("MachineIPs", toIPVectorString(ips))
@ -410,7 +417,7 @@ ACTOR Future<Void> simulatedMachine(

 			Void _ = wait( waitForAll( processes ) );

-			TraceEvent("SimulatedMachineRebootStart")
+			TraceEvent("SimulatedMachineRebootStart", randomId)
 				.detail("Folder0", myFolders[0])
 				.detail("CFolder0", coordFolders[0])
 				.detail("MachineIPs", toIPVectorString(ips))
@ -442,7 +449,12 @@ ACTOR Future<Void> simulatedMachine(
 				ASSERT( it.second.isReady() && !it.second.isError() );
 			}

-			TraceEvent("SimulatedMachineRebootAfterKills")
+			for( auto it : g_simulator.getMachineById(localities.zoneId())->deletingFiles ) {
+				filenames.insert( it );
+				closingStr += it + ", ";
+			}
+
+			TraceEvent("SimulatedMachineRebootAfterKills", randomId)
 				.detail("Folder0", myFolders[0])
 				.detail("CFolder0", coordFolders[0])
 				.detail("MachineIPs", toIPVectorString(ips))
@ -471,12 +483,12 @@ ACTOR Future<Void> simulatedMachine(
 						openFiles += *it + ", ";
 						i++;
 					}
-					TraceEvent("MachineFilesOpen").detail("PAddr", toIPVectorString(ips)).detail("OpenFiles", openFiles);
+					TraceEvent("MachineFilesOpen", randomId).detail("PAddr", toIPVectorString(ips)).detail("OpenFiles", openFiles);
 				} else
 					break;

 				if( shutdownDelayCount++ >= 50 ) {  // Worker doesn't shut down instantly on reboot
-					TraceEvent(SevError, "SimulatedFDBDFilesCheck")
+					TraceEvent(SevError, "SimulatedFDBDFilesCheck", randomId)
 						.detail("PAddrs", toIPVectorString(ips))
 						.detailext("ZoneId", localities.zoneId())
 						.detailext("DataHall", localities.dataHallId());
@ -487,8 +499,8 @@ ACTOR Future<Void> simulatedMachine(
 				backoff = std::min( backoff + 1.0, 6.0 );
 			}

-			TraceEvent("SimulatedFDBDFilesClosed")
-				.detail("ProcessAddress", toIPVectorString(ips))
+			TraceEvent("SimulatedFDBDFilesClosed", randomId)
+				.detail("Address", toIPVectorString(ips))
 				.detailext("ZoneId", localities.zoneId())
 				.detailext("DataHall", localities.dataHallId());

@ -510,7 +522,7 @@ ACTOR Future<Void> simulatedMachine(

 			auto rebootTime = g_random->random01() * MACHINE_REBOOT_TIME;

-			TraceEvent("SimulatedMachineShutdown")
+			TraceEvent("SimulatedMachineShutdown", randomId)
 				.detail("Swap", swap)
 				.detail("KillType", killType)
 				.detail("RebootTime", rebootTime)
@ -530,7 +542,7 @@ ACTOR Future<Void> simulatedMachine(

 				if( myFolders != toRebootFrom ) {
 					TEST( true ); // Simulated machine swapped data folders
-					TraceEvent("SimulatedMachineFolderSwap")
+					TraceEvent("SimulatedMachineFolderSwap", randomId)
 						.detail("OldFolder0", myFolders[0]).detail("NewFolder0", toRebootFrom[0])
 						.detail("MachineIPs", toIPVectorString(ips));
 				}
@ -648,93 +660,157 @@ ACTOR Future<Void> restartSimulatedSystem(vector<Future<Void>> *systemActors, st
 	return Void();
 }

-std::string randomConfiguration( int physicalDatacenters ) {
-	int r = std::min(g_random->randomInt(0, 6), 3);
-//	r = 1; //ahm
+struct SimulationConfig {
+	explicit SimulationConfig(int extraDB, int minimumReplication);
+	int extraDB;

-	// See also random configuration choices in ConfigureDatabase workload
+	DatabaseConfiguration db;

-	std::string startingConfig = "new";
-	if (r == 0) {
+	void set_config(std::string config);
+
+	// Simulation layout
+	int datacenters;
+	int machine_count;  // Total, not per DC.
+	int processes_per_machine;
+	int coordinators;
+
+	std::string toString();
+
+private:
+	void generateNormalConfig(int minimumReplication);
+};
+
+SimulationConfig::SimulationConfig(int extraDB, int minimumReplication) : extraDB(extraDB) {
+	generateNormalConfig(minimumReplication);
+}
+
+void SimulationConfig::set_config(std::string config) {
+	// The only mechanism we have for turning "single" into what single means
+	// is buildConfiguration()... :/
+	std::map<std::string, std::string> hack_map;
+	ASSERT( buildConfiguration(config, hack_map) );
+	for(auto kv : hack_map) db.set( kv.first, kv.second );
+}
+
+StringRef StringRefOf(const char* s) {
+  return StringRef((uint8_t*)s, strlen(s));
+}
+
+void SimulationConfig::generateNormalConfig(int minimumReplication) {
+	set_config("new");
+	datacenters = g_random->randomInt( 1, 4 );
+	if (g_random->random01() < 0.25) db.desiredTLogCount = g_random->randomInt(1,7);
+	if (g_random->random01() < 0.25) db.masterProxyCount = g_random->randomInt(1,7);
+	if (g_random->random01() < 0.25) db.resolverCount = g_random->randomInt(1,7);
+	if (g_random->random01() < 0.5) {
+		set_config("ssd");
+	} else {
+		set_config("memory");
+	}
+
+	int replication_type = std::max(minimumReplication, std::min(g_random->randomInt( 0, 6 ), 3));
+	//replication_type = 1;  //ahm
+	switch (replication_type) {
+	case 0: {
 		TEST( true );  // Simulated cluster using custom redundancy mode
-		int storage_replicas = g_random->randomInt(1,5);
-		startingConfig += " storage_replicas:=" + format("%d", storage_replicas);
-		startingConfig += " storage_quorum:=" + format("%d", storage_replicas);
-		int log_replicas = g_random->randomInt(1,5);
-		startingConfig += " log_replicas:=" + format("%d", log_replicas);
-		int log_anti_quorum = g_random->randomInt(0, log_replicas);
-		startingConfig += " log_anti_quorum:=" + format("%d", log_anti_quorum);
-		startingConfig += " replica_datacenters:=1";
-		startingConfig += " min_replica_datacenters:=1";
+		int storage_servers = g_random->randomInt(1,5);
+		int replication_factor = g_random->randomInt(1,5);
+		int anti_quorum = g_random->randomInt(0, replication_factor);
+		// Go through buildConfiguration, as it sets tLogPolicy/storagePolicy.
+		set_config(format("storage_replicas:=%d storage_quorum:=%d "
+		                  "log_replicas:=%d log_anti_quorum:=%1 "
+		                  "replica_datacenters:=1 min_replica_datacenters:=1",
+		                  storage_servers, storage_servers,
+		                  replication_factor, anti_quorum));
+		break;
 	}
-	else if (r == 1) {
+	case 1: {
 		TEST( true );  // Simulated cluster running in single redundancy mode
-		startingConfig += " single";
+		set_config("single");
+		break;
 	}
-	else if( r == 2 ) {
+	case 2: {
 		TEST( true );  // Simulated cluster running in double redundancy mode
-		startingConfig += " double";
+		set_config("double");
+		break;
 	}
-	else if( r == 3 ) {
-		if( physicalDatacenters == 1 ) {
+	case 3: {
+		if( datacenters == 1 ) {
 			TEST( true );  // Simulated cluster running in triple redundancy mode
-			startingConfig += " triple";
+			set_config("triple");
 		}
-		else if( physicalDatacenters == 2 ) {
+		else if( datacenters == 2 ) {
 			TEST( true );  // Simulated cluster running in 2 datacenter mode
-			startingConfig += " two_datacenter";
+			set_config("two_datacenter");
 		}
-		else if( physicalDatacenters == 3 ) {
+		else if( datacenters == 3 ) {
 			TEST( true );  // Simulated cluster running in 3 data-hall mode
-			startingConfig += " three_data_hall";
+			set_config("three_data_hall");
 		}
 		else {
 			ASSERT( false );
 		}
+		break;
+	}
+	default:
+		ASSERT(false);  // Programmer forgot to adjust cases.
 	}

-	if (g_random->random01() < 0.25) startingConfig += " logs=" + format("%d", g_random->randomInt(1,7));
-	if (g_random->random01() < 0.25) startingConfig += " proxies=" + format("%d", g_random->randomInt(1,7));
-	if (g_random->random01() < 0.25) startingConfig += " resolvers=" + format("%d", g_random->randomInt(1,7));
+	machine_count = g_random->randomInt( std::max( 2+datacenters, db.minMachinesRequired() ), extraDB ? 6 : 10 );
+	processes_per_machine = g_random->randomInt(1, (extraDB ? 14 : 28)/machine_count + 2 );
+	coordinators = BUGGIFY ? g_random->randomInt(1, machine_count+1) : std::min( machine_count, db.maxMachineFailuresTolerated()*2 + 1 );
+}

-	startingConfig += g_random->random01() < 0.5 ? " ssd" : " memory";
-	return startingConfig;
+std::string SimulationConfig::toString() {
+	std::stringstream config;
+	std::map<std::string, std::string>&& dbconfig = db.toMap();
+	config << "new";
+
+	if (dbconfig["redundancy_mode"] != "custom") {
+		config << " " << dbconfig["redundancy_mode"];
+	} else {
+		config << " " << "log_replicas:=" << db.tLogReplicationFactor;
+		config << " " << "log_anti_quorum:=" << db.tLogWriteAntiQuorum;
+		config << " " << "storage_replicas:=" << db.storageTeamSize;
+		config << " " << "storage_quorum:=" << db.durableStorageQuorum;
+	}
+
+	config << " logs=" << db.getDesiredLogs();
+	config << " proxies=" << db.getDesiredProxies();
+	config << " resolvers=" << db.getDesiredResolvers();
+
+	config << " " << dbconfig["storage_engine"];
+	return config.str();
 }

 void setupSimulatedSystem( vector<Future<Void>> *systemActors, std::string baseFolder,
 							int* pTesterCount, Optional<ClusterConnectionString> *pConnString,
-							Standalone<StringRef> *pStartingConfiguration, int extraDB)
+							Standalone<StringRef> *pStartingConfiguration, int extraDB, int minimumReplication)
 {
-	int dataCenters = g_random->randomInt( 1, 4 );
-
 	// SOMEDAY: this does not test multi-interface configurations
-	std::string startingConfigString = randomConfiguration(dataCenters);
-	std::map<std::string,std::string> startingConfigMap;
-	ASSERT( buildConfiguration( startingConfigString, startingConfigMap ) == ConfigurationResult::SUCCESS );
+	SimulationConfig simconfig(extraDB, minimumReplication);
+	std::string startingConfigString = simconfig.toString();

-	DatabaseConfiguration startingConfig;
-	for(auto kv : startingConfigMap) startingConfig.set( kv.first, kv.second );
-	g_simulator.storagePolicy = startingConfig.storagePolicy;
-	g_simulator.tLogPolicy = startingConfig.tLogPolicy;
-	g_simulator.tLogWriteAntiQuorum = startingConfig.tLogWriteAntiQuorum;
+	g_simulator.storagePolicy = simconfig.db.storagePolicy;
+	g_simulator.tLogPolicy = simconfig.db.tLogPolicy;
+	g_simulator.tLogWriteAntiQuorum = simconfig.db.tLogWriteAntiQuorum;
 	ASSERT(g_simulator.storagePolicy);
 	ASSERT(g_simulator.tLogPolicy);
 	TraceEvent("simulatorConfig").detail("tLogPolicy", g_simulator.tLogPolicy->info()).detail("storagePolicy", g_simulator.storagePolicy->info()).detail("tLogWriteAntiQuorum", g_simulator.tLogWriteAntiQuorum).detail("ConfigString", startingConfigString);

-	int machineCount = g_random->randomInt( std::max( 2+dataCenters, startingConfig.minMachinesRequired() ), extraDB ? 6 : 10 );
+	const int dataCenters = simconfig.datacenters;
+	const int machineCount = simconfig.machine_count;
+	const int coordinatorCount = simconfig.coordinators;
+	const int processesPerMachine = simconfig.processes_per_machine;

 	// half the time, when we have more than 4 machines that are not the first in their dataCenter, assign classes
 	bool assignClasses = machineCount - dataCenters > 4 && g_random->random01() < 0.5;
-	int processesPerMachine = g_random->randomInt(1, (extraDB ? 14 : 28)/machineCount + 2 );

 	// Use SSL half the time
 	bool sslEnabled = g_random->random01() < 0.05;
 	TEST( sslEnabled ); // SSL enabled
 	TEST( !sslEnabled ); // SSL disabled

-	// Pick coordination processes.
-	int coordinatorCount = BUGGIFY ? g_random->randomInt(1, machineCount+1) : std::min( machineCount, startingConfig.maxMachineFailuresTolerated()*2 + 1 );
-
 	vector<NetworkAddress> coordinatorAddresses;
 	for( int dc = 0; dc < dataCenters; dc++ ) {
 		int machines = machineCount / dataCenters + (dc < machineCount % dataCenters); // add remainder of machines to first datacenter
@ -817,12 +893,12 @@ void setupSimulatedSystem( vector<Future<Void>> *systemActors, std::string baseF
 	}

 	g_simulator.desiredCoordinators = coordinatorCount;
-	g_simulator.killableMachines = startingConfig.maxMachineFailuresTolerated();
+	g_simulator.killableMachines = simconfig.db.maxMachineFailuresTolerated();
 	g_simulator.neededDatacenters = 1;
 	g_simulator.killableDatacenters = 0;
 	g_simulator.physicalDatacenters = dataCenters;
 	g_simulator.maxCoordinatorsInDatacenter = ((coordinatorCount-1)/dataCenters) + 1;
-	g_simulator.machinesNeededForProgress = startingConfig.minMachinesRequired() + nonVersatileMachines;
+	g_simulator.machinesNeededForProgress = simconfig.db.minMachinesRequired() + nonVersatileMachines;
 	g_simulator.processesPerMachine = processesPerMachine;

 	TraceEvent("SetupSimulatorSettings")
@ -889,11 +965,11 @@ void setupSimulatedSystem( vector<Future<Void>> *systemActors, std::string baseF
 	Void _ = wait( DatabaseContext::configureDatabase( *pZookeeper, ClusterInterface::ALL, mode ) );*/
 }

-int checkExtraDB(const char *testFile) {
+void checkExtraDB(const char *testFile, int &extraDB, int &minimumReplication) {
 	std::ifstream ifs;
 	ifs.open(testFile, std::ifstream::in);
 	if (!ifs.good())
-		return 0;
+		return;

 	std::string cline;

@ -911,15 +987,15 @@ int checkExtraDB(const char *testFile) {
 		std::string value = removeWhitespace(line.substr(found + 1));

 		if (attrib == "extraDB") {
-			int v = 0;
-			sscanf( value.c_str(), "%d", &v );
-			ifs.close();
-			return v;
+			sscanf( value.c_str(), "%d", &extraDB );
+		}
+
+		if (attrib == "minimumReplication") {
+			sscanf( value.c_str(), "%d", &minimumReplication );
 		}
 	}

 	ifs.close();
-	return 0;
 }

 ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool rebooting ) {
@ -927,7 +1003,9 @@ ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool reboot
 	state Optional<ClusterConnectionString> connFile;
 	state Standalone<StringRef> startingConfiguration;
 	state int testerCount = 1;
-	state int extraDB = checkExtraDB(testFile);
+	state int extraDB = 0;
+	state int minimumReplication = 0;
+	checkExtraDB(testFile, extraDB, minimumReplication);

 	Void _ = wait( g_simulator.onProcess( g_simulator.newProcess(
 			"TestSystem", 0x01010101, 1, LocalityData(Optional<Standalone<StringRef>>(), Standalone<StringRef>(g_random->randomUniqueID().toString()), Optional<Standalone<StringRef>>(), Optional<Standalone<StringRef>>()), ProcessClass(ProcessClass::TesterClass, ProcessClass::CommandLineSource), "", "" ), TaskDefaultYield ) );
@ -944,7 +1022,7 @@ ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool reboot
 		}
 		else {
 			g_expect_full_pointermap = 1;
-			setupSimulatedSystem( &systemActors, dataFolder, &testerCount, &connFile, &startingConfiguration, extraDB );
+			setupSimulatedSystem( &systemActors, dataFolder, &testerCount, &connFile, &startingConfiguration, extraDB, minimumReplication );
 			Void _ = wait( delay(1.0) ); // FIXME: WHY!!!  //wait for machines to boot
 		}
 		std::string clusterFileDir = joinPath( dataFolder, g_random->randomUniqueID().toString() );
--- a/fdbserver/Status.actor.cpp
+++ b/fdbserver/Status.actor.cpp
@ -947,6 +947,7 @@ ACTOR static Future<double> doGrvProbe(Transaction *tr, Optional<FDBTransactionO

 	loop {
 		try {
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 			if(priority.present()) {
 				tr->setOption(priority.get());
 			}
@ -969,6 +970,7 @@ ACTOR static Future<double> doReadProbe(Future<double> grvProbe, Transaction *tr
 	state double start = timer_monotonic();

 	loop {
+		tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 		try {
 			Optional<Standalone<StringRef> > _ = wait(tr->get(LiteralStringRef("\xff/StatusJsonTestKey62793")));
 			return timer_monotonic() - start;
@ -993,6 +995,7 @@ ACTOR static Future<double> doCommitProbe(Future<double> grvProbe, Transaction *

 	loop {
 		try {
+			tr->setOption(FDBTransactionOptions::LOCK_AWARE);
 			tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
 			tr->makeSelfConflicting();
 			Void _ = wait(tr->commit());
@ -1022,9 +1025,7 @@ ACTOR static Future<Void> doProbe(Future<double> probe, int timeoutSeconds, cons
 	return Void();
 }

-ACTOR static Future<StatusObject> latencyProbeFetcher(Reference<AsyncVar<struct ServerDBInfo>> db, StatusArray *messages, std::set<std::string> *incomplete_reasons) {
-	Database cx = openDBOnServer(db, TaskDefaultEndpoint, true, true); // Open a new database connection that is lock-aware
-
+ACTOR static Future<StatusObject> latencyProbeFetcher(Database cx, StatusArray *messages, std::set<std::string> *incomplete_reasons) {
 	state Transaction trImmediate(cx);
 	state Transaction trDefault(cx);
 	state Transaction trBatch(cx);
@ -1777,9 +1778,7 @@ ACTOR Future<StatusReply> clusterGetStatus(

 		if (configuration.present()){
 			// Do the latency probe by itself to avoid interference from other status activities
-			state Future<StatusObject> latencyProbe = latencyProbeFetcher(db, &messages, &status_incomplete_reasons);
-
-			StatusObject latencyProbeResults = wait(latencyProbe);
+			StatusObject latencyProbeResults = wait(latencyProbeFetcher(cx, &messages, &status_incomplete_reasons));

 			statusObj["database_available"] = latencyProbeResults.count("immediate_priority_transaction_start_seconds") && latencyProbeResults.count("read_seconds") && latencyProbeResults.count("commit_seconds");
 			if (!latencyProbeResults.empty()) {
--- a/fdbserver/StorageMetrics.actor.h
+++ b/fdbserver/StorageMetrics.actor.h
@ -54,7 +54,7 @@ struct StorageMetricSample {
 				bck_split.decrementNonEnd();

 				KeyRef split = keyBetween(KeyRangeRef(bck_split != sample.begin() ? std::max<KeyRef>(*bck_split,range.begin) : range.begin, *it));
-				if( split.size() <= CLIENT_KNOBS->SPLIT_KEY_SIZE_LIMIT )
+				if(!front || (getEstimate(KeyRangeRef(range.begin, split)) > 0 && split.size() <= CLIENT_KNOBS->SPLIT_KEY_SIZE_LIMIT))
 					return split;
 			}

@ -63,7 +63,7 @@ struct StorageMetricSample {
 				++it;

 				KeyRef split = keyBetween(KeyRangeRef(*fwd_split, it != sample.end() ? std::min<KeyRef>(*it, range.end) : range.end));
-				if( split.size() <= CLIENT_KNOBS->SPLIT_KEY_SIZE_LIMIT )
+				if(front || (getEstimate(KeyRangeRef(split, range.end)) > 0 && split.size() <= CLIENT_KNOBS->SPLIT_KEY_SIZE_LIMIT))
 					return split;

 				fwd_split = it;
--- a/fdbserver/TLogServer.actor.cpp
+++ b/fdbserver/TLogServer.actor.cpp
@ -200,6 +200,63 @@ struct CompareFirst {
 	}
 };

+KeyRange prefixRange( KeyRef prefix ) {
+	Key end = strinc(prefix);
+	return KeyRangeRef( prefix, end );
+}
+
+////// Persistence format (for self->persistentData)
+
+// Immutable keys
+static const KeyValueRef persistFormat( LiteralStringRef( "Format" ), LiteralStringRef("FoundationDB/LogServer/2/3") );
+static const KeyRangeRef persistFormatReadableRange( LiteralStringRef("FoundationDB/LogServer/2/2"), LiteralStringRef("FoundationDB/LogServer/2/4") );
+static const KeyRangeRef persistRecoveryCountKeys = KeyRangeRef( LiteralStringRef( "DbRecoveryCount/" ), LiteralStringRef( "DbRecoveryCount0" ) );
+
+// Updated on updatePersistentData()
+static const KeyRangeRef persistCurrentVersionKeys = KeyRangeRef( LiteralStringRef( "version/" ), LiteralStringRef( "version0" ) );
+static const KeyRange persistTagMessagesKeys = prefixRange(LiteralStringRef("TagMsg/"));
+static const KeyRange persistTagPoppedKeys = prefixRange(LiteralStringRef("TagPop/"));
+
+static Key persistTagMessagesKey( UID id, Tag tag, Version version ) {
+	BinaryWriter wr( Unversioned() );
+	wr.serializeBytes(persistTagMessagesKeys.begin);
+	wr << id;
+	wr << tag;
+	wr << bigEndian64( version );
+	return wr.toStringRef();
+}
+
+static Key persistTagPoppedKey( UID id, Tag tag ) {
+	BinaryWriter wr(Unversioned());
+	wr.serializeBytes( persistTagPoppedKeys.begin );
+	wr << id;
+	wr << tag;
+	return wr.toStringRef();
+}
+
+static Value persistTagPoppedValue( Version popped ) {
+	return BinaryWriter::toValue( popped, Unversioned() );
+}
+
+static Tag decodeTagPoppedKey( KeyRef id, KeyRef key ) {
+	Tag s;
+	BinaryReader rd( key.removePrefix(persistTagPoppedKeys.begin).removePrefix(id), Unversioned() );
+	rd >> s;
+	return s;
+}
+
+static Version decodeTagPoppedValue( ValueRef value ) {
+	return BinaryReader::fromStringRef<Version>( value, Unversioned() );
+}
+
+static StringRef stripTagMessagesKey( StringRef key ) {
+	return key.substr( sizeof(UID) + sizeof(Tag) + persistTagMessagesKeys.begin.size() );
+}
+
+static Version decodeTagMessagesKey( StringRef key ) {
+	return bigEndian64( BinaryReader::fromStringRef<Version>( stripTagMessagesKey(key), Unversioned() ) );
+}
+
 struct TLogData : NonCopyable {
 	AsyncTrigger newLogData;
 	Deque<UID> queueOrder;
@ -238,13 +295,14 @@ struct TLogData : NonCopyable {
 	Future<Void> oldLogServer;

 	PromiseStream<Future<Void>> sharedActors;
+	bool terminated;

 	TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference<AsyncVar<ServerDBInfo>> const& dbInfo)
 			: dbgid(dbgid), instanceID(g_random->randomUniqueID().first()),
 			  persistentData(persistentData), rawPersistentQueue(persistentQueue), persistentQueue(new TLogQueue(persistentQueue, dbgid)),
 			  dbInfo(dbInfo), queueCommitBegin(0), queueCommitEnd(0), prevVersion(0),
 			  diskQueueCommitBytes(0), largeDiskQueueCommitBytes(false),
-			  bytesInput(0), bytesDurable(0), updatePersist(Void())
+			  bytesInput(0), bytesDurable(0), updatePersist(Void()), terminated(false)
 		{
 		}
 };
@ -373,6 +431,16 @@ struct LogData : NonCopyable, public ReferenceCounted<LogData> {

 		ASSERT(tLogData->bytesDurable <= tLogData->bytesInput);
 		endRole(tli.id(), "TLog", "Error", true);
+
+		if(!tLogData->terminated) {
+			Key logIdKey = BinaryWriter::toValue(logId,Unversioned());
+			tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistCurrentVersionKeys.begin)) );
+			tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistRecoveryCountKeys.begin)) );
+			Key msgKey = logIdKey.withPrefix(persistTagMessagesKeys.begin);
+			tLogData->persistentData->clear( KeyRangeRef( msgKey, strinc(msgKey) ) );
+			Key poppedKey = logIdKey.withPrefix(persistTagPoppedKeys.begin);
+			tLogData->persistentData->clear( KeyRangeRef( poppedKey, strinc(poppedKey) ) );
+		}
 	}

 	LogEpoch epoch() const { return recoveryCount; }
@ -408,63 +476,6 @@ ACTOR Future<Void> tLogLock( TLogData* self, ReplyPromise< TLogLockResult > repl
 	return Void();
 }

-KeyRange prefixRange( KeyRef prefix ) {
-	Key end = strinc(prefix);
-	return KeyRangeRef( prefix, end );
-}
-
-////// Persistence format (for self->persistentData)
-
-// Immutable keys
-static const KeyValueRef persistFormat( LiteralStringRef( "Format" ), LiteralStringRef("FoundationDB/LogServer/2/3") );
-static const KeyRangeRef persistFormatReadableRange( LiteralStringRef("FoundationDB/LogServer/2/2"), LiteralStringRef("FoundationDB/LogServer/2/4") );
-static const KeyRangeRef persistRecoveryCountKeys = KeyRangeRef( LiteralStringRef( "DbRecoveryCount/" ), LiteralStringRef( "DbRecoveryCount0" ) );
-
-// Updated on updatePersistentData()
-static const KeyRangeRef persistCurrentVersionKeys = KeyRangeRef( LiteralStringRef( "version/" ), LiteralStringRef( "version0" ) );
-static const KeyRange persistTagMessagesKeys = prefixRange(LiteralStringRef("TagMsg/"));
-static const KeyRange persistTagPoppedKeys = prefixRange(LiteralStringRef("TagPop/"));
-
-static Key persistTagMessagesKey( UID id, Tag tag, Version version ) {
-	BinaryWriter wr( Unversioned() );
-	wr.serializeBytes(persistTagMessagesKeys.begin);
-	wr << id;
-	wr << tag;
-	wr << bigEndian64( version );
-	return wr.toStringRef();
-}
-
-static Key persistTagPoppedKey( UID id, Tag tag ) {
-	BinaryWriter wr(Unversioned());
-	wr.serializeBytes( persistTagPoppedKeys.begin );
-	wr << id;
-	wr << tag;
-	return wr.toStringRef();
-}
-
-static Value persistTagPoppedValue( Version popped ) {
-	return BinaryWriter::toValue( popped, Unversioned() );
-}
-
-static Tag decodeTagPoppedKey( KeyRef id, KeyRef key ) {
-	Tag s;
-	BinaryReader rd( key.removePrefix(persistTagPoppedKeys.begin).removePrefix(id), Unversioned() );
-	rd >> s;
-	return s;
-}
-
-static Version decodeTagPoppedValue( ValueRef value ) {
-	return BinaryReader::fromStringRef<Version>( value, Unversioned() );
-}
-
-static StringRef stripTagMessagesKey( StringRef key ) {
-	return key.substr( sizeof(UID) + sizeof(Tag) + persistTagMessagesKeys.begin.size() );
-}
-
-static Version decodeTagMessagesKey( StringRef key ) {
-	return bigEndian64( BinaryReader::fromStringRef<Version>( stripTagMessagesKey(key), Unversioned() ) );
-}
-
 void updatePersistentPopped( TLogData* self, Reference<LogData> logData, Tag tag, LogData::TagData& data ) {
 	if (!data.popped_recently) return;
 	self->persistentData->set(KeyValueRef( persistTagPoppedKey(logData->logId, tag), persistTagPoppedValue(data.popped) ));
@ -612,9 +623,15 @@ ACTOR Future<Void> updateStorage( TLogData* self ) {
 				} else {
 					Void _ = wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
 				}
+
+				if( logData->removed.isReady() ) {
+					break;
+				}
 			}

-			self->queueOrder.pop_front();
+			if(logData->persistentDataDurableVersion == logData->version.get()) {
+				self->queueOrder.pop_front();
+			}
 			Void _ = wait( delay(0.0, TaskUpdateStorage) );
 		} else {
 			Void _ = wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) );
@ -1083,7 +1100,7 @@ ACTOR Future<Void> initPersistentState( TLogData* self, Reference<LogData> logDa
 	return Void();
 }

-ACTOR Future<Void> rejoinMasters( TLogData* self, TLogInterface tli, DBRecoveryCount recoveryCount ) {
+ACTOR Future<Void> rejoinMasters( TLogData* self, TLogInterface tli, DBRecoveryCount recoveryCount, Future<Void> registerWithMaster ) {
 	state UID lastMasterID(0,0);
 	loop {
 		auto const& inf = self->dbInfo->get();
@ -1101,20 +1118,25 @@ ACTOR Future<Void> rejoinMasters( TLogData* self, TLogInterface tli, DBRecoveryC
 			throw worker_removed();
 		}

-		if (self->dbInfo->get().master.id() != lastMasterID) {
-			// The TLogRejoinRequest is needed to establish communications with a new master, which doesn't have our TLogInterface
-			TLogRejoinRequest req;
-			req.myInterface = tli;
-			TraceEvent("TLogRejoining", self->dbgid).detail("Master", self->dbInfo->get().master.id());
-			choose {
-				when ( bool success = wait( brokenPromiseToNever( self->dbInfo->get().master.tlogRejoin.getReply( req ) ) ) ) {
-					if (success)
-						lastMasterID = self->dbInfo->get().master.id();
+		if( registerWithMaster.isReady() ) {
+			if ( self->dbInfo->get().master.id() != lastMasterID) {
+				// The TLogRejoinRequest is needed to establish communications with a new master, which doesn't have our TLogInterface
+				TLogRejoinRequest req;
+				req.myInterface = tli;
+				TraceEvent("TLogRejoining", self->dbgid).detail("Master", self->dbInfo->get().master.id());
+				choose {
+					when ( bool success = wait( brokenPromiseToNever( self->dbInfo->get().master.tlogRejoin.getReply( req ) ) ) ) {
+						if (success)
+							lastMasterID = self->dbInfo->get().master.id();
+					}
+					when ( Void _ = wait( self->dbInfo->onChange() ) ) { }
 				}
-				when ( Void _ = wait( self->dbInfo->onChange() ) ) { }
+			} else {
+				Void _ = wait( self->dbInfo->onChange() );
 			}
-		} else
-			Void _ = wait( self->dbInfo->onChange() );
+		} else {
+			Void _ = wait( registerWithMaster || self->dbInfo->onChange() );
+		}
 	}
 }

@ -1129,11 +1151,11 @@ ACTOR Future<Void> respondToRecovered( TLogInterface tli, Future<Void> recovery

 ACTOR Future<Void> cleanupPeekTrackers( TLogData* self ) {
 	loop {
-		double minExpireTime = SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME;
+		double minTimeUntilExpiration = SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME;
 		auto it = self->peekTracker.begin();
 		while(it != self->peekTracker.end()) {
-			double expireTime = SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME - now()-it->second.lastUpdate;
-			if(expireTime < 1.0e-6) {
+			double timeUntilExpiration = it->second.lastUpdate + SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME - now();
+			if(timeUntilExpiration < 1.0e-6) {
 				for(auto seq : it->second.sequence_version) {
 					if(!seq.second.isSet()) {
 						seq.second.sendError(timed_out());
@ -1141,12 +1163,12 @@ ACTOR Future<Void> cleanupPeekTrackers( TLogData* self ) {
 				}
 				it = self->peekTracker.erase(it);
 			} else {
-				minExpireTime = std::min(minExpireTime, expireTime);
+				minTimeUntilExpiration = std::min(minTimeUntilExpiration, timeUntilExpiration);
 				++it;
 			}
 		}

-		Void _ = wait( delay(minExpireTime) );
+		Void _ = wait( delay(minTimeUntilExpiration) );
 	}
 }

@ -1240,7 +1262,7 @@ ACTOR Future<Void> tLogCore( TLogData* self, Reference<LogData> logData ) {
 	logData->addActor.send( respondToRecovered(logData->tli, logData->recovery) );
 	logData->addActor.send( logData->removed );
 	//FIXME: update tlogMetrics to include new information, or possibly only have one copy for the shared instance
-	logData->addActor.send( traceCounters("TLogMetrics", logData->logId, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &logData->cc, self->dbgid.toString() + "/TLogMetrics"));
+	logData->addActor.send( traceCounters("TLogMetrics", logData->logId, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &logData->cc, logData->logId.toString() + "/TLogMetrics"));
 	logData->addActor.send( serveTLogInterface(self, logData->tli, logData, warningCollectorInput) );

 	try {
@ -1267,7 +1289,7 @@ ACTOR Future<Void> checkEmptyQueue(TLogData* self) {
 	}
 }

-ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality, Promise<Void> oldLog, PromiseStream<InitializeTLogRequest> tlogRequests ) {
+ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality, Promise<Void> oldLog, Promise<Void> recovered, PromiseStream<InitializeTLogRequest> tlogRequests ) {
 	state double startt = now();
 	state Reference<LogData> logData;
 	state KeyRange tagKeys;
@ -1319,6 +1341,7 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
 		DUMPTOKEN( recruited.confirmRunning );

 		//FIXME: need for upgrades from 4.X to 5.0, remove once this upgrade path is no longer needed
+		if(recovered.canBeSet()) recovered.send(Void());
 		oldLog.send(Void());
 		while(!tlogRequests.isEmpty()) {
 			tlogRequests.getFuture().pop().reply.sendError(recruitment_failed());
@ -1331,6 +1354,7 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
 	ASSERT(fVers.get().size() == fRecoverCounts.get().size());

 	state int idx = 0;
+	state Promise<Void> registerWithMaster;
 	for(idx = 0; idx < fVers.get().size(); idx++) {
 		state KeyRef rawId = fVers.get()[idx].key.removePrefix(persistCurrentVersionKeys.begin);
 		UID id1 = BinaryReader::fromStringRef<UID>( rawId, Unversioned() );
@ -1358,7 +1382,7 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
 		logData->persistentDataDurableVersion = ver;
 		logData->version.set(ver);
 		logData->recoveryCount = BinaryReader::fromStringRef<DBRecoveryCount>( fRecoverCounts.get()[idx].value, Unversioned() );
-		logData->removed = rejoinMasters(self, recruited, logData->recoveryCount);
+		logData->removed = rejoinMasters(self, recruited, logData->recoveryCount, registerWithMaster.getFuture());
 		removed.push_back(errorOr(logData->removed));

 		TraceEvent("TLogRestorePersistentStateVer", id1).detail("ver", ver);
@ -1397,8 +1421,13 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
 				when( TLogQueueEntry qe = wait( self->persistentQueue->readNext() ) ) {
 					if(!self->queueOrder.size() || self->queueOrder.back() != qe.id) self->queueOrder.push_back(qe.id);
 					if(qe.id != lastId) {
-						logData = self->id_data[qe.id];
 						lastId = qe.id;
+						auto it = self->id_data.find(qe.id);
+						if(it != self->id_data.end()) {
+							logData = it->second;
+						} else {
+							logData = Reference<LogData>();
+						}
 					} else {
 						ASSERT( qe.version >= lastVer );
 						lastVer = qe.version;
@ -1407,19 +1436,21 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
 					//TraceEvent("TLogRecoveredQE", self->dbgid).detail("logId", qe.id).detail("ver", qe.version).detail("MessageBytes", qe.messages.size()).detail("Tags", qe.tags.size())
 					//	.detail("Tag0", qe.tags.size() ? qe.tags[0].tag : invalidTag).detail("version", logData->version.get());

-					logData->knownCommittedVersion = std::max(logData->knownCommittedVersion, qe.knownCommittedVersion);
-					if( qe.version > logData->version.get() ) {
-						commitMessages(logData, qe.version, qe.arena(), qe.messages, qe.tags, self->bytesInput);
-						logData->version.set( qe.version );
-						logData->queueCommittedVersion.set( qe.version );
+					if(logData) {
+						logData->knownCommittedVersion = std::max(logData->knownCommittedVersion, qe.knownCommittedVersion);
+						if( qe.version > logData->version.get() ) {
+							commitMessages(logData, qe.version, qe.arena(), qe.messages, qe.tags, self->bytesInput);
+							logData->version.set( qe.version );
+							logData->queueCommittedVersion.set( qe.version );

-						while (self->bytesInput - self->bytesDurable >= recoverMemoryLimit) {
-							TEST(true);  // Flush excess data during TLog queue recovery
-							TraceEvent("FlushLargeQueueDuringRecovery", self->dbgid).detail("BytesInput", self->bytesInput).detail("BytesDurable", self->bytesDurable).detail("Version", logData->version.get()).detail("PVer", logData->persistentDataVersion);
+							while (self->bytesInput - self->bytesDurable >= recoverMemoryLimit) {
+								TEST(true);  // Flush excess data during TLog queue recovery
+								TraceEvent("FlushLargeQueueDuringRecovery", self->dbgid).detail("BytesInput", self->bytesInput).detail("BytesDurable", self->bytesDurable).detail("Version", logData->version.get()).detail("PVer", logData->persistentDataVersion);

-							choose {
-								when( Void _ = wait( updateStorage(self) ) ) {}
-								when( Void _ = wait( allRemoved ) ) { throw worker_removed(); }
+								choose {
+									when( Void _ = wait( updateStorage(self) ) ) {}
+									when( Void _ = wait( allRemoved ) ) { throw worker_removed(); }
+								}
 							}
 						}
 					}
@ -1442,11 +1473,13 @@ ACTOR Future<Void> restorePersistentState( TLogData* self, LocalityData locality
 		self->sharedActors.send( tLogCore( self, it.second ) );
 	}

+	if(registerWithMaster.canBeSet()) registerWithMaster.send(Void());
 	return Void();
 }

 bool tlogTerminated( TLogData* self, IKeyValueStore* persistentData, TLogQueue* persistentQueue, Error const& e ) {
 	// Dispose the IKVS (destroying its data permanently) only if this shutdown is definitely permanent.  Otherwise just close it.
+	self->terminated = true;
 	if (e.code() == error_code_worker_removed || e.code() == error_code_recruitment_failed) {
 		persistentData->dispose();
 		persistentQueue->dispose();
@ -1472,7 +1505,7 @@ ACTOR Future<Void> recoverTagFromLogSystem( TLogData* self, Reference<LogData> l
 	state Version tagPopped = 0;
 	state Version lastVer = 0;

-	TraceEvent("LogRecoveringTagBegin", self->dbgid).detail("Tag", tag).detail("recoverAt", endVersion);
+	TraceEvent("LogRecoveringTagBegin", logData->logId).detail("Tag", tag).detail("recoverAt", endVersion);

 	while (tagAt <= endVersion) {
 		loop {
@ -1498,11 +1531,11 @@ ACTOR Future<Void> recoverTagFromLogSystem( TLogData* self, Reference<LogData> l
 		int writtenBytes = 0;
 		while (true) {
 			bool foundMessage = r->hasMessage();
-			//TraceEvent("LogRecoveringMsg").detail("Tag", tag).detail("foundMessage", foundMessage).detail("ver", r->version().toString());
+			//TraceEvent("LogRecoveringMsg", logData->logId).detail("Tag", tag).detail("foundMessage", foundMessage).detail("ver", r->version().toString());
 			if (!foundMessage || r->version().version != ver) {
 				ASSERT(r->version().version > lastVer);
 				if (ver) {
-					//TraceEvent("LogRecoveringTagVersion", self->dbgid).detail("Tag", tag).detail("Ver", ver).detail("Bytes", wr.getLength());
+					//TraceEvent("LogRecoveringTagVersion", logData->logId).detail("Tag", tag).detail("Ver", ver).detail("Bytes", wr.getLength());
 					writtenBytes += 100 + wr.getLength();
 					self->persistentData->set( KeyValueRef( persistTagMessagesKey( logData->logId, tag, ver ), wr.toStringRef() ) );
 				}
@ -1539,6 +1572,8 @@ ACTOR Future<Void> recoverTagFromLogSystem( TLogData* self, Reference<LogData> l
 	Void _ = wait(tLogPop( self, TLogPopRequest(tagPopped, tag), logData ));

 	updatePersistentPopped( self, logData, tag, logData->tag_data.find(tag)->value );
+
+	TraceEvent("LogRecoveringTagComplete", logData->logId).detail("Tag", tag).detail("recoverAt", endVersion);
 	return Void();
 }

@ -1571,52 +1606,59 @@ ACTOR Future<Void> recoverFromLogSystem( TLogData* self, Reference<LogData> logD
 	state Future<Void> recoveryDone = Never();
 	state Future<Void> commitTimeout = delay(SERVER_KNOBS->LONG_TLOG_COMMIT_TIME);

-	loop {
-		choose {
-			when(Void _ = wait(copyDone)) {
-				recoverFutures.clear();
-				for(auto tag : recoverTags )
-					recoverFutures.push_back(recoverTagFromLogSystem(self, logData, 0, knownCommittedVersion, tag, uncommittedBytes, logSystem));
-				copyDone = Never();
-				recoveryDone =  waitForAll(recoverFutures);
+	try {
+		loop {
+			choose {
+				when(Void _ = wait(copyDone)) {
+					recoverFutures.clear();
+					for(auto tag : recoverTags )
+						recoverFutures.push_back(recoverTagFromLogSystem(self, logData, 0, knownCommittedVersion, tag, uncommittedBytes, logSystem));
+					copyDone = Never();
+					recoveryDone =  waitForAll(recoverFutures);

-				Void __ = wait( committing );
-				Void __ = wait( self->updatePersist );
-				committing = self->persistentData->commit();
-				commitTimeout = delay(SERVER_KNOBS->LONG_TLOG_COMMIT_TIME);
-				uncommittedBytes->set(0);
-				Void __ = wait( committing );
-				TraceEvent("TLogCommitCopyData", self->dbgid);
+					Void __ = wait( committing );
+					Void __ = wait( self->updatePersist );
+					committing = self->persistentData->commit();
+					commitTimeout = delay(SERVER_KNOBS->LONG_TLOG_COMMIT_TIME);
+					uncommittedBytes->set(0);
+					Void __ = wait( committing );
+					TraceEvent("TLogCommitCopyData", logData->logId);

-				if(!copyComplete.isSet())
-					copyComplete.send(Void());
-			}
-			when(Void _ = wait(recoveryDone)) { break; }
-			when(Void _ = wait(commitTimeout)) {
-				TEST(true); // We need to commit occasionally if this process is long to avoid running out of memory.
-				// We let one, but not more, commits pipeline with the network transfer
-				Void __ = wait( committing );
-				Void __ = wait( self->updatePersist );
-				committing = self->persistentData->commit();
-				commitTimeout = delay(SERVER_KNOBS->LONG_TLOG_COMMIT_TIME);
-				uncommittedBytes->set(0);
-				//TraceEvent("TLogCommitRecoveryData", self->dbgid).detail("MemoryUsage", DEBUG_DETERMINISM ? 0 : getMemoryUsage());
-			}
-			when(Void _ = wait(uncommittedBytes->onChange())) {
-				if(uncommittedBytes->get() >= SERVER_KNOBS->LARGE_TLOG_COMMIT_BYTES)
-					commitTimeout = Void();
+					if(!copyComplete.isSet())
+						copyComplete.send(Void());
+				}
+				when(Void _ = wait(recoveryDone)) { break; }
+				when(Void _ = wait(commitTimeout)) {
+					TEST(true); // We need to commit occasionally if this process is long to avoid running out of memory.
+					// We let one, but not more, commits pipeline with the network transfer
+					Void __ = wait( committing );
+					Void __ = wait( self->updatePersist );
+					committing = self->persistentData->commit();
+					commitTimeout = delay(SERVER_KNOBS->LONG_TLOG_COMMIT_TIME);
+					uncommittedBytes->set(0);
+					//TraceEvent("TLogCommitRecoveryData", self->dbgid).detail("MemoryUsage", DEBUG_DETERMINISM ? 0 : getMemoryUsage());
+				}
+				when(Void _ = wait(uncommittedBytes->onChange())) {
+					if(uncommittedBytes->get() >= SERVER_KNOBS->LARGE_TLOG_COMMIT_BYTES)
+						commitTimeout = Void();
+				}
 			}
 		}
+
+		Void _ = wait( committing );
+		Void _ = wait( self->updatePersist );
+		Void _ = wait( self->persistentData->commit() );
+
+		TraceEvent("TLogRecoveryComplete", logData->logId).detail("Locality", self->dbInfo->get().myLocality.toString());
+		TEST(true);  // tLog restore from old log system completed
+
+		return Void();
+	} catch( Error &e ) {
+		TraceEvent("TLogRecoveryError", logData->logId).error(e,true);
+		if(!copyComplete.isSet())
+			copyComplete.sendError(worker_removed());
+		throw;
 	}
-
-	Void _ = wait( committing );
-	Void _ = wait( self->updatePersist );
-	Void _ = wait( self->persistentData->commit() );
-
-	TraceEvent("TLogRecoveryComplete", self->dbgid).detail("Locality", self->dbInfo->get().myLocality.toString());
-	TEST(true);  // tLog restore from old log system completed
-
-	return Void();
 }

 ACTOR Future<Void> tLogStart( TLogData* self, InitializeTLogRequest req, LocalityData locality ) {
@ -1639,7 +1681,7 @@ ACTOR Future<Void> tLogStart( TLogData* self, InitializeTLogRequest req, Localit
 	state Reference<LogData> logData = Reference<LogData>( new LogData(self, recruited) );
 	self->id_data[recruited.id()] = logData;
 	logData->recoveryCount = req.epoch;
-	logData->removed = rejoinMasters(self, recruited, req.epoch);
+	logData->removed = rejoinMasters(self, recruited, req.epoch, Future<Void>(Void()));
 	self->queueOrder.push_back(recruited.id());

 	TraceEvent("TLogStart", logData->logId);
@ -1691,7 +1733,7 @@ ACTOR Future<Void> tLogStart( TLogData* self, InitializeTLogRequest req, Localit
 }

 // New tLog (if !recoverFrom.size()) or restore from network
-ACTOR Future<Void> tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference<AsyncVar<ServerDBInfo>> db, LocalityData locality, PromiseStream<InitializeTLogRequest> tlogRequests, UID tlogId, bool restoreFromDisk, Promise<Void> oldLog )
+ACTOR Future<Void> tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference<AsyncVar<ServerDBInfo>> db, LocalityData locality, PromiseStream<InitializeTLogRequest> tlogRequests, UID tlogId, bool restoreFromDisk, Promise<Void> oldLog, Promise<Void> recovered )
 {
 	state TLogData self( tlogId, persistentData, persistentQueue, db );
 	state Future<Void> error = actorCollection( self.sharedActors.getFuture() );
@ -1700,11 +1742,13 @@ ACTOR Future<Void> tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ

 	try {
 		if(restoreFromDisk) {
-			Void _ = wait( restorePersistentState( &self, locality, oldLog, tlogRequests ) );
+			Void _ = wait( restorePersistentState( &self, locality, oldLog, recovered, tlogRequests ) );
 		} else {
 			Void _ = wait( checkEmptyQueue(&self) );
 		}

+		if(recovered.canBeSet()) recovered.send(Void());
+
 		self.sharedActors.send( cleanupPeekTrackers(&self) );
 		self.sharedActors.send( commitQueue(&self) );
 		self.sharedActors.send( updateStorageLoop(&self) );
@ -1723,11 +1767,11 @@ ACTOR Future<Void> tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ
 			}
 		}
 	} catch (Error& e) {
-		TraceEvent("TLogError", tlogId).error(e);
-		if(e.code() != error_code_actor_cancelled) {
-			while(!tlogRequests.isEmpty()) {
-				tlogRequests.getFuture().pop().reply.sendError(e);
-			}
+		TraceEvent("TLogError", tlogId).error(e, true);
+		if(recovered.canBeSet()) recovered.send(Void());
+
+		while(!tlogRequests.isEmpty()) {
+			tlogRequests.getFuture().pop().reply.sendError(recruitment_failed());
 		}

 		for( auto& it : self.id_data ) {
--- a/fdbserver/TagPartitionedLogSystem.actor.cpp
+++ b/fdbserver/TagPartitionedLogSystem.actor.cpp
@ -329,15 +329,63 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		}
 	}

-	virtual Future<Void> confirmEpochLive(Optional<UID> debugID) {
-		// Returns success after confirming that pushes in the current epoch are still possible
-		// FIXME: This is way too conservative?
-		vector<Future<Void>> alive;
-		for(auto& t : logServers) {
-			if( t->get().present() ) alive.push_back( brokenPromiseToNever( t->get().interf().confirmRunning.getReply(TLogConfirmRunningRequest(debugID), TaskTLogConfirmRunningReply ) ) );
-			else alive.push_back( Never() );
+	ACTOR static Future<Void> confirmEpochLive_internal(TagPartitionedLogSystem* self, Optional<UID> debugID) {
+		state vector<Future<Void>> alive;
+		int numPresent = 0;
+		for(auto& t : self->logServers) {
+			if( t->get().present() ) {
+				alive.push_back( brokenPromiseToNever(
+				    t->get().interf().confirmRunning.getReply( TLogConfirmRunningRequest(debugID),
+				                                               TaskTLogConfirmRunningReply ) ) );
+				numPresent++;
+			} else {
+				alive.push_back( Never() );
+			}
 		}
-		return quorum( alive, alive.size() - tLogWriteAntiQuorum );
+
+		Void _ = wait( quorum( alive, std::min(self->tLogReplicationFactor, numPresent - self->tLogWriteAntiQuorum) ) );
+
+		loop {
+			LocalityGroup locked;
+			std::vector<LocalityData> unlocked, unused;
+			for (int i = 0; i < alive.size(); i++) {
+				if (alive[i].isReady() && !alive[i].isError()) {
+					locked.add(self->tLogLocalities[i]);
+				} else {
+					unlocked.push_back(self->tLogLocalities[i]);
+				}
+			}
+			bool quorum_obtained = locked.validate(self->tLogPolicy);
+			if (!quorum_obtained && self->tLogWriteAntiQuorum != 0) {
+				quorum_obtained = !validateAllCombinations(unused, locked, self->tLogPolicy, unlocked, self->tLogWriteAntiQuorum, false);
+			}
+			if (self->tLogReplicationFactor - self->tLogWriteAntiQuorum == 1 && locked.size() > 0) {
+				ASSERT(quorum_obtained);
+			}
+			if (quorum_obtained) {
+				return Void();
+			}
+
+			// The current set of responders that we have weren't enough to form a quorum, so we must
+			// wait for more responses and try again.
+			std::vector<Future<Void>> changes;
+			for (int i = 0; i < alive.size(); i++) {
+				if (!alive[i].isReady()) {
+					changes.push_back( ready(alive[i]) );
+				} else if (alive[i].isReady() && alive[i].isError() &&
+				           alive[i].getError().code() == error_code_tlog_stopped) {
+					// All commits must go to all TLogs.  If any TLog is stopped, then our epoch has ended.
+					return Never();
+				}
+			}
+			ASSERT(changes.size() != 0);
+			Void _ = wait( waitForAny(changes) );
+		}
+	}
+
+	// Returns success after confirming that pushes in the current epoch are still possible.
+	virtual Future<Void> confirmEpochLive(Optional<UID> debugID) {
+		return confirmEpochLive_internal(this, debugID);
 	}

 	virtual Future<Reference<ILogSystem>> newEpoch( vector<WorkerInterface> availableLogServers, DatabaseConfiguration const& config, LogEpoch recoveryCount ) {
@ -522,7 +570,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		// Creates a new logSystem representing the (now frozen) epoch
 		// No other important side effects.
 		// The writeQuorum in the master info is from the previous configuration
-		state vector<Future<TLogLockResult>> tLogReply;
+		state vector<Future<TLogLockResult>> tLogReply(prevState.tLogs.size());

 		if (!prevState.tLogs.size()) {
 			// This is a brand new database
@ -545,7 +593,8 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		// To ensure consistent recovery, the number of servers NOT in the write quorum plus the number of servers NOT in the read quorum
 		// have to be strictly less than the replication factor.  Otherwise there could be a replica set consistent entirely of servers that
 		// are out of date due to not being in the write quorum or unavailable due to not being in the read quorum.
-		// So (N - W) + (N - R) < F, and optimally (N-W)+(N-R)=F-1.  Thus R=2N+1-F-W.
+		// So with N = # of tlogs, W = antiquorum, R = required count, F = replication factor,
+		// W + (N - R) < F, and optimally (N-W)+(N-R)=F-1.  Thus R=N+1-F+W.
 		state int requiredCount = (int)prevState.tLogs.size()+1 - prevState.tLogReplicationFactor + prevState.tLogWriteAntiQuorum;
 		ASSERT( requiredCount > 0 && requiredCount <= prevState.tLogs.size() );
 		ASSERT( prevState.tLogReplicationFactor >= 1 && prevState.tLogReplicationFactor <= prevState.tLogs.size() );
@ -579,8 +628,12 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		}
 		state Future<Void> rejoins = trackRejoins( dbgid, allLogServers, rejoinRequests );

-		for(int t=0; t<logServers.size(); t++)
-			tLogReply.push_back( lockTLog( dbgid, logServers[t]) );
+		state bool buggify_lock_minimal_tlogs = BUGGIFY;
+		if (!buggify_lock_minimal_tlogs) {
+			for(int t=0; t<logServers.size(); t++) {
+				tLogReply[t] = lockTLog( dbgid, logServers[t]);
+			}
+		}

 		state Optional<Version> last_end;

@ -588,6 +641,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		state int	cycles = 0;

 		loop {
+			if (buggify_lock_minimal_tlogs) {
+				lockMinimalTLogSet( dbgid, prevState, logServers, logFailed, &tLogReply );
+			}
 			std::vector<LocalityData>	availableItems, badCombo;
 			std::vector<TLogLockResult> results;
 			std::string	sServerState;
@ -596,7 +652,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			cycles ++;

 			for(int t=0; t<logServers.size(); t++) {
-				if (tLogReply[t].isReady() && !tLogReply[t].isError() && !logFailed[t]->get()) {
+				if (tLogReply[t].isValid() && tLogReply[t].isReady() && !tLogReply[t].isError() && !logFailed[t]->get()) {
 					results.push_back(tLogReply[t].get());
 					availableItems.push_back(prevState.tLogLocalities[t]);
 					sServerState += 'a';
@ -621,7 +677,6 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 					(!validateAllCombinations(badCombo, unResponsiveSet, prevState.tLogPolicy, availableItems, prevState.tLogWriteAntiQuorum, false)))
 			{
 				TraceEvent("EpochEndBadCombo", dbgid).detail("Cycles", cycles)
-					.detail("Required", requiredCount)
 					.detail("Present", results.size())
 					.detail("Available", availableItems.size())
 					.detail("Absent", logServers.size() - results.size())
@ -659,7 +714,6 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS

 					TraceEvent("LogSystemRecovery", dbgid).detail("Cycles", cycles)
 						.detail("TotalServers", logServers.size())
-						.detail("Required", requiredCount)
 						.detail("Present", results.size())
 						.detail("Available", availableItems.size())
 						.detail("Absent", logServers.size() - results.size())
@ -702,7 +756,6 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 				else {
 					TraceEvent("LogSystemUnchangedRecovery", dbgid).detail("Cycles", cycles)
 						.detail("TotalServers", logServers.size())
-						.detail("Required", requiredCount)
 						.detail("Present", results.size())
 						.detail("Available", availableItems.size())
 						.detail("Absent", logServers.size() - results.size())
@ -725,9 +778,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			else {
 				TraceEvent("LogSystemWaitingForRecovery", dbgid).detail("Cycles", cycles)
 					.detail("AvailableServers", results.size())
-					.detail("RequiredServers", requiredCount)
 					.detail("TotalServers", logServers.size())
-					.detail("Required", requiredCount)
 					.detail("Present", results.size())
 					.detail("Available", availableItems.size())
 					.detail("Absent", logServers.size() - results.size())
@ -743,11 +794,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			// Wait for anything relevant to change
 			std::vector<Future<Void>> changes;
 			for(int i=0; i<logServers.size(); i++) {
-				if (!tLogReply[i].isReady())
+				if (tLogReply[i].isValid() && !tLogReply[i].isReady()) {
 					changes.push_back( ready(tLogReply[i]) );
-				else {
-					changes.push_back( logServers[i]->onChange() );
+					if(buggify_lock_minimal_tlogs) {
+						changes.push_back( logFailed[i]->onChange() );
+					}
+				} else {
 					changes.push_back( logFailed[i]->onChange() );
+					changes.push_back( logServers[i]->onChange() );
 				}
 			}
 			ASSERT(changes.size());
@ -792,6 +846,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 			req.recoverAt = oldLogSystem->epochEndVersion.get();
 			req.knownCommittedVersion = oldLogSystem->knownCommittedVersion;
 			req.epoch = recoveryCount;
+			TraceEvent("TLogInitializeRequest").detail("address", workers[i].tLog.getEndpoint().address);
 		}

 		logSystem->tLogLocalities.resize( workers.size() );
@ -874,6 +929,173 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted<TagPartitionedLogS
 		}
 	}

+	static void lockMinimalTLogSet(const UID& dbgid, const DBCoreState& prevState,
+	                               const std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>>& logServers,
+	                               const std::vector<Reference<AsyncVar<bool>>>& logFailed,
+	                               vector<Future<TLogLockResult>>* tLogReply ) {
+		// Invariant: tLogReply[i] must correspond to the tlog stored as logServers[i].
+		ASSERT(tLogReply->size() == prevState.tLogLocalities.size());
+		ASSERT(logFailed.size() == tLogReply->size());
+
+		// For any given index, only one of the following will be true.
+		auto locking_completed = [&logFailed, tLogReply](int index) {
+			const auto& entry = tLogReply->at(index);
+			return !logFailed[index]->get() && entry.isValid() && entry.isReady() && !entry.isError();
+		};
+		auto locking_failed = [&logFailed, tLogReply](int index) {
+			const auto& entry = tLogReply->at(index);
+			return logFailed[index]->get() || (entry.isValid() && entry.isReady() && entry.isError());
+		};
+		auto locking_pending = [&logFailed, tLogReply](int index) {
+			const auto& entry = tLogReply->at(index);
+			return !logFailed[index]->get() && (entry.isValid() && !entry.isReady());
+		};
+		auto locking_skipped = [&logFailed, tLogReply](int index) {
+			const auto& entry = tLogReply->at(index);
+			return !logFailed[index]->get() && !entry.isValid();
+		};
+
+		auto can_obtain_quorum = [&prevState](std::function<bool(int)> filter) {
+			LocalityGroup filter_true;
+			std::vector<LocalityData> filter_false, unused;
+			for (int i = 0; i < prevState.tLogLocalities.size() ; i++) {
+				if (filter(i)) {
+					filter_true.add(prevState.tLogLocalities[i]);
+				} else {
+					filter_false.push_back(prevState.tLogLocalities[i]);
+				}
+			}
+			bool valid = filter_true.validate(prevState.tLogPolicy);
+			if (!valid && prevState.tLogWriteAntiQuorum > 0 ) {
+				valid = !validateAllCombinations(unused, filter_true, prevState.tLogPolicy, filter_false, prevState.tLogWriteAntiQuorum, false);
+			}
+			return valid;
+		};
+
+		// Step 1: Verify that if all the failed TLogs come back, they can't form a quorum.
+		if (can_obtain_quorum(locking_failed)) {
+			TraceEvent(SevInfo, "MasterRecoveryTLogLockingImpossible", dbgid);
+			return;
+		}
+
+		// Step 2: It's possible for us to succeed, but we need to lock additional logs.
+		//
+		// First, we need an accurate picture of what TLogs we're capable of locking. We can't tell the
+		// difference between a temporarily failed TLog and a permanently failed TLog. Thus, we assume
+		// all failures are permanent, and manually re-issue lock requests if they rejoin.
+		for (int i = 0; i < logFailed.size(); i++) {
+			const auto& r = tLogReply->at(i);
+			TEST(locking_failed(i) && (r.isValid() && !r.isReady()));  // A TLog failed with a pending request.
+			// The reboot_a_tlog BUGGIFY below should cause the above case to be hit.
+			if (locking_failed(i)) {
+				tLogReply->at(i) = Future<TLogLockResult>();
+			}
+		}
+
+		// We're trying to paritition the set of old tlogs into two sets, L and R, such that:
+		// (1). R does not validate the policy
+		// (2). |R| is as large as possible
+		// (3). L contains all the already-locked TLogs
+		// and then we only issue lock requests to TLogs in L. This is safe, as R does not have quorum,
+		// so no commits may occur.  It does not matter if L forms a quorum or not.
+		//
+		// We form these sets by starting with L as all machines and R as the empty set, and moving a
+		// random machine from L to R until (1) or (2) no longer holds as true. Code-wise, L is
+		// [0..end-can_omit), and R is [end-can_omit..end), and we move a random machine via randomizing
+		// the order of the tlogs. Choosing a random machine was verified to generate a good-enough
+		// result to be interesting intests sufficiently frequently that we don't need to try to
+		// calculate the exact optimal solution.
+		std::vector<std::pair<LocalityData, int>> tlogs;
+		for (int i = 0; i < prevState.tLogLocalities.size(); i++) {
+			tlogs.emplace_back(prevState.tLogLocalities[i], i);
+		}
+		g_random->randomShuffle(tlogs);
+		// Rearrange the array such that things that the left is logs closer to being locked, and
+		// the right is logs that can't be locked.  This makes us prefer locking already-locked TLogs,
+		// which is how we respect the decisions made in the previous execution.
+		auto idx_to_order = [&locking_completed, &locking_failed, &locking_pending, &locking_skipped](int index) {
+			bool complete = locking_completed(index);
+			bool pending = locking_pending(index);
+			bool skipped = locking_skipped(index);
+			bool failed = locking_failed(index);
+
+			ASSERT( complete + pending + skipped + failed == 1 );
+
+			if (complete) return 0;
+			if (pending) return 1;
+			if (skipped) return 2;
+			if (failed) return 3;
+
+			ASSERT(false);  // Programmer error.
+			return -1;
+		};
+		std::sort(tlogs.begin(), tlogs.end(),
+		    // TODO: Change long type to `auto` once toolchain supports C++17.
+		    [&idx_to_order](const std::pair<LocalityData, int>& lhs, const std::pair<LocalityData, int>& rhs) {
+		    	return idx_to_order(lhs.second) < idx_to_order(rhs.second);
+		    });
+
+		// Indexes that aren't in the vector are the ones we're considering omitting. Remove indexes until
+		// the removed set forms a quorum.
+		int can_omit = 0;
+		std::vector<int> to_lock_indexes;
+		for (auto it = tlogs.cbegin() ; it != tlogs.cend() - 1 ; it++ ) {
+			to_lock_indexes.push_back(it->second);
+		}
+		auto filter = [&to_lock_indexes](int index) {
+			return std::find(to_lock_indexes.cbegin(), to_lock_indexes.cend(), index) == to_lock_indexes.cend();
+		};
+		while(true) {
+			if (can_obtain_quorum(filter)) {
+				break;
+			} else {
+				can_omit++;
+				ASSERT(can_omit < tlogs.size());
+				to_lock_indexes.pop_back();
+			}
+		}
+
+		if (prevState.tLogReplicationFactor - prevState.tLogWriteAntiQuorum == 1) {
+			ASSERT(can_omit == 0);
+		}
+		// Our previous check of making sure there aren't too many failed logs should have prevented this.
+		ASSERT(!locking_failed(tlogs[tlogs.size()-can_omit-1].second));
+
+		// If we've managed to leave more tlogs unlocked than (RF-AQ), it means we've hit the case
+		// where the policy engine has allowed us to have multiple logs in the same failure domain
+		// with independant sets of data. This case will validated that no code is relying on the old
+		// quorum=(RF-AQ) logic, and now goes through the policy engine instead.
+		TEST(can_omit >= prevState.tLogReplicationFactor - prevState.tLogWriteAntiQuorum);  // Locking a subset of the TLogs while ending an epoch.
+		const bool reboot_a_tlog = g_network->now() - g_simulator.lastConnectionFailure > g_simulator.connectionFailuresDisableDuration && BUGGIFY && g_random->random01() < 0.25;
+		TraceEvent(SevInfo, "MasterRecoveryTLogLocking", dbgid)
+		    .detail("locks", tlogs.size() - can_omit)
+		    .detail("skipped", can_omit)
+		    .detail("replication", prevState.tLogReplicationFactor)
+		    .detail("antiquorum", prevState.tLogWriteAntiQuorum)
+		    .detail("reboot_buggify", reboot_a_tlog);
+		for (int i = 0; i < tlogs.size() - can_omit; i++) {
+			const int index = tlogs[i].second;
+			Future<TLogLockResult>& entry = tLogReply->at(index);
+			if (!entry.isValid()) {
+				entry = lockTLog( dbgid, logServers[index] );
+			}
+		}
+		if (reboot_a_tlog) {
+			g_simulator.lastConnectionFailure = g_network->now();
+			for (int i = 0; i < tlogs.size() - can_omit; i++) {
+				const int index = tlogs[i].second;
+				if (logServers[index]->get().present()) {
+					g_simulator.rebootProcess(
+					    g_simulator.getProcessByAddress(
+					        logServers[index]->get().interf().address()),
+					    ISimulator::RebootProcess);
+					break;
+				}
+			}
+		}
+		// Intentionally leave `tlogs.size() - can_omit` .. `tlogs.size()` as !isValid() Futures.
+  }
+
 	template <class T>
 	static vector<T> getReadyNonError( vector<Future<T>> const& futures ) {
 		// Return the values of those futures which have (non-error) values ready
--- a/fdbserver/TesterInterface.h
+++ b/fdbserver/TesterInterface.h
@ -83,11 +83,11 @@ struct TesterInterface {
 	}
 };

-Future<Void> testerServerCore( TesterInterface const& interf, Reference<ClusterConnectionFile> const& ccf, Reference<AsyncVar<struct ServerDBInfo>> const& );
+Future<Void> testerServerCore( TesterInterface const& interf, Reference<ClusterConnectionFile> const& ccf, Reference<AsyncVar<struct ServerDBInfo>> const&, LocalityData const& );

 enum test_location_t { TEST_HERE, TEST_ON_SERVERS, TEST_ON_TESTERS };
 enum test_type_t { TEST_TYPE_FROM_FILE, TEST_TYPE_CONSISTENCY_CHECK };

-Future<Void> runTests( Reference<ClusterConnectionFile> const& connFile, test_type_t const& whatToRun, test_location_t const& whereToRun, int const& minTestersExpected, std::string const& fileName = std::string(), StringRef const& startingConfiguration = StringRef() );
+Future<Void> runTests( Reference<ClusterConnectionFile> const& connFile, test_type_t const& whatToRun, test_location_t const& whereToRun, int const& minTestersExpected, std::string const& fileName = std::string(), StringRef const& startingConfiguration = StringRef(), LocalityData const& locality = LocalityData() );

 #endif
--- a/fdbserver/WorkerInterface.h
+++ b/fdbserver/WorkerInterface.h
@ -273,7 +273,7 @@ Future<Void> extractClusterInterface( Reference<AsyncVar<Optional<struct Cluster

 Future<Void> fdbd( Reference<ClusterConnectionFile> const&, LocalityData const& localities, ProcessClass const& processClass, std::string const& dataFolder, std::string const& coordFolder, int64_t const& memoryLimit, std::string const& metricsConnFile, std::string const& metricsPrefix );
 Future<Void> workerServer( Reference<ClusterConnectionFile> const&, Reference<AsyncVar<Optional<struct ClusterControllerFullInterface>>> const& ccInterface, LocalityData const& localities, ProcessClass const& processClass, std::string const& filename, int64_t const& memoryLimit, Future<Void> const& forceFailure, std::string const& metricsConnFile, std::string const& metricsPrefix );
-Future<Void> clusterController( Reference<ClusterConnectionFile> const&, Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> const& currentCC );
+Future<Void> clusterController( Reference<ClusterConnectionFile> const&, Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> const& currentCC, Reference<AsyncVar<ProcessClass>> const& asyncProcessClass );

 // These servers are started by workerServer
 Future<Void> storageServer(
@ -287,14 +287,14 @@ Future<Void> storageServer(
 				class IKeyValueStore* const& persistentData,
 				StorageServerInterface const& ssi,
 				Reference<AsyncVar<ServerDBInfo>> const& db,
-				std::string const& folder );  // changes pssi->id() to be the recovered ID
+				std::string const& folder,
+				Promise<Void> const& recovered);  // changes pssi->id() to be the recovered ID
 Future<Void> masterServer( MasterInterface const& mi, Reference<AsyncVar<ServerDBInfo>> const& db, class ServerCoordinators const&, LifetimeToken const& lifetime );
 Future<Void> masterProxyServer(MasterProxyInterface const& proxy, InitializeMasterProxyRequest const& req, Reference<AsyncVar<ServerDBInfo>> const& db);
-Future<Void> tLog( class IKeyValueStore* const& persistentData, class IDiskQueue* const& persistentQueue, Reference<AsyncVar<ServerDBInfo>> const& db, LocalityData const& locality, PromiseStream<InitializeTLogRequest> const& tlogRequests, UID const& tlogId, bool const& restoreFromDisk, Promise<Void> const& oldLog );  // changes tli->id() to be the recovered ID
+Future<Void> tLog( class IKeyValueStore* const& persistentData, class IDiskQueue* const& persistentQueue, Reference<AsyncVar<ServerDBInfo>> const& db, LocalityData const& locality, PromiseStream<InitializeTLogRequest> const& tlogRequests, UID const& tlogId, bool const& restoreFromDisk, Promise<Void> const& oldLog, Promise<Void> const& recovered );  // changes tli->id() to be the recovered ID
 Future<Void> debugQueryServer( DebugQueryRequest const& req );
 Future<Void> monitorServerDBInfo( Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> const& ccInterface, Reference<ClusterConnectionFile> const&, LocalityData const&, Reference<AsyncVar<ServerDBInfo>> const& dbInfo );
 Future<Void> resolver( ResolverInterface const& proxy, InitializeResolverRequest const&, Reference<AsyncVar<ServerDBInfo>> const& db );
-Future<Void> runMetrics( Future<Database> const& fcx, Key const& metricsPrefix );

 void registerThreadForProfiling();
 void updateCpuProfiler(ProfilerRequest req);
--- a/fdbserver/fdbserver.actor.cpp
+++ b/fdbserver/fdbserver.actor.cpp
@ -802,6 +802,45 @@ extern bool g_crashOnError;
 	}
 #endif

+Optional<bool> checkBuggifyOverride(const char *testFile) {
+	std::ifstream ifs;
+	ifs.open(testFile, std::ifstream::in);
+	if (!ifs.good())
+		return 0;
+
+	std::string cline;
+
+	while (ifs.good()) {
+		getline(ifs, cline);
+		std::string line = removeWhitespace(std::string(cline));
+		if (!line.size() || line.find(';') == 0)
+			continue;
+
+		size_t found = line.find('=');
+		if (found == std::string::npos)
+			// hmmm, not good
+			continue;
+		std::string attrib = removeWhitespace(line.substr(0, found));
+		std::string value = removeWhitespace(line.substr(found + 1));
+
+		if (attrib == "buggify") {
+			if( !strcmp( value.c_str(), "on" ) ) {
+				ifs.close();
+				return true;
+			} else if( !strcmp( value.c_str(), "off" ) ) {
+				ifs.close();
+				return false;
+			} else {
+				fprintf(stderr, "ERROR: Unknown buggify override state `%s'\n", value.c_str());
+				flushAndExit(FDB_EXIT_ERROR);
+			}
+		}
+	}
+
+	ifs.close();
+	return Optional<bool>();
+}
+
 int main(int argc, char* argv[]) {
 	try {
 		platformInit();
@ -1359,6 +1398,11 @@ int main(int argc, char* argv[]) {
 		else
 			g_debug_random = new DeterministicRandom(platform::getRandomSeed());

+		if(role==Simulation) {
+			Optional<bool> buggifyOverride = checkBuggifyOverride(testFile);
+			if(buggifyOverride.present())
+				buggifyEnabled = buggifyOverride.get();
+		}
 		enableBuggify( buggifyEnabled );

 		delete FLOW_KNOBS;
@ -1528,6 +1572,17 @@ int main(int argc, char* argv[]) {

 		Future<Optional<Void>> f;

+		Standalone<StringRef> machineId(getSharedMemoryMachineId().toString());
+
+		if (!localities.isPresent(LocalityData::keyZoneId))
+			localities.set(LocalityData::keyZoneId, zoneId.present() ? zoneId : machineId);
+
+		if (!localities.isPresent(LocalityData::keyMachineId))
+			localities.set(LocalityData::keyMachineId, machineId);
+
+		if (!localities.isPresent(LocalityData::keyDcId) && dcId.present())
+			localities.set(LocalityData::keyDcId, dcId);
+
 		if (role == Simulation) {
 			TraceEvent("Simulation").detail("TestFile", testFile);

@ -1574,16 +1629,6 @@ int main(int argc, char* argv[]) {

 			vector<Future<Void>> actors;
 			actors.push_back( listenError );
-			Standalone<StringRef> machineId(getSharedMemoryMachineId().toString());
-
-			if (!localities.isPresent(LocalityData::keyZoneId))
-				localities.set(LocalityData::keyZoneId, zoneId.present() ? zoneId : machineId);
-
-			if (!localities.isPresent(LocalityData::keyMachineId))
-				localities.set(LocalityData::keyMachineId, machineId);
-
-			if (!localities.isPresent(LocalityData::keyDcId) && dcId.present())
-				localities.set(LocalityData::keyDcId, dcId);

 			actors.push_back( fdbd(connectionFile, localities, processClass, dataFolder, dataFolder, storageMemLimit, metricsConnFile, metricsPrefix) );
 			//actors.push_back( recurring( []{}, .001 ) );  // for ASIO latency measurement
@ -1591,11 +1636,11 @@ int main(int argc, char* argv[]) {
 			f = stopAfter( waitForAll(actors) );
 			g_network->run();
 		} else if (role == MultiTester) {
-			f = stopAfter( runTests( connectionFile, TEST_TYPE_FROM_FILE, testOnServers ? TEST_ON_SERVERS : TEST_ON_TESTERS, minTesterCount, testFile ) );
+			f = stopAfter( runTests( connectionFile, TEST_TYPE_FROM_FILE, testOnServers ? TEST_ON_SERVERS : TEST_ON_TESTERS, minTesterCount, testFile, StringRef(), localities ) );
 			g_network->run();
 		} else if (role == Test || role == ConsistencyCheck) {
 			auto m = startSystemMonitor(dataFolder, zoneId, zoneId);
-			f = stopAfter( runTests( connectionFile, role == ConsistencyCheck ? TEST_TYPE_CONSISTENCY_CHECK : TEST_TYPE_FROM_FILE, TEST_HERE, 1, testFile ) );
+			f = stopAfter( runTests( connectionFile, role == ConsistencyCheck ? TEST_TYPE_CONSISTENCY_CHECK : TEST_TYPE_FROM_FILE, TEST_HERE, 1, testFile, StringRef(), localities ) );
 			g_network->run();
 		} else if (role == CreateTemplateDatabase) {
 			createTemplateDatabase();
--- a/fdbserver/fdbserver.vcxproj
+++ b/fdbserver/fdbserver.vcxproj
@ -48,7 +48,6 @@
    <ClCompile Include="Knobs.cpp" />
    <ActorCompiler Include="QuietDatabase.actor.cpp" />
    <ActorCompiler Include="networktest.actor.cpp" />
-    <ActorCompiler Include="MetricLogger.actor.cpp" />
    <ActorCompiler Include="workloads\SaveAndKill.actor.cpp" />
    <ActorCompiler Include="Resolver.actor.cpp" />
    <ActorCompiler Include="LogSystemDiskQueueAdapter.actor.cpp" />
@ -120,6 +119,7 @@
    <ActorCompiler Include="workloads\Increment.actor.cpp" />
    <ActorCompiler Include="workloads\FuzzApiCorrectness.actor.cpp" />
    <ActorCompiler Include="workloads\LockDatabase.actor.cpp" />
+    <ActorCompiler Include="workloads\LowLatency.actor.cpp" />
    <ClCompile Include="workloads\MemoryKeyValueStore.cpp" />
    <ActorCompiler Include="workloads\RyowCorrectness.actor.cpp" />
    <ActorCompiler Include="workloads\IndexScan.actor.cpp" />
--- a/fdbserver/fdbserver.vcxproj.filters
+++ b/fdbserver/fdbserver.vcxproj.filters
@ -247,6 +247,9 @@
      <Filter>workloads</Filter>
    </ActorCompiler>
    <ActorCompiler Include="OldTLogServer.actor.cpp" />
+    <ActorCompiler Include="workloads\LowLatency.actor.cpp">
+      <Filter>workloads</Filter>
+    </ActorCompiler>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="SkipList.cpp" />
--- a/fdbserver/masterserver.actor.cpp
+++ b/fdbserver/masterserver.actor.cpp
@ -77,6 +77,7 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {

 	DatabaseConfiguration originalConfiguration;
 	DatabaseConfiguration configuration;
+	bool hasConfiguration;

 	ServerCoordinators coordinators;

@ -138,7 +139,8 @@ struct MasterData : NonCopyable, ReferenceCounted<MasterData> {
 		  version(invalidVersion),
 		  lastVersionTime(0),
 		  txnStateStore(0),
-		  memoryLimit(2e9)
+		  memoryLimit(2e9),
+		  hasConfiguration(false)
 	{
 	}
 	~MasterData() { if(txnStateStore) txnStateStore->close(); }
@ -250,53 +252,34 @@ ACTOR Future<Void> newTLogServers( Reference<MasterData> self, Future< RecruitFr
 	return Void();
 }

-ACTOR Future<Void> newSeedServers( Reference<MasterData> self, vector<StorageServerInterface>* servers ) {
+ACTOR Future<Void> newSeedServers( Reference<MasterData> self, RecruitFromConfigurationReply recruits, vector<StorageServerInterface>* servers ) {
 	// This is only necessary if the database is at version 0
 	servers->clear();
 	if (self->lastEpochEnd) return Void();

 	state Tag tag = 0;
-	state std::set<Optional<Standalone<StringRef>>> dataCenters;
-	while( servers->size() < self->configuration.storageTeamSize ) {
-		try {
-			RecruitStorageRequest req;
-			req.criticalRecruitment = true;
-			for(auto s = servers->begin(); s != servers->end(); ++s)
-				req.excludeMachines.push_back(s->locality.zoneId());
+	while( tag < recruits.storageServers.size() ) {
+		TraceEvent("MasterRecruitingInitialStorageServer", self->dbgid)
+			.detail("CandidateWorker", recruits.storageServers[tag].locality.toString());

-			TraceEvent("MasterRecruitingInitialStorageServer", self->dbgid)
-				.detail("ExcludingMachines", req.excludeMachines.size())
-				.detail("ExcludingDataCenters", req.excludeDCs.size());
+		InitializeStorageRequest isr;
+		isr.seedTag = tag;
+		isr.storeType = self->configuration.storageServerStoreType;
+		isr.reqId = g_random->randomUniqueID();
+		isr.interfaceId = g_random->randomUniqueID();

-			RecruitStorageReply candidateWorker = wait( brokenPromiseToNever( self->clusterController.recruitStorage.getReply( req ) ) );
+		ErrorOr<StorageServerInterface> newServer = wait( recruits.storageServers[tag].storage.tryGetReply( isr ) );

-			TraceEvent("MasterRecruitingInitialStorageServer", self->dbgid)
-				.detail("CandidateWorker", candidateWorker.worker.locality.toString());
+		if( newServer.isError() ) {
+			if( !newServer.isError( error_code_recruitment_failed ) && !newServer.isError( error_code_request_maybe_delivered ) )
+				throw newServer.getError();

-			InitializeStorageRequest isr;
-			isr.seedTag = tag;
-			isr.storeType = self->configuration.storageServerStoreType;
-			isr.reqId = g_random->randomUniqueID();
-			isr.interfaceId = g_random->randomUniqueID();
-
-			ErrorOr<StorageServerInterface> newServer = wait( candidateWorker.worker.storage.tryGetReply( isr ) );
-
-			if( newServer.isError() ) {
-				if( !newServer.isError( error_code_recruitment_failed ) && !newServer.isError( error_code_request_maybe_delivered ) )
-					throw newServer.getError();
-
-				TEST( true ); // masterserver initial storage recuitment loop failed to get new server
-				Void _ = wait( delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY) );
-			}
-			else {
-				servers->push_back( newServer.get() );
-				dataCenters.insert( newServer.get().locality.dcId() );
-				tag++;
-			}
-		} catch ( Error &e ) {
-			if(e.code() != error_code_timed_out) {
-				throw;
-			}
+			TEST( true ); // masterserver initial storage recuitment loop failed to get new server
+			Void _ = wait( delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY) );
+		}
+		else {
+			servers->push_back( newServer.get() );
+			tag++;
 		}
 	}

@ -376,7 +359,7 @@ Future<Void> sendMasterRegistration( MasterData* self, LogSystemConfig const& lo
 	masterReq.proxies = proxies;
 	masterReq.resolvers = resolvers;
 	masterReq.recoveryCount = recoveryCount;
-	masterReq.configuration = self->configuration;
+	if(self->hasConfiguration) masterReq.configuration = self->configuration;
 	masterReq.registrationCount = ++self->registrationCount;
 	masterReq.priorCommittedLogServers = priorCommittedLogServers;
 	masterReq.recoveryState = self->recoveryState;
@ -450,7 +433,7 @@ ACTOR Future<Standalone<CommitTransactionRef>> provisionalMaster( Reference<Mast
 				}
 			}
 		}
-		when ( ReplyPromise<vector<StorageServerInterface>> req = waitNext( parent->provisionalProxies[0].getKeyServersLocations.getFuture() ) ) {
+		when ( ReplyPromise<vector<pair<KeyRangeRef, vector<StorageServerInterface>>>> req = waitNext( parent->provisionalProxies[0].getKeyServersLocations.getFuture() ) ) {
 			req.send(Never());
 		}
 		when ( Void _ = wait( waitFailure ) ) { throw worker_removed(); }
@ -486,7 +469,7 @@ ACTOR Future<Void> recruitEverything( Reference<MasterData> self, vector<Storage

 	RecruitFromConfigurationReply recruits = wait(
 		brokenPromiseToNever( self->clusterController.recruitFromConfiguration.getReply(
-			RecruitFromConfigurationRequest( self->configuration ) ) ) );
+			RecruitFromConfigurationRequest( self->configuration, self->lastEpochEnd==0 ) ) ) );

 	TraceEvent("MasterRecoveryState", self->dbgid)
 		.detail("StatusCode", RecoveryStatus::initializing_transaction_servers)
@ -499,7 +482,7 @@ ACTOR Future<Void> recruitEverything( Reference<MasterData> self, vector<Storage
 	// Actually, newSeedServers does both the recruiting and initialization of the seed servers; so if this is a brand new database we are sort of lying that we are
 	// past the recruitment phase.  In a perfect world we would split that up so that the recruitment part happens above (in parallel with recruiting the transaction servers?).

-	Void _ = wait( newProxies( self, recruits ) && newResolvers( self, recruits ) && newTLogServers( self, recruits, oldLogSystem ) && newSeedServers( self, seedServers ) );
+	Void _ = wait( newProxies( self, recruits ) && newResolvers( self, recruits ) && newTLogServers( self, recruits, oldLogSystem ) && newSeedServers( self, recruits, seedServers ) );
 	return Void();
 }

@ -559,6 +542,7 @@ ACTOR Future<Void> readTransactionSystemState( Reference<MasterData> self, Refer
 	Standalone<VectorRef<KeyValueRef>> rawConf = wait( self->txnStateStore->readRange( configKeys ) );
 	self->configuration.fromKeyValues( rawConf );
 	self->originalConfiguration = self->configuration;
+	self->hasConfiguration = true;
 	TraceEvent("MasterRecoveredConfig", self->dbgid).detail("conf", self->configuration.toString()).trackLatest("RecoveredConfig");

 	//auto kvs = self->txnStateStore->readRange( systemKeys );
@ -646,6 +630,7 @@ ACTOR Future<Void> recoverFrom( Reference<MasterData> self, Reference<ILogSystem
 		.detail("StatusCode", RecoveryStatus::reading_transaction_system_state)
 		.detail("Status", RecoveryStatus::names[RecoveryStatus::reading_transaction_system_state])
 		.trackLatest(format("%s/MasterRecoveryState", printable(self->dbName).c_str() ).c_str());
+	self->hasConfiguration = false;

 	if(BUGGIFY)
 		Void _ = wait( delay(10.0) );
--- a/fdbserver/storageserver.actor.cpp
+++ b/fdbserver/storageserver.actor.cpp
@ -3224,7 +3224,7 @@ ACTOR Future<Void> replaceInterface( StorageServer* self, StorageServerInterface
 	return Void();
 }

-ACTOR Future<Void> storageServer( IKeyValueStore* persistentData, StorageServerInterface ssi, Reference<AsyncVar<ServerDBInfo>> db, std::string folder )
+ACTOR Future<Void> storageServer( IKeyValueStore* persistentData, StorageServerInterface ssi, Reference<AsyncVar<ServerDBInfo>> db, std::string folder, Promise<Void> recovered )
 {
 	state StorageServer self(persistentData, db, ssi);
 	self.folder = folder;
@ -3233,23 +3233,28 @@ ACTOR Future<Void> storageServer( IKeyValueStore* persistentData, StorageServerI
 		state double start = now();
 		TraceEvent("StorageServerRebootStart", self.thisServerID);
 		bool ok = wait( self.storage.restoreDurableState() );
-		if (!ok) return Void();
+		if (!ok) {
+			if(recovered.canBeSet()) recovered.send(Void());
+			return Void();
+		}
 		TraceEvent("SSTimeRestoreDurableState", self.thisServerID).detail("TimeTaken", now() - start);

 		ASSERT( self.thisServerID == ssi.id() );
 		TraceEvent("StorageServerReboot", self.thisServerID)
 			.detail("Version", self.version.get());

+		if(recovered.canBeSet()) recovered.send(Void());
+
 		Void _ = wait( replaceInterface( &self, ssi ) );

 		TraceEvent("StorageServerStartingCore", self.thisServerID).detail("TimeTaken", now() - start);

 		//Void _ = wait( delay(0) );  // To make sure self->zkMasterInfo.onChanged is available to wait on
-
 		Void _ = wait( storageServerCore(&self, ssi) );

 		throw internal_error();
 	} catch (Error& e) {
+		if(recovered.canBeSet()) recovered.send(Void());
 		if (storageServerTerminated(self, persistentData, e))
 			return Void();
 		throw e;
--- a/fdbserver/tester.actor.cpp
+++ b/fdbserver/tester.actor.cpp
@ -484,7 +484,7 @@ ACTOR Future<Void> runWorkloadAsync( Database cx, WorkloadInterface workIface, T
 	return Void();
 }

-ACTOR Future<Void> testerServerWorkload( WorkloadRequest work, Reference<ClusterConnectionFile> ccf, Reference<AsyncVar<struct ServerDBInfo>> dbInfo ) {
+ACTOR Future<Void> testerServerWorkload( WorkloadRequest work, Reference<ClusterConnectionFile> ccf, Reference<AsyncVar<struct ServerDBInfo>> dbInfo, LocalityData locality ) {
 	state WorkloadInterface workIface;
 	state bool replied = false;
 	state Database cx;
@ -501,7 +501,7 @@ ACTOR Future<Void> testerServerWorkload( WorkloadRequest work, Reference<Cluster

 		if( database.size() ) {
 			Reference<Cluster> cluster = Cluster::createCluster(ccf->getFilename(), -1);
-			Database _cx = wait(cluster->createDatabase(database));
+			Database _cx = wait(cluster->createDatabase(database, locality));
 			cx = _cx;

 			Void _ = wait( delay(1.0) );
@ -544,7 +544,7 @@ ACTOR Future<Void> testerServerWorkload( WorkloadRequest work, Reference<Cluster
 	return Void();
 }

-ACTOR Future<Void> testerServerCore( TesterInterface interf, Reference<ClusterConnectionFile> ccf, Reference<AsyncVar<struct ServerDBInfo>> dbInfo ) {
+ACTOR Future<Void> testerServerCore( TesterInterface interf, Reference<ClusterConnectionFile> ccf, Reference<AsyncVar<struct ServerDBInfo>> dbInfo, LocalityData locality ) {
 	state PromiseStream<Future<Void>> addWorkload;
 	state Future<Void> workerFatalError = actorCollection(addWorkload.getFuture());

@ -552,7 +552,7 @@ ACTOR Future<Void> testerServerCore( TesterInterface interf, Reference<ClusterCo
 	loop choose {
 		when (Void _ = wait(workerFatalError)) {}
 		when (WorkloadRequest work = waitNext( interf.recruitments.getFuture() )) {
-			addWorkload.send(testerServerWorkload(work, ccf, dbInfo));
+			addWorkload.send(testerServerWorkload(work, ccf, dbInfo, locality));
 		}
 	}
 }
@ -740,10 +740,10 @@ ACTOR Future<Void> checkConsistency(Database cx, std::vector< TesterInterface >
 									double quiescentWaitTimeout, double softTimeLimit, double databasePingDelay) {
 	state TestSpec spec;

-	state bool connectionFailures;
+	state double connectionFailures;
 	if( g_network->isSimulated() ) {
-		connectionFailures = g_simulator.enableConnectionFailures;
-		g_simulator.enableConnectionFailures = false;
+		connectionFailures = g_simulator.connectionFailuresDisableDuration;
+		g_simulator.connectionFailuresDisableDuration = 1e6;
 		g_simulator.speedUpSimulation = true;
 	}
 	
@ -763,7 +763,7 @@ ACTOR Future<Void> checkConsistency(Database cx, std::vector< TesterInterface >
 		DistributedTestResults testResults = wait(runWorkload(cx, testers, database, spec));
 		if(testResults.ok() || lastRun) {
 			if( g_network->isSimulated() ) {
-				g_simulator.enableConnectionFailures = connectionFailures;
+				g_simulator.connectionFailuresDisableDuration = connectionFailures;
 			}
 			return Void();
 		}
@ -934,11 +934,14 @@ vector<TestSpec> readTests( ifstream& ifs ) {
 		} else if( attrib == "simCheckRelocationDuration" ) {
 			spec.simCheckRelocationDuration = (value == "true");
 			TraceEvent("TestParserTest").detail("ParsedSimCheckRelocationDuration", spec.simCheckRelocationDuration);
-		} else if( attrib == "simEnableConnectionFailures" ) {
-			spec.simEnableConnectionFailures = (value == "true");
-			if(g_network->isSimulated() && !spec.simEnableConnectionFailures)
-				g_simulator.enableConnectionFailures = false;
-			TraceEvent("TestParserTest").detail("ParsedSimEnableConnectionFailures", spec.simEnableConnectionFailures);
+		} else if( attrib == "connectionFailuresDisableDuration" ) {
+			double connectionFailuresDisableDuration;
+			sscanf( value.c_str(), "%lf", &connectionFailuresDisableDuration );
+			ASSERT( connectionFailuresDisableDuration >= 0 );
+			spec.simConnectionFailuresDisableDuration = connectionFailuresDisableDuration;
+			if(g_network->isSimulated())
+				g_simulator.connectionFailuresDisableDuration = spec.simConnectionFailuresDisableDuration;
+			TraceEvent("TestParserTest").detail("ParsedSimConnectionFailuresDisableDuration", spec.simConnectionFailuresDisableDuration);
 		} else if( attrib == "simBackupAgents" ) {
 			if (value == "BackupToFile")
 				spec.simBackupAgents = ISimulator::BackupToFile;
@ -949,6 +952,10 @@ vector<TestSpec> readTests( ifstream& ifs ) {
 			TraceEvent("TestParserTest").detail("ParsedSimBackupAgents", spec.simBackupAgents);
 		} else if( attrib == "extraDB" ) {
 			TraceEvent("TestParserTest").detail("ParsedExtraDB", "");
+		} else if( attrib == "minimumReplication" ) {
+			TraceEvent("TestParserTest").detail("ParsedMinimumReplication", "");
+		} else if( attrib == "buggify" ) {
+			TraceEvent("TestParserTest").detail("ParsedBuggify", "");
 		} else if( attrib == "checkOnly" ) {
 			if(value == "true")
 				spec.phases = TestWorkload::CHECK;
@ -985,7 +992,7 @@ vector<TestSpec> readTests( ifstream& ifs ) {
 	return result;
 }

-ACTOR Future<Void> runTests( Reference<AsyncVar<Optional<struct ClusterControllerFullInterface>>> cc, Reference<AsyncVar<Optional<struct ClusterInterface>>> ci, vector< TesterInterface > testers, vector<TestSpec> tests, StringRef startingConfiguration ) {
+ACTOR Future<Void> runTests( Reference<AsyncVar<Optional<struct ClusterControllerFullInterface>>> cc, Reference<AsyncVar<Optional<struct ClusterInterface>>> ci, vector< TesterInterface > testers, vector<TestSpec> tests, StringRef startingConfiguration, LocalityData locality ) {
 	state Standalone<StringRef> database = LiteralStringRef("DB");
 	state Database cx;
 	state Reference<AsyncVar<ServerDBInfo>> dbInfo( new AsyncVar<ServerDBInfo> );
@ -1016,7 +1023,7 @@ ACTOR Future<Void> runTests( Reference<AsyncVar<Optional<struct ClusterControlle
 		databasePingDelay = 0.0;
 	
 	if (useDB) {
-		Database _cx = wait( DatabaseContext::createDatabase( ci, Reference<Cluster>(), database, LocalityData() ) ); // FIXME: Locality!
+		Database _cx = wait( DatabaseContext::createDatabase( ci, Reference<Cluster>(), database, locality ) );
 		cx = _cx;
 	} else
 		database = LiteralStringRef("");
@ -1071,7 +1078,7 @@ ACTOR Future<Void> runTests( Reference<AsyncVar<Optional<struct ClusterControlle

 ACTOR Future<Void> runTests( Reference<AsyncVar<Optional<struct ClusterControllerFullInterface>>> cc, 
 		Reference<AsyncVar<Optional<struct ClusterInterface>>> ci, vector<TestSpec> tests, test_location_t at, 
-		int minTestersExpected, StringRef startingConfiguration ) {
+		int minTestersExpected, StringRef startingConfiguration, LocalityData locality ) {
 	state int flags = at == TEST_ON_SERVERS ? 0 : GetWorkersRequest::FLAG_TESTER_CLASS;
 	state Future<Void> testerTimeout = delay(60.0); // wait 60 sec for testers to show up
 	state vector<std::pair<WorkerInterface, ProcessClass>> workers;
@ -1097,12 +1104,12 @@ ACTOR Future<Void> runTests( Reference<AsyncVar<Optional<struct ClusterControlle
 	for(int i=0; i<workers.size(); i++)
 		ts.push_back(workers[i].first.testerInterface);

-	Void _ = wait( runTests( cc, ci, ts, tests, startingConfiguration) );
+	Void _ = wait( runTests( cc, ci, ts, tests, startingConfiguration, locality) );
 	return Void();
 }

 ACTOR Future<Void> runTests( Reference<ClusterConnectionFile> connFile, test_type_t whatToRun, test_location_t at, 
-		int minTestersExpected, std::string fileName, StringRef startingConfiguration ) {
+		int minTestersExpected, std::string fileName, StringRef startingConfiguration, LocalityData locality ) {
 	state vector<TestSpec> testSpecs;
 	Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> cc( new AsyncVar<Optional<ClusterControllerFullInterface>> );
 	Reference<AsyncVar<Optional<ClusterInterface>>> ci( new AsyncVar<Optional<ClusterInterface>> );
@ -1147,10 +1154,10 @@ ACTOR Future<Void> runTests( Reference<ClusterConnectionFile> connFile, test_typ
 		Reference<AsyncVar<ServerDBInfo>> db( new AsyncVar<ServerDBInfo> );
 		vector<TesterInterface> iTesters(1);
 		actors.push_back( reportErrors(monitorServerDBInfo( cc, Reference<ClusterConnectionFile>(), LocalityData(), db ), "monitorServerDBInfo") );  // FIXME: Locality
-		actors.push_back( reportErrors(testerServerCore( iTesters[0], connFile, db ), "testerServerCore") );
-		tests = runTests( cc, ci, iTesters, testSpecs, startingConfiguration );
+		actors.push_back( reportErrors(testerServerCore( iTesters[0], connFile, db, locality ), "testerServerCore") );
+		tests = runTests( cc, ci, iTesters, testSpecs, startingConfiguration, locality );
 	} else {
-		tests = reportErrors(runTests(cc, ci, testSpecs, at, minTestersExpected, startingConfiguration), "runTests");
+		tests = reportErrors(runTests(cc, ci, testSpecs, at, minTestersExpected, startingConfiguration, locality), "runTests");
 	}

 	choose {
--- a/fdbserver/worker.actor.cpp
+++ b/fdbserver/worker.actor.cpp
@ -24,6 +24,7 @@
 #include "flow/TDMetric.actor.h"
 #include "fdbrpc/simulator.h"
 #include "fdbclient/NativeAPI.h"
+#include "fdbclient/MetricLogger.h"
 #include "WorkerInterface.h"
 #include "IKeyValueStore.h"
 #include "WaitFailure.h"
@ -184,7 +185,7 @@ std::string filenameFromSample( KeyValueStoreType storeType, std::string folder,
 	if( storeType == KeyValueStoreType::SSD_BTREE_V1 )
 		return joinPath( folder, sample_filename );
 	else if ( storeType == KeyValueStoreType::SSD_BTREE_V2 )
-		return joinPath(folder, sample_filename); 
+		return joinPath(folder, sample_filename);
 	else if( storeType == KeyValueStoreType::MEMORY )
 		return joinPath( folder, sample_filename.substr(0, sample_filename.size() - 5) );

@ -195,7 +196,7 @@ std::string filenameFromId( KeyValueStoreType storeType, std::string folder, std
 	if( storeType == KeyValueStoreType::SSD_BTREE_V1)
 		return joinPath( folder, prefix + id.toString() + ".fdb" );
 	else if (storeType == KeyValueStoreType::SSD_BTREE_V2)
-		return joinPath(folder, prefix + id.toString() + ".sqlite"); 
+		return joinPath(folder, prefix + id.toString() + ".sqlite");
 	else if( storeType == KeyValueStoreType::MEMORY )
 		return joinPath( folder, prefix + id.toString() + "-" );

@ -318,7 +319,7 @@ ACTOR Future<Void> storageServerRollbackRebooter( Future<Void> prevStorageServer
 		auto* kv = openKVStore( storeType, filename, ssi.uniqueID, memoryLimit );
 		Future<Void> kvClosed = kv->onClosed();
 		filesClosed->add( kvClosed );
-		prevStorageServer = storageServer( kv, ssi, db, folder );
+		prevStorageServer = storageServer( kv, ssi, db, folder, Promise<Void>() );
 		prevStorageServer = handleIOErrors( prevStorageServer, kv, ssi.id(), kvClosed );
 	}
 }
@ -355,6 +356,7 @@ void startRole(UID roleId, UID workerId, std::string as, std::map<std::string, s
 	g_roles.insert({as, roleId.shortString()});
 	StringMetricHandle(LiteralStringRef("Roles")) = roleString(g_roles, false);
 	StringMetricHandle(LiteralStringRef("RolesWithIDs")) = roleString(g_roles, true);
+	if (g_network->isSimulated()) g_simulator.addRole(g_network->getLocalAddress(), as);
 }

 void endRole(UID id, std::string as, std::string reason, bool ok, Error e) {
@ -386,6 +388,7 @@ void endRole(UID id, std::string as, std::string reason, bool ok, Error e) {
 	g_roles.erase({as, id.shortString()});
 	StringMetricHandle(LiteralStringRef("Roles")) = roleString(g_roles, false);
 	StringMetricHandle(LiteralStringRef("RolesWithIDs")) = roleString(g_roles, true);
+	if (g_network->isSimulated()) g_simulator.removeRole(g_network->getLocalAddress(), as);
 }

 ACTOR Future<Void> monitorServerDBInfo( Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> ccInterface, Reference<ClusterConnectionFile> connFile, LocalityData locality, Reference<AsyncVar<ServerDBInfo>> dbInfo ) {
@ -509,7 +512,7 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
 		if( metricsConnFile.size() > 0) {
 			try {
 				state Reference<Cluster> cluster = Cluster::createCluster( metricsConnFile, Cluster::API_VERSION_LATEST );
-				metricsLogger = runMetrics( cluster->createDatabase(LiteralStringRef("DB")), KeyRef(metricsPrefix) );
+				metricsLogger = runMetrics( cluster->createDatabase(LiteralStringRef("DB"), locality), KeyRef(metricsPrefix) );
 			} catch(Error &e) {
 				TraceEvent(SevWarnAlways, "TDMetricsBadClusterFile").error(e).detail("ConnFile", metricsConnFile);
 			}
@ -520,10 +523,9 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
 	}

 	errorForwarders.add( loadedPonger( interf.debugPing.getFuture() ) );
-	errorForwarders.add( registrationClient( ccInterface, interf, processClass ) );
 	errorForwarders.add( waitFailureServer( interf.waitFailure.getFuture() ) );
 	errorForwarders.add( monitorServerDBInfo( ccInterface, connFile, locality, dbInfo ) );
-	errorForwarders.add( testerServerCore( interf.testerInterface, connFile, dbInfo ) );
+	errorForwarders.add( testerServerCore( interf.testerInterface, connFile, dbInfo, locality ) );

 	filesClosed.add(stopping.getFuture());

@ -550,7 +552,7 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
 	try {
 		std::vector<DiskStore> stores = getDiskStores( folder );
 		bool validateDataFiles = deleteFile(joinPath(folder, validationFilename));
-
+		std::vector<Future<Void>> recoveries;
 		for( int f = 0; f < stores.size(); f++ ) {
 			DiskStore s = stores[f];
 			// FIXME: Error handling
@ -581,7 +583,9 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
 				DUMPTOKEN(recruited.getKeyValueStoreType);
 				DUMPTOKEN(recruited.watchValue);

-				Future<Void> f = storageServer( kv, recruited, dbInfo, folder );
+				Promise<Void> recovery;
+				Future<Void> f = storageServer( kv, recruited, dbInfo, folder, recovery );
+				recoveries.push_back(recovery.getFuture());
 				f =  handleIOErrors( f, kv, s.storeID, kvClosed );
 				f = storageServerRollbackRebooter( f, s.storeType, s.filename, recruited, dbInfo, folder, &filesClosed, memoryLimit );
 				errorForwarders.add( forwardError( errors, "StorageServer", recruited.id(), f ) );
@ -597,7 +601,9 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
 				startRole( s.storeID, interf.id(), "SharedTLog", details, "Restored" );

 				Promise<Void> oldLog;
-				Future<Void> tl = tLog( kv, queue, dbInfo, locality, tlog.isReady() ? tlogRequests : PromiseStream<InitializeTLogRequest>(), s.storeID, true, oldLog );
+				Promise<Void> recovery;
+				Future<Void> tl = tLog( kv, queue, dbInfo, locality, tlog.isReady() ? tlogRequests : PromiseStream<InitializeTLogRequest>(), s.storeID, true, oldLog, recovery );
+				recoveries.push_back(recovery.getFuture());
 				tl = handleIOErrors( tl, kv, s.storeID );
 				tl = handleIOErrors( tl, queue, s.storeID );
 				if(tlog.isReady()) {
@ -613,6 +619,11 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
 		details["StoresPresent"] = format("%d", stores.size());
 		startRole( interf.id(), interf.id(), "Worker", details );

+		Void _ = wait(waitForAll(recoveries));
+		errorForwarders.add( registrationClient( ccInterface, interf, processClass ) );
+
+		TraceEvent("RecoveriesComplete", interf.id());
+
 		loop choose {

 			when( RebootRequest req = waitNext( interf.clientInterface.reboot.getFuture() ) ) {
@ -621,7 +632,7 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
 					Reference<IAsyncFile> checkFile = wait( IAsyncFileSystem::filesystem()->open( joinPath(folder, validationFilename), IAsyncFile::OPEN_CREATE | IAsyncFile::OPEN_READWRITE, 0600 ) );
 					Void _ = wait( checkFile->sync() );
 				}
-			
+
 				if(g_network->isSimulated()) {
 					TraceEvent("SimulatedReboot").detail("Deletion", rebootReq.deleteData );
 					if( rebootReq.deleteData ) {
@ -660,7 +671,7 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
 					std::map<std::string, std::string> details;
 					details["ForMaster"] = req.recruitmentID.shortString();
 					details["StorageEngine"] = req.storeType.toString();
-					
+
 					//FIXME: start role for every tlog instance, rather that just for the shared actor, also use a different role type for the shared actor
 					startRole( logId, interf.id(), "SharedTLog", details );

@ -669,7 +680,7 @@ ACTOR Future<Void> workerServer( Reference<ClusterConnectionFile> connFile, Refe
 					IDiskQueue* queue = openDiskQueue( joinPath( folder, fileLogQueuePrefix.toString() + logId.toString() + "-" ), logId );
 					filesClosed.add( data->onClosed() );
 					filesClosed.add( queue->onClosed() );
-					tlog = tLog( data, queue, dbInfo, locality, tlogRequests, logId, false, Promise<Void>() );
+					tlog = tLog( data, queue, dbInfo, locality, tlogRequests, logId, false, Promise<Void>(), Promise<Void>() );
 					tlog = handleIOErrors( tlog, data, logId );
 					tlog = handleIOErrors( tlog, queue, logId );
 					errorForwarders.add( forwardError( errors, "SharedTLog", logId, tlog ) );
@ -906,10 +917,11 @@ ACTOR Future<Void> fdbd(
 		// SOMEDAY: start the services on the machine in a staggered fashion in simulation?
 		Reference<AsyncVar<Optional<ClusterControllerFullInterface>>> cc( new AsyncVar<Optional<ClusterControllerFullInterface>> );
 		Reference<AsyncVar<Optional<ClusterInterface>>> ci( new AsyncVar<Optional<ClusterInterface>> );
+		Reference<AsyncVar<ProcessClass>> asyncProcessClass(new AsyncVar<ProcessClass>(ProcessClass(processClass.classType(), ProcessClass::CommandLineSource)));
 		vector<Future<Void>> v;
 		if ( coordFolder.size() )
 			v.push_back( fileNotFoundToNever( coordinationServer( coordFolder ) ) ); //SOMEDAY: remove the fileNotFound wrapper and make DiskQueue construction safe from errors setting up their files
-		v.push_back( reportErrors( processClass == ProcessClass::TesterClass ? monitorLeader( connFile, cc ) : clusterController( connFile, cc ), "clusterController") );
+		v.push_back( reportErrors( processClass == ProcessClass::TesterClass ? monitorLeader( connFile, cc ) : clusterController( connFile, cc , asyncProcessClass), "clusterController") );
 		v.push_back( reportErrors(extractClusterInterface( cc, ci ), "extractClusterInterface") );
 		v.push_back( reportErrors(failureMonitorClient( ci, true ), "failureMonitorClient") );
 		v.push_back( reportErrorsExcept(workerServer(connFile, cc, localities, processClass, dataFolder, memoryLimit, metricsConnFile, metricsPrefix), "workerServer", UID(), &normalWorkerErrors()) );
--- a/fdbserver/workloads/ApiCorrectness.actor.cpp
+++ b/fdbserver/workloads/ApiCorrectness.actor.cpp
@ -132,11 +132,12 @@ public:
 	}

 	ACTOR Future<Void> performSetup(Database cx, ApiCorrectnessWorkload *self) {
-		//Choose a random transaction type (NativeAPI, ReadYourWrites, ThreadSafe)
+		//Choose a random transaction type (NativeAPI, ReadYourWrites, ThreadSafe, MultiVersion)
 		std::vector<TransactionType> types;
 		types.push_back(NATIVE);
 		types.push_back(READ_YOUR_WRITES);
 		types.push_back(THREAD_SAFE);
+		types.push_back(MULTI_VERSION);

 		Void _ = wait(self->chooseTransactionFactory(cx, types));

--- a/fdbserver/workloads/ConsistencyCheck.actor.cpp
+++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp
@ -171,6 +171,7 @@ struct ConsistencyCheckWorkload : TestWorkload
 				state DatabaseConfiguration configuration;

 				state Transaction tr(cx);
+				tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 				loop {
 					try {
 						Standalone<RangeResultRef> res = wait( tr.getRange(configKeys, 1000) );
@ -247,15 +248,15 @@ struct ConsistencyCheckWorkload : TestWorkload
 				}

 				//Get a list of key servers; verify that the TLogs and master all agree about who the key servers are
-				state Promise<vector<StorageServerInterface>> keyServerPromise;
+				state Promise<vector<pair<KeyRangeRef, vector<StorageServerInterface>>>> keyServerPromise;
 				bool keyServerResult = wait(self->getKeyServers(cx, self, keyServerPromise));
 				if(keyServerResult)
 				{
-					state vector<StorageServerInterface> storageServers = keyServerPromise.getFuture().get();
+					state vector<pair<KeyRangeRef, vector<StorageServerInterface>>> keyServers = keyServerPromise.getFuture().get();

 					//Get the locations of all the shards in the database
 					state Promise<Standalone<VectorRef<KeyValueRef>>> keyLocationPromise;
-					bool keyLocationResult = wait(self->getKeyLocations(cx, storageServers, self, keyLocationPromise));
+					bool keyLocationResult = wait(self->getKeyLocations(cx, keyServers, self, keyLocationPromise));
 					if(keyLocationResult)
 					{
 						state Standalone<VectorRef<KeyValueRef>> keyLocations = keyLocationPromise.getFuture().get();
@ -268,7 +269,7 @@ struct ConsistencyCheckWorkload : TestWorkload
 			catch(Error &e)
 			{
 				if(e.code() == error_code_past_version || e.code() == error_code_future_version || e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || e.code() == error_code_server_request_queue_full)
-					TraceEvent("ConsistencyCheck_Retry").error(e);
+					TraceEvent("ConsistencyCheck_Retry").error(e); // FIXME: consistency check does not retry in this case
 				else
 					self->testFailure(format("Error %d - %s", e.code(), e.what()));
 			}
@ -285,6 +286,7 @@ struct ConsistencyCheckWorkload : TestWorkload
 		loop
 		{
 			state Transaction tr(cx);
+			tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 			try
 			{
 				Version version = wait(tr.getReadVersion());
@ -300,18 +302,18 @@ struct ConsistencyCheckWorkload : TestWorkload
 	//Get a list of storage servers from the master and compares them with the TLogs.
 	//If this is a quiescent check, then each master proxy needs to respond, otherwise only one needs to respond.
 	//Returns false if there is a failure (in this case, keyServersPromise will never be set)
-	ACTOR Future<bool> getKeyServers(Database cx, ConsistencyCheckWorkload *self, Promise<vector<StorageServerInterface>> keyServersPromise)
+	ACTOR Future<bool> getKeyServers(Database cx, ConsistencyCheckWorkload *self, Promise<vector<pair<KeyRangeRef, vector<StorageServerInterface>>>> keyServersPromise)
 	{
-		state vector<StorageServerInterface> keyServers;
+		state vector<pair<KeyRangeRef, vector<StorageServerInterface>>> keyServers;

 		loop
 		{
 			state Reference<ProxyInfo> proxyInfo = wait(cx->getMasterProxiesFuture());

 			//Try getting key server locations from the master proxies
-			state vector<Future<ErrorOr<vector<StorageServerInterface>>>> keyServerLocationFutures;
+			state vector<Future<ErrorOr<vector<pair<KeyRangeRef, vector<StorageServerInterface>>>>>> keyServerLocationFutures;
 			for(int i = 0; i < proxyInfo->size(); i++)
-				keyServerLocationFutures.push_back(proxyInfo->get(i,&MasterProxyInterface::getKeyServersLocations).getReplyUnlessFailedFor(ReplyPromise<vector<StorageServerInterface>>(), 2, 0));
+				keyServerLocationFutures.push_back(proxyInfo->get(i,&MasterProxyInterface::getKeyServersLocations).getReplyUnlessFailedFor(ReplyPromise<vector<pair<KeyRangeRef, vector<StorageServerInterface>>>>(), 2, 0));

 			choose {
 				when( Void _ = wait(waitForAll(keyServerLocationFutures)) ) {
@ -320,21 +322,21 @@ struct ConsistencyCheckWorkload : TestWorkload
 					state bool successful = true;
 					for(int i = 0; i < keyServerLocationFutures.size(); i++)
 					{
-						ErrorOr<vector<StorageServerInterface>> interfaces = keyServerLocationFutures[i].get();
+						ErrorOr<vector<pair<KeyRangeRef, vector<StorageServerInterface>>>> shards = keyServerLocationFutures[i].get();

 						//If performing quiescent check, then all master proxies should be reachable.  Otherwise, only one needs to be reachable
-						if(self->performQuiescentChecks && !interfaces.present())
+						if(self->performQuiescentChecks && !shards.present())
 						{
 							TraceEvent("ConsistencyCheck_MasterProxyUnavailable").detail("MasterProxyID", proxyInfo->getId(i));
 							self->testFailure("Master proxy unavailable");
 							return false;
 						}

-						//Get the list of interfaces if one was returned.  If not doing a quiescent check, we can break if it is.
-						//If we are doing a quiescent check, then we only need to do this for the first interface.
-						if(interfaces.present() && (i == 0 || !self->performQuiescentChecks))
+						//Get the list of shards if one was returned.  If not doing a quiescent check, we can break if it is.
+						//If we are doing a quiescent check, then we only need to do this for the first shard.
+						if(shards.present() && (i == 0 || !self->performQuiescentChecks))
 						{
-							keyServers = interfaces.get();
+							keyServers = shards.get();
 							if(!self->performQuiescentChecks)
 								break;
 						}
@ -364,96 +366,108 @@ struct ConsistencyCheckWorkload : TestWorkload

 	//Retrieves the locations of all shards in the database
 	//Returns false if there is a failure (in this case, keyLocationPromise will never be set)
-	ACTOR Future<bool> getKeyLocations(Database cx, vector<StorageServerInterface> storageServers, ConsistencyCheckWorkload *self, Promise<Standalone<VectorRef<KeyValueRef>>> keyLocationPromise)
+	ACTOR Future<bool> getKeyLocations(Database cx, vector<pair<KeyRangeRef, vector<StorageServerInterface>>> shards, ConsistencyCheckWorkload *self, Promise<Standalone<VectorRef<KeyValueRef>>> keyLocationPromise)
 	{
 		state Standalone<VectorRef<KeyValueRef>> keyLocations;
 		state Key beginKey = allKeys.begin;
+		state int i = 0;

 		//If the responses are too big, we may use multiple requests to get the key locations.  Each request begins where the last left off
-		while(beginKey < allKeys.end)
+		for ( ; i < shards.size(); i++)
 		{
-			try
+			// skip serverList shards
+			if (!shards[i].first.begin.startsWith(keyServersPrefix)) {
+				break;
+			}
+
+			state Key endKey = shards[i].first.end.startsWith(keyServersPrefix) ? shards[i].first.end.removePrefix(keyServersPrefix) : allKeys.end;
+
+			while(beginKey < endKey)
 			{
-				Version version = wait(self->getVersion(cx, self));
-
-				GetKeyValuesRequest req;
-				Key prefixBegin = beginKey.withPrefix(keyServersPrefix);
-				req.begin = firstGreaterOrEqual(prefixBegin);
-				req.end = firstGreaterOrEqual(keyServersEnd);
-				req.limit = SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT;
-				req.limitBytes = SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES;
-				req.version = version;
-
-				//Try getting the shard locations from the key servers
-				state vector<Future<ErrorOr<GetKeyValuesReply>>> keyValueFutures;
-				for(int i = 0; i < storageServers.size(); i++)
+				try
 				{
-					resetReply(req);
-					keyValueFutures.push_back(storageServers[i].getKeyValues.getReplyUnlessFailedFor(req, 2, 0));
-				}
+					Version version = wait(self->getVersion(cx, self));

-				Void _ = wait(waitForAll(keyValueFutures));
+					GetKeyValuesRequest req;
+					Key prefixBegin = beginKey.withPrefix(keyServersPrefix);
+					req.begin = firstGreaterOrEqual(prefixBegin);
+					req.end = firstGreaterOrEqual(keyServersEnd);
+					req.limit = SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT;
+					req.limitBytes = SERVER_KNOBS->MOVE_KEYS_KRM_LIMIT_BYTES;
+					req.version = version;

-				int firstValidStorageServer = -1;
-
-				//Read the shard location results
-				for(int i = 0; i < keyValueFutures.size(); i++)
-				{
-					ErrorOr<GetKeyValuesReply> reply = keyValueFutures[i].get();
-
-					if(!reply.present())
+					//Try getting the shard locations from the key servers
+					state vector<Future<ErrorOr<GetKeyValuesReply>>> keyValueFutures;
+					for(int j = 0; j < shards[i].second.size(); j++)
 					{
-						//If the storage server didn't reply in a quiescent database, then the check fails
-						if(self->performQuiescentChecks)
+						resetReply(req);
+						keyValueFutures.push_back(shards[i].second[j].getKeyValues.getReplyUnlessFailedFor(req, 2, 0));
+					}
+
+					Void _ = wait(waitForAll(keyValueFutures));
+
+					int firstValidStorageServer = -1;
+
+					//Read the shard location results
+					for(int j = 0; j < keyValueFutures.size(); j++)
+					{
+						ErrorOr<GetKeyValuesReply> reply = keyValueFutures[j].get();
+
+						if(!reply.present())
 						{
-							TraceEvent("ConsistencyCheck_KeyServerUnavailable").detail("StorageServer", storageServers[i].id().toString().c_str());
-							self->testFailure("Key server unavailable");
-							return false;
+							//If the storage server didn't reply in a quiescent database, then the check fails
+							if(self->performQuiescentChecks)
+							{
+								TraceEvent("ConsistencyCheck_KeyServerUnavailable").detail("StorageServer", shards[i].second[j].id().toString().c_str());
+								self->testFailure("Key server unavailable");
+								return false;
+							}
+
+							//If no storage servers replied, then throw all_alternatives_failed to force a retry
+							else if(firstValidStorageServer < 0 && j == keyValueFutures.size() - 1)
+								throw all_alternatives_failed();
 						}

-						//If no storage servers replied, then throw all_alternatives_failed to force a retry
-						else if(firstValidStorageServer < 0 && i == keyValueFutures.size() - 1)
-							throw all_alternatives_failed();
+						//If this is the first storage server, store the locations to send back to the caller
+						else if(firstValidStorageServer < 0)
+							firstValidStorageServer = j;
+
+						//Otherwise, compare the data to the results from the first storage server.  If they are different, then the check fails
+						else if(reply.get().data != keyValueFutures[firstValidStorageServer].get().get().data || reply.get().more != keyValueFutures[firstValidStorageServer].get().get().more)
+						{
+							TraceEvent("ConsistencyCheck_InconsistentKeyServers").detail("StorageServer1", shards[i].second[firstValidStorageServer].id())
+								.detail("StorageServer2", shards[i].second[j].id());
+							self->testFailure("Key servers inconsistent");
+							return false;
+						}
 					}

-					//If this is the first storage server, store the locations to send back to the caller
-					else if(firstValidStorageServer < 0)
-						firstValidStorageServer = i;
+					auto keyValueResponse = keyValueFutures[firstValidStorageServer].get().get();
+					Standalone<RangeResultRef> currentLocations = krmDecodeRanges( keyServersPrefix, KeyRangeRef(beginKey, endKey), RangeResultRef( keyValueResponse.data, keyValueResponse.more) );

-					//Otherwise, compare the data to the results from the first storage server.  If they are different, then the check fails
-					else if(reply.get().data != keyValueFutures[firstValidStorageServer].get().get().data || reply.get().more != keyValueFutures[firstValidStorageServer].get().get().more)
-					{
-						TraceEvent("ConsistencyCheck_InconsistentKeyServers").detail("StorageServer1", storageServers[firstValidStorageServer].id())
-							.detail("StorageServer2", storageServers[i].id());
-						self->testFailure("Key servers inconsistent");
-						return false;
-					}
+					//Push all but the last item, which will be pushed as the first item next iteration
+					keyLocations.append_deep(keyLocations.arena(), currentLocations.begin(), currentLocations.size() - 1);
+
+					//Next iteration should pick up where we left off
+					ASSERT(currentLocations.size() > 1);
+					beginKey = currentLocations.end()[-1].key;
+
+					//If this is the last iteration, then push the allKeys.end KV pair
+					if(beginKey == allKeys.end)
+						keyLocations.push_back_deep(keyLocations.arena(), currentLocations.end()[-1]);
+				}
+				catch(Error &e)
+				{
+					//If we failed because of a version problem, then retry
+					if(e.code() == error_code_past_version || e.code() == error_code_future_version || e.code() == error_code_past_version)
+						TraceEvent("ConsistencyCheck_RetryGetKeyLocations").error(e);
+					else
+						throw;
 				}
-
-				auto keyValueResponse = keyValueFutures[firstValidStorageServer].get().get();
-				Standalone<RangeResultRef> currentLocations = krmDecodeRanges( keyServersPrefix, KeyRangeRef(beginKey, allKeys.end), RangeResultRef( keyValueResponse.data, keyValueResponse.more) );
-
-				//Push all but the last item, which will be pushed as the first item next iteration
-				keyLocations.append_deep(keyLocations.arena(), currentLocations.begin(), currentLocations.size() - 1);
-
-				//Next iteration should pick up where we left off
-				ASSERT(currentLocations.size() > 1);
-				beginKey = currentLocations.end()[-1].key;
-
-				//If this is the last iteration, then push the allKeys.end KV pair
-				if(beginKey == allKeys.end)
-					keyLocations.push_back_deep(keyLocations.arena(), currentLocations.end()[-1]);
-			}
-			catch(Error &e)
-			{
-				//If we failed because of a version problem, then retry
-				if(e.code() == error_code_past_version || e.code() == error_code_future_version || e.code() == error_code_past_version)
-					TraceEvent("ConsistencyCheck_RetryGetKeyLocations").error(e);
-				else
-					throw;
 			}
 		}

+
 		keyLocationPromise.send(keyLocations);
 		return true;
 	}
@ -532,6 +546,7 @@ struct ConsistencyCheckWorkload : TestWorkload

 	ACTOR Future<int64_t> getDatabaseSize(Database cx) {
 		state Transaction tr( cx );
+		tr.setOption(FDBTransactionOptions::LOCK_AWARE);
 		loop {
 			try {
 				StorageMetrics metrics = wait( tr.getStorageMetrics( KeyRangeRef(allKeys.begin, keyServersPrefix), 100000 ) );
@ -1084,6 +1099,7 @@ struct ConsistencyCheckWorkload : TestWorkload
 				if(!statefulProcesses[itr->first.address()].count(id)) {
 					TraceEvent("ConsistencyCheck_ExtraDataStore").detail("Address", itr->first.address()).detail("DataStoreID", id);
 					if(g_network->isSimulated()) {
+						TraceEvent("ConsistencyCheck_RebootProcess").detail("Address", itr->first.address()).detail("DataStoreID", id);
 						g_simulator.rebootProcess(g_simulator.getProcessByAddress(itr->first.address()), ISimulator::RebootProcess);
 					}

--- a/fdbserver/workloads/DummyWorkload.actor.cpp
+++ b/fdbserver/workloads/DummyWorkload.actor.cpp
@ -23,9 +23,13 @@

 // The workload that do nothing. It can be used for waiting for quiescence
 struct DummyWorkload : TestWorkload {
+	bool displayWorkers;
+	double displayDelay;

 	DummyWorkload(WorkloadContext const& wcx)
 	: TestWorkload(wcx) {
+		displayWorkers = getOption(options, LiteralStringRef("displayWorkers"), true);
+		displayDelay = getOption(options, LiteralStringRef("displayDelay"), 0.0);
 	}

 	virtual std::string description() {
@ -33,6 +37,16 @@ struct DummyWorkload : TestWorkload {
 	}

 	virtual Future<Void> start(Database const& cx) {
+		if ((clientId == 0) && (displayWorkers)) {
+			return _start(this, cx);
+		}
+		return Void();
+	}
+
+	ACTOR static Future<Void> _start( DummyWorkload* self, Database cx) {
+		if (self->displayDelay > 0.0)
+			Void _ = wait(delay(self->displayDelay));
+		g_simulator.displayWorkers();
 		return Void();
 	}

--- a/fdbserver/workloads/LowLatency.actor.cpp
+++ b/fdbserver/workloads/LowLatency.actor.cpp
@ -0,0 +1,101 @@
+/*
+ * LowLatency.actor.cpp
+ *
+ * This source file is part of the FoundationDB open source project
+ *
+ * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flow/actorcompiler.h"
+#include "fdbrpc/ContinuousSample.h"
+#include "fdbclient/NativeAPI.h"
+#include "fdbserver/TesterInterface.h"
+#include "fdbclient/ReadYourWrites.h"
+#include "fdbserver/Knobs.h"
+#include "workloads.h"
+
+struct LowLatencyWorkload : TestWorkload {
+	double testDuration;
+	double maxLatency;
+	double checkDelay;
+	PerfIntCounter operations, retries;
+	bool ok;
+
+	LowLatencyWorkload(WorkloadContext const& wcx)
+		: TestWorkload(wcx), operations("Operations"), retries("Retries") , ok(true)
+	{
+		testDuration = getOption( options, LiteralStringRef("testDuration"), 600.0 );
+		maxLatency = getOption( options, LiteralStringRef("maxLatency"), 20.0 );
+		checkDelay = getOption( options, LiteralStringRef("checkDelay"), 1.0 );
+	}
+
+	virtual std::string description() { return "LowLatency"; }
+
+	virtual Future<Void> setup( Database const& cx ) {
+		return Void();
+	}
+
+	virtual Future<Void> start( Database const& cx ) {
+		if( clientId == 0 )
+			return _start( cx, this );
+		return Void();
+	}
+
+	ACTOR static Future<Void> _start( Database cx, LowLatencyWorkload* self ) {
+		state double testStart = now();
+		try {
+			loop {
+				Void _ = wait( delay( self->checkDelay ) );
+				state Transaction tr( cx );
+				state double operationStart = now();
+				++self->operations;
+				loop {
+					try {
+						tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+						tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+						Version _ = wait(tr.getReadVersion());
+						break;
+					} catch( Error &e ) {
+						Void _ = wait( tr.onError(e) );
+						++self->retries;
+					}
+				}
+				if(now() - operationStart > self->maxLatency) {
+					TraceEvent(SevError, "LatencyTooLarge").detail("maxLatency", self->maxLatency).detail("observedLatency", now() - operationStart);
+					self->ok = false;
+				}
+				if( now() - testStart > self->testDuration )
+					break;
+			}
+			return Void();
+		} catch( Error &e ) {
+			TraceEvent(SevError, "LowLatencyError").error(e,true);
+			throw;
+		}
+	}
+
+	virtual Future<bool> check( Database const& cx ) {
+		return ok;
+	}
+
+	virtual void getMetrics( vector<PerfMetric>& m ) {
+		double duration = testDuration;
+		m.push_back( PerfMetric( "Operations/sec", operations.getValue() / duration, false ) );
+		m.push_back( operations.getMetric() );
+		m.push_back( retries.getMetric() );
+	}
+};
+
+WorkloadFactory<LowLatencyWorkload> LowLatencyWorkloadFactory("LowLatency");
--- a/fdbserver/workloads/MachineAttrition.actor.cpp
+++ b/fdbserver/workloads/MachineAttrition.actor.cpp
@ -42,6 +42,8 @@ struct MachineAttritionWorkload : TestWorkload {
 	bool killDc;
 	bool killSelf;
 	bool replacement;
+	bool waitForVersion;
+	bool allowFaultInjection;

 	// This is set in setup from the list of workers when the cluster is started
 	std::vector<LocalityData> machines;
@ -57,6 +59,8 @@ struct MachineAttritionWorkload : TestWorkload {
 		killDc = getOption( options, LiteralStringRef("killDc"), g_random->random01() < 0.25 );
 		killSelf = getOption( options, LiteralStringRef("killSelf"), false );
 		replacement = getOption( options, LiteralStringRef("replacement"), reboot && g_random->random01() < 0.5 );
+		waitForVersion = getOption( options, LiteralStringRef("waitForVersion"), false );
+		allowFaultInjection = getOption( options, LiteralStringRef("allowFaultInjection"), true );
 	}

 	static vector<ISimulator::ProcessInfo*> getServers() {
@ -93,7 +97,7 @@ struct MachineAttritionWorkload : TestWorkload {
 				.detail("MeanDelay", meanDelay);

 			return timeout(
-				reportErrorsExcept( machineKillWorker( this, meanDelay ), "machineKillWorkerError", UID(), &normalAttritionErrors()),
+				reportErrorsExcept( machineKillWorker( this, meanDelay, cx ), "machineKillWorkerError", UID(), &normalAttritionErrors()),
 				testDuration, Void() );
 		}
 		if(killSelf)
@ -111,7 +115,7 @@ struct MachineAttritionWorkload : TestWorkload {
 		StringRef uid;
 	};

-	ACTOR static Future<Void> machineKillWorker( MachineAttritionWorkload *self, double meanDelay ) {
+	ACTOR static Future<Void> machineKillWorker( MachineAttritionWorkload *self, double meanDelay, Database cx ) {
 		state int killedMachines = 0;
 		state double delayBeforeKill = g_random->random01() * meanDelay;
 		state std::set<UID> killedUIDs;
@ -151,10 +155,25 @@ struct MachineAttritionWorkload : TestWorkload {
 				Void _ = wait( delay( delayBeforeKill ) );
 				TraceEvent("WorkerKillAfterDelay");

+				if(self->waitForVersion) {
+					state Transaction tr( cx );
+					loop {
+						try {
+							tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
+							tr.setOption(FDBTransactionOptions::LOCK_AWARE);
+							Version _ = wait(tr.getReadVersion());
+							break;
+						} catch( Error &e ) {
+							Void _ = wait( tr.onError(e) );
+						}
+					}
+				}
+
 				// decide on a machine to kill
 				LocalityData targetMachine = self->machines.back();

 				TraceEvent("Assassination").detail("TargetMachine", targetMachine.toString())
+					.detailext("zoneId", targetMachine.zoneId())
 					.detail("Reboot", self->reboot).detail("killedMachines", killedMachines)
 					.detail("machinesToKill", self->machinesToKill).detail("machinesToLeave", self->machinesToLeave)
 					.detail("machines", self->machines.size()).detail("Replace", self->replacement);
@ -166,12 +185,13 @@ struct MachineAttritionWorkload : TestWorkload {
 						g_simulator.killMachine( targetMachine.zoneId(), ISimulator::Reboot );
 					}
 				} else {
-					TraceEvent("WorkerKill").detail("MachineCount", self->machines.size());
-					if( g_random->random01() < 0.33 ) {
+					auto randomDouble = g_random->random01();
+					TraceEvent("WorkerKill").detail("MachineCount", self->machines.size()).detail("RandomValue", randomDouble);
+					if (randomDouble < 0.33 ) {
 						TraceEvent("RebootAndDelete").detail("TargetMachine", targetMachine.toString());
 						g_simulator.killMachine( targetMachine.zoneId(), ISimulator::RebootAndDelete );
 					} else {
-						auto kt = g_random->random01() < 0.5 ? ISimulator::KillInstantly : ISimulator::InjectFaults;
+						auto kt = (g_random->random01() < 0.5 || !self->allowFaultInjection) ? ISimulator::KillInstantly : ISimulator::InjectFaults;
 						g_simulator.killMachine( targetMachine.zoneId(), kt );
 					}
 				}
--- a/fdbserver/workloads/RandomMoveKeys.actor.cpp
+++ b/fdbserver/workloads/RandomMoveKeys.actor.cpp
@ -21,6 +21,7 @@
 #include "flow/actorcompiler.h"
 #include "fdbrpc/simulator.h"
 #include "fdbclient/StorageServerInterface.h"
+#include "fdbclient/ManagementAPI.h"
 #include "fdbserver/MoveKeys.h"
 #include "fdbclient/NativeAPI.h"
 #include "workloads.h"
@ -65,15 +66,13 @@ struct MoveKeysWorkload : TestWorkload {
 				}
 			}

-			state int oldMode = wait( self->setDDMode( cx, 0 ) );
+			state int oldMode = wait( setDDMode( cx, 0 ) );
 			TraceEvent("RMKStartModeSetting");
 			Void _ = wait( timeout( reportErrors( self->worker( cx, self ), "moveKeysWorkloadWorkerError" ), self->testDuration, Void() ) );
 			// Always set the DD mode back, even if we die with an error
 			TraceEvent("RMKDoneMoving");
-			int _ = wait( self->setDDMode( cx, oldMode ) );
+			int _ = wait( setDDMode( cx, oldMode ) );
 			TraceEvent("RMKDoneModeSetting");
-			Void _ = wait( self->forceMasterFailure(cx, self) );
-			TraceEvent("RMKDoneKillingMaster");
 		}
 		return Void();
 	}
@ -116,33 +115,6 @@ struct MoveKeysWorkload : TestWorkload {
 		return vector<StorageServerInterface>(t.begin(), t.end());
 	}

-	ACTOR Future<int> setDDMode( Database cx, int mode ) {
-		state Transaction tr(cx);
-		state int oldMode = -1;
-		state BinaryWriter wr(Unversioned());
-		wr << mode;
-
-		loop {
-			try {
-				Optional<Value> old = wait( tr.get( dataDistributionModeKey ) );
-				if (oldMode < 0) {
-					oldMode = 1;
-					if (old.present()) {
-						BinaryReader rd(old.get(), Unversioned());
-						rd >> oldMode;
-					}
-				}
-				tr.set( dataDistributionModeKey, wr.toStringRef() );
-
-				Void _ = wait( tr.commit() );
-				return oldMode;
-			} catch (Error& e) {
-				TraceEvent("setDDModeRetrying").error(e);
-				Void _ = wait (tr.onError(e));
-			}
-		}
-	}
-
 	ACTOR Future<Void> doMoveKeys(Database cx, MoveKeysWorkload *self, KeyRange keys, vector<StorageServerInterface> destinationTeam, 
 			MoveKeysLock lock, std::string dbName ) {
 		state TraceInterval relocateShardInterval("RelocateShard");
--- a/fdbserver/workloads/RemoveServersSafely.actor.cpp
+++ b/fdbserver/workloads/RemoveServersSafely.actor.cpp
@ -26,9 +26,6 @@
 #include "fdbrpc/simulator.h"
 #include "fdbclient/ManagementAPI.h"

-const char*		removeClearEnv = getenv("REMOVE_CLEAR");
-int						removeClear = removeClearEnv ? atoi(removeClearEnv) : 1;
-
 template <>
 std::string describe( uint32_t const& item ) {
 	return format("%d", item);
@ -154,6 +151,7 @@ struct RemoveServersSafelyWorkload : TestWorkload {
 	{
 		std::vector<ISimulator::ProcessInfo*>	processes;
 		std::set<AddressExclusion>	processAddrs;
+		UID functionId = g_nondeterministic_random->randomUniqueID();

 		// Get the list of process network addresses
 		for (auto& netAddr : netAddrs) {
@ -170,24 +168,64 @@ struct RemoveServersSafelyWorkload : TestWorkload {
 		// Get the list of processes matching network address
 		for (auto processInfo : g_simulator.getAllProcesses()) {
 			auto processNet = AddressExclusion(processInfo->address.ip, processInfo->address.port);
-			if (processAddrs.find(processNet) != processAddrs.end())
+			if (processAddrs.find(processNet) != processAddrs.end()) {
 				processes.push_back(processInfo);
+				TraceEvent("RemoveAndKill", functionId).detail("Step", "getProcessItem").detail("ProcessAddress", processInfo->address).detail("Process", describe(*processInfo)).detail("failed", processInfo->failed).detail("excluded", processInfo->excluded).detail("rebooting", processInfo->rebooting).detail("Protected", g_simulator.protectedAddresses.count(processInfo->address));
+			}
+			else {
+				TraceEvent("RemoveAndKill", functionId).detail("Step", "getProcessNoItem").detail("ProcessAddress", processInfo->address).detail("Process", describe(*processInfo)).detail("failed", processInfo->failed).detail("excluded", processInfo->excluded).detail("rebooting", processInfo->rebooting).detail("Protected", g_simulator.protectedAddresses.count(processInfo->address));
+			}
 		}
-		TraceEvent("RemoveAndKill").detail("Step", "getProcesses")
+		TraceEvent("RemoveAndKill", functionId).detail("Step", "getProcesses")
+			.detail("netAddrSize",netAddrs.size()).detail("processAddrSize",processAddrs.size())
 			.detail("netAddrs",describe(netAddrs)).detail("processAddrs",describe(processAddrs))
 			.detail("Proceses", processes.size()).detail("MachineProcesses", machineProcesses.size());

-		// Processes may have been destroyed causing
-//		ASSERT(processAddrs.size() == processes.size());
 		return processes;
 	}

+	virtual std::vector<ISimulator::ProcessInfo*> excludeAddresses(std::set<AddressExclusion> const& procAddrs)
+	{
+		// Get the updated list of processes which may have changed due to reboots, deletes, etc
+		std::vector<ISimulator::ProcessInfo*>	procArray = getProcesses(procAddrs);
+
+		// Include all of the excluded machines because the first command of the next section is includeall
+		TraceEvent("RemoveAndKill").detail("Step", "exclude addresses").detail("AddrTotal", procAddrs.size()).detail("ProcTotal", procArray.size()).detail("Addresses", describe(procAddrs)).detail("ClusterAvailable", g_simulator.isAvailable());
+		for (auto& procAddr : procAddrs) {
+			g_simulator.excludeAddress(NetworkAddress(procAddr.ip, procAddr.port, true, false));
+		}
+		for (auto& procRecord : procArray) {
+			procRecord->excluded = true;
+			TraceEvent("RemoveAndKill").detail("Step", "ExcludeAddress").detail("ProcessAddress", procRecord->address).detail("Process", describe(*procRecord)).detail("failed", procRecord->failed).detail("rebooting", procRecord->rebooting).detail("ClusterAvailable", g_simulator.isAvailable());
+		}
+		return procArray;
+	}
+
+	virtual std::vector<ISimulator::ProcessInfo*> includeAddresses(std::set<AddressExclusion> const& procAddrs)
+	{
+		// Get the updated list of processes which may have changed due to reboots, deletes, etc
+		std::vector<ISimulator::ProcessInfo*>	procArray = getProcesses(procAddrs);
+
+		// Include all of the excluded machines because the first command of the next section is includeall
+		TraceEvent("RemoveAndKill").detail("Step", "include addresses").detail("AddrTotal", procAddrs.size()).detail("ProcTotal", procArray.size()).detail("Addresses", describe(procAddrs)).detail("ClusterAvailable", g_simulator.isAvailable());
+		for (auto& procAddr : procAddrs) {
+			g_simulator.includeAddress(NetworkAddress(procAddr.ip, procAddr.port, true, false));
+		}
+		for (auto& procRecord : procArray) {
+			// Only change the exclusion member, if not failed since it will require a reboot to revive it
+			if (!procRecord->failed)
+				procRecord->excluded = false;
+			TraceEvent("RemoveAndKill").detail("Step", "IncludeAddress").detail("ProcessAddress", procRecord->address).detail("Process", describe(*procRecord)).detail("failed", procRecord->failed).detail("rebooting", procRecord->rebooting).detail("ClusterAvailable", g_simulator.isAvailable());
+		}
+		return procArray;
+	}
+
 	virtual std::vector<ISimulator::ProcessInfo*> protectServers(std::set<AddressExclusion> const& killAddrs)
 	{
 		std::vector<ISimulator::ProcessInfo*>	processes;
 		std::set<AddressExclusion>	processAddrs;
 		std::vector<AddressExclusion> killableAddrs;
-		std::vector<ISimulator::ProcessInfo*>	killProcesses, killableProcesses, processesLeft, processesDead;
+		std::vector<ISimulator::ProcessInfo*>	killProcArray, killableProcesses, processesLeft, processesDead;

 		// Get the list of processes matching network address
 		for (auto processInfo : getServers()) {
@ -199,7 +237,7 @@ struct RemoveServersSafelyWorkload : TestWorkload {
 			else if (killAddrs.find(processNet) == killAddrs.end())
 				processesLeft.push_back(processInfo);
 			else
-				killProcesses.push_back(processInfo);
+				killProcArray.push_back(processInfo);
 		}

 		// Identify the largest set of processes which can be killed
@ -207,22 +245,22 @@ struct RemoveServersSafelyWorkload : TestWorkload {
 		bool bCanKillProcess;
 		ISimulator::ProcessInfo*	randomProcess;
 		auto deadProcess = processesDead.back();
-		for (int killsLeft = killProcesses.size(); killsLeft > 0; killsLeft --)
+		for (int killsLeft = killProcArray.size(); killsLeft > 0; killsLeft --)
 		{
 			// Select a random kill process
 			randomIndex = g_random->randomInt(0, killsLeft);
-			randomProcess = killProcesses[randomIndex];
+			randomProcess = killProcArray[randomIndex];
 			processesDead.push_back(randomProcess);
-			killProcesses[randomIndex] = killProcesses.back();
-			killProcesses.pop_back();
+			killProcArray[randomIndex] = killProcArray.back();
+			killProcArray.pop_back();
 			// Add all of the remaining processes the leftover array
-			processesLeft.insert(processesLeft.end(), killProcesses.begin(), killProcesses.end());
+			processesLeft.insert(processesLeft.end(), killProcArray.begin(), killProcArray.end());

 			// Check if we can kill the added process
 			bCanKillProcess = g_simulator.canKillProcesses(processesLeft, processesDead, ISimulator::KillInstantly, NULL);

 			// Remove the added processes
-			processesLeft.resize(processesLeft.size() - killProcesses.size());
+			processesLeft.resize(processesLeft.size() - killProcArray.size());

 			if (bCanKillProcess) {
 				killableProcesses.push_back(randomProcess);
@ -247,94 +285,133 @@ struct RemoveServersSafelyWorkload : TestWorkload {

 		// Removing the first set of machines might legitimately bring the database down, so a timeout is not an error
 		state std::vector<NetworkAddress> firstCoordinators;
-		state std::vector<ISimulator::ProcessInfo*>	killProcesses;
+		state std::vector<ISimulator::ProcessInfo*>	killProcArray;
+		state bool bClearedFirst;

-		TraceEvent("RemoveAndKill").detail("Step", "exclude first list").detail("toKill1", describe(toKill1)).detail("KillTotal", toKill1.size())
-			.detail("ClusterAvailable", g_simulator.isAvailable());
+		TraceEvent("RemoveAndKill").detail("Step", "exclude list first").detail("toKill", describe(toKill1)).detail("KillTotal", toKill1.size()).detail("ClusterAvailable", g_simulator.isAvailable());
+		self->excludeAddresses(toKill1);

-			killProcesses = self->getProcesses(toKill1);
-			TraceEvent("RemoveAndKill").detail("Step", "mark first processes excluded").detail("Addresses", describe(toKill1))
-				.detail("AddressTotal", toKill1.size()).detail("Processes", killProcesses.size())
-				.detail("ClusterAvailable", g_simulator.isAvailable());
-			for (auto& killProcess : killProcesses) {
-				killProcess->excluded = true;
-				g_simulator.excludeAddress(killProcess->address);
-				TraceEvent("RemoveAndKill").detail("Step", "MarkProcessFirst").detail("Process", describe(*killProcess));
-			}
+		Optional<Void> result = wait( timeout( removeAndKill( self, cx, toKill1, NULL), self->kill1Timeout ) );

-		Optional<Void> result = wait( timeout( removeAndKill( self, cx, toKill1), self->kill1Timeout ) );
+		bClearedFirst = result.present();

-		TraceEvent("RemoveAndKill").detail("Step", "first exclusion result").detail("result", result.present() ? "succeeded" : "failed");
-		killProcesses = self->getProcesses(toKill1);
-		TraceEvent("RemoveAndKill").detail("Step", "include first processes").detail("toKill1", describe(toKill1))
-			.detail("KillTotal", toKill1.size()).detail("Processes", killProcesses.size());
-		for (auto& killProcess : killProcesses) {
-			g_simulator.includeAddress(killProcess->address);
-			killProcess->excluded = false;
+		TraceEvent("RemoveAndKill").detail("Step", "excluded list first").detail("excluderesult", bClearedFirst ? "succeeded" : "failed").detail("KillTotal", toKill1.size()).detail("Processes", killProcArray.size()).detail("toKill1", describe(toKill1)).detail("ClusterAvailable", g_simulator.isAvailable());
+
+		bClearedFirst=false;
+		// Include the servers, if unable to exclude
+		if (!bClearedFirst) {
+			// Get the updated list of processes which may have changed due to reboots, deletes, etc
+			TraceEvent("RemoveAndKill").detail("Step", "include all first").detail("KillTotal", toKill1.size()).detail("toKill", describe(toKill1)).detail("ClusterAvailable", g_simulator.isAvailable());
+			Void _ = wait( includeServers( cx, vector<AddressExclusion>(1) ) );
+			self->includeAddresses(toKill1);
+			TraceEvent("RemoveAndKill").detail("Step", "included all first").detail("KillTotal", toKill1.size()).detail("toKill", describe(toKill1)).detail("ClusterAvailable", g_simulator.isAvailable());
 		}

-		killProcesses = self->protectServers(toKill2);
+		// Get the list of protected servers
+		killProcArray = self->protectServers(toKill2);

 		// Update the kill networks to the killable processes
-		toKill2 = self->getNetworks(killProcesses);
+		toKill2 = self->getNetworks(killProcArray);

-		TraceEvent("RemoveAndKill").detail("Step", "Mark second processes excluded").detail("toKill2", describe(toKill2))
-			.detail("KillTotal", toKill2.size()).detail("Processes", killProcesses.size());
-		for (auto& killProcess : killProcesses) {
-			killProcess->excluded = true;
-			g_simulator.excludeAddress(killProcess->address);
-			TraceEvent("RemoveAndKill").detail("Step", "MarkProcessSecond").detail("Processes", killProcesses.size()).detail("Process", describe(*killProcess));
-		}
+		TraceEvent("RemoveAndKill").detail("Step", "exclude list second").detail("KillTotal", toKill2.size()).detail("toKill", describe(toKill2)).detail("ClusterAvailable", g_simulator.isAvailable());
+		self->excludeAddresses(toKill2);

 		// The second set of machines is selected so that we can always make progress without it, even after the permitted number of other permanent failures
 		// so we expect to succeed after a finite amount of time
 		state Future<Void> disabler = disableConnectionFailuresAfter( self->kill2Timeout/2, "RemoveServersSafely" );
 		TraceEvent("RemoveAndKill").detail("Step", "exclude second list").detail("toKill2", describe(toKill2)).detail("KillTotal", toKill2.size())
-			.detail("Processes", killProcesses.size()).detail("ClusterAvailable", g_simulator.isAvailable());
-		Void _ = wait( reportErrors( timeoutError( removeAndKill( self, cx, toKill2), self->kill2Timeout ), "RemoveServersSafelyError", UID() ) );
+			.detail("Processes", killProcArray.size()).detail("ClusterAvailable", g_simulator.isAvailable());
+		Void _ = wait( reportErrors( timeoutError( removeAndKill( self, cx, toKill2, bClearedFirst ? &toKill1 : NULL), self->kill2Timeout ), "RemoveServersSafelyError", UID() ) );

-
-		TraceEvent("RemoveAndKill").detail("Step", "excluded second list").detail("KillTotal", toKill2.size()).detail("Excluded", killProcesses.size())
-			.detail("ClusterAvailable", g_simulator.isAvailable());
+		TraceEvent("RemoveAndKill").detail("Step", "excluded second list").detail("KillTotal", toKill1.size()).detail("toKill", describe(toKill2)).detail("ClusterAvailable", g_simulator.isAvailable());

 		// Reinclude all of the machine, if buggified
 		if (BUGGIFY) {
-			TraceEvent("RemoveAndKill").detail("Step", "final include all").detail("ClusterAvailable", g_simulator.isAvailable());
+			// Get the updated list of processes which may have changed due to reboots, deletes, etc
+			TraceEvent("RemoveAndKill").detail("Step", "include all second").detail("KillTotal", toKill1.size()).detail("toKill", describe(toKill2)).detail("ClusterAvailable", g_simulator.isAvailable());
 			Void _ = wait( includeServers( cx, vector<AddressExclusion>(1) ) );
-			for (auto& killProcess : killProcesses) {
-				g_simulator.includeAddress(killProcess->address);
-				killProcess->excluded = false;
-			}
-			TraceEvent("RemoveAndKill").detail("Step", "final included all").detail("ClusterAvailable", g_simulator.isAvailable());
+			self->includeAddresses(toKill2);
+			TraceEvent("RemoveAndKill").detail("Step", "included all second").detail("KillTotal", toKill1.size()).detail("toKill", describe(toKill2)).detail("ClusterAvailable", g_simulator.isAvailable());
 		}

 		return Void();
 	}

-	ACTOR static Future<Void> removeAndKill( RemoveServersSafelyWorkload* self, Database cx, std::set<AddressExclusion> toKill)
+	virtual std::vector<ISimulator::ProcessInfo*> killAddresses(std::set<AddressExclusion> const& killAddrs)
 	{
-		// First clear the exclusion list and exclude the given list
-		TraceEvent("RemoveAndKill").detail("Step", "include all").detail("ClusterAvailable", g_simulator.isAvailable());
-		Void _ = wait( includeServers( cx, vector<AddressExclusion>(1) ) );
-		TraceEvent("RemoveAndKill").detail("Step", "included all").detail("ClusterAvailable", g_simulator.isAvailable());
+		UID functionId = g_nondeterministic_random->randomUniqueID();
+		bool removeViaClear = !BUGGIFY;
+		std::vector<ISimulator::ProcessInfo*>	killProcArray;
+		std::vector<AddressExclusion>	toKillArray;

-		state std::vector<ISimulator::ProcessInfo*>	killProcesses;
+		std::copy(killAddrs.begin(), killAddrs.end(), std::back_inserter(toKillArray));
+		killProcArray = getProcesses(killAddrs);
+
+		// Reboot and delete or kill the servers
+		if( killProcesses ) {
+			TraceEvent("RemoveAndKill", functionId).detail("Step", removeViaClear ? "ClearProcesses" : "IgnoreProcesses").detail("Addresses", describe(killAddrs))
+				.detail("Processes", killProcArray.size()).detail("ClusterAvailable", g_simulator.isAvailable()).detail("RemoveViaClear", removeViaClear);
+			for (auto& killProcess : killProcArray) {
+				if (g_simulator.protectedAddresses.count(killProcess->address))
+					TraceEvent("RemoveAndKill", functionId).detail("Step", "NoKill Process").detail("Process", describe(*killProcess)).detail("failed", killProcess->failed).detail("rebooting", killProcess->rebooting).detail("ClusterAvailable", g_simulator.isAvailable()).detail("Protected", g_simulator.protectedAddresses.count(killProcess->address));
+				else if (removeViaClear) {
+					g_simulator.rebootProcess( killProcess, ISimulator::RebootProcessAndDelete);
+					TraceEvent("RemoveAndKill", functionId).detail("Step", "Clear Process").detail("Process", describe(*killProcess)).detail("failed", killProcess->failed).detail("rebooting", killProcess->rebooting).detail("ClusterAvailable", g_simulator.isAvailable()).detail("Protected", g_simulator.protectedAddresses.count(killProcess->address));
+				}
+/*
+				else {
+					g_simulator.killProcess( killProcess, ISimulator::KillInstantly );
+					TraceEvent("RemoveAndKill", functionId).detail("Step", "Kill Process").detail("Process", describe(*killProcess)).detail("failed", killProcess->failed).detail("rebooting", killProcess->rebooting).detail("ClusterAvailable", g_simulator.isAvailable()).detail("Protected", g_simulator.protectedAddresses.count(killProcess->address));
+				}
+*/
+			}
+		}
+		else {
+			std::set<Optional<Standalone<StringRef>>> zoneIds;
+			bool killedMachine;
+			for (auto& killProcess : killProcArray) {
+				zoneIds.insert(killProcess->locality.zoneId());
+			}
+			TraceEvent("RemoveAndKill", functionId).detail("Step", removeViaClear ? "ClearMachines" : "KillMachines").detail("Addresses", describe(killAddrs)).detail("Processes", killProcArray.size()).detail("Zones", zoneIds.size()).detail("ClusterAvailable", g_simulator.isAvailable());
+			for (auto& zoneId : zoneIds) {
+				killedMachine = g_simulator.killMachine( zoneId, removeViaClear ? ISimulator::RebootAndDelete : ISimulator::KillInstantly, removeViaClear);
+				TraceEvent(killedMachine ? SevInfo : SevWarn, "RemoveAndKill").detail("Step", removeViaClear ? "Clear Machine" : "Kill Machine").detailext("ZoneId", zoneId).detail(removeViaClear ? "Cleared" : "Killed", killedMachine).detail("ClusterAvailable", g_simulator.isAvailable());
+			}
+		}
+
+		return killProcArray;
+	}
+
+	ACTOR static Future<Void> removeAndKill( RemoveServersSafelyWorkload* self, Database cx, std::set<AddressExclusion> toKill, std::set<AddressExclusion>* pIncAddrs)
+	{
+		state UID functionId = g_nondeterministic_random->randomUniqueID();
+
+		// First clear the exclusion list and exclude the given list
+		TraceEvent("RemoveAndKill", functionId).detail("Step", "include all").detail("ClusterAvailable", g_simulator.isAvailable());
+		Void _ = wait( includeServers( cx, vector<AddressExclusion>(1) ) );
+		TraceEvent("RemoveAndKill", functionId).detail("Step", "included all").detail("ClusterAvailable", g_simulator.isAvailable());
+		// Reinclude the addresses that were excluded, if present
+		if (pIncAddrs) {
+			self->includeAddresses(*pIncAddrs);
+		}
+
+		state std::vector<ISimulator::ProcessInfo*>	killProcArray;
 		state std::vector<AddressExclusion>	toKillArray;

 		std::copy(toKill.begin(), toKill.end(), std::back_inserter(toKillArray));
-		killProcesses = self->getProcesses(toKill);
+		killProcArray = self->getProcesses(toKill);

-		TraceEvent("RemoveAndKill").detail("Step", "Activate Server Exclusion").detail("toKill", describe(toKill)).detail("Addresses", describe(toKillArray)).detail("ClusterAvailable", g_simulator.isAvailable());
+		TraceEvent("RemoveAndKill", functionId).detail("Step", "Activate Server Exclusion").detail("KillAddrs", toKill.size()).detail("KillProcs", killProcArray.size()).detail("MissingProcs", toKill.size()!=killProcArray.size()).detail("toKill", describe(toKill)).detail("Addresses", describe(toKillArray)).detail("ClusterAvailable", g_simulator.isAvailable());
 		Void _ = wait( excludeServers( cx, toKillArray ) );

 		// We need to skip at least the quorum change if there's nothing to kill, because there might not be enough servers left
 		// alive to do a coordinators auto (?)
 		if (toKill.size()) {
 			// Wait for removal to be safe
-			TraceEvent("RemoveAndKill").detail("Step", "Wait For Server Exclusion").detail("Addresses", describe(toKill)).detail("ClusterAvailable", g_simulator.isAvailable());
+			TraceEvent("RemoveAndKill", functionId).detail("Step", "Wait For Server Exclusion").detail("Addresses", describe(toKill)).detail("ClusterAvailable", g_simulator.isAvailable());
 			Void _ = wait( waitForExcludedServers( cx, toKillArray ) );

-			TraceEvent("RemoveAndKill").detail("Step", "coordinators auto").detail("desiredCoordinators", g_simulator.desiredCoordinators).detail("ClusterAvailable", g_simulator.isAvailable());
+			TraceEvent("RemoveAndKill", functionId).detail("Step", "coordinators auto").detail("desiredCoordinators", g_simulator.desiredCoordinators).detail("ClusterAvailable", g_simulator.isAvailable());

 			// Setup the coordinators BEFORE the exclusion
 			// Otherwise, we may end up with NotEnoughMachinesForCoordinators
@ -349,38 +426,14 @@ struct RemoveServersSafelyWorkload : TestWorkload {
 					break;
 			}

-			// Reboot and delete or kill the servers
-			if( self->killProcesses ) {
-				TraceEvent("RemoveAndKill").detail("Step", removeClear ? "ClearProcesses" : "KillProcesses").detail("Addresses", describe(toKill))
-					.detail("Processes", killProcesses.size()).detail("ClusterAvailable", g_simulator.isAvailable());
-				for (auto& killProcess : killProcesses) {
-					TraceEvent("RemoveAndKill").detail("Step", removeClear ? "Clear Process" : "Kill Process").detail("Process", describe(*killProcess)).detail("ClusterAvailable", g_simulator.isAvailable()).detail("Protected", g_simulator.protectedAddresses.count(killProcess->address));
-//				ASSERT(g_simulator.protectedAddresses.count(killProcess->address) == 0);
-					if (removeClear)
-						g_simulator.rebootProcess( killProcess, ISimulator::RebootProcessAndDelete);
-					else
-						g_simulator.killProcess( killProcess, ISimulator::KillInstantly );
-				}
-			}
-			else {
-				std::set<Optional<Standalone<StringRef>>> zoneIds;
-				bool killedMachine;
-				for (auto& killProcess : killProcesses) {
-					zoneIds.insert(killProcess->locality.zoneId());
-				}
-				TraceEvent("RemoveAndKill").detail("Step", removeClear ? "ClearMachines" : "KillMachines").detail("Addresses", describe(toKill)).detail("Processes", killProcesses.size()).detail("Zones", zoneIds.size()).detail("ClusterAvailable", g_simulator.isAvailable());
-				for (auto& zoneId : zoneIds) {
-					killedMachine = g_simulator.killMachine( zoneId, removeClear ? ISimulator::RebootAndDelete : ISimulator::KillInstantly, removeClear ? true : false );
-					TraceEvent(killedMachine ? SevInfo : SevWarn, "RemoveAndKill").detail("Step", removeClear ? "Clear Machine" : "Kill Machine").detailext("ZoneId", zoneId).detail(removeClear ? "Cleared" : "Killed", killedMachine).detail("ClusterAvailable", g_simulator.isAvailable());
-				}
-			}
+			self->killAddresses(toKill);
 		}
 		else
 		{
-			TraceEvent("RemoveAndKill").detail("Step", "nothing to clear").detail("ClusterAvailable", g_simulator.isAvailable());
+			TraceEvent("RemoveAndKill", functionId).detail("Step", "nothing to clear").detail("ClusterAvailable", g_simulator.isAvailable());
 		}

-		TraceEvent("RemoveAndKill").detail("Step", "done").detail("ClusterAvailable", g_simulator.isAvailable());
+		TraceEvent("RemoveAndKill", functionId).detail("Step", "done").detail("ClusterAvailable", g_simulator.isAvailable());

 		return Void();
 	}
--- a/fdbserver/workloads/Rollback.actor.cpp
+++ b/fdbserver/workloads/Rollback.actor.cpp
@ -58,29 +58,32 @@ struct RollbackWorkload : TestWorkload {
 	ACTOR Future<Void> simulateFailure( Database cx, RollbackWorkload* self ) {
 		auto system = self->dbInfo->get();
 		auto tlogs = system.logSystemConfig.allPresentLogs();
-
-		if( tlogs.empty() ) {
+		
+		if( tlogs.empty() || system.client.proxies.empty() ) {
 			TraceEvent(SevInfo, "UnableToTriggerRollback").detail("Reason", "No tlogs in System Map");
 			return Void();
 		}
+
+		state MasterProxyInterface proxy = g_random->randomChoice( system.client.proxies );
+
 		int utIndex = g_random->randomInt(0, tlogs.size());
 		state NetworkAddress uncloggedTLog = tlogs[utIndex].address();

 		for(int t=0; t<tlogs.size(); t++)
 			if (t != utIndex)
-				if( tlogs[ t ].address().ip == system.master.address().ip ) {
-					TraceEvent(SevInfo, "UnableToTriggerRollback").detail("Reason", "master-clogged tLog shared IPs");
+				if( tlogs[ t ].address().ip == proxy.address().ip ) {
+					TraceEvent(SevInfo, "UnableToTriggerRollback").detail("Reason", "proxy-clogged tLog shared IPs");
 					return Void();
 				}

 		TraceEvent("AttemptingToTriggerRollback")
-			.detail("Master", system.master.address())
+			.detail("Proxy", proxy.address())
 			.detail("UncloggedTLog", uncloggedTLog);

 		for(int t=0; t<tlogs.size(); t++)
 			if (t != utIndex)
 				g_simulator.clogPair( 
-					system.master.address().ip,
+					proxy.address().ip,
 					tlogs[t].address().ip,
 					self->clogDuration );
 				//g_simulator.clogInterface( g_simulator.getProcess( system.tlogs[t].commit.getEndpoint() ), self->clogDuration, ClogAll );
@ -89,12 +92,12 @@ struct RollbackWorkload : TestWorkload {
 		Void _ = wait( delay( self->clogDuration/3 ) );
 		auto system = self->dbInfo->get();

-		// Kill the master and the unclogged tlog
+		// Kill the proxy and the unclogged tlog
 		if (self->enableFailures) {
-			g_simulator.killProcess( g_simulator.getProcessByAddress( system.master.address() ), ISimulator::KillInstantly );
+			g_simulator.killProcess( g_simulator.getProcessByAddress( proxy.address() ), ISimulator::KillInstantly );
 			g_simulator.clogInterface( uncloggedTLog.ip, self->clogDuration, ClogAll );
 		} else {
-			g_simulator.clogInterface( system.master.address().ip, self->clogDuration, ClogAll );
+			g_simulator.clogInterface( proxy.address().ip, self->clogDuration, ClogAll );
 			g_simulator.clogInterface( uncloggedTLog.ip, self->clogDuration, ClogAll );
 		}
 		return Void();
--- a/fdbserver/workloads/Throughput.actor.cpp
+++ b/fdbserver/workloads/Throughput.actor.cpp
@ -372,7 +372,7 @@ struct ThroughputWorkload : TestWorkload {
 		double ierror = (self->totalLatencyIntegral - self->totalTransactionsIntegral * self->targetLatency) / 
 			self->totalTransactionsIntegral * (after-self->startT);

-		double desiredSuccessors = 1 - error*self->Pgain - ierror*self->Igain;
+		double desiredSuccessors = 1 - (error*self->Pgain + ierror*self->Igain) / self->targetLatency;

 		//if (g_random->random01() < .001) TraceEvent("ThroughputControl").detail("Error", error).detail("IError", ierror).detail("DesiredSuccessors", desiredSuccessors).detail("ActiveActors", self->activeActors);

--- a/fdbserver/workloads/workloads.h
+++ b/fdbserver/workloads/workloads.h
@ -160,7 +160,7 @@ public:
 		waitForQuiescenceBegin = true;
 		waitForQuiescenceEnd = true;
 		simCheckRelocationDuration = false;
-		simEnableConnectionFailures = true;
+		simConnectionFailuresDisableDuration = 0;
 		simBackupAgents = ISimulator::NoBackupAgents;
 	}
 	TestSpec( StringRef title, bool dump, bool clear, double startDelay = 30.0, bool useDB = true, double databasePingDelay = -1.0 ) : 
@ -169,7 +169,7 @@ public:
 				useDB( useDB ), timeout( 600 ),
 				databasePingDelay( databasePingDelay ), runConsistencyCheck( g_network->isSimulated() ),
 				waitForQuiescenceBegin( true ), waitForQuiescenceEnd( true ), simCheckRelocationDuration( false ), 
-				simEnableConnectionFailures( true ), simBackupAgents( ISimulator::NoBackupAgents ) {
+				simConnectionFailuresDisableDuration( 0 ), simBackupAgents( ISimulator::NoBackupAgents ) {
 		phases = TestWorkload::SETUP | TestWorkload::EXECUTION | TestWorkload::CHECK | TestWorkload::METRICS;
 		if( databasePingDelay < 0 )
 			databasePingDelay = g_network->isSimulated() ? 0.0 : 15.0;
@ -189,7 +189,7 @@ public:
 	bool waitForQuiescenceEnd;

 	bool simCheckRelocationDuration; //If set to true, then long duration relocations generate SevWarnAlways messages.  Once any workload sets this to true, it will be true for the duration of the program.  Can only be used in simulation.
-	bool simEnableConnectionFailures; //If set to true, then network connections are subjected to random failures.  Once any workload sets this to false, it will be false for the duration of the program. Can only be used in simulation.
+	double simConnectionFailuresDisableDuration;
 	ISimulator::BackupAgentType simBackupAgents; //If set to true, then the simulation runs backup agents on the workers. Can only be used in simulation.
 };

--- a/flow/Knobs.cpp
+++ b/flow/Knobs.cpp
@ -52,7 +52,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) {

 	//FlowTransport
 	init( CONNECTION_REJECTED_MESSAGE_DELAY,                   1.0 );
-	init( CONNECTION_ID_TIMEOUT,                             600.0 );
+	init( CONNECTION_ID_TIMEOUT,                             600.0 ); if( randomize && BUGGIFY ) CONNECTION_ID_TIMEOUT = 60.0;
 	init( CONNECTION_CLEANUP_DELAY,                          100.0 );
 	init( INITIAL_RECONNECTION_TIME,                          0.05 );
 	init( MAX_RECONNECTION_TIME,                               0.5 );
--- a/flow/Platform.cpp
+++ b/flow/Platform.cpp
@ -484,11 +484,6 @@ const char* getInterfaceName(uint32_t _ip) {
 void getNetworkTraffic(uint32_t ip, uint64_t& bytesSent, uint64_t& bytesReceived,
 					   uint64_t& outSegs, uint64_t& retransSegs) {
 	INJECT_FAULT( platform_error, "getNetworkTraffic" ); // Even though this function doesn't throw errors, the equivalents for other platforms do, and since all of our simulation testing is on Linux...
-	bytesSent = 0;
-	bytesReceived = 0;
-	outSegs = 0;
-	retransSegs = 0;
-
 	const char* ifa_name = getInterfaceName(ip);
 	if (!ifa_name)
 		return;
@ -500,8 +495,8 @@ void getNetworkTraffic(uint32_t ip, uint64_t& bytesSent, uint64_t& bytesReceived
 	std::string iface;
 	std::string ignore;

-	bytesSent = 0;
-	bytesReceived = 0;
+	uint64_t bytesSentSum = 0;
+	uint64_t bytesReceivedSum = 0;

 	while (dev_stream.good()) {
 		dev_stream >> iface;
@ -513,13 +508,20 @@ void getNetworkTraffic(uint32_t ip, uint64_t& bytesSent, uint64_t& bytesReceived
 			for (int i = 0; i < 7; i++) dev_stream >> ignore;
 			dev_stream >> sent;

-			bytesSent += sent;
-			bytesReceived += received;
+			bytesSentSum += sent;
+			bytesReceivedSum += received;

 			dev_stream.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
 		}
 	}

+	if(bytesSentSum > 0) {
+		bytesSent = bytesSentSum;
+	}
+	if(bytesReceivedSum > 0) {
+		bytesReceived = bytesReceivedSum;
+	}
+
 	std::ifstream snmp_stream("/proc/net/snmp", std::ifstream::in);

 	std::string label;
@ -558,6 +560,8 @@ void getMachineLoad(uint64_t& idleTime, uint64_t& totalTime) {

 void getDiskStatistics(std::string const& directory, uint64_t& currentIOs, uint64_t& busyTicks, uint64_t& reads, uint64_t& writes, uint64_t& writeSectors, uint64_t& readSectors) {
 	INJECT_FAULT( platform_error, "getDiskStatistics" );
+	currentIOs = 0;
+
 	struct stat buf;
 	if (stat(directory.c_str(), &buf)) {
 		TraceEvent(SevError, "GetDiskStatisticsStatError").detail("Directory", directory).GetLastError();
@ -653,12 +657,6 @@ void getDiskStatistics(std::string const& directory, uint64_t& currentIOs, uint6
 	}

 	if(!g_network->isSimulated()) TraceEvent(SevWarnAlways, "DeviceNotFound").detail("Directory", directory);
-	currentIOs = 0;
-	busyTicks = 0;
-	reads = 0;
-	writes = 0;
-	writeSectors = 0;
-	readSectors = 0;
 }

 dev_t getDeviceId(std::string path) {
@ -685,10 +683,6 @@ dev_t getDeviceId(std::string path) {
 void getNetworkTraffic(uint32_t ip, uint64_t& bytesSent, uint64_t& bytesReceived,
 					   uint64_t& outSegs, uint64_t& retransSegs) {
 	INJECT_FAULT( platform_error, "getNetworkTraffic" );
-	bytesSent = 0;
-	bytesReceived = 0;
-	outSegs = 0;
-	retransSegs = 0;

 	const char* ifa_name = getInterfaceName(ip);
 	if (!ifa_name)
@ -732,6 +726,7 @@ void getNetworkTraffic(uint32_t ip, uint64_t& bytesSent, uint64_t& bytesReceived
 				bytesSent = if2m->ifm_data.ifi_obytes;
 				bytesReceived = if2m->ifm_data.ifi_ibytes;
 				outSegs = if2m->ifm_data.ifi_opackets;
+				retransSegs = 0;
 				break;
 			}
 		}
@ -758,8 +753,6 @@ void getDiskStatistics(std::string const& directory, uint64_t& currentIOs, uint6
 	INJECT_FAULT( platform_error, "getDiskStatistics" );
 	currentIOs = 0;
 	busyTicks = 0;
-	reads = 0;
-	writes = 0;
 	writeSectors = 0;
 	readSectors = 0;

@ -1115,10 +1108,10 @@ SystemStatistics getSystemStatistics(std::string dataFolder, uint32_t ip, System
 			returnStats.machineCPUSeconds = (100 - DisplayValue.doubleValue) * returnStats.elapsed / 100.0;
 	}
 #elif defined(__unixish__)
-	uint64_t machineNowSent, machineNowReceived;
-	uint64_t machineOutSegs, machineRetransSegs;
-	uint64_t currentIOs, nowBusyTicks, nowReads, nowWrites, nowWriteSectors, nowReadSectors;
-	uint64_t clockIdleTime, clockTotalTime;
+	uint64_t machineNowSent = (*statState)->machineLastSent;
+	uint64_t machineNowReceived = (*statState)->machineLastReceived;
+	uint64_t machineOutSegs = (*statState)->machineLastOutSegs;
+	uint64_t machineRetransSegs = (*statState)->machineLastRetransSegs;

 	getNetworkTraffic(ip, machineNowSent, machineNowReceived, machineOutSegs, machineRetransSegs);
 	if( returnStats.initialized ) {
@ -1132,6 +1125,13 @@ SystemStatistics getSystemStatistics(std::string dataFolder, uint32_t ip, System
 	(*statState)->machineLastOutSegs = machineOutSegs;
 	(*statState)->machineLastRetransSegs = machineRetransSegs;

+	uint64_t currentIOs;
+	uint64_t nowBusyTicks = (*statState)->lastBusyTicks;
+	uint64_t nowReads = (*statState)->lastReads;
+	uint64_t nowWrites = (*statState)->lastWrites;
+	uint64_t nowWriteSectors = (*statState)->lastWriteSectors; 
+	uint64_t nowReadSectors = (*statState)->lastReadSectors;
+
 	if(dataFolder != "") {
 		getDiskStatistics(dataFolder, currentIOs, nowBusyTicks, nowReads, nowWrites, nowWriteSectors, nowReadSectors);
 		returnStats.processDiskQueueDepth = currentIOs;
@ -1151,6 +1151,9 @@ SystemStatistics getSystemStatistics(std::string dataFolder, uint32_t ip, System
 		(*statState)->lastReadSectors = nowReadSectors;
 	}

+	uint64_t clockIdleTime = (*statState)->lastClockIdleTime;
+	uint64_t clockTotalTime = (*statState)->lastClockTotalTime;
+
 	getMachineLoad(clockIdleTime, clockTotalTime);
 	returnStats.machineCPUSeconds = clockTotalTime - (*statState)->lastClockTotalTime != 0 ? ( 1 - ((clockIdleTime - (*statState)->lastClockIdleTime) / ((double)(clockTotalTime - (*statState)->lastClockTotalTime)))) * returnStats.elapsed : 0;
 	(*statState)->lastClockIdleTime = clockIdleTime;
@ -1539,12 +1542,13 @@ void renameFile( std::string const& fromPath, std::string const& toPath ) {
 	INJECT_FAULT( io_error, "renameFile" );
 #ifdef _WIN32
 	if (MoveFile( fromPath.c_str(), toPath.c_str() )) {
-		renamedFile();
+		//renamedFile();
 		return;
 	}
 #elif (defined(__linux__) || defined(__APPLE__))
 	if (!rename( fromPath.c_str(), toPath.c_str() )) {
-		renamedFile();
+		//FIXME: We cannot inject faults after renaming the file, because we could end up with two asyncFileNonDurable open for the same file
+		//renamedFile();
 		return;
 	}
 #else
--- a/flow/SignalSafeUnwind.cpp
+++ b/flow/SignalSafeUnwind.cpp
@ -76,7 +76,11 @@ extern "C" int dl_iterate_phdr(
    } else {
        // This path is NOT async signal safe, and serves until and unless initSignalSafeUnwind() is called
        initChain();
-        return chain_dl_iterate_phdr(callback, data);
+
+		setProfilingEnabled(0);
+        int result = chain_dl_iterate_phdr(callback, data);
+		setProfilingEnabled(1);
+		return result;
    }
 }

--- a/flow/TDMetric.actor.h
+++ b/flow/TDMetric.actor.h
@ -680,7 +680,7 @@ struct TimeDescriptor {
 };

 struct BaseMetric {
-	BaseMetric(MetricNameRef const &name) : metricName(name), pCollection(nullptr), registered(false) {
+	BaseMetric(MetricNameRef const &name) : metricName(name), pCollection(nullptr), registered(false), enabled(false) {
 		setConfig(false);
 	}
 	virtual ~BaseMetric() {
--- a/flow/genericactors.actor.h
+++ b/flow/genericactors.actor.h
@ -239,8 +239,12 @@ Future<Void> triggerOnError( Func what, Future<Void> signal ) {
 ACTOR template<class T>
 void uncancellable(Future<T> what, Promise<T> result)
 {
-	T val = wait(what);
-	result.send(val);
+	try {
+		T val = wait(what);
+		result.send(val);
+	} catch( Error &e ) {
+		result.sendError(e);
+	}
 }

 //Waits for a future to complete and cannot be cancelled
--- a/layers/directory/directory.py
+++ b/layers/directory/directory.py
@ -1,260 +0,0 @@
-#
-# directory.py
-#
-# This source file is part of the FoundationDB open source project
-#
-# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from subspace import Subspace
-import fdb, fdb.tuple
-import random, struct
-
-fdb.api_version(100)
-
-#TODO: Error class
-
-class HighContentionAllocator (object):
-    def __init__(self, subspace):
-        self.counters = subspace[0]
-        self.recent = subspace[1]
-
-    @fdb.transactional
-    def allocate( self, tr ):
-        """Returns a byte string which
-            (1) has never and will never be returned by another call to HighContentionAllocator.allocate() on the same subspace
-            (2) is nearly as short as possible given the above"""
-
-        [(start, count)] = [ (self.counters.unpack(k)[0],struct.unpack("<q",v)[0]) for k,v in tr.snapshot.get_range( self.counters.range().start, self.counters.range().stop, limit=1, reverse=True ) ] or [ (0,0) ]
-
-        window = self._window_size(start)
-        if (count+1)*2 >= window:
-            # Advance the window
-            del tr[ self.counters : self.counters[start].key()+chr(0) ]
-            start += window
-            del tr[ self.recent : self.recent[start] ]
-            window = self._window_size(start)
-
-        # Increment the allocation count for the current window
-        tr.add( self.counters[start], struct.pack("<q", 1) )
-
-        while True:
-            # As of the snapshot we are reading from, the window is less than half full, so
-            # this should be expected 2 tries.  Under high contention (and when the window advances),
-            # there is an additional subsequent risk of conflict for this transaction.
-            candidate = random.randint( start, start+window )
-            if tr[ self.recent[candidate] ] == None:
-                tr[ self.recent[candidate] ] = ""
-                return fdb.tuple.pack( (candidate,) )
-
-    def _window_size(self, start):
-        # Larger window sizes are better for high contention, smaller for keeping the keys small.  But if
-        # there are lots of allocations the keys can't be too small.  So start small and scale up.  We don't
-        # want this to ever get *too* big because we have to store about window_size/2 recent items.
-        if start < 255: return 64
-        if start < 65535: return 1024
-        return 8192
-
-class DirectoryLayer (object):
-    def __init__(self, node_subspace = Subspace( rawPrefix="\xfe" ), content_subspace = Subspace() ):
-        self.content_subspace = content_subspace
-        self.node_subspace = node_subspace
-        # The root node is the one whose contents are the node subspace
-        self.root_node = self.node_subspace[ self.node_subspace.key() ]
-        self.allocator = HighContentionAllocator( self.root_node['hca'] )
-
-    @fdb.transactional
-    def create_or_open( self, tr, path, layer=None, prefix=None, allow_create=True, allow_open=True ):
-        """Opens the directory with the given path.
-        If the directory does not exist, it is created (creating parent directories if necessary).
-        If prefix is specified, the directory is created with the given physical prefix; otherwise a prefix is allocated automatically.
-        If layer is specified, it is checked against the layer of an existing directory or set as the layer of a new directory."""
-        if isinstance(path, str): path=(path,)
-        if not path: raise ValueError( "The root directory may not be opened." )  # Because it contains node metadata!
-        existing_node = self._find(tr, path)
-        if existing_node:
-            if not allow_open: raise ValueError("The directory already exists.")
-            existing_layer = tr[ existing_node['layer'].key() ]
-            if layer and existing_layer and existing_layer != layer:
-                raise ValueError( "The directory exists but was created with an incompatible layer." )
-            return self._contents_of_node(existing_node, path, existing_layer)
-        if not allow_create: raise ValueError("The directory does not exist.")
-
-        if prefix==None:
-            prefix = self.allocator.allocate(tr)
-
-        if not self._is_prefix_free(tr, prefix):
-            raise ValueError("The given prefix is already in use.")
-
-        if path[:-1]:
-            parent_node = self._node_with_prefix( self.create_or_open(tr, path[:-1], layer=None).key() )
-        else:
-            parent_node = self.root_node
-        #parent_node = self._find(tr, path[:-1])
-        if not parent_node:
-            print repr(path[:-1])
-            raise ValueError("The parent directory doesn't exist.")
-
-        node = self._node_with_prefix(prefix)
-        tr[ parent_node[self.SUBDIRS][ path[-1] ].key() ] = prefix
-        if layer: tr[ node['layer'].key() ] = layer
-
-        return self._contents_of_node(node, path, layer)
-
-    def open( self, db_or_tr, path, layer=None ):
-        """Opens the directory with the given path.
-        If the directory does not exist, an error is raised.
-        If layer is specified, and a different layer was specified when the directory was created, an error is raised."""
-        return self.create_or_open(db_or_tr, path, layer, allow_create=False)
-    def create( self, db_or_tr, path, layer=None, prefix=None ):
-        """Creates a directory with the given path (creating parent directories if necessary).
-        If the given directory already exists, an error is raised.
-        If prefix is specified, the directory is created with the given physical prefix; otherwise a prefix is allocated automatically.
-        If layer is specified, it is recorded with the directory and will be checked by future calls to open."""
-        return self.create_or_open(db_or_tr, path, layer, prefix, allow_open=False)
-
-    @fdb.transactional
-    def move( self, tr, old_path, new_path ):
-        """Moves the directory found at `old_path` to `new_path`.
-        There is no effect on the physical prefix of the given directory, or on clients that already have the directory open.
-        If the old directory does not exist, a directory already exists at `new_path`, or the parent directory of `new_path`
-        does not exist, an error is raised."""
-        if isinstance(old_path, str): old_path=(old_path,)
-        if isinstance(new_path, str): new_path=(new_path,)
-        if self._find(tr, new_path): raise ValueError( "The destination directory already exists.  Remove it first." )
-        old_node = self._find(tr, old_path)
-        if not old_node: raise ValueError("The source directory does not exist.")
-        parent_node = self._find(tr, new_path[:-1] )
-        if not parent_node: raise ValueError( "The parent of the destination directory does not exist.  Create it first." )
-        tr[ parent_node[self.SUBDIRS][ new_path[-1] ].key() ] = self._contents_of_node( old_node, None ).key()
-        self._remove_from_parent( tr, old_path )
-        return self._contents_of_node( old_node, new_path, tr[ old_node['layer'].key() ] )
-
-    @fdb.transactional
-    def remove( self, tr, path ):
-        """Removes the directory, its contents and all subdirectories transactionally.
-        Warning: Clients which have already opened the directory might still insert data into its contents after it is removed."""
-        if isinstance(path, str): path=(path,)
-        n = self._find(tr, path)
-        if not n: raise ValueError( "The directory doesn't exist." )
-        self._remove_recursive(tr, n)
-        self._remove_from_parent(tr, path)
-
-    @fdb.transactional
-    def list( self, tr, path=() ):
-        if isinstance(path, str): path=(path,)
-        node = self._find( tr, path)
-        if not node:
-            raise ValueError("The given directory does not exist.")
-        return [name for name, cnode in self._subdir_names_and_nodes(tr, node)]
-
-    ### IMPLEMENTATION ###
-    SUBDIRS=0
-
-    def _node_containing_key(self, tr, key):
-        # Right now this is only used for _is_prefix_free(), but if we add parent pointers to directory nodes,
-        # it could also be used to find a path based on a key
-        if key.startswith(self.node_subspace.key()):
-            return self.root_node
-        for k,v in tr.get_range( self.node_subspace.range( () ).start,
-                                 self.node_subspace.pack( (key,) )+"\x00",
-                                 reverse=True,
-                                 limit=1 ):
-            prev_prefix = self.node_subspace.unpack( k )[0]
-            if key.startswith(prev_prefix):
-                return Subspace( rawPrefix=k ) # self.node_subspace[prev_prefix]
-        return None
-
-    def _node_with_prefix( self, prefix ):
-        if prefix==None: return None
-        return self.node_subspace[prefix]
-
-    def _contents_of_node( self, node, path, layer=None ):
-        prefix = self.node_subspace.unpack( node.key() )[0]
-        return DirectorySubspace( path, prefix, self, layer )
-
-    def _find( self, tr, path ):
-        n = self.root_node
-        for name in path:
-            n = self._node_with_prefix( tr[ n[self.SUBDIRS][name].key() ] )
-            if n == None:
-                return None
-        return n
-
-    def _subdir_names_and_nodes( self, tr, node ):
-        sd = node[self.SUBDIRS]
-        for k,v in tr[sd.range(())]:
-            yield sd.unpack(k)[0], self._node_with_prefix( v )
-
-    def _remove_from_parent( self, tr, path ):
-        parent = self._find( tr, path[:-1] )
-        del tr[ parent[self.SUBDIRS][ path[-1] ].key() ]
-
-    def _remove_recursive( self, tr, node):
-        for name, sn in self._subdir_names_and_nodes(tr, node):
-            self._remove_recursive(tr, sn)
-        tr.clear_range_startswith( self._contents_of_node(node,None).key() )
-        del tr[ node.range(()) ]
-
-    def _is_prefix_free( self, tr, prefix ):
-        # Returns true if the given prefix does not intersect any currently allocated prefix
-        # (including the root node).  This means that it neither contains any other prefix nor
-        # is contained by any other prefix.
-        return prefix and not self._node_containing_key( tr, prefix ) and not len(list(tr.get_range( self.node_subspace.pack( (prefix,) ), self.node_subspace.pack( (strinc(prefix),) ), limit=1 )))
-
-directory = DirectoryLayer()
-
-class DirectorySubspace (Subspace):
-    # A DirectorySubspace represents the *contents* of a directory, but it also remembers
-    # the path it was opened with and offers convenience methods to operate on the directory
-    # at that path.
-    def __init__(self, path, prefix, directoryLayer=directory, layer=None):
-        Subspace.__init__(self, rawPrefix=prefix)
-        self.path = path
-        self.directoryLayer = directoryLayer
-        self.layer = layer
-
-    def __repr__(self):
-        return 'DirectorySubspace(' + repr(self.path) + ',' + repr(self.rawPrefix) + ')'
-
-    def check_layer(self, layer):
-        if layer and self.layer and layer!=self.layer:
-            raise ValueError("The directory was created with an incompatible layer.")
-
-    def create_or_open( self, db_or_tr, name_or_path, layer=None, prefix=None ):
-        if not isinstance( name_or_path, tuple ): name_or_path = (name_or_path,)
-        return self.directoryLayer.create_or_open( db_or_tr, self.path + name_or_path, layer, prefix )
-    def open( self, db_or_tr, name_or_path, layer=None ):
-        if not isinstance( name_or_path, tuple ): name_or_path = (name_or_path,)
-        return self.directoryLayer.open( db_or_tr, self.path + name_or_path, layer )
-    def create( self, db_or_tr, name_or_path, layer=None ):
-        if not isinstance( name_or_path, tuple ): name_or_path = (name_or_path,)
-        return self.directoryLayer.create( db_or_tr, self.path + name_or_path, layer )
-    def move_to( self, db_or_tr, new_path ):
-        return self.directoryLayer.moveTo( db_or_tr, self.path, new_path )
-    def remove( self, db_or_tr ):
-        return self.directoryLayer.remove( db_or_tr, self.path )
-    def list( self, db_or_tr ):
-        return self.directoryLayer.list( db_or_tr, self.path )
-
-def random_key():
-    return uuid.uuid4().bytes
-
-def strinc(key):
-    lastc = (ord(key[-1:]) + 1) % 256
-    if lastc:
-        return key[:-1] + chr(lastc)
-    else:
-        return strinc(key[:-1]) + chr(lastc)
--- a/layers/directory/dirtest2.py
+++ b/layers/directory/dirtest2.py
@ -1,79 +0,0 @@
-#
-# dirtest2.py
-#
-# This source file is part of the FoundationDB open source project
-#
-# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import fdb, fdb.tuple
-fdb.api_version(23)
-
-from subspace import Subspace
-from directory import directory, DirectoryLayer
-
-def is_error(f):
-    try:
-        f()
-        return False
-    except:
-        return True
-
-db = fdb.open()
-del db[:]
-
-print directory.create( db, 'evil', prefix="\x14" )
-directory = DirectoryLayer( content_subspace = Subspace(rawPrefix="\x01") )
-
-# Make a new directory
-stuff = directory.create( db, ('stuff',) )
-print 'stuff is in', stuff
-print 'stuff[0] is', fdb.tuple.unpack( stuff[0].key() )
-#assert stuff.key() == "\x01\x14"
-
-# Open it again
-stuff2 = directory.open( db, ('stuff',) )
-assert stuff2.key() == stuff.key()
-
-# Make another directory
-items = directory.create_or_open( db, ('items',) )
-print 'items are in', items
-#assert items.key() == "\x01\x15\x01"
-
-# List the root directory
-assert directory.list(db, ()) == ['evil','items','stuff']
-
-# Move everything into an 'app' directory
-app = directory.create( db, ('app',) )
-directory.move( db, ('stuff',), ('app','stuff') )
-directory.move( db, ('items',), ('app','items') )
-
-# Make a directory in a hard-coded place
-special = directory.create_or_open( db, ('app', 'special'), prefix="\x00" )
-assert special.key() == "\x00"
-
-assert directory.list(db, ()) == ['app','evil']
-assert directory.list(db, ("app",)) == ['items', 'special', 'stuff']
-
-assert directory.open( db, ('app', 'stuff') ).key() == stuff.key()
-
-# Destroy the stuff directory
-directory.remove( db, ('app', 'stuff') )
-assert is_error( lambda: directory.open( db, ('app','stuff')) )
-assert directory.list(db, ("app",)) == ['items', 'special']
-
-# Test that items is still OK
-items2 = directory.create_or_open( db, ('app','items') )
-assert items.key() == items.key()
--- a/layers/directory/subspace.py
+++ b/layers/directory/subspace.py
@ -1,58 +0,0 @@
-#
-# subspace.py
-#
-# This source file is part of the FoundationDB open source project
-#
-# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-###################################
-# This defines a Subspace of keys #
-###################################
-
-import fdb.tuple
-
-class Subspace (object):
-    def __init__(self, prefixTuple=tuple(), rawPrefix=""):
-        self.rawPrefix = rawPrefix + fdb.tuple.pack(prefixTuple)
-
-    def __repr__(self):
-        return 'Subspace(rawPrefix=' + repr(self.rawPrefix) + ')'
-
-    def __getitem__(self, name):
-        return Subspace( (name,), self.rawPrefix )
-   
-    def key(self):
-        return self.rawPrefix
-
-    def pack(self, t = tuple()):
-        return self.rawPrefix + fdb.tuple.pack( t )
-    
-    def unpack(self, key):
-        assert key.startswith(self.rawPrefix)
-        return fdb.tuple.unpack(key[len(self.rawPrefix):])
-    
-    def range(self, t = tuple()):
-        p = fdb.tuple.range( t )
-        return slice(self.rawPrefix + p.start, self.rawPrefix + p.stop)
-
-    def contains(self, key):
-        return key.startswith(self.rawPrefix)
-
-    def as_foundationdb_key(self):
-        return self.rawPrefix
-
-    def subspace(self, tuple):
-        return Subspace( tuple, self.rawPrefix )
--- a/packaging/foundationdb.conf
+++ b/packaging/foundationdb.conf
@ -15,7 +15,7 @@ restart_delay = 60
 # restart_backoff = 60
 # restart_delay_reset_interval = 60
 cluster_file = /etc/foundationdb/fdb.cluster
-# delete_wd40_env = false
+# delete_envvars =
 # kill_on_configuration_change = true

 ## Default parameters for individual fdbserver processes
--- a/packaging/msi/FDBInstaller.wxs
+++ b/packaging/msi/FDBInstaller.wxs
@ -32,7 +32,7 @@

 <Wix xmlns='http://schemas.microsoft.com/wix/2006/wi'>
  <Product Name='$(var.Title)'
-           Id='{E1E1FACE-6556-42A5-8F29-5E16D1418E24}'
+           Id='{06EE6C90-3838-4C25-95D6-A4716F8CE7D0}'
           UpgradeCode='{A95EA002-686E-4164-8356-C715B7F8B1C8}'
           Version='$(var.Version)'
           Manufacturer='$(var.Manufacturer)'
--- a/tests/fast/LowLatency.txt
+++ b/tests/fast/LowLatency.txt
@ -0,0 +1,21 @@
+testTitle=Clogged
+    testName=Cycle
+    transactionsPerSecond=1000.0
+    testDuration=30.0
+    expectedRate=0
+
+    testName=LowLatency
+    testDuration=30.0
+
+    testName=Attrition
+    machinesToKill=1
+    machinesToLeave=3
+    reboot=true
+    testDuration=30.0
+    waitForVersion=true
+    allowFaultInjection=false
+    killDc=false
+
+connectionFailuresDisableDuration=100000
+buggify=off
+minimumReplication=2
--- a/tests/python_tests/python_performance.py
+++ b/tests/python_tests/python_performance.py
@ -275,7 +275,7 @@ class PythonPerformance(PythonTest):

        for i in range(count):
            index = random.randint(0, self.key_count)
-            list(tr[self.key(index):self.key(index+1)])
+            list(tr.get_range(self.key(index), self.key(index+1), limit=2))

        return count / (time.time() - s)

--- a/tests/rare/CheckRelocation.txt
+++ b/tests/rare/CheckRelocation.txt
@ -12,4 +12,4 @@ valueBytes=128
 discardEdgeMeasurements=false
 warmingDelay=10.0
 simCheckRelocationDuration=true
-simEnableConnectionFailures=false
+connectionFailuresDisableDuration=100000
--- a/tests/rare/ConflictRangeCheck.txt
+++ b/tests/rare/ConflictRangeCheck.txt
@ -1,3 +1,3 @@
 testTitle=RandomReadWriteTest
 testName=ConflictRange
-simEnableConnectionFailures=false
+connectionFailuresDisableDuration=100000
--- a/Show More
+++ b/Show More