foundationdb/fdbclient/BlobGranuleReader.actor.cpp

/*
 * BlobGranuleReader.actor.cpp
 *
 * This source file is part of the FoundationDB open source project
 *
 * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <map>
#include <vector>

#include "fdbclient/AsyncFileS3BlobStore.actor.h"
#include "fdbclient/Atomic.h"
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/BlobGranuleReader.actor.h"
#include "fdbclient/BlobWorkerCommon.h"
#include "fdbclient/BlobWorkerInterface.h"
#include "fdbclient/SystemData.h" // for allKeys unit test - could remove
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // This must be the last #include.

// TODO more efficient data structure besides std::map? PTree is unecessary since this isn't versioned, but some other
// sorted thing could work. And if it used arenas it'd probably be more efficient with allocations, since everything
// else is in 1 arena and discarded at the end.

// TODO could refactor the file reading code from here and the delta file function into another actor,
// then this part would also be testable? but meh

#define BG_READ_DEBUG false

ACTOR Future<Arena> readSnapshotFile(Reference<BackupContainerFileSystem> bstore,
                                     BlobFilePointerRef f,
                                     KeyRangeRef keyRange,
                                     std::map<KeyRef, ValueRef>* dataMap) {
	try {
		state Arena arena;
		// printf("Starting read of snapshot file %s\n", filename.c_str());
		state Reference<IAsyncFile> reader = wait(bstore->readFile(f.filename.toString()));
		// printf("Got snapshot file size %lld\n", size);
		state uint8_t* data = new (arena) uint8_t[f.length];
		// printf("Reading %lld bytes from snapshot file %s\n", size, filename.c_str());
		int readSize = wait(reader->read(data, f.length, f.offset));
		// printf("Read %lld bytes from snapshot file %s\n", readSize, filename.c_str());
		ASSERT(f.length == readSize);

		// weird stuff for deserializing vector and arenas
		Arena parseArena;
		GranuleSnapshot snapshot;
		StringRef dataRef(data, f.length);
		ArenaObjectReader rdr(arena, dataRef, Unversioned());
		rdr.deserialize(FileIdentifierFor<GranuleSnapshot>::value, snapshot, parseArena);
		arena.dependsOn(parseArena);

		// GranuleSnapshot snapshot = ObjectReader::fromStringRef<GranuleSnapshot>(dataRef, Unversioned();)
		// printf("Parsed %d rows from snapshot file %s\n", snapshot.size(), filename.c_str());

		// TODO REMOVE sanity check eventually
		for (int i = 0; i < snapshot.size() - 1; i++) {
			if (snapshot[i].key >= snapshot[i + 1].key) {
				printf("BG SORT ORDER VIOLATION IN SNAPSHOT FILE: '%s', '%s'\n",
				       snapshot[i].key.printable().c_str(),
				       snapshot[i + 1].key.printable().c_str());
			}
			ASSERT(snapshot[i].key < snapshot[i + 1].key);
		}

		int i = 0;
		while (i < snapshot.size() && snapshot[i].key < keyRange.begin) {
			/*if (snapshot.size() < 10) { // debug
			    printf("  Pruning %s < %s\n", snapshot[i].key.printable().c_str(), keyRange.begin.printable().c_str());
			}*/
			i++;
		}
		while (i < snapshot.size() && snapshot[i].key < keyRange.end) {
			dataMap->insert({ snapshot[i].key, snapshot[i].value });
			/*if (snapshot.size() < 10) { // debug
			    printf("  Including %s\n", snapshot[i].key.printable().c_str());
			}*/
			i++;
		}
		/*if (snapshot.size() < 10) { // debug
		    while (i < snapshot.size()) {
		        printf("  Pruning %s >= %s\n", snapshot[i].key.printable().c_str(), keyRange.end.printable().c_str());
		        i++;
		    }
		}*/
		if (BG_READ_DEBUG) {
			printf("Started with %d rows from snapshot file %s after pruning to [%s - %s)\n",
			       dataMap->size(),
			       f.toString().c_str(),
			       keyRange.begin.printable().c_str(),
			       keyRange.end.printable().c_str());
		}

		return arena;
	} catch (Error& e) {
		printf("Reading snapshot file %s got error %s\n", f.toString().c_str(), e.name());
		throw e;
	}
}

ACTOR Future<Standalone<GranuleDeltas>> readDeltaFile(Reference<BackupContainerFileSystem> bstore,
                                                      BlobFilePointerRef f,
                                                      KeyRangeRef keyRange,
                                                      Version readVersion) {
	try {
		// printf("Starting read of delta file %s\n", filename.c_str());
		state Standalone<GranuleDeltas> result;
		state Reference<IAsyncFile> reader = wait(bstore->readFile(f.filename.toString()));
		// printf("Got delta file size %lld\n", size);
		state uint8_t* data = new (result.arena()) uint8_t[f.length];
		// printf("Reading %lld bytes from delta file %s into %p\n", size, filename.c_str(), data);
		int readSize = wait(reader->read(data, f.length, f.offset));
		// printf("Read %d bytes from delta file %s\n", readSize, filename.c_str());
		ASSERT(f.length == readSize);

		// Don't do range or version filtering in here since we'd have to copy/rewrite the deltas and it might starve
		// snapshot read task, do it in main thread

		// weirdness with vector refs and arenas here
		Arena parseArena;
		StringRef dataRef(data, f.length);
		ArenaObjectReader rdr(result.arena(), dataRef, Unversioned());
		rdr.deserialize(FileIdentifierFor<GranuleDeltas>::value, result.contents(), parseArena);
		result.arena().dependsOn(parseArena);

		if (BG_READ_DEBUG) {
			printf("Parsed %d deltas from delta file %s\n", result.size(), f.toString().c_str());
		}

		// TODO REMOVE sanity check
		for (int i = 0; i < result.size() - 1; i++) {
			if (result[i].version > result[i + 1].version) {
				printf("BG VERSION ORDER VIOLATION IN DELTA FILE: '%lld', '%lld'\n",
				       result[i].version,
				       result[i + 1].version);
			}
			ASSERT(result[i].version <= result[i + 1].version);
		}

		return result;
	} catch (Error& e) {
		printf("Reading delta file %s got error %s\n", f.toString().c_str(), e.name());
		throw e;
	}
}

// TODO this giant switch is mostly lifted from storage server.
// Could refactor atomics to have a generic "handle this atomic mutation" thing instead of having to duplicate code with
// the switch statement everywhere?
static void applyDelta(std::map<KeyRef, ValueRef>* dataMap, Arena& ar, KeyRangeRef keyRange, MutationRef m) {
	if (m.type == MutationRef::ClearRange) {
		if (m.param2 <= keyRange.begin || m.param1 >= keyRange.end) {
			return;
		}
		// keyRange is inclusive on start, lower_bound is inclusive with the argument, and erase is inclusive for the
		// begin. So if lower bound didn't find the exact key, we need to go up one so it doesn't erase an extra key
		// outside the range.
		std::map<KeyRef, ValueRef>::iterator itStart = dataMap->lower_bound(m.param1);
		if (itStart != dataMap->end() && itStart->first < m.param1) {
			itStart++;
		}

		// keyRange is exclusive on end, lower bound is inclusive with the argument, and erase is exclusive for the end
		// key. So if lower bound didn't find the exact key, we need to go up one so it doesn't skip the last key it
		// should erase
		std::map<KeyRef, ValueRef>::iterator itEnd = dataMap->lower_bound(m.param2);
		if (itEnd != dataMap->end() && itEnd->first < m.param2) {
			itEnd++;
		}
		dataMap->erase(itStart, itEnd);
	} else {
		if (m.param1 < keyRange.begin || m.param1 >= keyRange.end) {
			return;
		}
		// TODO: we don't need atomics here since eager reads handles it
		std::map<KeyRef, ValueRef>::iterator it = dataMap->find(m.param1);
		if (m.type != MutationRef::SetValue) {
			Optional<StringRef> oldVal;
			if (it != dataMap->end()) {
				oldVal = it->second;
			}

			switch (m.type) {
			case MutationRef::AddValue:
				m.param2 = doLittleEndianAdd(oldVal, m.param2, ar);
				break;
			case MutationRef::And:
				m.param2 = doAnd(oldVal, m.param2, ar);
				break;
			case MutationRef::Or:
				m.param2 = doOr(oldVal, m.param2, ar);
				break;
			case MutationRef::Xor:
				m.param2 = doXor(oldVal, m.param2, ar);
				break;
			case MutationRef::AppendIfFits:
				m.param2 = doAppendIfFits(oldVal, m.param2, ar);
				break;
			case MutationRef::Max:
				m.param2 = doMax(oldVal, m.param2, ar);
				break;
			case MutationRef::Min:
				m.param2 = doMin(oldVal, m.param2, ar);
				break;
			case MutationRef::ByteMin:
				m.param2 = doByteMin(oldVal, m.param2, ar);
				break;
			case MutationRef::ByteMax:
				m.param2 = doByteMax(oldVal, m.param2, ar);
				break;
			case MutationRef::MinV2:
				m.param2 = doMinV2(oldVal, m.param2, ar);
				break;
			case MutationRef::AndV2:
				m.param2 = doAndV2(oldVal, m.param2, ar);
				break;
			case MutationRef::CompareAndClear:
				if (oldVal.present() && m.param2 == oldVal.get()) {
					m.type = MutationRef::ClearRange;
					m.param2 = keyAfter(m.param1, ar);
					applyDelta(dataMap, ar, keyRange, m);
				};
				return;
			}
		}
		if (it == dataMap->end()) {
			dataMap->insert({ m.param1, m.param2 });
		} else {
			it->second = m.param2;
		}
	}
}

// TODO might want to change this to an actor so it can yield periodically?
static void applyDeltas(std::map<KeyRef, ValueRef>* dataMap,
                        Arena& arena,
                        GranuleDeltas deltas,
                        KeyRangeRef keyRange,
                        Version readVersion,
                        Version* lastFileEndVersion) {
	if (!deltas.empty()) {
		// check that consecutive delta file versions are disjoint
		ASSERT(*lastFileEndVersion < deltas.front().version);
	}
	for (MutationsAndVersionRef& delta : deltas) {
		if (delta.version > readVersion) {
			*lastFileEndVersion = readVersion;
			return;
		}
		for (auto& m : delta.mutations) {
			applyDelta(dataMap, arena, keyRange, m);
		}
	}
	if (!deltas.empty()) {
		*lastFileEndVersion = deltas.back().version;
	}
}

// TODO: improve the interface of this function so that it doesn't need
//       to be passed the entire BlobWorkerStats object
ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
                                          KeyRangeRef keyRange,
                                          Version readVersion,
                                          Reference<BackupContainerFileSystem> bstore,
                                          Optional<BlobWorkerStats*> stats) {

	// TODO REMOVE with V2 of protocol
	ASSERT(readVersion == chunk.includedVersion);
	// Arena to hold all allocations for applying deltas. Most of it, and the arenas produced by reading the files,
	// will likely be tossed if there are a significant number of mutations, so we copy at the end instead of doing a
	// dependsOn.
	// FIXME: probably some threshold of a small percentage of the data is actually changed, where it makes sense to
	// just to dependsOn instead of copy, to use a little extra memory footprint to help cpu?
	state Arena arena;

	try {
		state std::map<KeyRef, ValueRef> dataMap;
		state Version lastFileEndVersion = invalidVersion;

		Future<Arena> readSnapshotFuture;
		if (chunk.snapshotFile.present()) {
			readSnapshotFuture = readSnapshotFile(bstore, chunk.snapshotFile.get(), keyRange, &dataMap);
			if (stats.present()) {
				++stats.get()->s3GetReqs;
			}
		} else {
			readSnapshotFuture = Future<Arena>(Arena());
		}

		state std::vector<Future<Standalone<GranuleDeltas>>> readDeltaFutures;
		readDeltaFutures.reserve(chunk.deltaFiles.size());
		for (BlobFilePointerRef deltaFile : chunk.deltaFiles) {
			readDeltaFutures.push_back(readDeltaFile(bstore, deltaFile, keyRange, readVersion));
			if (stats.present()) {
				++stats.get()->s3GetReqs;
			}
		}

		Arena snapshotArena = wait(readSnapshotFuture);
		arena.dependsOn(snapshotArena);

		if (BG_READ_DEBUG) {
			printf("Applying %d delta files\n", readDeltaFutures.size());
		}
		for (Future<Standalone<GranuleDeltas>> deltaFuture : readDeltaFutures) {
			Standalone<GranuleDeltas> result = wait(deltaFuture);
			arena.dependsOn(result.arena());
			applyDeltas(&dataMap, arena, result, keyRange, readVersion, &lastFileEndVersion);
			wait(yield());
		}
		if (BG_READ_DEBUG) {
			printf("Applying %d memory deltas\n", chunk.newDeltas.size());
		}
		applyDeltas(&dataMap, arena, chunk.newDeltas, keyRange, readVersion, &lastFileEndVersion);
		wait(yield());

		RangeResult ret;
		for (auto& it : dataMap) {
			ret.push_back_deep(ret.arena(), KeyValueRef(it.first, it.second));
			// TODO for large reads, probably wait to yield periodically here for SlowTask
		}

		return ret;
	} catch (Error& e) {
		printf("Reading blob granule got error %s\n", e.name());
		throw e;
	}
}

// TODO probably should add things like limit/bytelimit at some point?
ACTOR Future<Void> readBlobGranules(BlobGranuleFileRequest request,
                                    BlobGranuleFileReply reply,
                                    Reference<BackupContainerFileSystem> bstore,
                                    PromiseStream<RangeResult> results) {
	// TODO for large amount of chunks, this should probably have some sort of buffer limit like ReplyPromiseStream.
	// Maybe just use ReplyPromiseStream instead of PromiseStream?
	try {
		state int i;
		for (i = 0; i < reply.chunks.size(); i++) {
			/*printf("ReadBlobGranules processing chunk %d [%s - %s)\n",
			       i,
			       reply.chunks[i].keyRange.begin.printable().c_str(),
			       reply.chunks[i].keyRange.end.printable().c_str());*/
			RangeResult chunkResult =
			    wait(readBlobGranule(reply.chunks[i], request.keyRange, request.readVersion, bstore));
			results.send(std::move(chunkResult));
		}
		// printf("ReadBlobGranules done, sending EOS\n");
		results.sendError(end_of_stream());
	} catch (Error& e) {
		printf("ReadBlobGranules got error %s\n", e.name());
		results.sendError(e);
	}

	return Void();
}

TEST_CASE("/blobgranule/reader/applyDelta") {
	printf("Testing blob granule deltas\n");
	Arena a;

	// do this 2 phase arena creation of string refs instead of LiteralStringRef because there is no char* StringRef
	// constructor, and valgrind might complain if the stringref data isn't in the arena
	std::string sk_a = "A";
	std::string sk_ab = "AB";
	std::string sk_b = "B";
	std::string sk_c = "C";
	std::string sk_z = "Z";
	std::string sval1 = "1";
	std::string sval2 = "2";

	StringRef k_a = StringRef(a, sk_a);
	StringRef k_ab = StringRef(a, sk_ab);
	StringRef k_b = StringRef(a, sk_b);
	StringRef k_c = StringRef(a, sk_c);
	StringRef k_z = StringRef(a, sk_z);
	StringRef val1 = StringRef(a, sval1);
	StringRef val2 = StringRef(a, sval2);

	std::map<KeyRef, ValueRef> data;
	data.insert({ k_a, val1 });
	data.insert({ k_ab, val1 });
	data.insert({ k_b, val1 });

	std::map<KeyRef, ValueRef> correctData = data;
	std::map<KeyRef, ValueRef> originalData = data;

	ASSERT(data == correctData);

	// test all clear permutations

	MutationRef mClearEverything(MutationRef::ClearRange, allKeys.begin, allKeys.end);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mClearEverything);
	correctData.clear();
	ASSERT(data == correctData);

	MutationRef mClearEverything2(MutationRef::ClearRange, allKeys.begin, k_c);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mClearEverything2);
	correctData.clear();
	ASSERT(data == correctData);

	MutationRef mClearEverything3(MutationRef::ClearRange, k_a, allKeys.end);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mClearEverything3);
	correctData.clear();
	ASSERT(data == correctData);

	MutationRef mClearEverything4(MutationRef::ClearRange, k_a, k_c);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mClearEverything4);
	correctData.clear();
	ASSERT(data == correctData);

	MutationRef mClearFirst(MutationRef::ClearRange, k_a, k_ab);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mClearFirst);
	correctData.erase(k_a);
	ASSERT(data == correctData);

	MutationRef mClearSecond(MutationRef::ClearRange, k_ab, k_b);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mClearSecond);
	correctData.erase(k_ab);
	ASSERT(data == correctData);

	MutationRef mClearThird(MutationRef::ClearRange, k_b, k_c);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mClearThird);
	correctData.erase(k_b);
	ASSERT(data == correctData);

	MutationRef mClearFirst2(MutationRef::ClearRange, k_a, k_b);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mClearFirst2);
	correctData.erase(k_a);
	correctData.erase(k_ab);
	ASSERT(data == correctData);

	MutationRef mClearLast2(MutationRef::ClearRange, k_ab, k_c);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mClearLast2);
	correctData.erase(k_ab);
	correctData.erase(k_b);
	ASSERT(data == correctData);

	// test set data
	MutationRef mSetA(MutationRef::SetValue, k_a, val2);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mSetA);
	correctData[k_a] = val2;
	ASSERT(data == correctData);

	MutationRef mSetAB(MutationRef::SetValue, k_ab, val2);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mSetAB);
	correctData[k_ab] = val2;
	ASSERT(data == correctData);

	MutationRef mSetB(MutationRef::SetValue, k_b, val2);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mSetB);
	correctData[k_b] = val2;
	ASSERT(data == correctData);

	MutationRef mSetC(MutationRef::SetValue, k_c, val2);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mSetC);
	correctData[k_c] = val2;
	ASSERT(data == correctData);

	// test pruning deltas that are outside of the key range

	MutationRef mSetZ(MutationRef::SetValue, k_z, val2);
	data = originalData;
	applyDelta(&data, a, KeyRangeRef(k_a, k_c), mSetZ);
	ASSERT(data == originalData);

	applyDelta(&data, a, KeyRangeRef(k_ab, k_c), mSetA);
	ASSERT(data == originalData);

	applyDelta(&data, a, KeyRangeRef(k_ab, k_c), mClearFirst);
	ASSERT(data == originalData);

	applyDelta(&data, a, KeyRangeRef(k_a, k_ab), mClearThird);
	ASSERT(data == originalData);

	// Could test all other atomic ops, but if set, max, and compare+clear works, and the others all just directly call
	// the atomics, there is little to test

	MutationRef mCAndC1(MutationRef::CompareAndClear, k_a, val1);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mCAndC1);
	correctData.erase(k_a);
	ASSERT(data == correctData);

	MutationRef mCAndC2(MutationRef::CompareAndClear, k_a, val2);
	data = originalData;
	applyDelta(&data, a, allKeys, mCAndC2);
	ASSERT(data == originalData);

	MutationRef mCAndCZ(MutationRef::CompareAndClear, k_z, val2);
	data = originalData;
	applyDelta(&data, a, allKeys, mCAndCZ);
	ASSERT(data == originalData);

	MutationRef mMaxA(MutationRef::ByteMax, k_a, val2);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mMaxA);
	correctData[k_a] = val2;
	ASSERT(data == correctData);

	MutationRef mMaxC(MutationRef::ByteMax, k_c, val2);
	data = originalData;
	correctData = originalData;
	applyDelta(&data, a, allKeys, mMaxC);
	correctData[k_c] = val2;
	ASSERT(data == correctData);

	return Void();
}