foundationdb/fdbclient/Tuple.cpp

/*
 * Tuple.cpp
 *
 * This source file is part of the FoundationDB open source project
 *
 * Copyright 2013-2024 Apple Inc. and the FoundationDB project authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "fdbclient/Tuple.h"
#include "flow/UnitTest.h"

const uint8_t VERSIONSTAMP_96_CODE = 0x33;
const uint8_t USER_TYPE_START = 0x40;
const uint8_t USER_TYPE_END = 0x4f;

// TODO: Many functions copied from bindings/flow/Tuple.cpp. Merge at some point.
static float bigEndianFloat(float orig) {
	int32_t big = *(int32_t*)&orig;
	big = bigEndian32(big);
	return *(float*)&big;
}

static double bigEndianDouble(double orig) {
	int64_t big = *(int64_t*)&orig;
	big = bigEndian64(big);
	return *(double*)&big;
}

static size_t findStringTerminator(const StringRef data, size_t offset) {
	size_t i = offset;
	while (i < data.size() - 1 && !(data[i] == '\x00' && data[i + 1] != (uint8_t)'\xff')) {
		i += (data[i] == '\x00' ? 2 : 1);
	}

	return i;
}

// If encoding and the sign bit is 1 (the number is negative), flip all the bits.
// If decoding and the sign bit is 0 (the number is negative), flip all the bits.
// Otherwise, the number is positive, so flip the sign bit.
static void adjustFloatingPoint(uint8_t* bytes, size_t size, bool encode) {
	if ((encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x00)) ||
	    (!encode && ((uint8_t)(bytes[0] & 0x80) != (uint8_t)0x80))) {
		for (size_t i = 0; i < size; i++) {
			bytes[i] ^= (uint8_t)0xff;
		}
	} else {
		bytes[0] ^= (uint8_t)0x80;
	}
}

Tuple::Tuple(StringRef const& str, bool exclude_incomplete, bool include_user_type) {
	data.append(data.arena(), str.begin(), str.size());

	size_t i = 0;
	while (i < data.size()) {
		offsets.push_back(i);

		if (data[i] == '\x01' || data[i] == '\x02') {
			i = findStringTerminator(str, i + 1) + 1;
		} else if (data[i] >= '\x0c' && data[i] <= '\x1c') {
			i += abs(data[i] - '\x14') + 1;
		} else if (data[i] == 0x20) {
			i += sizeof(float) + 1;
		} else if (data[i] == 0x21) {
			i += sizeof(double) + 1;
		} else if (data[i] == 0x26 || data[i] == 0x27) {
			i += 1;
		} else if (data[i] == '\x00') {
			i += 1;
		} else if (data[i] == VERSIONSTAMP_96_CODE) {
			i += VERSIONSTAMP_TUPLE_SIZE + 1;
		} else if (include_user_type && isUserType(data[i])) {
			// User defined codes must come at the end of a Tuple and are not delimited.
			i = data.size();
		} else {
			throw invalid_tuple_data_type();
		}
	}
	// If incomplete tuples are allowed, remove the last offset if i is now beyond size()
	// Strings will never be considered incomplete due to the way the string end is found.
	if (exclude_incomplete && i > data.size())
		offsets.pop_back();
}

Tuple Tuple::unpack(StringRef const& str, bool exclude_incomplete) {
	return Tuple(str, exclude_incomplete);
}

std::string Tuple::tupleToString(const Tuple& tuple) {
	std::string str;
	if (tuple.size() > 1) {
		str += "(";
	}
	for (int i = 0; i < tuple.size(); ++i) {
		Tuple::ElementType type = tuple.getType(i);
		if (type == Tuple::NULL_TYPE) {
			str += "NULL";
		} else if (type == Tuple::BYTES || type == Tuple::UTF8) {
			if (type == Tuple::UTF8) {
				str += "u";
			}
			str += "\'" + tuple.getString(i).printable() + "\'";
		} else if (type == Tuple::INT) {
			str += format("%ld", tuple.getInt(i));
		} else if (type == Tuple::FLOAT) {
			str += format("%f", tuple.getFloat(i));
		} else if (type == Tuple::DOUBLE) {
			str += format("%f", tuple.getDouble(i));
		} else if (type == Tuple::BOOL) {
			str += tuple.getBool(i) ? "true" : "false";
		} else if (type == Tuple::VERSIONSTAMP) {
			TupleVersionstamp versionstamp = tuple.getVersionstamp(i);
			str += format("Transaction Version: '%ld', BatchNumber: '%hd', UserVersion : '%hd'",
			              versionstamp.getVersion(),
			              versionstamp.getBatchNumber(),
			              versionstamp.getUserVersion());
		} else {
			ASSERT(false);
		}

		if (i < tuple.size() - 1) {
			str += ", ";
		}
	}
	if (tuple.size() > 1) {
		str += ")";
	}
	return str;
}

Tuple Tuple::unpackUserType(StringRef const& str, bool exclude_incomplete) {
	return Tuple(str, exclude_incomplete, true);
}

bool Tuple::isUserType(uint8_t code) const {
	return code >= USER_TYPE_START && code <= USER_TYPE_END;
}

Tuple& Tuple::append(Tuple const& tuple) {
	for (size_t offset : tuple.offsets) {
		offsets.push_back(offset + data.size());
	}

	data.append(data.arena(), tuple.data.begin(), tuple.data.size());

	return *this;
}

Tuple& Tuple::append(TupleVersionstamp const& vs) {
	offsets.push_back(data.size());

	data.push_back(data.arena(), VERSIONSTAMP_96_CODE);
	data.append(data.arena(), vs.begin(), vs.size());

	return *this;
}

Tuple& Tuple::append(StringRef const& str, bool utf8) {
	offsets.push_back(data.size());

	const uint8_t utfChar = uint8_t(utf8 ? '\x02' : '\x01');
	data.append(data.arena(), &utfChar, 1);

	size_t lastPos = 0;
	for (size_t pos = 0; pos < str.size(); ++pos) {
		if (str[pos] == '\x00') {
			data.append(data.arena(), str.begin() + lastPos, pos - lastPos);
			data.push_back(data.arena(), (uint8_t)'\x00');
			data.push_back(data.arena(), (uint8_t)'\xff');
			lastPos = pos + 1;
		}
	}

	data.append(data.arena(), str.begin() + lastPos, str.size() - lastPos);
	data.push_back(data.arena(), (uint8_t)'\x00');

	return *this;
}

Tuple& Tuple::append(UnicodeStr const& str) {
	return append(str.str, true);
}

Tuple& Tuple::appendRaw(StringRef const& str) {
	offsets.push_back(data.size());

	data.append(data.arena(), str.begin(), str.size());
	return *this;
}

Tuple& Tuple::append(int64_t value) {
	uint64_t swap = value;
	bool neg = false;

	offsets.push_back(data.size());

	if (value < 0) {
		value = ~(-value);
		neg = true;
	}

	swap = bigEndian64(value);

	for (int i = 0; i < 8; i++) {
		if (((uint8_t*)&swap)[i] != (neg ? 255 : 0)) {
			data.push_back(data.arena(), (uint8_t)(20 + (8 - i) * (neg ? -1 : 1)));
			data.append(data.arena(), ((const uint8_t*)&swap) + i, 8 - i);
			return *this;
		}
	}

	data.push_back(data.arena(), (uint8_t)'\x14');
	return *this;
}

Tuple& Tuple::append(int32_t value) {
	return append((int64_t)value);
}

Tuple& Tuple::append(bool value) {
	offsets.push_back(data.size());
	if (value) {
		data.push_back(data.arena(), 0x27);
	} else {
		data.push_back(data.arena(), 0x26);
	}
	return *this;
}

Tuple& Tuple::append(float value) {
	offsets.push_back(data.size());
	float swap = bigEndianFloat(value);
	uint8_t* bytes = (uint8_t*)&swap;
	adjustFloatingPoint(bytes, sizeof(float), true);

	data.push_back(data.arena(), 0x20);
	data.append(data.arena(), bytes, sizeof(float));
	return *this;
}

Tuple& Tuple::append(double value) {
	offsets.push_back(data.size());
	double swap = value;
	swap = bigEndianDouble(swap);
	uint8_t* bytes = (uint8_t*)&swap;
	adjustFloatingPoint(bytes, sizeof(double), true);

	data.push_back(data.arena(), 0x21);
	data.append(data.arena(), bytes, sizeof(double));
	return *this;
}

Tuple& Tuple::append(std::nullptr_t) {
	offsets.push_back(data.size());
	data.push_back(data.arena(), (uint8_t)'\x00');
	return *this;
}

Tuple& Tuple::appendNull() {
	return append(nullptr);
}

Tuple& Tuple::append(Tuple::UserTypeStr const& udt) {
	offsets.push_back(data.size());
	ASSERT(isUserType(udt.code));
	data.push_back(data.arena(), udt.code);
	data.append(data.arena(), udt.str.begin(), udt.str.size());

	return *this;
}

Tuple::ElementType Tuple::getType(size_t index) const {
	if (index >= offsets.size()) {
		throw invalid_tuple_index();
	}

	uint8_t code = data[offsets[index]];

	if (code == '\x00') {
		return ElementType::NULL_TYPE;
	} else if (code == '\x01') {
		return ElementType::BYTES;
	} else if (code == '\x02') {
		return ElementType::UTF8;
	} else if (code >= '\x0c' && code <= '\x1c') {
		return ElementType::INT;
	} else if (code == 0x20) {
		return ElementType::FLOAT;
	} else if (code == 0x21) {
		return ElementType::DOUBLE;
	} else if (code == 0x26 || code == 0x27) {
		return ElementType::BOOL;
	} else if (code == VERSIONSTAMP_96_CODE) {
		return ElementType::VERSIONSTAMP;
	} else if (isUserType(code)) {
		return ElementType::USER_TYPE;
	} else {
		throw invalid_tuple_data_type();
	}
}

Standalone<StringRef> Tuple::getString(size_t index) const {
	if (index >= offsets.size()) {
		throw invalid_tuple_index();
	}

	uint8_t code = data[offsets[index]];
	if (code != '\x01' && code != '\x02') {
		throw invalid_tuple_data_type();
	}

	size_t b = offsets[index] + 1;
	size_t e;
	if (offsets.size() > index + 1) {
		e = offsets[index + 1];
	} else {
		e = data.size();
	}

	Standalone<StringRef> result;
	VectorRef<uint8_t> staging;

	for (size_t i = b; i < e; ++i) {
		if (data[i] == '\x00') {
			staging.append(result.arena(), data.begin() + b, i - b);
			++i;
			b = i + 1;

			if (i < e) {
				staging.push_back(result.arena(), '\x00');
			}
		}
	}

	if (b < e) {
		staging.append(result.arena(), data.begin() + b, e - b);
	}

	result.StringRef::operator=(StringRef(staging.begin(), staging.size()));
	return result;
}

int64_t Tuple::getInt(size_t index, bool allow_incomplete) const {
	if (index >= offsets.size()) {
		throw invalid_tuple_index();
	}

	int64_t swap;
	bool neg = false;

	ASSERT(offsets[index] < data.size());
	uint8_t code = data[offsets[index]];
	if (code < '\x0c' || code > '\x1c') {
		throw invalid_tuple_data_type();
	}

	int8_t len = code - '\x14';

	if (len < 0) {
		len = -len;
		neg = true;
	}

	memset(&swap, neg ? '\xff' : 0, 8 - len);
	// presentLen is how many of len bytes are actually present, it will be < len if the encoded tuple was truncated
	int presentLen = std::min<int8_t>(len, data.size() - offsets[index] - 1);
	ASSERT(len == presentLen || allow_incomplete);
	memcpy(((uint8_t*)&swap) + 8 - len, data.begin() + offsets[index] + 1, presentLen);
	if (presentLen < len) {
		int suffix = len - presentLen;
		if (presentLen == 0) {
			// The first byte in an int would always be at least 1, because if was 0 then a shorter int type would have
			// been used. So if we don't have the first (most significant) byte in the encoded string, use 1 so that the
			// decoded result maintains the encoded form's sort order with an encoded value of a shorter and same-signed
			// type.
			*(((uint8_t*)&swap) + 8 - len) = 1;
			--suffix; // The suffix to clear below is now 1 byte shorter.
		}
		memset(((uint8_t*)&swap) + 8 - suffix, 0, suffix);
	}

	swap = bigEndian64(swap);

	if (neg) {
		swap = -(~swap);
	}

	return swap;
}

// TODO: Combine with bindings/flow/Tuple.*. This code is copied from there.
bool Tuple::getBool(size_t index) const {
	if (index >= offsets.size()) {
		throw invalid_tuple_index();
	}
	ASSERT_LT(offsets[index], data.size());
	uint8_t code = data[offsets[index]];
	if (code == 0x26) {
		return false;
	} else if (code == 0x27) {
		return true;
	} else {
		throw invalid_tuple_data_type();
	}
}

float Tuple::getFloat(size_t index) const {
	if (index >= offsets.size()) {
		throw invalid_tuple_index();
	}
	ASSERT_LT(offsets[index], data.size());
	uint8_t code = data[offsets[index]];
	if (code != 0x20) {
		throw invalid_tuple_data_type();
	}

	float swap;
	uint8_t* bytes = (uint8_t*)&swap;
	ASSERT_LE(offsets[index] + 1 + sizeof(float), data.size());
	swap = *(float*)(data.begin() + offsets[index] + 1);
	adjustFloatingPoint(bytes, sizeof(float), false);

	return bigEndianFloat(swap);
}

double Tuple::getDouble(size_t index) const {
	if (index >= offsets.size()) {
		throw invalid_tuple_index();
	}
	ASSERT_LT(offsets[index], data.size());
	uint8_t code = data[offsets[index]];
	if (code != 0x21) {
		throw invalid_tuple_data_type();
	}

	double swap;
	uint8_t* bytes = (uint8_t*)&swap;
	ASSERT_LE(offsets[index] + 1 + sizeof(double), data.size());
	swap = *(double*)(data.begin() + offsets[index] + 1);
	adjustFloatingPoint(bytes, sizeof(double), false);

	return bigEndianDouble(swap);
}

TupleVersionstamp Tuple::getVersionstamp(size_t index) const {
	if (index >= offsets.size()) {
		throw invalid_tuple_index();
	}
	ASSERT_LT(offsets[index], data.size());
	uint8_t code = data[offsets[index]];
	if (code != VERSIONSTAMP_96_CODE) {
		throw invalid_tuple_data_type();
	}
	return TupleVersionstamp(StringRef(data.begin() + offsets[index] + 1, VERSIONSTAMP_TUPLE_SIZE));
}

Tuple::UserTypeStr Tuple::getUserType(size_t index) const {
	// Valid index.
	if (index >= offsets.size()) {
		throw invalid_tuple_index();
	}

	// Valid user type code.
	ASSERT_LT(offsets[index], data.size());
	uint8_t code = data[offsets[index]];
	if (!isUserType(code)) {
		throw invalid_tuple_data_type();
	}

	size_t start = offsets[index] + 1;

	Standalone<StringRef> str;
	VectorRef<uint8_t> staging;
	staging.append(str.arena(), data.begin() + start, data.size() - start);
	str.StringRef::operator=(StringRef(staging.begin(), staging.size()));

	return Tuple::UserTypeStr(code, str);
}

KeyRange Tuple::range(Tuple const& tuple) const {
	VectorRef<uint8_t> begin;
	VectorRef<uint8_t> end;

	KeyRange keyRange;

	begin.reserve(keyRange.arena(), data.size() + tuple.pack().size() + 1);
	begin.append(keyRange.arena(), data.begin(), data.size());
	begin.append(keyRange.arena(), tuple.pack().begin(), tuple.pack().size());
	begin.push_back(keyRange.arena(), uint8_t('\x00'));

	end.reserve(keyRange.arena(), data.size() + tuple.pack().size() + 1);
	end.append(keyRange.arena(), data.begin(), data.size());
	end.append(keyRange.arena(), tuple.pack().begin(), tuple.pack().size());
	end.push_back(keyRange.arena(), uint8_t('\xff'));

	keyRange.KeyRangeRef::operator=(
	    KeyRangeRef(StringRef(begin.begin(), begin.size()), StringRef(end.begin(), end.size())));
	return keyRange;
}

Tuple Tuple::subTuple(size_t start, size_t end) const {
	if (start >= offsets.size() || end <= start) {
		return Tuple();
	}

	size_t endPos = end < offsets.size() ? offsets[end] : data.size();
	return Tuple(StringRef(data.begin() + offsets[start], endPos - offsets[start]));
}

StringRef Tuple::subTupleRawString(size_t index) const {
	if (index >= offsets.size()) {
		return StringRef();
	}
	size_t end = index + 1;
	size_t endPos = end < offsets.size() ? offsets[end] : data.size();
	return StringRef(data.begin() + offsets[index], endPos - offsets[index]);
}

TEST_CASE("/fdbclient/Tuple/makeTuple") {
	Tuple t1 = Tuple::makeTuple(1,
	                            1.0f,
	                            1.0,
	                            false,
	                            "byteStr"_sr,
	                            Tuple::UnicodeStr("str"_sr),
	                            nullptr,
	                            TupleVersionstamp("000000000000"_sr),
	                            Tuple::UserTypeStr(0x41, "12345678"_sr));
	Tuple t2 = Tuple()
	               .append(1)
	               .append(1.0f)
	               .append(1.0)
	               .append(false)
	               .append("byteStr"_sr)
	               .append(Tuple::UnicodeStr("str"_sr))
	               .append(nullptr)
	               .append(TupleVersionstamp("000000000000"_sr))
	               .append(Tuple::UserTypeStr(0x41, "12345678"_sr));

	ASSERT(t1.pack() == t2.pack());
	ASSERT(t1.getType(0) == Tuple::INT);
	ASSERT(t1.getType(1) == Tuple::FLOAT);
	ASSERT(t1.getType(2) == Tuple::DOUBLE);
	ASSERT(t1.getType(3) == Tuple::BOOL);
	ASSERT(t1.getType(4) == Tuple::BYTES);
	ASSERT(t1.getType(5) == Tuple::UTF8);
	ASSERT(t1.getType(6) == Tuple::NULL_TYPE);
	ASSERT(t1.getType(7) == Tuple::VERSIONSTAMP);
	ASSERT(t1.getType(8) == Tuple::USER_TYPE);
	ASSERT(t1.size() == 9);

	return Void();
}

TEST_CASE("/fdbclient/Tuple/unpack") {
	Tuple t1 = Tuple::makeTuple(1,
	                            1.0f,
	                            1.0,
	                            false,
	                            "byteStr"_sr,
	                            Tuple::UnicodeStr("str"_sr),
	                            nullptr,
	                            TupleVersionstamp("000000000000"_sr),
	                            Tuple::UserTypeStr(0x41, "12345678"_sr));

	Standalone<StringRef> packed = t1.pack();
	Tuple t2 = Tuple::unpackUserType(packed);
	ASSERT(t2.pack() == t1.pack());
	ASSERT(t2.getInt(0) == t1.getInt(0));
	ASSERT(t2.getFloat(1) == t1.getFloat(1));
	ASSERT(t2.getDouble(2) == t1.getDouble(2));
	ASSERT(t2.getBool(3) == t1.getBool(3));
	ASSERT(t2.getString(4) == t1.getString(4));
	ASSERT(t2.getString(5) == t1.getString(5));
	ASSERT(t2.getType(6) == Tuple::NULL_TYPE);
	ASSERT(t2.getVersionstamp(7) == t1.getVersionstamp(7));
	ASSERT(t2.getUserType(8) == t1.getUserType(8));
	ASSERT(t2.size() == 9);

	try {
		Tuple t3 = Tuple::unpack(packed);
		ASSERT(false);
	} catch (Error& e) {
		if (e.code() != error_code_invalid_tuple_data_type) {
			throw e;
		}
	}

	return Void();
}