2019-11-26 13:00:13 +08:00
|
|
|
/*
|
|
|
|
* FileDecoder.actor.cpp
|
|
|
|
*
|
|
|
|
* This source file is part of the FoundationDB open source project
|
|
|
|
*
|
|
|
|
* Copyright 2013-2019 Apple Inc. and the FoundationDB project authors
|
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <iostream>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "fdbclient/BackupAgent.actor.h"
|
|
|
|
#include "fdbclient/BackupContainer.h"
|
|
|
|
#include "fdbbackup/FileConverter.h"
|
|
|
|
#include "fdbclient/MutationList.h"
|
|
|
|
#include "flow/flow.h"
|
|
|
|
#include "flow/serialize.h"
|
2020-09-11 08:06:16 +08:00
|
|
|
#include "fdbclient/BuildFlags.h"
|
2020-02-04 02:42:05 +08:00
|
|
|
#include "flow/actorcompiler.h" // has to be last include
|
2019-11-26 13:00:13 +08:00
|
|
|
|
2020-03-25 01:54:12 +08:00
|
|
|
#define SevDecodeInfo SevVerbose
|
|
|
|
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
extern bool g_crashOnError;
|
|
|
|
|
2019-11-26 13:00:13 +08:00
|
|
|
namespace file_converter {
|
|
|
|
|
|
|
|
void printDecodeUsage() {
|
|
|
|
std::cout << "\n"
|
|
|
|
" -r, --container Container URL.\n"
|
|
|
|
" -i, --input FILE Log file to be decoded.\n"
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
" --crash Crash on serious error.\n"
|
2020-09-11 08:06:16 +08:00
|
|
|
" --build_flags Print build information and exit.\n"
|
2019-11-26 13:00:13 +08:00
|
|
|
"\n";
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2020-09-11 08:06:16 +08:00
|
|
|
void printBuildInformation() {
|
|
|
|
printf("%s", jsonBuildInformation().c_str());
|
|
|
|
}
|
|
|
|
|
2019-11-26 13:00:13 +08:00
|
|
|
struct DecodeParams {
|
|
|
|
std::string container_url;
|
|
|
|
std::string file;
|
|
|
|
bool log_enabled = false;
|
|
|
|
std::string log_dir, trace_format, trace_log_group;
|
|
|
|
|
|
|
|
std::string toString() {
|
|
|
|
std::string s;
|
|
|
|
s.append("ContainerURL: ");
|
|
|
|
s.append(container_url);
|
|
|
|
s.append(", File: ");
|
|
|
|
s.append(file);
|
|
|
|
if (log_enabled) {
|
|
|
|
if (!log_dir.empty()) {
|
|
|
|
s.append(" LogDir:").append(log_dir);
|
|
|
|
}
|
|
|
|
if (!trace_format.empty()) {
|
|
|
|
s.append(" Format:").append(trace_format);
|
|
|
|
}
|
|
|
|
if (!trace_log_group.empty()) {
|
|
|
|
s.append(" LogGroup:").append(trace_log_group);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
int parseDecodeCommandLine(DecodeParams* param, CSimpleOpt* args) {
|
|
|
|
while (args->Next()) {
|
|
|
|
auto lastError = args->LastError();
|
|
|
|
switch (lastError) {
|
|
|
|
case SO_SUCCESS:
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
std::cerr << "ERROR: argument given for option: " << args->OptionText() << "\n";
|
|
|
|
return FDB_EXIT_ERROR;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
int optId = args->OptionId();
|
|
|
|
switch (optId) {
|
|
|
|
case OPT_HELP:
|
|
|
|
printDecodeUsage();
|
|
|
|
return FDB_EXIT_ERROR;
|
|
|
|
|
|
|
|
case OPT_CONTAINER:
|
|
|
|
param->container_url = args->OptionArg();
|
|
|
|
break;
|
|
|
|
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
case OPT_CRASHONERROR:
|
|
|
|
g_crashOnError = true;
|
|
|
|
break;
|
|
|
|
|
2019-11-26 13:00:13 +08:00
|
|
|
case OPT_INPUT_FILE:
|
|
|
|
param->file = args->OptionArg();
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OPT_TRACE:
|
|
|
|
param->log_enabled = true;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OPT_TRACE_DIR:
|
|
|
|
param->log_dir = args->OptionArg();
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OPT_TRACE_FORMAT:
|
|
|
|
if (!validateTraceFormat(args->OptionArg())) {
|
|
|
|
std::cerr << "ERROR: Unrecognized trace format " << args->OptionArg() << "\n";
|
|
|
|
return FDB_EXIT_ERROR;
|
|
|
|
}
|
|
|
|
param->trace_format = args->OptionArg();
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OPT_TRACE_LOG_GROUP:
|
|
|
|
param->trace_log_group = args->OptionArg();
|
|
|
|
break;
|
2020-09-11 08:06:16 +08:00
|
|
|
case OPT_BUILD_FLAGS:
|
|
|
|
printBuildInformation();
|
|
|
|
return FDB_EXIT_ERROR;
|
|
|
|
break;
|
2019-11-26 13:00:13 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return FDB_EXIT_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
void printLogFiles(std::string msg, const std::vector<LogFile>& files) {
|
|
|
|
std::cout << msg << " " << files.size() << " log files\n";
|
|
|
|
for (const auto& file : files) {
|
|
|
|
std::cout << file.toString() << "\n";
|
|
|
|
}
|
|
|
|
std::cout << std::endl;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<LogFile> getRelevantLogFiles(const std::vector<LogFile>& files, const DecodeParams& params) {
|
|
|
|
std::vector<LogFile> filtered;
|
|
|
|
for (const auto& file : files) {
|
|
|
|
if (file.fileName.find(params.file) != std::string::npos) {
|
|
|
|
filtered.push_back(file);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return filtered;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::pair<Version, int32_t> decode_key(const StringRef& key) {
|
|
|
|
ASSERT(key.size() == sizeof(uint8_t) + sizeof(Version) + sizeof(int32_t));
|
|
|
|
|
|
|
|
uint8_t hash;
|
|
|
|
Version version;
|
|
|
|
int32_t part;
|
|
|
|
BinaryReader rd(key, Unversioned());
|
|
|
|
rd >> hash >> version >> part;
|
|
|
|
version = bigEndian64(version);
|
|
|
|
part = bigEndian32(part);
|
|
|
|
|
|
|
|
int32_t v = version / CLIENT_KNOBS->LOG_RANGE_BLOCK_SIZE;
|
|
|
|
ASSERT(((uint8_t)hashlittle(&v, sizeof(v), 0)) == hash);
|
|
|
|
|
|
|
|
return std::make_pair(version, part);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Decodes an encoded list of mutations in the format of:
|
|
|
|
// [includeVersion:uint64_t][val_length:uint32_t][mutation_1][mutation_2]...[mutation_k],
|
|
|
|
// where a mutation is encoded as:
|
|
|
|
// [type:uint32_t][keyLength:uint32_t][valueLength:uint32_t][key][value]
|
2019-11-27 09:37:33 +08:00
|
|
|
std::vector<MutationRef> decode_value(const StringRef& value) {
|
2019-11-26 13:00:13 +08:00
|
|
|
StringRefReader reader(value, restore_corrupted_data());
|
|
|
|
|
|
|
|
reader.consume<uint64_t>(); // Consume the includeVersion
|
|
|
|
uint32_t val_length = reader.consume<uint32_t>();
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
if (val_length != value.size() - sizeof(uint64_t) - sizeof(uint32_t)) {
|
2020-03-25 01:54:12 +08:00
|
|
|
TraceEvent(SevError, "ValueError")
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
.detail("ValueLen", val_length)
|
|
|
|
.detail("ValueSize", value.size())
|
|
|
|
.detail("Value", printable(value));
|
|
|
|
}
|
2019-11-26 13:00:13 +08:00
|
|
|
|
2019-11-27 09:37:33 +08:00
|
|
|
std::vector<MutationRef> mutations;
|
2019-11-26 13:00:13 +08:00
|
|
|
while (1) {
|
|
|
|
if (reader.eof()) break;
|
|
|
|
|
|
|
|
// Deserialization of a MutationRef, which was packed by MutationListRef::push_back_deep()
|
|
|
|
uint32_t type, p1len, p2len;
|
|
|
|
type = reader.consume<uint32_t>();
|
|
|
|
p1len = reader.consume<uint32_t>();
|
|
|
|
p2len = reader.consume<uint32_t>();
|
|
|
|
|
|
|
|
const uint8_t* key = reader.consume(p1len);
|
|
|
|
const uint8_t* val = reader.consume(p2len);
|
|
|
|
|
2019-11-27 09:37:33 +08:00
|
|
|
mutations.emplace_back((MutationRef::Type)type, StringRef(key, p1len), StringRef(val, p2len));
|
2019-11-26 13:00:13 +08:00
|
|
|
}
|
2019-11-27 09:37:33 +08:00
|
|
|
return mutations;
|
2019-11-26 13:00:13 +08:00
|
|
|
}
|
|
|
|
|
2019-11-27 09:37:33 +08:00
|
|
|
struct VersionedMutations {
|
|
|
|
Version version;
|
|
|
|
std::vector<MutationRef> mutations;
|
2019-12-03 02:27:48 +08:00
|
|
|
Arena arena; // The arena that contains the mutations.
|
2019-11-27 09:37:33 +08:00
|
|
|
};
|
|
|
|
|
2020-12-26 02:55:42 +08:00
|
|
|
struct VersionedKVPart {
|
|
|
|
Arena arena;
|
|
|
|
Version version;
|
|
|
|
int32_t part;
|
|
|
|
StringRef kv;
|
|
|
|
VersionedKVPart(Arena arena, Version version, int32_t part, StringRef kv)
|
|
|
|
: arena(arena), version(version), part(part), kv(kv) {}
|
|
|
|
};
|
|
|
|
|
2019-11-27 09:37:33 +08:00
|
|
|
/*
|
|
|
|
* Model a decoding progress for a mutation file. Usage is:
|
|
|
|
*
|
|
|
|
* DecodeProgress progress(logfile);
|
|
|
|
* wait(progress->openFile(container));
|
|
|
|
* while (!progress->finished()) {
|
|
|
|
* VersionedMutations m = wait(progress->getNextBatch());
|
|
|
|
* ...
|
|
|
|
* }
|
2019-12-03 02:27:48 +08:00
|
|
|
*
|
|
|
|
* Internally, the decoding process is done block by block -- each block is
|
|
|
|
* decoded into a list of key/value pairs, which are then decoded into batches
|
|
|
|
* of mutations. Because a version's mutations can be split into many key/value
|
|
|
|
* pairs, the decoding of mutation batch needs to look ahead one more pair. So
|
|
|
|
* at any time this object might have two blocks of data in memory.
|
2019-11-27 09:37:33 +08:00
|
|
|
*/
|
2020-12-26 02:55:42 +08:00
|
|
|
class DecodeProgress {
|
|
|
|
std::vector<VersionedKVPart> keyValues;
|
|
|
|
|
|
|
|
public:
|
2019-11-26 13:00:13 +08:00
|
|
|
DecodeProgress() = default;
|
2020-05-23 00:25:32 +08:00
|
|
|
template <class U>
|
|
|
|
DecodeProgress(const LogFile& file, U &&values)
|
|
|
|
: file(file), keyValues(std::forward<U>(values)) {}
|
2019-11-27 09:37:33 +08:00
|
|
|
|
2020-03-31 02:34:51 +08:00
|
|
|
// If there are no more mutations to pull from the file.
|
|
|
|
// However, we could have unfinished version in the buffer when EOF is true,
|
2020-03-31 04:37:29 +08:00
|
|
|
// which means we should look for data in the next file. The caller
|
|
|
|
// should call getUnfinishedBuffer() to get these left data.
|
2020-12-28 12:43:47 +08:00
|
|
|
bool finished() const { return (eof && keyValues.empty()) || (leftover && !keyValues.empty()); }
|
2020-03-31 02:34:51 +08:00
|
|
|
|
2020-12-26 02:55:42 +08:00
|
|
|
std::vector<VersionedKVPart>&& getUnfinishedBuffer() && { return std::move(keyValues); }
|
2019-11-27 09:37:33 +08:00
|
|
|
|
|
|
|
// Returns all mutations of the next version in a batch.
|
|
|
|
Future<VersionedMutations> getNextBatch() { return getNextBatchImpl(this); }
|
|
|
|
|
|
|
|
Future<Void> openFile(Reference<IBackupContainer> container) { return openFileImpl(this, container); }
|
|
|
|
|
|
|
|
// The following are private APIs:
|
|
|
|
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
// Returns true if value contains complete data.
|
2020-12-26 02:55:42 +08:00
|
|
|
static bool isValueComplete(StringRef value) {
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
StringRefReader reader(value, restore_corrupted_data());
|
|
|
|
|
|
|
|
reader.consume<uint64_t>(); // Consume the includeVersion
|
|
|
|
uint32_t val_length = reader.consume<uint32_t>();
|
|
|
|
return val_length == value.size() - sizeof(uint64_t) - sizeof(uint32_t);
|
|
|
|
}
|
|
|
|
|
2019-12-03 02:27:48 +08:00
|
|
|
// PRECONDITION: finished() must return false before calling this function.
|
|
|
|
// Returns the next batch of mutations along with the arena backing it.
|
2020-03-31 02:34:51 +08:00
|
|
|
// Note the returned batch can be empty when the file has unfinished
|
|
|
|
// version batch data that are in the next file.
|
2019-11-27 09:37:33 +08:00
|
|
|
ACTOR static Future<VersionedMutations> getNextBatchImpl(DecodeProgress* self) {
|
|
|
|
ASSERT(!self->finished());
|
|
|
|
|
|
|
|
loop {
|
2020-03-31 04:37:29 +08:00
|
|
|
if (self->keyValues.size() <= 1) {
|
|
|
|
// Try to decode another block when less than one left
|
2020-03-14 09:44:15 +08:00
|
|
|
wait(readAndDecodeFile(self));
|
|
|
|
}
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
|
2020-12-26 02:55:42 +08:00
|
|
|
const auto& kv = self->keyValues[0];
|
|
|
|
ASSERT(kv.part == 0);
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
|
|
|
|
// decode next versions, check if they are continuous parts
|
2020-03-14 09:44:15 +08:00
|
|
|
int idx = 1; // next kv pair in "keyValues"
|
2020-12-26 02:55:42 +08:00
|
|
|
int bufSize = kv.kv.size();
|
2020-03-14 09:44:15 +08:00
|
|
|
for (int lastPart = 0; idx < self->keyValues.size(); idx++, lastPart++) {
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
if (idx == self->keyValues.size()) break;
|
|
|
|
|
2020-12-26 02:55:42 +08:00
|
|
|
const auto& nextKV = self->keyValues[idx];
|
|
|
|
if (kv.version != nextKV.version) {
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2020-12-26 02:55:42 +08:00
|
|
|
if (lastPart + 1 != nextKV.part) {
|
|
|
|
TraceEvent("DecodeError").detail("Part1", lastPart).detail("Part2", nextKV.part);
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
throw restore_corrupted_data();
|
|
|
|
}
|
2020-12-26 02:55:42 +08:00
|
|
|
bufSize += nextKV.kv.size();
|
2019-11-27 09:37:33 +08:00
|
|
|
}
|
|
|
|
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
VersionedMutations m;
|
2020-12-26 02:55:42 +08:00
|
|
|
m.version = kv.version;
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
TraceEvent("Decode").detail("Version", m.version).detail("Idx", idx).detail("Q", self->keyValues.size());
|
2020-12-26 02:55:42 +08:00
|
|
|
StringRef value = kv.kv;
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
if (idx > 1) {
|
|
|
|
// Stitch parts into one and then decode one by one
|
|
|
|
Standalone<StringRef> buf = self->combineValues(idx, bufSize);
|
|
|
|
value = buf;
|
|
|
|
m.arena = buf.arena();
|
|
|
|
}
|
2020-12-26 02:55:42 +08:00
|
|
|
if (isValueComplete(value)) {
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
m.mutations = decode_value(value);
|
2020-03-31 04:37:29 +08:00
|
|
|
if (m.arena.getSize() == 0) {
|
2020-12-26 02:55:42 +08:00
|
|
|
m.arena = kv.arena;
|
2020-03-31 04:37:29 +08:00
|
|
|
}
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
self->keyValues.erase(self->keyValues.begin(), self->keyValues.begin() + idx);
|
|
|
|
return m;
|
|
|
|
} else if (!self->eof) {
|
|
|
|
// Read one more block, hopefully the missing part of the value can be found.
|
|
|
|
wait(readAndDecodeFile(self));
|
|
|
|
} else {
|
2020-03-31 02:34:51 +08:00
|
|
|
TraceEvent(SevWarn, "MissingValue").detail("Version", m.version);
|
2020-03-31 04:37:29 +08:00
|
|
|
self->leftover = true;
|
2020-03-31 02:34:51 +08:00
|
|
|
return m; // Empty mutations
|
2019-11-27 09:37:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns a buffer which stitches first "idx" values into one.
|
2019-12-03 02:27:48 +08:00
|
|
|
// "len" MUST equal the summation of these values.
|
2019-11-27 09:37:33 +08:00
|
|
|
Standalone<StringRef> combineValues(const int idx, const int len) {
|
|
|
|
ASSERT(idx <= keyValues.size() && idx > 1);
|
|
|
|
|
|
|
|
Standalone<StringRef> buf = makeString(len);
|
|
|
|
int n = 0;
|
|
|
|
for (int i = 0; i < idx; i++) {
|
2020-12-26 02:55:42 +08:00
|
|
|
const auto& value = keyValues[i].kv;
|
2019-11-27 09:37:33 +08:00
|
|
|
memcpy(mutateString(buf) + n, value.begin(), value.size());
|
|
|
|
n += value.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(n == len);
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Decodes a block into KeyValueRef stored in "keyValues".
|
2019-11-26 13:00:13 +08:00
|
|
|
void decode_block(const Standalone<StringRef>& buf, int len) {
|
|
|
|
StringRef block(buf.begin(), len);
|
|
|
|
StringRefReader reader(block, restore_corrupted_data());
|
|
|
|
|
|
|
|
try {
|
2020-02-15 03:27:02 +08:00
|
|
|
// Read header, currently only decoding version BACKUP_AGENT_MLOG_VERSION
|
2019-11-26 13:00:13 +08:00
|
|
|
if (reader.consume<int32_t>() != BACKUP_AGENT_MLOG_VERSION) throw restore_unsupported_file_version();
|
|
|
|
|
|
|
|
// Read k/v pairs. Block ends either at end of last value exactly or with 0xFF as first key len byte.
|
|
|
|
while (1) {
|
|
|
|
// If eof reached or first key len bytes is 0xFF then end of block was reached.
|
|
|
|
if (reader.eof() || *reader.rptr == 0xFF) break;
|
|
|
|
|
|
|
|
// Read key and value. If anything throws then there is a problem.
|
|
|
|
uint32_t kLen = reader.consumeNetworkUInt32();
|
|
|
|
const uint8_t* k = reader.consume(kLen);
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
std::pair<Version, int32_t> version_part = decode_key(StringRef(k, kLen));
|
2019-11-26 13:00:13 +08:00
|
|
|
uint32_t vLen = reader.consumeNetworkUInt32();
|
|
|
|
const uint8_t* v = reader.consume(vLen);
|
2020-03-25 01:54:12 +08:00
|
|
|
TraceEvent(SevDecodeInfo, "Block")
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
.detail("KeySize", kLen)
|
|
|
|
.detail("valueSize", vLen)
|
|
|
|
.detail("Offset", reader.rptr - buf.begin())
|
|
|
|
.detail("Version", version_part.first)
|
|
|
|
.detail("Part", version_part.second);
|
|
|
|
keyValues.emplace_back(buf.arena(), version_part.first, version_part.second, StringRef(v, vLen));
|
2019-11-26 13:00:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Make sure any remaining bytes in the block are 0xFF
|
|
|
|
for (auto b : reader.remainder()) {
|
|
|
|
if (b != 0xFF) throw restore_corrupted_data_padding();
|
|
|
|
}
|
|
|
|
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
// The (version, part) in a block can be out of order, i.e., (3, 0)
|
|
|
|
// can be followed by (4, 0), and then (3, 1). So we need to sort them
|
|
|
|
// first by version, and then by part number.
|
2020-12-26 02:55:42 +08:00
|
|
|
std::sort(keyValues.begin(), keyValues.end(), [](const VersionedKVPart& a, const VersionedKVPart& b) {
|
|
|
|
return a.version == b.version ? a.part < b.part : a.version < b.version;
|
|
|
|
});
|
2019-11-26 13:00:13 +08:00
|
|
|
return;
|
|
|
|
} catch (Error& e) {
|
|
|
|
TraceEvent(SevWarn, "CorruptBlock").error(e).detail("Offset", reader.rptr - buf.begin());
|
|
|
|
throw;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-11-27 09:37:33 +08:00
|
|
|
ACTOR static Future<Void> openFileImpl(DecodeProgress* self, Reference<IBackupContainer> container) {
|
|
|
|
Reference<IAsyncFile> fd = wait(container->readFile(self->file.fileName));
|
|
|
|
self->fd = fd;
|
|
|
|
wait(readAndDecodeFile(self));
|
|
|
|
return Void();
|
|
|
|
}
|
2019-11-26 13:00:13 +08:00
|
|
|
|
2019-12-03 02:27:48 +08:00
|
|
|
// Reads a file block, decodes it into key/value pairs, and stores these pairs.
|
2019-11-27 09:37:33 +08:00
|
|
|
ACTOR static Future<Void> readAndDecodeFile(DecodeProgress* self) {
|
2019-11-26 13:00:13 +08:00
|
|
|
try {
|
2019-11-27 09:37:33 +08:00
|
|
|
state int64_t len = std::min<int64_t>(self->file.blockSize, self->file.fileSize - self->offset);
|
|
|
|
if (len == 0) {
|
|
|
|
self->eof = true;
|
|
|
|
return Void();
|
2019-11-26 13:00:13 +08:00
|
|
|
}
|
2019-11-27 09:37:33 +08:00
|
|
|
|
|
|
|
state Standalone<StringRef> buf = makeString(len);
|
|
|
|
state int rLen = wait(self->fd->read(mutateString(buf), len, self->offset));
|
|
|
|
TraceEvent("ReadFile")
|
|
|
|
.detail("Name", self->file.fileName)
|
|
|
|
.detail("Len", rLen)
|
|
|
|
.detail("Offset", self->offset);
|
|
|
|
if (rLen != len) {
|
|
|
|
throw restore_corrupted_data();
|
|
|
|
}
|
|
|
|
self->decode_block(buf, rLen);
|
|
|
|
self->offset += rLen;
|
2019-11-26 13:00:13 +08:00
|
|
|
return Void();
|
|
|
|
} catch (Error& e) {
|
|
|
|
TraceEvent(SevWarn, "CorruptLogFileBlock")
|
|
|
|
.error(e)
|
|
|
|
.detail("Filename", self->file.fileName)
|
|
|
|
.detail("BlockOffset", self->offset)
|
|
|
|
.detail("BlockLen", self->file.blockSize);
|
|
|
|
throw;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
LogFile file;
|
|
|
|
Reference<IAsyncFile> fd;
|
|
|
|
int64_t offset = 0;
|
|
|
|
bool eof = false;
|
2020-03-31 04:37:29 +08:00
|
|
|
bool leftover = false; // Done but has unfinished version batch data left
|
2019-11-26 13:00:13 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
ACTOR Future<Void> decode_logs(DecodeParams params) {
|
|
|
|
state Reference<IBackupContainer> container = IBackupContainer::openContainer(params.container_url);
|
|
|
|
|
|
|
|
state BackupFileList listing = wait(container->dumpFileList());
|
Decode out of order mutations in old mutation logs
In the old mutation logs, a version's mutations are serialized as a buffer.
Then the buffer is split into smaller chunks, e.g., 10000 bytes each. When
writting chunks to the final mutation log file, these chunks can be flushed
out of order. For instance, the (version, chunck_part) can be in the order of
(3, 0), (4, 0), (3, 1). As a result, the decoder must read forward to find all
chunks of data for a version.
Another complication is that the files are organized into blocks, where (3, 1)
can be in a subsequent block. This change checks the value size for each
version, if the size is smaller than the right size, the decoder will look
for the missing chucks in the next block.
2020-03-11 06:45:57 +08:00
|
|
|
// remove partitioned logs
|
|
|
|
listing.logs.erase(std::remove_if(listing.logs.begin(), listing.logs.end(),
|
|
|
|
[](const LogFile& file) {
|
|
|
|
std::string prefix("plogs/");
|
|
|
|
return file.fileName.substr(0, prefix.size()) == prefix;
|
|
|
|
}),
|
|
|
|
listing.logs.end());
|
2019-11-26 13:00:13 +08:00
|
|
|
std::sort(listing.logs.begin(), listing.logs.end());
|
|
|
|
TraceEvent("Container").detail("URL", params.container_url).detail("Logs", listing.logs.size());
|
|
|
|
|
|
|
|
BackupDescription desc = wait(container->describeBackup());
|
|
|
|
std::cout << "\n" << desc.toString() << "\n";
|
|
|
|
|
|
|
|
state std::vector<LogFile> logs = getRelevantLogFiles(listing.logs, params);
|
|
|
|
printLogFiles("Relevant files are: ", logs);
|
|
|
|
|
|
|
|
state int i = 0;
|
2020-03-31 02:34:51 +08:00
|
|
|
// Previous file's unfinished version data
|
2020-12-26 02:55:42 +08:00
|
|
|
state std::vector<VersionedKVPart> left;
|
2019-11-26 13:00:13 +08:00
|
|
|
for (; i < logs.size(); i++) {
|
2020-03-31 04:37:29 +08:00
|
|
|
if (logs[i].fileSize == 0) continue;
|
|
|
|
|
2020-05-23 00:25:32 +08:00
|
|
|
state DecodeProgress progress(logs[i], std::move(left));
|
2019-11-27 09:37:33 +08:00
|
|
|
wait(progress.openFile(container));
|
|
|
|
while (!progress.finished()) {
|
|
|
|
VersionedMutations vms = wait(progress.getNextBatch());
|
|
|
|
for (const auto& m : vms.mutations) {
|
|
|
|
std::cout << vms.version << " " << m.toString() << "\n";
|
|
|
|
}
|
|
|
|
}
|
2020-05-23 00:25:32 +08:00
|
|
|
left = std::move(progress).getUnfinishedBuffer();
|
2020-03-31 02:34:51 +08:00
|
|
|
if (!left.empty()) {
|
|
|
|
TraceEvent("UnfinishedFile").detail("File", logs[i].fileName).detail("Q", left.size());
|
|
|
|
}
|
2019-11-26 13:00:13 +08:00
|
|
|
}
|
|
|
|
return Void();
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace file_converter
|
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
|
|
|
try {
|
|
|
|
CSimpleOpt* args = new CSimpleOpt(argc, argv, file_converter::gConverterOptions, SO_O_EXACT);
|
|
|
|
file_converter::DecodeParams param;
|
|
|
|
int status = file_converter::parseDecodeCommandLine(¶m, args);
|
|
|
|
std::cout << "Params: " << param.toString() << "\n";
|
|
|
|
if (status != FDB_EXIT_SUCCESS) {
|
|
|
|
file_converter::printDecodeUsage();
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (param.log_enabled) {
|
|
|
|
if (param.log_dir.empty()) {
|
|
|
|
setNetworkOption(FDBNetworkOptions::TRACE_ENABLE);
|
|
|
|
} else {
|
|
|
|
setNetworkOption(FDBNetworkOptions::TRACE_ENABLE, StringRef(param.log_dir));
|
|
|
|
}
|
|
|
|
if (!param.trace_format.empty()) {
|
|
|
|
setNetworkOption(FDBNetworkOptions::TRACE_FORMAT, StringRef(param.trace_format));
|
|
|
|
}
|
|
|
|
if (!param.trace_log_group.empty()) {
|
|
|
|
setNetworkOption(FDBNetworkOptions::TRACE_LOG_GROUP, StringRef(param.trace_log_group));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
platformInit();
|
|
|
|
Error::init();
|
|
|
|
|
|
|
|
StringRef url(param.container_url);
|
|
|
|
setupNetwork(0, true);
|
|
|
|
|
|
|
|
TraceEvent::setNetworkThread();
|
|
|
|
openTraceFile(NetworkAddress(), 10 << 20, 10 << 20, param.log_dir, "decode", param.trace_log_group);
|
|
|
|
|
|
|
|
auto f = stopAfter(decode_logs(param));
|
|
|
|
|
|
|
|
runNetwork();
|
|
|
|
return status;
|
|
|
|
} catch (Error& e) {
|
|
|
|
fprintf(stderr, "ERROR: %s\n", e.what());
|
|
|
|
return FDB_EXIT_ERROR;
|
|
|
|
} catch (std::exception& e) {
|
|
|
|
TraceEvent(SevError, "MainError").error(unknown_error()).detail("RootException", e.what());
|
|
|
|
return FDB_EXIT_MAIN_EXCEPTION;
|
|
|
|
}
|
2020-02-04 02:42:05 +08:00
|
|
|
}
|