Filter partitioned logs with subset relationship

If a log file's progress is not saved, a new log file will be generated
with the same begin version. Then we can have a file that contains a subset
of contents in another log file. During restore, we should filter out files
that their contents are subset of other files.
This commit is contained in:
Jingyu Zhou 2020-03-04 10:52:51 -08:00
parent 696ce6aa82
commit 20df67ee6a
3 changed files with 13 additions and 9 deletions

View File

@ -81,10 +81,10 @@ std::vector<LogFile> getRelevantLogFiles(const std::vector<LogFile>& files, Vers
std::vector<LogFile> sorted;
int i = 0;
for (int j = 1; j < filtered.size(); j++) {
if (!filtered[i].sameContent(filtered[j])) {
if (!filtered[i].isSubset(filtered[j])) {
sorted.push_back(filtered[i]);
i = j;
}
i = j;
}
if (i < filtered.size()) {
sorted.push_back(filtered[i]);

View File

@ -1155,16 +1155,19 @@ std::cout << "lastBegin " << lastBegin << ", lastEnd " << lastEnd << ", end " <<
return true;
}
// Returns log files that are not duplicated.
// Returns log files that are not duplicated, or subset of another log.
// If a log file's progress is not saved, a new log file will be generated
// with the same begin version. So we can have a file that contains a subset
// of contents in another log file.
// PRE-CONDITION: logs are already sorted.
static std::vector<LogFile> filterDuplicates(const std::vector<LogFile>& logs) {
std::vector<LogFile> filtered;
int i = 0;
for (int j = 1; j < logs.size(); j++) {
if (!logs[i].sameContent(logs[j])) {
if (!logs[i].isSubset(logs[j])) {
filtered.push_back(logs[i]);
i = j;
}
i = j;
}
if (i < logs.size()) filtered.push_back(logs[i]);
return filtered;
@ -1180,7 +1183,7 @@ std::cout << "lastBegin " << lastBegin << ", lastEnd " << lastEnd << ", end " <<
Version end = getPartitionedLogsContinuousEndVersion(logs, file.beginVersion);
std::cout << " determine " << file.toString() << " , end " << end << "\n\n";
if (end > file.beginVersion) {
desc->minLogBegin = file.beginVersion;
// desc->minLogBegin = file.beginVersion;
// contiguousLogEnd is not inclusive, so +1 here.
desc->contiguousLogEnd.get() = end + 1;
return;

View File

@ -82,9 +82,10 @@ struct LogFile {
return beginVersion == rhs.beginVersion ? endVersion < rhs.endVersion : beginVersion < rhs.beginVersion;
}
// Returns if two log files have the same content by comparing version range and tag ID.
bool sameContent(const LogFile& rhs) const {
return beginVersion == rhs.beginVersion && endVersion == rhs.endVersion && tagId == rhs.tagId;
// Returns if this log file contains a subset of content of the given file
// by comparing version range and tag ID.
bool isSubset(const LogFile& rhs) const {
return beginVersion == rhs.beginVersion && endVersion <= rhs.endVersion && tagId == rhs.tagId;
}
std::string toString() const {