Tightening up memory management in the blob worker

This commit is contained in:
Josh Slocum 2022-02-01 14:52:28 -06:00
parent d0e89ecdd5
commit a42c80faa9
1 changed files with 34 additions and 30 deletions

View File

@ -64,9 +64,7 @@ struct GranuleMetadata : NonCopyable, ReferenceCounted<GranuleMetadata> {
KeyRange keyRange;
GranuleFiles files;
GranuleDeltas currentDeltas; // only contain deltas in pendingDeltaVersion + 1, bufferedDeltaVersion
// TODO get rid of this and do Reference<Standalone<GranuleDeltas>>?
Arena deltaArena;
Standalone<GranuleDeltas> currentDeltas; // only contain deltas in pendingDeltaVersion + 1, bufferedDeltaVersion
uint64_t bytesInNewDeltaFiles = 0;
uint64_t bufferedDeltaBytes = 0;
@ -431,8 +429,7 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
UID granuleID,
int64_t epoch,
int64_t seqno,
Arena deltaArena,
GranuleDeltas deltasToWrite,
Standalone<GranuleDeltas> deltasToWrite,
Version currentDeltaVersion,
Future<BlobFileIndex> previousDeltaFileFuture,
Future<Void> waitCommitted,
@ -446,18 +443,23 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
std::to_string(currentDeltaVersion) + ".delta";
state Value serialized = ObjectWriter::toValue(deltasToWrite, Unversioned());
state size_t serializedSize = serialized.size();
// FIXME: technically we can free up deltaArena here to reduce memory
// Free up deltasToWrite here to reduce memory
deltasToWrite = Standalone<GranuleDeltas>();
state Reference<IBackupFile> objectFile = wait(bwData->bstore->writeFile(fname));
++bwData->stats.s3PutReqs;
++bwData->stats.deltaFilesWritten;
bwData->stats.deltaBytesWritten += serialized.size();
bwData->stats.deltaBytesWritten += serializedSize;
wait(objectFile->append(serialized.begin(), serialized.size()));
wait(objectFile->append(serialized.begin(), serializedSize));
wait(objectFile->finish());
// free serialized since it is persisted in blob
serialized = Value();
state int numIterations = 0;
try {
// before updating FDB, wait for the delta file version to be committed and previous delta files to finish
@ -474,7 +476,7 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
wait(readAndCheckGranuleLock(tr, keyRange, epoch, seqno));
Key dfKey = blobGranuleFileKeyFor(granuleID, 'D', currentDeltaVersion);
Value dfValue = blobGranuleFileValueFor(fname, 0, serialized.size());
Value dfValue = blobGranuleFileValueFor(fname, 0, serializedSize);
tr->set(dfKey, dfValue);
if (oldGranuleComplete.present()) {
@ -493,7 +495,7 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
keyRange.begin.printable(),
keyRange.end.printable(),
fname,
serialized.size(),
serializedSize,
currentDeltaVersion,
tr->getCommittedVersion());
}
@ -501,7 +503,7 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
if (BUGGIFY_WITH_PROB(0.01)) {
wait(delay(deterministicRandom()->random01()));
}
return BlobFileIndex(currentDeltaVersion, fname, 0, serialized.size());
return BlobFileIndex(currentDeltaVersion, fname, 0, serializedSize);
} catch (Error& e) {
numIterations++;
wait(tr->onError(e));
@ -545,16 +547,15 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
state std::string fname = deterministicRandom()->randomUniqueID().shortString() + "_" + granuleID.toString() +
"_T" + std::to_string((uint64_t)(1000.0 * now())) + "_V" + std::to_string(version) +
".snapshot";
state Arena arena;
state GranuleSnapshot snapshot;
state Standalone<GranuleSnapshot> snapshot;
wait(delay(0, TaskPriority::BlobWorkerUpdateStorage));
loop {
try {
RangeResult res = waitNext(rows.getFuture());
arena.dependsOn(res.arena());
snapshot.append(arena, res.begin(), res.size());
snapshot.arena().dependsOn(res.arena());
snapshot.append(snapshot.arena(), res.begin(), res.size());
wait(yield(TaskPriority::BlobWorkerUpdateStorage));
} catch (Error& e) {
if (e.code() == error_code_end_of_stream) {
@ -587,20 +588,26 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
ASSERT(snapshot[i].key < snapshot[i + 1].key);
}
// TODO is this easy to read as a flatbuffer from reader? Need to be sure about this data format
state Value serialized = ObjectWriter::toValue(snapshot, Unversioned());
state size_t serializedSize = serialized.size();
// write to s3 using multi part upload
// free snapshot to reduce memory
snapshot = Standalone<GranuleSnapshot>();
// write to blob using multi part upload
state Reference<IBackupFile> objectFile = wait(bwData->bstore->writeFile(fname));
++bwData->stats.s3PutReqs;
++bwData->stats.snapshotFilesWritten;
bwData->stats.snapshotBytesWritten += serialized.size();
bwData->stats.snapshotBytesWritten += serializedSize;
// TODO: inject write error
wait(objectFile->append(serialized.begin(), serialized.size()));
wait(objectFile->append(serialized.begin(), serializedSize));
wait(objectFile->finish());
// free serialized since it is persisted in blob
serialized = Value();
wait(delay(0, TaskPriority::BlobWorkerUpdateFDB));
// object uploaded successfully, save it to system key space
@ -613,7 +620,7 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
try {
wait(readAndCheckGranuleLock(tr, keyRange, epoch, seqno));
Key snapshotFileKey = blobGranuleFileKeyFor(granuleID, 'S', version);
Key snapshotFileValue = blobGranuleFileValueFor(fname, 0, serialized.size());
Key snapshotFileValue = blobGranuleFileValueFor(fname, 0, serializedSize);
tr->set(snapshotFileKey, snapshotFileValue);
// create granule history at version if this is a new granule with the initial dump from FDB
if (createGranuleHistory) {
@ -658,14 +665,14 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
keyRange.begin.printable(),
keyRange.end.printable(),
fname,
serialized.size());
serializedSize);
}
if (BUGGIFY_WITH_PROB(0.1)) {
wait(delay(deterministicRandom()->random01()));
}
return BlobFileIndex(version, fname, 0, serialized.size());
return BlobFileIndex(version, fname, 0, serializedSize);
}
ACTOR Future<BlobFileIndex> dumpInitialSnapshotFromFDB(Reference<BlobWorkerData> bwData,
@ -1041,8 +1048,7 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
}
// discard all in-memory mutations
metadata->deltaArena = Arena();
metadata->currentDeltas = GranuleDeltas();
metadata->currentDeltas = Standalone<GranuleDeltas>();
metadata->bufferedDeltaBytes = 0;
metadata->bufferedDeltaVersion = cfRollbackVersion;
@ -1084,7 +1090,7 @@ static Version doGranuleRollback(Reference<GranuleMetadata> metadata,
metadata->bufferedDeltaBytes);
}
metadata->currentDeltas.resize(metadata->deltaArena, mIdx);
metadata->currentDeltas.resize(metadata->currentDeltas.arena(), mIdx);
// delete all deltas in rollback range, but we can optimize here to just skip the uncommitted mutations
// directly and immediately pop the rollback out of inProgress to completed
@ -1588,7 +1594,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
if (DEBUG_BW_VERSION(deltas.version)) {
fmt::print("BWB {0}: ({1})\n", deltas.version, deltas.mutations.size());
}
metadata->currentDeltas.push_back_deep(metadata->deltaArena, deltas);
metadata->currentDeltas.push_back_deep(metadata->currentDeltas.arena(), deltas);
processedAnyMutations = true;
ASSERT(deltas.version != invalidVersion);
@ -1641,7 +1647,6 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
startState.granuleID,
metadata->originalEpoch,
metadata->originalSeqno,
metadata->deltaArena,
metadata->currentDeltas,
lastDeltaVersion,
previousFuture,
@ -1659,8 +1664,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
bwData->stats.mutationBytesBuffered -= metadata->bufferedDeltaBytes;
// reset current deltas
metadata->deltaArena = Arena();
metadata->currentDeltas = GranuleDeltas();
metadata->currentDeltas = Standalone<GranuleDeltas>();
metadata->bufferedDeltaBytes = 0;
// if we just wrote a delta file, check if we need to compact here.
@ -2353,7 +2357,7 @@ ACTOR Future<Void> handleBlobGranuleFileRequest(Reference<BlobWorkerData> bwData
metadata->pendingDeltaVersion);
}
ASSERT(metadata->durableDeltaVersion.get() == metadata->pendingDeltaVersion);
rep.arena.dependsOn(metadata->deltaArena);
rep.arena.dependsOn(metadata->currentDeltas.arena());
for (auto& delta : metadata->currentDeltas) {
if (delta.version > req.readVersion) {
break;