From c7a120c59d5db28ed0771f13a3d3b8442d455699 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Wed, 29 Nov 2017 15:24:59 -0800 Subject: [PATCH 1/4] Rename IAsyncFile::incrementalDelete -> IAsyncFileSystem::incrementalDeleteFile. `deleteFile` existed in IAsyncFileSystem, so an incremental delete function seems to belong more as a virtual method on IAsyncFileSystem than a static method on IAsyncFile, and the naming should match. As long as we're here, change IAsyncFile to declare a virtual destructor, so that it has good and proper C++ behavior. I presume this is what was vaguely intended by the default constructor definition that previously existed? --- fdbrpc/IAsyncFile.actor.cpp | 6 +++--- fdbrpc/IAsyncFile.h | 14 +++++++------- fdbserver/DiskQueue.actor.cpp | 4 ++-- fdbserver/KeyValueStoreSQLite.actor.cpp | 4 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fdbrpc/IAsyncFile.actor.cpp b/fdbrpc/IAsyncFile.actor.cpp index 04dc2b7733..3fb55b2e3f 100644 --- a/fdbrpc/IAsyncFile.actor.cpp +++ b/fdbrpc/IAsyncFile.actor.cpp @@ -25,7 +25,7 @@ #include "flow/UnitTest.h" #include -IAsyncFile::IAsyncFile(){}; +IAsyncFile::~IAsyncFile() = default; ACTOR static Future incrementalDeleteHelper( std::string filename, bool mustBeDurable, int64_t truncateAmt, double interval ) { state Reference file; @@ -53,7 +53,7 @@ ACTOR static Future incrementalDeleteHelper( std::string filename, bool mu return Void(); } -Future IAsyncFile::incrementalDelete( std::string filename, bool mustBeDurable ) { +Future IAsyncFileSystem::incrementalDeleteFile( std::string filename, bool mustBeDurable ) { return uncancellable(incrementalDeleteHelper( filename, mustBeDurable, @@ -74,6 +74,6 @@ TEST_CASE( "fileio/incrementalDelete" ) { Void _ = wait(f->truncate(fileSize)); //close the file by deleting the reference f.clear(); - Void _ = wait(IAsyncFile::incrementalDelete(filename, true)); + Void _ = wait(IAsyncFileSystem::filesystem()->incrementalDeleteFile(filename, true)); return Void(); } diff --git a/fdbrpc/IAsyncFile.h b/fdbrpc/IAsyncFile.h index 4da4fb7a8b..a4f5c1cba1 100644 --- a/fdbrpc/IAsyncFile.h +++ b/fdbrpc/IAsyncFile.h @@ -27,7 +27,7 @@ //All outstanding operations must be cancelled before the destructor of IAsyncFile is called. class IAsyncFile { public: - IAsyncFile(); + virtual ~IAsyncFile(); // Pass these to g_network->open to get an IAsyncFile enum { // Implementation relies on the low bits being the same as the SQLite flags (this is validated by a static_assert there) @@ -58,10 +58,6 @@ public: virtual Future size() = 0; virtual std::string getFilename() = 0; - // Unlinks a file and then deletes it slowly by truncating the file repeatedly. - // If mustBeDurable, returns only when the file is guaranteed to be deleted even after a power failure. - static Future incrementalDelete( std::string filename, bool mustBeDurable ); - // Attempt to read the *length bytes at offset without copying. If successful, a pointer to the // requested bytes is written to *data, and the number of bytes successfully read is // written to *length. If unsuccessful, *data and *length are undefined. @@ -83,11 +79,15 @@ typedef void (*runCycleFuncPtr)(); class IAsyncFileSystem { public: - virtual Future< Reference > open( std::string filename, int64_t flags, int64_t mode ) = 0; // Opens a file for asynchronous I/O + virtual Future< Reference > open( std::string filename, int64_t flags, int64_t mode ) = 0; - virtual Future< Void > deleteFile( std::string filename, bool mustBeDurable ) = 0; // Deletes the given file. If mustBeDurable, returns only when the file is guaranteed to be deleted even after a power failure. + virtual Future< Void > deleteFile( std::string filename, bool mustBeDurable ) = 0; + + // Unlinks a file and then deletes it slowly by truncating the file repeatedly. + // If mustBeDurable, returns only when the file is guaranteed to be deleted even after a power failure. + virtual Future incrementalDeleteFile( std::string filename, bool mustBeDurable ); static IAsyncFileSystem* filesystem() { return filesystem(g_network); } static runCycleFuncPtr runCycleFunc() { return reinterpret_cast(reinterpret_cast(g_network->global(INetwork::enRunCycleFunc))); } diff --git a/fdbserver/DiskQueue.actor.cpp b/fdbserver/DiskQueue.actor.cpp index 275fc92a34..7a8442e419 100644 --- a/fdbserver/DiskQueue.actor.cpp +++ b/fdbserver/DiskQueue.actor.cpp @@ -405,8 +405,8 @@ public: TraceEvent("DiskQueueShutdownDeleting", self->dbgid) .detail("File0", self->filename(0)) .detail("File1", self->filename(1)); - Void _ = wait( IAsyncFile::incrementalDelete( self->filename(0), false ) ); - Void _ = wait( IAsyncFile::incrementalDelete( self->filename(1), true ) ); + Void _ = wait( IAsyncFileSystem::filesystem()->incrementalDeleteFile( self->filename(0), false ) ); + Void _ = wait( IAsyncFileSystem::filesystem()->incrementalDeleteFile( self->filename(1), true ) ); } TraceEvent("DiskQueueShutdownComplete", self->dbgid) .detail("DeleteFiles", deleteFiles) diff --git a/fdbserver/KeyValueStoreSQLite.actor.cpp b/fdbserver/KeyValueStoreSQLite.actor.cpp index b2633f8db0..0b38aea459 100644 --- a/fdbserver/KeyValueStoreSQLite.actor.cpp +++ b/fdbserver/KeyValueStoreSQLite.actor.cpp @@ -1827,8 +1827,8 @@ private: self->logging.cancel(); Void _ = wait( self->readThreads->stop() && self->writeThread->stop() ); if (deleteOnClose) { - Void _ = wait( IAsyncFile::incrementalDelete( self->filename, true ) ); - Void _ = wait( IAsyncFile::incrementalDelete( self->filename + "-wal", false ) ); + Void _ = wait( IAsyncFileSystem::filesystem()->incrementalDeleteFile( self->filename, true ) ); + Void _ = wait( IAsyncFileSystem::filesystem()->incrementalDeleteFile( self->filename + "-wal", false ) ); } } catch (Error& e) { TraceEvent(SevError, "KVDoCloseError", self->logID) From 196258080bc850735ee71323030962c37c6c6fee Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Wed, 29 Nov 2017 17:24:04 -0800 Subject: [PATCH 2/4] Refactor zeroing a chunk of a file from DiskQueue into IAsyncFile. If we're going to do the work to provide more optimized ways to zero files, then I'd feel better with this being in a more common place, so that any other zero-ers are likely to reuse it. It also makes testing easier/more obvious. Also, because it's needed for correctness, fix the aligned_alloc for OSX, which wasn't aligned, and use an actually aligned allocation function. --- fdbrpc/IAsyncFile.actor.cpp | 54 +++++++++++++++++++++++++++++++++++ fdbrpc/IAsyncFile.h | 2 ++ fdbserver/DiskQueue.actor.cpp | 15 ++-------- flow/Platform.h | 7 ++++- 4 files changed, 65 insertions(+), 13 deletions(-) diff --git a/fdbrpc/IAsyncFile.actor.cpp b/fdbrpc/IAsyncFile.actor.cpp index 3fb55b2e3f..4f3989d6ec 100644 --- a/fdbrpc/IAsyncFile.actor.cpp +++ b/fdbrpc/IAsyncFile.actor.cpp @@ -27,6 +27,60 @@ IAsyncFile::~IAsyncFile() = default; +const static unsigned int ONE_MEGABYTE = 1<<20; +const static unsigned int FOUR_KILOBYTES = 4<<10; + +ACTOR static Future zeroRangeHelper( IAsyncFile* f, int64_t offset, int64_t length, int fixedbyte) { + state int64_t pos = offset; + state void* zeros = aligned_alloc( ONE_MEGABYTE, ONE_MEGABYTE ); + memset( zeros, fixedbyte, ONE_MEGABYTE ); + + while(pos < offset+length) { + state int len = std::min( ONE_MEGABYTE, offset+length-pos ); + Void _ = wait( f->write( zeros, len, pos ) ); + pos += len; + Void _ = wait( yield() ); + } + + free(zeros); + return Void(); +} + +Future IAsyncFile::zeroRange(int64_t offset, int64_t length) { + return uncancellable(zeroRangeHelper(this, offset, length, 0)); +} + +TEST_CASE( "fileio/zero" ) { + state std::string filename = "/tmp/__ZEROJUNK__"; + state Reference f = + wait(IAsyncFileSystem::filesystem()->open( + filename, + IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_CREATE | IAsyncFile::OPEN_READWRITE, + 0)); + + // Verify that we can grow a file with zero(). + Void _ = wait(f->sync()); + Void _ = wait(f->zeroRange(0, ONE_MEGABYTE)); + int64_t size = wait(f->size()); + ASSERT( ONE_MEGABYTE == size ); + + // Verify that zero() does, in fact, zero. + Void _ = wait(zeroRangeHelper(f.getPtr(), 0, ONE_MEGABYTE, 0xff)); + Void _ = wait(f->zeroRange(0, ONE_MEGABYTE)); + state uint8_t* page = (uint8_t*)malloc(FOUR_KILOBYTES); + int n = wait( f->read(page, FOUR_KILOBYTES, 0) ); + ASSERT( n == FOUR_KILOBYTES ); + for (int i = 0; i < FOUR_KILOBYTES; i++) { + ASSERT(page[i] == 0); + } + free(page); + + // Destruct our file and remove it. + f.clear(); + Void _ = wait( IAsyncFileSystem::filesystem()->deleteFile(filename, true) ); + return Void(); +} + ACTOR static Future incrementalDeleteHelper( std::string filename, bool mustBeDurable, int64_t truncateAmt, double interval ) { state Reference file; state int64_t remainingFileSize; diff --git a/fdbrpc/IAsyncFile.h b/fdbrpc/IAsyncFile.h index a4f5c1cba1..ca0e2bd349 100644 --- a/fdbrpc/IAsyncFile.h +++ b/fdbrpc/IAsyncFile.h @@ -52,6 +52,8 @@ public: // For read() and write(), the data buffer must remain valid until the future is ready virtual Future read( void* data, int length, int64_t offset ) = 0; // Returns number of bytes actually read (from [0,length]) virtual Future write( void const* data, int length, int64_t offset ) = 0; + // The zeroed data is not guaranteed to be durable after `zeroRange` returns. A call to sync() would be required. + virtual Future zeroRange( int64_t offset, int64_t length ); virtual Future truncate( int64_t size ) = 0; virtual Future sync() = 0; virtual Future flush() { return Void(); } // Sends previous writes to the OS if they have been buffered in memory, but does not make them power safe diff --git a/fdbserver/DiskQueue.actor.cpp b/fdbserver/DiskQueue.actor.cpp index 7a8442e419..10aa47104c 100644 --- a/fdbserver/DiskQueue.actor.cpp +++ b/fdbserver/DiskQueue.actor.cpp @@ -581,20 +581,11 @@ public: ACTOR static UNCANCELLABLE Future truncateFile(RawDiskQueue_TwoFiles* self, int file, int64_t pos) { state TrackMe trackMe(self); - state StringBuffer zeros( self->dbgid ); - TraceEvent("DQTruncateFile", self->dbgid).detail("File", file).detail("Pos", pos).detail("File0Name", self->files[0].dbgFilename); - - zeros.alignReserve( sizeof(Page), 1<<20 ); - memset( zeros.append(1<<20), 0, 1<<20 ); - - while(pos < self->files[file].size) { - state int len = std::min(zeros.size(), self->files[file].size-pos); - Void _ = wait( self->files[file].f->write( zeros.str.begin(), len, pos ) ); - pos += len; - } - + state Reference f = self->files[file].f; // Hold onto a reference in the off-chance that the DQ is removed from underneath us. + Void _ = wait( f->zeroRange( pos, self->files[file].size-pos ) ); Void _ = wait(self->files[file].syncQueue->onSync()); + // We intentionally don't return the f->zero future, so that TrackMe is destructed after f->zero finishes. return Void(); } diff --git a/flow/Platform.h b/flow/Platform.h index 7e17ae7af4..9435d3eed2 100644 --- a/flow/Platform.h +++ b/flow/Platform.h @@ -489,7 +489,12 @@ inline static void aligned_free(void* ptr) { free(ptr); } inline static void* aligned_alloc(size_t alignment, size_t size) { return memalign(alignment, size); } #endif #elif defined(__APPLE__) -inline static void* aligned_alloc(size_t alignment, size_t size) { return malloc(size); } // FIXME: OSX doesn't have memalign(). All allocations are 16-byte aligned +#include +inline static void* aligned_alloc(size_t alignment, size_t size) { + void* ptr = nullptr; + posix_memalign(&ptr, alignment, size); + return ptr; +} inline static void aligned_free(void* ptr) { free(ptr); } #endif From 7bab3a4ece1304e275a486db1c530cc6d45f650c Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Thu, 30 Nov 2017 16:55:39 -0800 Subject: [PATCH 3/4] AsyncFileKAIO will prefer using fallocate's ZERO_RANGE for AsyncFile::zero(). For situations in which we have support for FALLOC_FL_ZERO_RANGE, it's much faster to use fallocate than manually overwrite the file with zero bytes. Note that this support depends on having a kernel from late 2014 or newer, and being on ext4 or xfs. If these conditions aren't met, we'll fall back to writing zeros in 1MB chunks as normal. --- fdbrpc/AsyncFileKAIO.actor.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/AsyncFileKAIO.actor.h index 81cc3870f1..c0fcc980fc 100644 --- a/fdbrpc/AsyncFileKAIO.actor.h +++ b/fdbrpc/AsyncFileKAIO.actor.h @@ -227,10 +227,27 @@ public: return success(result); } +// TODO(alexmiller): Remove when we upgrade the dev docker image to >14.10 +#ifndef FALLOC_FL_ZERO_RANGE +#define FALLOC_FL_ZERO_RANGE 0x10 +#endif + virtual Future zeroRange( int64_t offset, int64_t length ) override { + bool success = false; + if (ctx.fallocateZeroSupported) { + int rc = fallocate( fd, FALLOC_FL_ZERO_RANGE, offset, length ); + if (rc == EOPNOTSUPP) { + ctx.fallocateZeroSupported = false; + } + if (rc == 0) { + success = true; + } + } + return success ? Void() : IAsyncFile::zeroRange(offset, length); + } virtual Future truncate( int64_t size ) { ++countFileLogicalWrites; ++countLogicalWrites; - + if(failed) { return io_timeout(); } @@ -484,6 +501,7 @@ private: int outstanding; double ioStallBegin; bool fallocateSupported; + bool fallocateZeroSupported; std::priority_queue, IOBlock::indirect_order_by_priority> queue; Int64MetricHandle countAIOSubmit; Int64MetricHandle countAIOCollect; @@ -499,7 +517,7 @@ private: EventMetricHandle slowAioSubmitMetric; uint32_t opsIssued; - Context() : iocx(0), evfd(-1), outstanding(0), opsIssued(0), ioStallBegin(0), fallocateSupported(true), submittedRequestList(nullptr) { + Context() : iocx(0), evfd(-1), outstanding(0), opsIssued(0), ioStallBegin(0), fallocateSupported(true), fallocateZeroSupported(true), submittedRequestList(nullptr) { setIOTimeout(0); } From 064670a95b10cd7c1dd60c4a2f0e61dac8cdb157 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Wed, 6 Dec 2017 13:41:21 -0800 Subject: [PATCH 4/4] Maintain a reference to the IAsyncFile in zeroRange. And also add some notes about the reference semantics to the IAsyncFile header for future readers. --- fdbrpc/IAsyncFile.actor.cpp | 6 +++--- fdbrpc/IAsyncFile.h | 7 ++++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fdbrpc/IAsyncFile.actor.cpp b/fdbrpc/IAsyncFile.actor.cpp index 4f3989d6ec..98ebab1461 100644 --- a/fdbrpc/IAsyncFile.actor.cpp +++ b/fdbrpc/IAsyncFile.actor.cpp @@ -30,7 +30,7 @@ IAsyncFile::~IAsyncFile() = default; const static unsigned int ONE_MEGABYTE = 1<<20; const static unsigned int FOUR_KILOBYTES = 4<<10; -ACTOR static Future zeroRangeHelper( IAsyncFile* f, int64_t offset, int64_t length, int fixedbyte) { +ACTOR static Future zeroRangeHelper( Reference f, int64_t offset, int64_t length, int fixedbyte) { state int64_t pos = offset; state void* zeros = aligned_alloc( ONE_MEGABYTE, ONE_MEGABYTE ); memset( zeros, fixedbyte, ONE_MEGABYTE ); @@ -47,7 +47,7 @@ ACTOR static Future zeroRangeHelper( IAsyncFile* f, int64_t offset, int64_ } Future IAsyncFile::zeroRange(int64_t offset, int64_t length) { - return uncancellable(zeroRangeHelper(this, offset, length, 0)); + return uncancellable(zeroRangeHelper(Reference::addRef(this), offset, length, 0)); } TEST_CASE( "fileio/zero" ) { @@ -65,7 +65,7 @@ TEST_CASE( "fileio/zero" ) { ASSERT( ONE_MEGABYTE == size ); // Verify that zero() does, in fact, zero. - Void _ = wait(zeroRangeHelper(f.getPtr(), 0, ONE_MEGABYTE, 0xff)); + Void _ = wait(zeroRangeHelper(f, 0, ONE_MEGABYTE, 0xff)); Void _ = wait(f->zeroRange(0, ONE_MEGABYTE)); state uint8_t* page = (uint8_t*)malloc(FOUR_KILOBYTES); int n = wait( f->read(page, FOUR_KILOBYTES, 0) ); diff --git a/fdbrpc/IAsyncFile.h b/fdbrpc/IAsyncFile.h index ca0e2bd349..ca0cf1a280 100644 --- a/fdbrpc/IAsyncFile.h +++ b/fdbrpc/IAsyncFile.h @@ -24,7 +24,11 @@ #include "flow/flow.h" -//All outstanding operations must be cancelled before the destructor of IAsyncFile is called. +// All outstanding operations must be cancelled before the destructor of IAsyncFile is called. +// The desirability of the above semantic is disputed. Some classes (AsyncFileBlobStore, +// AsyncFileCached) maintain references, while others (AsyncFileNonDurable) don't, and the comment +// is unapplicable to some others as well (AsyncFileKAIO). It's safest to assume that all operations +// must complete or cancel, but you should probably look at the file implementations you'll be using. class IAsyncFile { public: virtual ~IAsyncFile(); @@ -53,6 +57,7 @@ public: virtual Future read( void* data, int length, int64_t offset ) = 0; // Returns number of bytes actually read (from [0,length]) virtual Future write( void const* data, int length, int64_t offset ) = 0; // The zeroed data is not guaranteed to be durable after `zeroRange` returns. A call to sync() would be required. + // This operation holds a reference to the AsyncFile, and does not need to be cancelled before a reference is dropped. virtual Future zeroRange( int64_t offset, int64_t length ); virtual Future truncate( int64_t size ) = 0; virtual Future sync() = 0;