diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 895d39b779..4b0e1275c4 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -47,6 +47,7 @@ Performance * Increased the get read version batch size in the client. This change reduces the load on the proxies when doing many transactions with only a few operations per transaction. `(PR #1311) `_ * Clients no longer attempt to connect to the master during recovery. `(PR #1317) `_ +* SQLite page files now grow and shrink in chunks based on a knob which defaults to an effective chunk size of 4MB. [6.1.3] `(PR #1482) `_ Fixes ----- diff --git a/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/AsyncFileKAIO.actor.h index 3c2110f91e..d9fb6de2f8 100644 --- a/fdbrpc/AsyncFileKAIO.actor.h +++ b/fdbrpc/AsyncFileKAIO.actor.h @@ -259,6 +259,8 @@ public: int result = -1; KAIOLogEvent(logFile, id, OpLogEntry::TRUNCATE, OpLogEntry::START, size / 4096); bool completed = false; + double begin = timer_monotonic(); + if( ctx.fallocateSupported && size >= lastFileSize ) { result = fallocate( fd, 0, 0, size); if (result != 0) { @@ -278,6 +280,12 @@ public: if ( !completed ) result = ftruncate(fd, size); + double end = timer_monotonic(); + if(g_nondeterministic_random->random01() < end-begin) { + TraceEvent("SlowKAIOTruncate") + .detail("TruncateTime", end - begin) + .detail("TruncateBytes", size - lastFileSize); + } KAIOLogEvent(logFile, id, OpLogEntry::TRUNCATE, OpLogEntry::COMPLETE, size / 4096, result); if(result != 0) { diff --git a/fdbserver/KeyValueStoreSQLite.actor.cpp b/fdbserver/KeyValueStoreSQLite.actor.cpp index c557d61740..bb7b8d8912 100644 --- a/fdbserver/KeyValueStoreSQLite.actor.cpp +++ b/fdbserver/KeyValueStoreSQLite.actor.cpp @@ -1325,6 +1325,9 @@ void SQLiteDB::open(bool writable) { int result = sqlite3_open_v2(apath.c_str(), &db, (writable ? SQLITE_OPEN_READWRITE : SQLITE_OPEN_READONLY), NULL); checkError("open", result); + int chunkSize = 4096 * (BUGGIFY ? g_random->randomInt(0, 100) : SERVER_KNOBS->SQLITE_CHUNK_SIZE_PAGES); + checkError("setChunkSize", sqlite3_file_control(db, nullptr, SQLITE_FCNTL_CHUNK_SIZE, &chunkSize)); + btree = db->aDb[0].pBt; initPagerCodec(); diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index d24351ddf5..562f3d3885 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -195,6 +195,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( SQLITE_PAGE_SCAN_ERROR_LIMIT, 10000 ); init( SQLITE_BTREE_PAGE_USABLE, 4096 - 8); // pageSize - reserveSize for page checksum + init( SQLITE_CHUNK_SIZE_PAGES, 1024 ); // 4MB // Maximum and minimum cell payload bytes allowed on primary page as calculated in SQLite. // These formulas are copied from SQLite, using its hardcoded constants, so if you are diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 5ad8a4427c..9ab339624e 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -166,6 +166,7 @@ public: int SQLITE_FRAGMENT_PRIMARY_PAGE_USABLE; int SQLITE_FRAGMENT_OVERFLOW_PAGE_USABLE; double SQLITE_FRAGMENT_MIN_SAVINGS; + int SQLITE_CHUNK_SIZE_PAGES; // KeyValueStoreSqlite spring cleaning double CLEANING_INTERVAL; diff --git a/fdbserver/VFSAsync.cpp b/fdbserver/VFSAsync.cpp index 6bc2132893..c7dce58664 100644 --- a/fdbserver/VFSAsync.cpp +++ b/fdbserver/VFSAsync.cpp @@ -80,7 +80,9 @@ struct VFSAsyncFile { int debug_zcrefs, debug_zcreads, debug_reads; - VFSAsyncFile(std::string const& filename, int flags) : filename(filename), flags(flags), pLockCount(&filename_lockCount_openCount[filename].first), debug_zcrefs(0), debug_zcreads(0), debug_reads(0) { + int chunkSize; + + VFSAsyncFile(std::string const& filename, int flags) : filename(filename), flags(flags), pLockCount(&filename_lockCount_openCount[filename].first), debug_zcrefs(0), debug_zcreads(0), debug_reads(0), chunkSize(0) { filename_lockCount_openCount[filename].second++; } ~VFSAsyncFile(); @@ -185,6 +187,12 @@ static int asyncWrite(sqlite3_file *pFile, const void *zBuf, int iAmt, sqlite_in static int asyncTruncate(sqlite3_file *pFile, sqlite_int64 size){ VFSAsyncFile *p = (VFSAsyncFile*)pFile; + + // Adjust size to a multiple of chunkSize if set + if(p->chunkSize != 0) { + size = ((size + p->chunkSize - 1) / p->chunkSize) * p->chunkSize; + } + try { waitFor( p->file->truncate( size ) ); return SQLITE_OK; @@ -245,7 +253,18 @@ static int asyncCheckReservedLock(sqlite3_file *pFile, int *pResOut){ ** No xFileControl() verbs are implemented by this VFS. */ static int VFSAsyncFileControl(sqlite3_file *pFile, int op, void *pArg){ - return SQLITE_NOTFOUND; + VFSAsyncFile *p = (VFSAsyncFile*)pFile; + switch(op) { + case SQLITE_FCNTL_CHUNK_SIZE: + p->chunkSize = *(int *)pArg; + return SQLITE_OK; + + case SQLITE_FCNTL_SIZE_HINT: + return asyncTruncate(pFile, *(int64_t *)pArg); + + default: + return SQLITE_NOTFOUND; + }; } static int asyncSectorSize(sqlite3_file *pFile){ return 512; } // SOMEDAY: Would 4K be better?