829 lines
28 KiB
C++
829 lines
28 KiB
C++
/*
|
|
* VFSAsync.cpp
|
|
*
|
|
* This source file is part of the FoundationDB open source project
|
|
*
|
|
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "sqlite/sqlite3.h"
|
|
#include <stdio.h>
|
|
#include <string>
|
|
#include <vector>
|
|
#include "fdbrpc/fdbrpc.h"
|
|
#include "fdbrpc/IAsyncFile.h"
|
|
#include "fdbserver/CoroFlow.h"
|
|
#include "fdbrpc/simulator.h"
|
|
#include "fdbrpc/AsyncFileReadAhead.actor.h"
|
|
|
|
#include <assert.h>
|
|
#include <string.h>
|
|
|
|
#ifdef WIN32
|
|
#include <Windows.h>
|
|
#endif
|
|
|
|
#ifdef __unixish__
|
|
#include <sys/types.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/file.h>
|
|
#include <sys/param.h>
|
|
#include <sys/time.h>
|
|
#include <unistd.h>
|
|
#include <errno.h>
|
|
#include <fcntl.h>
|
|
#endif
|
|
|
|
#include "fdbserver/VFSAsync.h"
|
|
|
|
/*
|
|
** The maximum pathname length supported by this VFS.
|
|
*/
|
|
#define MAXPATHNAME 512
|
|
|
|
#define NO_LOCK 0
|
|
#define SHARED_LOCK 1
|
|
#define RESERVED_LOCK 2
|
|
#define PENDING_LOCK 3
|
|
#define EXCLUSIVE_LOCK 4
|
|
const uint32_t RESERVED_COUNT = 1U << 29;
|
|
|
|
VFSAsyncFile::VFSAsyncFile(std::string const& filename, int flags)
|
|
: flags(flags), filename(filename), pLockCount(&filename_lockCount_openCount[filename].first), debug_zcrefs(0),
|
|
debug_zcreads(0), debug_reads(0), chunkSize(0) {
|
|
filename_lockCount_openCount[filename].second++;
|
|
|
|
TraceEvent(SevDebug, "VFSAsyncFileConstruct")
|
|
.detail("Filename", filename)
|
|
.detail("OpenCount", filename_lockCount_openCount[filename].second)
|
|
.detail("LockCount", filename_lockCount_openCount[filename].first)
|
|
.backtrace();
|
|
}
|
|
|
|
std::map<std::string, std::pair<uint32_t, int>> VFSAsyncFile::filename_lockCount_openCount;
|
|
|
|
static int asyncClose(sqlite3_file* pFile) {
|
|
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
|
|
|
|
TraceEvent(SevDebug, "VFSAsyncFileDestroy").detail("Filename", p->filename).backtrace();
|
|
|
|
// printf("Closing %s: %d zcrefs, %d/%d reads zc\n", filename.c_str(), debug_zcrefs, debug_zcreads,
|
|
// debug_zcreads+debug_reads);
|
|
ASSERT(!p->debug_zcrefs);
|
|
|
|
p->~VFSAsyncFile();
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
static int asyncRead(sqlite3_file* pFile, void* zBuf, int iAmt, sqlite_int64 iOfst) {
|
|
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
|
|
try {
|
|
++p->debug_reads;
|
|
int readBytes = waitForAndGet(p->file->read(zBuf, iAmt, iOfst));
|
|
if (readBytes < iAmt) {
|
|
memset((uint8_t*)zBuf + readBytes, 0, iAmt - readBytes); // When reading past the EOF, sqlite expects the
|
|
// extra portion of the buffer to be zeroed
|
|
return SQLITE_IOERR_SHORT_READ;
|
|
}
|
|
return SQLITE_OK;
|
|
} catch (Error& e) {
|
|
if (e.isInjectedFault()) {
|
|
VFSAsyncFile::setInjectedError(SQLITE_IOERR_READ);
|
|
}
|
|
return SQLITE_IOERR_READ;
|
|
}
|
|
}
|
|
|
|
#if 1
|
|
static int asyncReleaseZeroCopy(sqlite3_file* pFile, void* data, int iAmt, sqlite_int64 iOfst) {
|
|
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
|
|
try {
|
|
--p->debug_zcrefs;
|
|
p->file->releaseZeroCopy(data, iAmt, iOfst);
|
|
} catch (Error& e) {
|
|
if (e.isInjectedFault()) {
|
|
VFSAsyncFile::setInjectedError(SQLITE_IOERR);
|
|
}
|
|
return SQLITE_IOERR;
|
|
}
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
static int asyncReadZeroCopy(sqlite3_file* pFile, void** data, int iAmt, sqlite_int64 iOfst, int* pDataWasCached) {
|
|
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
|
|
try {
|
|
int readBytes = iAmt;
|
|
Future<Void> readFuture = p->file->readZeroCopy(data, &readBytes, iOfst);
|
|
if (pDataWasCached)
|
|
*pDataWasCached = readFuture.isReady() ? 1 : 0;
|
|
waitFor(readFuture);
|
|
++p->debug_zcrefs;
|
|
if (readBytes < iAmt) {
|
|
// When reading past the EOF, sqlite expects the extra portion of the buffer to be zeroed. We can't do
|
|
// that, so return and sqlite will use the slow path.
|
|
asyncReleaseZeroCopy(pFile, *data, readBytes, iOfst);
|
|
return SQLITE_IOERR_SHORT_READ;
|
|
}
|
|
++p->debug_zcreads;
|
|
return SQLITE_OK;
|
|
} catch (Error& e) {
|
|
if (e.isInjectedFault()) {
|
|
VFSAsyncFile::setInjectedError(SQLITE_IOERR_READ);
|
|
}
|
|
return SQLITE_IOERR_READ;
|
|
}
|
|
}
|
|
|
|
#else
|
|
static int asyncReadZeroCopy(sqlite3_file* pFile, void** data, int iAmt, sqlite_int64 iOfst) {
|
|
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
|
|
try {
|
|
*data = new char[iAmt];
|
|
int readBytes = waitForAndGet(p->file->read(*data, iAmt, iOfst));
|
|
// printf("+asyncReadRef %p +%lld %d/%d = %p\n", pFile, iOfst, readBytes, iAmt, *data);
|
|
if (readBytes < iAmt) {
|
|
memset((uint8_t*)*data + readBytes, 0, iAmt - readBytes); // When reading past the EOF, sqlite expects the
|
|
// extra portion of the buffer to be zeroed
|
|
return SQLITE_IOERR_SHORT_READ;
|
|
}
|
|
return SQLITE_OK;
|
|
} catch (Error& e) {
|
|
if (e.isInjectedFault()) {
|
|
VFSAsyncFile::setInjectedError(SQLITE_IOERR_READ);
|
|
}
|
|
return SQLITE_IOERR_READ;
|
|
}
|
|
}
|
|
static int asyncReleaseZeroCopy(sqlite3_file* pFile, void* data, int iAmt, sqlite_int64 iOfst) {
|
|
// printf("-asyncReleaseRef %p +%lld %d <= %p\n", pFile, iOfst, iAmt, data);
|
|
delete[](char*) data;
|
|
return SQLITE_OK;
|
|
}
|
|
#endif
|
|
|
|
static int asyncWrite(sqlite3_file* pFile, const void* zBuf, int iAmt, sqlite_int64 iOfst) {
|
|
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
|
|
try {
|
|
waitFor(p->file->write(zBuf, iAmt, iOfst));
|
|
return SQLITE_OK;
|
|
} catch (Error& e) {
|
|
if (e.isInjectedFault()) {
|
|
VFSAsyncFile::setInjectedError(SQLITE_IOERR_WRITE);
|
|
}
|
|
return SQLITE_IOERR_WRITE;
|
|
}
|
|
}
|
|
|
|
static int asyncTruncate(sqlite3_file* pFile, sqlite_int64 size) {
|
|
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
|
|
|
|
// Adjust size to a multiple of chunkSize if set
|
|
if (p->chunkSize != 0) {
|
|
size = ((size + p->chunkSize - 1) / p->chunkSize) * p->chunkSize;
|
|
}
|
|
|
|
try {
|
|
waitFor(p->file->truncate(size));
|
|
return SQLITE_OK;
|
|
} catch (Error& e) {
|
|
if (e.isInjectedFault()) {
|
|
VFSAsyncFile::setInjectedError(SQLITE_IOERR_TRUNCATE);
|
|
}
|
|
return SQLITE_IOERR_TRUNCATE;
|
|
}
|
|
}
|
|
|
|
static int asyncSync(sqlite3_file* pFile, int flags) {
|
|
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
|
|
try {
|
|
waitFor(p->file->sync());
|
|
return SQLITE_OK;
|
|
} catch (Error& e) {
|
|
if (e.isInjectedFault()) {
|
|
VFSAsyncFile::setInjectedError(SQLITE_IOERR_FSYNC);
|
|
}
|
|
|
|
TraceEvent("VFSAsyncFileSyncError")
|
|
.error(e)
|
|
.detail("Filename", p->filename)
|
|
.detail("Sqlite3File", (int64_t)pFile)
|
|
.detail("IAsyncFile", (int64_t)p->file.getPtr());
|
|
|
|
return SQLITE_IOERR_FSYNC;
|
|
}
|
|
}
|
|
|
|
/*
|
|
** Write the size of the file in bytes to *pSize.
|
|
*/
|
|
static int VFSAsyncFileSize(sqlite3_file* pFile, sqlite_int64* pSize) {
|
|
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
|
|
try {
|
|
*pSize = waitForAndGet(p->file->size());
|
|
return SQLITE_OK;
|
|
} catch (Error& e) {
|
|
if (e.isInjectedFault()) {
|
|
VFSAsyncFile::setInjectedError(SQLITE_IOERR_FSTAT);
|
|
}
|
|
return SQLITE_IOERR_FSTAT;
|
|
}
|
|
}
|
|
|
|
static int asyncLock(sqlite3_file* pFile, int eLock) {
|
|
// VFSAsyncFile *p = (VFSAsyncFile*)pFile;
|
|
|
|
//TraceEvent("FileLock").detail("File", p->filename).detail("Fd", p->file->debugFD()).detail("PrevLockLevel", p->lockLevel).detail("Op", eLock).detail("LockCount", *p->pLockCount);
|
|
|
|
return eLock == EXCLUSIVE_LOCK ? SQLITE_BUSY : SQLITE_OK;
|
|
}
|
|
static int asyncUnlock(sqlite3_file* pFile, int eLock) {
|
|
assert(eLock <= SHARED_LOCK);
|
|
|
|
return SQLITE_OK;
|
|
}
|
|
static int asyncCheckReservedLock(sqlite3_file* pFile, int* pResOut) {
|
|
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
|
|
*pResOut = *p->pLockCount >= RESERVED_COUNT;
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
/*
|
|
** No xFileControl() verbs are implemented by this VFS.
|
|
*/
|
|
static int VFSAsyncFileControl(sqlite3_file* pFile, int op, void* pArg) {
|
|
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
|
|
switch (op) {
|
|
case SQLITE_FCNTL_CHUNK_SIZE:
|
|
p->chunkSize = *(int*)pArg;
|
|
return SQLITE_OK;
|
|
|
|
case SQLITE_FCNTL_SIZE_HINT:
|
|
return asyncTruncate(pFile, *(int64_t*)pArg);
|
|
|
|
default:
|
|
return SQLITE_NOTFOUND;
|
|
};
|
|
}
|
|
|
|
static int asyncSectorSize(sqlite3_file* pFile) {
|
|
return 512;
|
|
} // SOMEDAY: Would 4K be better?
|
|
static int asyncDeviceCharacteristics(sqlite3_file* pFile) {
|
|
return 0;
|
|
}
|
|
|
|
#if 1
|
|
struct SharedMemoryInfo { // for a file
|
|
std::string filename;
|
|
std::vector<void*> regions;
|
|
int regionSize;
|
|
int refcount; // Number of connections with this open
|
|
int sharedLocks[SQLITE_SHM_NLOCK];
|
|
int exclusiveLocks[SQLITE_SHM_NLOCK];
|
|
|
|
SharedMemoryInfo() : regionSize(0), refcount(0) {
|
|
memset(sharedLocks, 0, sizeof(sharedLocks));
|
|
memset(exclusiveLocks, 0, sizeof(exclusiveLocks));
|
|
}
|
|
void cleanup() {
|
|
for (int i = 0; i < regions.size(); i++)
|
|
delete[](uint8_t*) regions[i];
|
|
table.erase(filename);
|
|
}
|
|
|
|
static Mutex mutex;
|
|
static std::map<std::string, SharedMemoryInfo> table;
|
|
};
|
|
Mutex SharedMemoryInfo::mutex;
|
|
std::map<std::string, SharedMemoryInfo> SharedMemoryInfo::table;
|
|
|
|
/*
|
|
** This function is called to obtain a pointer to region iRegion of the
|
|
** shared-memory associated with the database file fd. Shared-memory regions
|
|
** are numbered starting from zero. Each shared-memory region is szRegion
|
|
** bytes in size.
|
|
**
|
|
** If an error occurs, an error code is returned and *pp is set to nullptr.
|
|
**
|
|
** Otherwise, if the bExtend parameter is 0 and the requested shared-memory
|
|
** region has not been allocated (by any client, including one running in a
|
|
** separate process), then *pp is set to nullptr and SQLITE_OK returned. If
|
|
** bExtend is non-zero and the requested shared-memory region has not yet
|
|
** been allocated, it is allocated by this function.
|
|
**
|
|
** If the shared-memory region has already been allocated or is allocated by
|
|
** this call as described above, then it is mapped into this processes
|
|
** address space (if it is not already), *pp is set to point to the mapped
|
|
** memory and SQLITE_OK returned.
|
|
*/
|
|
static int asyncShmMap(sqlite3_file* fd, /* Handle open on database file */
|
|
int iRegion, /* Region to retrieve */
|
|
int szRegion, /* Size of regions */
|
|
int bExtend, /* True to extend file if necessary */
|
|
void volatile** pp /* OUT: Mapped memory */
|
|
) {
|
|
MutexHolder hold(SharedMemoryInfo::mutex);
|
|
|
|
VFSAsyncFile* pDbFd = (VFSAsyncFile*)fd;
|
|
SharedMemoryInfo* memInfo = pDbFd->sharedMemory;
|
|
if (!memInfo) {
|
|
std::string filename = pDbFd->filename;
|
|
memInfo = pDbFd->sharedMemory = &SharedMemoryInfo::table[filename];
|
|
memInfo->filename = filename;
|
|
memInfo->regionSize = szRegion;
|
|
++memInfo->refcount;
|
|
// printf("Shared memory for: '%s' (%d refs)\n", filename.c_str(), memInfo->refcount);
|
|
} else {
|
|
assert(memInfo->regionSize == szRegion);
|
|
}
|
|
|
|
if (iRegion >= memInfo->regions.size()) {
|
|
if (!bExtend) {
|
|
*pp = nullptr;
|
|
return SQLITE_OK;
|
|
}
|
|
while (memInfo->regions.size() <= iRegion) {
|
|
void* mem = new uint8_t[szRegion];
|
|
memset(mem, 0, szRegion);
|
|
memInfo->regions.push_back(mem);
|
|
}
|
|
}
|
|
*pp = memInfo->regions[iRegion];
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
/*
|
|
** Change the lock state for a shared-memory segment.
|
|
**
|
|
** Note that the relationship between SHAREd and EXCLUSIVE locks is a little
|
|
** different here than in posix. In xShmLock(), one can go from unlocked
|
|
** to shared and back or from unlocked to exclusive and back. But one may
|
|
** not go from shared to exclusive or from exclusive to shared.
|
|
*/
|
|
// sqlite doesn't seem to match these up correctly - it happily calls unlock on locks it doesn't hold.
|
|
// So we have to keep track of which locks are held by a given sqlite3_file
|
|
static int asyncShmLock(sqlite3_file* fd, /* Database file holding the shared memory */
|
|
int ofst, /* First lock to acquire or release */
|
|
int n, /* Number of locks to acquire or release */
|
|
int flags /* What to do with the lock */
|
|
) {
|
|
assert(ofst >= 0 && ofst + n <= SQLITE_SHM_NLOCK);
|
|
assert(n >= 1);
|
|
assert(flags == (SQLITE_SHM_LOCK | SQLITE_SHM_SHARED) || flags == (SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE) ||
|
|
flags == (SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED) || flags == (SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE));
|
|
assert(n == 1 || (flags & SQLITE_SHM_EXCLUSIVE) != 0);
|
|
|
|
MutexHolder hold(SharedMemoryInfo::mutex);
|
|
|
|
VFSAsyncFile* pDbFd = (VFSAsyncFile*)fd;
|
|
SharedMemoryInfo* memInfo = pDbFd->sharedMemory;
|
|
|
|
if (flags & SQLITE_SHM_UNLOCK) {
|
|
for (int i = ofst; i < ofst + n; i++) {
|
|
if (pDbFd->sharedMemorySharedLocks & (1 << i)) {
|
|
pDbFd->sharedMemorySharedLocks &= ~(1 << i);
|
|
--memInfo->sharedLocks[i];
|
|
}
|
|
if (pDbFd->sharedMemoryExclusiveLocks & (1 << i)) {
|
|
pDbFd->sharedMemoryExclusiveLocks &= ~(1 << i);
|
|
--memInfo->exclusiveLocks[i];
|
|
}
|
|
}
|
|
} else if (flags & SQLITE_SHM_SHARED) {
|
|
for (int i = ofst; i < ofst + n; i++)
|
|
if (memInfo->exclusiveLocks[i] != ((pDbFd->sharedMemoryExclusiveLocks >> i) & 1)) {
|
|
//TraceEvent("ShmLocked").detail("File", DEBUG_DETERMINISM ? 0 : (int64_t)pDbFd).detail("Acquiring", "Shared").detail("I", i).detail("Exclusive", memInfo->exclusiveLocks[i]).detail("MyExclusive", pDbFd->sharedMemoryExclusiveLocks);
|
|
return SQLITE_BUSY;
|
|
}
|
|
for (int i = ofst; i < ofst + n; i++)
|
|
if (!(pDbFd->sharedMemorySharedLocks & (1 << i))) {
|
|
pDbFd->sharedMemorySharedLocks |= 1 << i;
|
|
memInfo->sharedLocks[i]++;
|
|
}
|
|
} else {
|
|
for (int i = ofst; i < ofst + n; i++)
|
|
if (memInfo->exclusiveLocks[i] != ((pDbFd->sharedMemoryExclusiveLocks >> i) & 1) ||
|
|
memInfo->sharedLocks[i] != ((pDbFd->sharedMemorySharedLocks >> i) & 1)) {
|
|
//TraceEvent("ShmLocked").detail("File", DEBUG_DETERMINISM ? 0 : (int64_t)pDbFd).detail("Acquiring", "Exclusive").detail("I", i).detail("Exclusive", memInfo->exclusiveLocks[i]).detail("MyExclusive", pDbFd->sharedMemoryExclusiveLocks).detail("Shared", memInfo->sharedLocks[i]).detail("MyShared", pDbFd->sharedMemorySharedLocks);
|
|
return SQLITE_BUSY;
|
|
}
|
|
for (int i = ofst; i < ofst + n; i++)
|
|
if (!(pDbFd->sharedMemoryExclusiveLocks & (1 << i))) {
|
|
pDbFd->sharedMemoryExclusiveLocks |= 1 << i;
|
|
memInfo->exclusiveLocks[i]++;
|
|
}
|
|
}
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
/*
|
|
** Implement a memory barrier or memory fence on shared memory.
|
|
**
|
|
** All loads and stores begun before the barrier must complete before
|
|
** any load or store begun after the barrier.
|
|
*/
|
|
static void asyncShmBarrier(sqlite3_file*) {
|
|
#if WIN32
|
|
_ReadWriteBarrier();
|
|
#else
|
|
__sync_synchronize();
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
** Close a connection to shared-memory. Delete the underlying
|
|
** storage if deleteFlag is true.
|
|
**
|
|
** If there is no shared memory associated with the connection then this
|
|
** routine is a harmless no-op.
|
|
*/
|
|
static int asyncShmUnmap(sqlite3_file* fd, /* The underlying database file */
|
|
int deleteFlag /* Delete shared-memory if true */
|
|
) {
|
|
MutexHolder hold(SharedMemoryInfo::mutex);
|
|
|
|
VFSAsyncFile* pDbFd = (VFSAsyncFile*)fd;
|
|
SharedMemoryInfo* memInfo = pDbFd->sharedMemory;
|
|
if (!memInfo)
|
|
return SQLITE_OK;
|
|
pDbFd->sharedMemory = 0;
|
|
|
|
// printf("Connection %p closed shared memory\n", fd);
|
|
|
|
if (!--memInfo->refcount) {
|
|
// printf("Cleanup shared memory for: '%s' (%d refs; deleteFlag=%d)\n", memInfo->filename.c_str(),
|
|
// memInfo->refcount, deleteFlag); printf(" Shared locks: "); for(int i=0; i<8; i++) printf("%d ",
|
|
// memInfo->sharedLocks[i]); printf("\n"); printf(" Exclusive locks: "); for(int i=0; i<8; i++) printf("%d ",
|
|
// memInfo->exclusiveLocks[i]); printf("\n");
|
|
|
|
//TraceEvent("CleanupSharedMemory").detail("Filename", memInfo->filename.c_str()).detail("RefCount", memInfo->refcount).detail("DeleteFlag", deleteFlag);
|
|
// for(int i = 0; i < 8; i++)
|
|
//TraceEvent("CleanupSharedMemory_Locks").detail("Filename", memInfo->filename.c_str()).detail("Num", i).detail("Shared", memInfo->sharedLocks[i]).detail("Exclusive", memInfo->exclusiveLocks[i]);
|
|
|
|
// We don't think deleteFlag will ever be set
|
|
ASSERT(!deleteFlag);
|
|
}
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
VFSAsyncFile::~VFSAsyncFile() {
|
|
|
|
TraceEvent(SevDebug, "VFSAsyncFileDestroyStart")
|
|
.detail("Filename", filename)
|
|
.detail("OpenCount", filename_lockCount_openCount[filename].second)
|
|
.detail("LockCount", filename_lockCount_openCount[filename].first)
|
|
.backtrace();
|
|
|
|
if (!--filename_lockCount_openCount[filename].second) {
|
|
filename_lockCount_openCount.erase(filename);
|
|
|
|
TraceEvent(SevDebug, "VFSAsyncFileDestroy").detail("Filename", filename).backtrace();
|
|
|
|
// Always delete the shared memory when the last copy of the file is deleted. In simulation, this is helpful
|
|
// because "killing" a file without properly closing it can result in a shared memory state that causes
|
|
// corruption when reopening the killed file. The only expected penalty from doing this is a potentially slower
|
|
// open operation on a database, but that should happen infrequently.
|
|
//
|
|
// We can't do this in ShmUnmap when refcount is 0 because it seems that SQLite sometimes subsequently tries to
|
|
// reopen the WAL from multiple locations simultaneously, resulting in a locking error
|
|
auto itr = SharedMemoryInfo::table.find(filename);
|
|
if (itr != SharedMemoryInfo::table.end()) {
|
|
ASSERT_ABORT(itr->second.refcount == 0);
|
|
itr->second.cleanup();
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
** Open a file handle.
|
|
*/
|
|
static int asyncOpen(sqlite3_vfs* pVfs, /* VFS */
|
|
const char* zName, /* File to open, or 0 for a temp file */
|
|
sqlite3_file* pFile, /* Pointer to VFSAsyncFile struct to populate */
|
|
int flags, /* Input SQLITE_OPEN_XXX flags */
|
|
int* pOutFlags /* Output SQLITE_OPEN_XXX flags (or nullptr) */
|
|
) {
|
|
static const sqlite3_io_methods asyncio = { 3, /* iVersion */
|
|
asyncClose, /* xClose */
|
|
asyncRead, /* xRead */
|
|
asyncWrite, /* xWrite */
|
|
asyncTruncate, /* xTruncate */
|
|
asyncSync, /* xSync */
|
|
VFSAsyncFileSize, /* xFileSize */
|
|
asyncLock, /* xLock */
|
|
asyncUnlock, /* xUnlock */
|
|
asyncCheckReservedLock, /* xCheckReservedLock */
|
|
VFSAsyncFileControl, /* xFileControl */
|
|
asyncSectorSize, /* xSectorSize */
|
|
asyncDeviceCharacteristics, /* xDeviceCharacteristics */
|
|
asyncShmMap,
|
|
asyncShmLock,
|
|
asyncShmBarrier,
|
|
asyncShmUnmap,
|
|
asyncReadZeroCopy,
|
|
asyncReleaseZeroCopy };
|
|
|
|
VFSAsyncFile* p = (VFSAsyncFile*)pFile; /* Populate this structure */
|
|
|
|
if (zName == 0)
|
|
return SQLITE_IOERR;
|
|
|
|
static_assert(
|
|
SQLITE_OPEN_EXCLUSIVE == IAsyncFile::OPEN_EXCLUSIVE && SQLITE_OPEN_CREATE == IAsyncFile::OPEN_CREATE &&
|
|
SQLITE_OPEN_READONLY == IAsyncFile::OPEN_READONLY && SQLITE_OPEN_READWRITE == IAsyncFile::OPEN_READWRITE,
|
|
"SQLite flag values don't match IAsyncFile flag values");
|
|
|
|
// File creation here is disabled because we always create the files first in KeyValueStoreSQLite, using atomic
|
|
// creation
|
|
int oflags =
|
|
flags & (/*SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_CREATE |*/ SQLITE_OPEN_READONLY | SQLITE_OPEN_READWRITE);
|
|
if (flags & SQLITE_OPEN_WAL)
|
|
oflags |= IAsyncFile::OPEN_LARGE_PAGES;
|
|
oflags |= IAsyncFile::OPEN_LOCK;
|
|
|
|
memset(static_cast<void*>(p), 0, sizeof(VFSAsyncFile));
|
|
new (p) VFSAsyncFile(zName, flags);
|
|
try {
|
|
// Note that SQLiteDB::open also opens the db file, so its flags and modes are important, too
|
|
p->file = waitForAndGet(IAsyncFileSystem::filesystem()->open(p->filename, oflags, 0600));
|
|
|
|
TraceEvent(SevDebug, "VFSAsyncFileOpened").detail("Filename", p->filename).backtrace();
|
|
|
|
} catch (Error& e) {
|
|
if (e.isInjectedFault()) {
|
|
VFSAsyncFile::setInjectedError(SQLITE_CANTOPEN);
|
|
}
|
|
TraceEvent("VFSAsyncFileOpenError").error(e).detail("Filename", p->filename);
|
|
p->~VFSAsyncFile();
|
|
return SQLITE_CANTOPEN;
|
|
}
|
|
|
|
if (pOutFlags) {
|
|
*pOutFlags = flags;
|
|
}
|
|
p->base.pMethods = &asyncio;
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
// The next few functions, which perform filesystem operations by path rather than by file, have
|
|
// OS-specific implementations.
|
|
|
|
/*
|
|
** Delete the file identified by argument zPath. If the dirSync parameter
|
|
** is non-zero, then ensure the file-system modification to delete the
|
|
** file has been synced to disk before returning.
|
|
*/
|
|
static int asyncDelete(sqlite3_vfs* pVfs, const char* zPath, int dirSync) {
|
|
ASSERT(false); // At the moment this isn't used; hence isn't under test. Could easily use
|
|
// IAsyncFileSystem::filesystem()->deleteFile().
|
|
return SQLITE_IOERR_DELETE;
|
|
}
|
|
|
|
/*
|
|
** Query the file-system to see if the named file exists, is readable or
|
|
** is both readable and writable. For an exists query, treat a zero-length file
|
|
** as if it does not exist.
|
|
*/
|
|
static int asyncAccess(sqlite3_vfs* pVfs, const char* zPath, int flags, int* pResOut) {
|
|
#ifdef __unixish__
|
|
#ifndef F_OK
|
|
#define F_OK 0
|
|
#endif
|
|
#ifndef R_OK
|
|
#define R_OK 4
|
|
#endif
|
|
#ifndef W_OK
|
|
#define W_OK 2
|
|
#endif
|
|
int rc; /* access() return code */
|
|
int eAccess = F_OK; /* Second argument to access() */
|
|
|
|
assert(flags == SQLITE_ACCESS_EXISTS /* access(zPath, F_OK) */
|
|
|| flags == SQLITE_ACCESS_READ /* access(zPath, R_OK) */
|
|
|| flags == SQLITE_ACCESS_READWRITE /* access(zPath, R_OK|W_OK) */
|
|
);
|
|
|
|
if (flags == SQLITE_ACCESS_READWRITE)
|
|
eAccess = R_OK | W_OK;
|
|
if (flags == SQLITE_ACCESS_READ)
|
|
eAccess = R_OK;
|
|
|
|
rc = access(zPath, eAccess);
|
|
*pResOut = (rc == 0);
|
|
|
|
if (flags == SQLITE_ACCESS_EXISTS && *pResOut) {
|
|
struct stat buf;
|
|
if (0 == stat(zPath, &buf) && buf.st_size == 0) {
|
|
*pResOut = 0;
|
|
}
|
|
}
|
|
return SQLITE_OK;
|
|
#else
|
|
WIN32_FILE_ATTRIBUTE_DATA data;
|
|
DWORD attr = INVALID_FILE_ATTRIBUTES;
|
|
memset(&data, 0, sizeof(data));
|
|
if (GetFileAttributesEx(zPath, GetFileExInfoStandard, &data)) {
|
|
if (!(flags == SQLITE_ACCESS_EXISTS && data.nFileSizeHigh == 0 && data.nFileSizeLow == 0))
|
|
attr = data.dwFileAttributes;
|
|
} else if (GetLastError() != ERROR_FILE_NOT_FOUND)
|
|
return SQLITE_IOERR_ACCESS;
|
|
|
|
if (flags == SQLITE_ACCESS_READWRITE)
|
|
*pResOut = (attr & FILE_ATTRIBUTE_READONLY) == 0;
|
|
else
|
|
*pResOut = attr != INVALID_FILE_ATTRIBUTES;
|
|
return SQLITE_OK;
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
** Argument zPath points to a nul-terminated string containing a file path.
|
|
** If zPath is an absolute path, then it is copied as is into the output
|
|
** buffer. Otherwise, if it is a relative path, then the equivalent full
|
|
** path is written to the output buffer.
|
|
*/
|
|
static int asyncFullPathname(sqlite3_vfs* pVfs, /* VFS */
|
|
const char* zPath, /* Input path (possibly a relative path) */
|
|
int nPathOut, /* Size of output buffer in bytes */
|
|
char* zPathOut /* Pointer to output buffer */
|
|
) {
|
|
try {
|
|
auto s = abspath(zPath);
|
|
if (s.size() >= nPathOut)
|
|
return SQLITE_IOERR;
|
|
memcpy(zPathOut, s.c_str(), s.size() + 1);
|
|
return SQLITE_OK;
|
|
} catch (Error& e) {
|
|
if (e.isInjectedFault()) {
|
|
VFSAsyncFile::setInjectedError(SQLITE_IOERR);
|
|
}
|
|
TraceEvent(SevError, "VFSAsyncFullPathnameError").error(e).detail("PathIn", (std::string)zPath);
|
|
return SQLITE_IOERR;
|
|
} catch (...) {
|
|
TraceEvent(SevError, "VFSAsyncFullPathnameError").error(unknown_error()).detail("PathIn", (std::string)zPath);
|
|
return SQLITE_IOERR;
|
|
}
|
|
}
|
|
|
|
/*
|
|
** Returns true if there is a shared memory entry for the specified filename,
|
|
** and false otherwise.
|
|
*/
|
|
bool vfsAsyncIsOpen(std::string filename) {
|
|
return SharedMemoryInfo::table.count(abspath(filename)) > 0;
|
|
}
|
|
|
|
/*
|
|
** The following four VFS methods:
|
|
**
|
|
** xDlOpen
|
|
** xDlError
|
|
** xDlSym
|
|
** xDlClose
|
|
**
|
|
** are supposed to implement the functionality needed by SQLite to load
|
|
** extensions compiled as shared objects. This simple VFS does not support
|
|
** this functionality, so the following functions are no-ops.
|
|
*/
|
|
static void* asyncDlOpen(sqlite3_vfs* pVfs, const char* zPath) {
|
|
return 0;
|
|
}
|
|
static void asyncDlError(sqlite3_vfs* pVfs, int nByte, char* zErrMsg) {
|
|
sqlite3_snprintf(nByte, zErrMsg, "Loadable extensions are not supported");
|
|
zErrMsg[nByte - 1] = '\0';
|
|
}
|
|
static void (*asyncDlSym(sqlite3_vfs* pVfs, void* pH, const char* z))(void) {
|
|
return 0;
|
|
}
|
|
static void asyncDlClose(sqlite3_vfs* pVfs, void* pHandle) {
|
|
return;
|
|
}
|
|
|
|
/*
|
|
** Parameter zByte points to a buffer nByte bytes in size. Populate this
|
|
** buffer with pseudo-random data.
|
|
*/
|
|
static int asyncRandomness(sqlite3_vfs* pVfs, int nByte, char* zByte) {
|
|
for (int i = 0; i < nByte; i++)
|
|
zByte[i] = deterministicRandom()->randomInt(0, 256);
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
/*
|
|
** Sleep for at least nMicro microseconds. Return the (approximate) number
|
|
** of microseconds slept for.
|
|
*/
|
|
static int asyncSleep(sqlite3_vfs* pVfs, int microseconds) {
|
|
try {
|
|
Future<Void> simCancel = Never();
|
|
if (g_network->isSimulated())
|
|
simCancel = success(g_simulator.getCurrentProcess()->shutdownSignal.getFuture());
|
|
if (simCancel.isReady()) {
|
|
waitFor(delay(FLOW_KNOBS->MAX_BUGGIFIED_DELAY));
|
|
return 0;
|
|
}
|
|
waitFor(g_network->delay(microseconds * 1e-6, TaskPriority::DefaultDelay) || simCancel);
|
|
return microseconds;
|
|
} catch (Error& e) {
|
|
if (e.isInjectedFault()) {
|
|
VFSAsyncFile::setInjectedError(SQLITE_ERROR);
|
|
}
|
|
TraceEvent(SevError, "VFSAsyncSleepError").errorUnsuppressed(e);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/*
|
|
** Find the current time (in Universal Coordinated Time). Write into *piNow
|
|
** the current time and date as a Julian Day number times 86_400_000. In
|
|
** other words, write into *piNow the number of milliseconds since the Julian
|
|
** epoch of noon in Greenwich on November 24, 4714 B.C according to the
|
|
** proleptic Gregorian calendar.
|
|
**
|
|
** On success, return 0. Return 1 if the time and date cannot be found.
|
|
*/
|
|
static int asyncCurrentTimeInt64(sqlite3_vfs* NotUsed, sqlite3_int64* piNow) {
|
|
#if __unixish__
|
|
static const sqlite3_int64 unixEpoch = 24405875 * (sqlite3_int64)8640000;
|
|
struct timeval sNow;
|
|
gettimeofday(&sNow, nullptr);
|
|
*piNow = unixEpoch + 1000 * (sqlite3_int64)sNow.tv_sec + sNow.tv_usec / 1000;
|
|
#elif defined(_WIN32)
|
|
static const sqlite3_int64 winFiletimeEpoch = 23058135 * (sqlite3_int64)8640000;
|
|
int64_t ft = 0;
|
|
GetSystemTimeAsFileTime((FILETIME*)&ft);
|
|
*piNow = winFiletimeEpoch + ft / 10000;
|
|
#else
|
|
#error Port me!
|
|
#endif
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
** Set *pTime to the current UTC time expressed as a Julian day. Return
|
|
** SQLITE_OK if successful, or an error code otherwise.
|
|
**
|
|
** http://en.wikipedia.org/wiki/Julian_day
|
|
*/
|
|
static int asyncCurrentTime(sqlite3_vfs* pVfs, double* pTime) {
|
|
sqlite3_int64 t = 0;
|
|
int rc = asyncCurrentTimeInt64(pVfs, &t);
|
|
if (rc)
|
|
return rc;
|
|
*pTime = t / 86400000.0;
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
static int asyncGetLastError(sqlite3_vfs* NotUsed, int NotUsed2, char* NotUsed3) {
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
** This function returns a pointer to the VFS implemented in this file.
|
|
** To make the VFS available to SQLite:
|
|
**
|
|
** sqlite3_vfs_register(sqlite3_asyncvfs(), 0);
|
|
*/
|
|
sqlite3_vfs* vfsAsync() {
|
|
static sqlite3_vfs asyncvfs = {
|
|
3, /* iVersion */
|
|
sizeof(VFSAsyncFile), /* szOsFile */
|
|
MAXPATHNAME, /* mxPathname */
|
|
0, /* pNext */
|
|
"fdb_async", /* zName */
|
|
0, /* pAppData */
|
|
asyncOpen, /* xOpen */
|
|
asyncDelete, /* xDelete */
|
|
asyncAccess, /* xAccess */
|
|
asyncFullPathname, /* xFullPathname */
|
|
asyncDlOpen, /* xDlOpen */
|
|
asyncDlError, /* xDlError */
|
|
asyncDlSym, /* xDlSym */
|
|
asyncDlClose, /* xDlClose */
|
|
asyncRandomness, /* xRandomness */
|
|
asyncSleep, /* xSleep */
|
|
asyncCurrentTime, /* xCurrentTime */
|
|
asyncGetLastError, /* xGetLastError */
|
|
asyncCurrentTimeInt64, /* xCurrentTimeInt64 */
|
|
0, /* xSetSystemCall */
|
|
0, /* xGetSystemCall */
|
|
0, /* xNextSystemCall */
|
|
|
|
};
|
|
return &asyncvfs;
|
|
}
|