foundationdb/fdbserver/VFSAsync.cpp

829 lines
28 KiB
C++
Raw Normal View History

2017-05-26 04:48:44 +08:00
/*
* VFSAsync.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
2017-05-26 04:48:44 +08:00
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
2017-05-26 04:48:44 +08:00
* http://www.apache.org/licenses/LICENSE-2.0
*
2017-05-26 04:48:44 +08:00
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "sqlite/sqlite3.h"
#include <stdio.h>
#include <string>
#include <vector>
#include "fdbrpc/fdbrpc.h"
#include "fdbrpc/IAsyncFile.h"
#include "fdbserver/CoroFlow.h"
2017-05-26 04:48:44 +08:00
#include "fdbrpc/simulator.h"
#include "fdbrpc/AsyncFileReadAhead.actor.h"
#include <assert.h>
#include <string.h>
#ifdef WIN32
#include <Windows.h>
2017-05-26 04:48:44 +08:00
#endif
#ifdef __unixish__
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <sys/param.h>
#include <sys/time.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#endif
#include "fdbserver/VFSAsync.h"
2017-05-26 04:48:44 +08:00
/*
** The maximum pathname length supported by this VFS.
*/
#define MAXPATHNAME 512
#define NO_LOCK 0
#define SHARED_LOCK 1
#define RESERVED_LOCK 2
#define PENDING_LOCK 3
#define EXCLUSIVE_LOCK 4
const uint32_t RESERVED_COUNT = 1U << 29;
2017-05-26 04:48:44 +08:00
VFSAsyncFile::VFSAsyncFile(std::string const& filename, int flags)
2021-07-23 13:48:27 +08:00
: flags(flags), filename(filename), pLockCount(&filename_lockCount_openCount[filename].first), debug_zcrefs(0),
debug_zcreads(0), debug_reads(0), chunkSize(0) {
filename_lockCount_openCount[filename].second++;
TraceEvent(SevDebug, "VFSAsyncFileConstruct")
.detail("Filename", filename)
.detail("OpenCount", filename_lockCount_openCount[filename].second)
.detail("LockCount", filename_lockCount_openCount[filename].first)
.backtrace();
}
2017-05-26 04:48:44 +08:00
std::map<std::string, std::pair<uint32_t, int>> VFSAsyncFile::filename_lockCount_openCount;
2017-05-26 04:48:44 +08:00
static int asyncClose(sqlite3_file* pFile) {
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
2017-05-26 04:48:44 +08:00
TraceEvent(SevDebug, "VFSAsyncFileDestroy").detail("Filename", p->filename).backtrace();
// printf("Closing %s: %d zcrefs, %d/%d reads zc\n", filename.c_str(), debug_zcrefs, debug_zcreads,
// debug_zcreads+debug_reads);
ASSERT(!p->debug_zcrefs);
2017-05-26 04:48:44 +08:00
p->~VFSAsyncFile();
return SQLITE_OK;
}
static int asyncRead(sqlite3_file* pFile, void* zBuf, int iAmt, sqlite_int64 iOfst) {
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
2017-05-26 04:48:44 +08:00
try {
++p->debug_reads;
int readBytes = waitForAndGet(p->file->read(zBuf, iAmt, iOfst));
2017-05-26 04:48:44 +08:00
if (readBytes < iAmt) {
memset((uint8_t*)zBuf + readBytes, 0, iAmt - readBytes); // When reading past the EOF, sqlite expects the
// extra portion of the buffer to be zeroed
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR_SHORT_READ;
}
return SQLITE_OK;
} catch (Error& e) {
if (e.isInjectedFault()) {
VFSAsyncFile::setInjectedError(SQLITE_IOERR_READ);
}
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR_READ;
}
}
#if 1
static int asyncReleaseZeroCopy(sqlite3_file* pFile, void* data, int iAmt, sqlite_int64 iOfst) {
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
try {
2017-05-26 04:48:44 +08:00
--p->debug_zcrefs;
p->file->releaseZeroCopy(data, iAmt, iOfst);
} catch (Error& e) {
if (e.isInjectedFault()) {
VFSAsyncFile::setInjectedError(SQLITE_IOERR);
}
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR;
}
return SQLITE_OK;
}
static int asyncReadZeroCopy(sqlite3_file* pFile, void** data, int iAmt, sqlite_int64 iOfst, int* pDataWasCached) {
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
2017-05-26 04:48:44 +08:00
try {
int readBytes = iAmt;
Future<Void> readFuture = p->file->readZeroCopy(data, &readBytes, iOfst);
if (pDataWasCached)
2017-05-26 04:48:44 +08:00
*pDataWasCached = readFuture.isReady() ? 1 : 0;
waitFor(readFuture);
++p->debug_zcrefs;
if (readBytes < iAmt) {
// When reading past the EOF, sqlite expects the extra portion of the buffer to be zeroed. We can't do
// that, so return and sqlite will use the slow path.
2017-05-26 04:48:44 +08:00
asyncReleaseZeroCopy(pFile, *data, readBytes, iOfst);
return SQLITE_IOERR_SHORT_READ;
}
++p->debug_zcreads;
return SQLITE_OK;
} catch (Error& e) {
if (e.isInjectedFault()) {
VFSAsyncFile::setInjectedError(SQLITE_IOERR_READ);
}
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR_READ;
}
}
#else
static int asyncReadZeroCopy(sqlite3_file* pFile, void** data, int iAmt, sqlite_int64 iOfst) {
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
2017-05-26 04:48:44 +08:00
try {
*data = new char[iAmt];
int readBytes = waitForAndGet(p->file->read(*data, iAmt, iOfst));
// printf("+asyncReadRef %p +%lld %d/%d = %p\n", pFile, iOfst, readBytes, iAmt, *data);
2017-05-26 04:48:44 +08:00
if (readBytes < iAmt) {
memset((uint8_t*)*data + readBytes, 0, iAmt - readBytes); // When reading past the EOF, sqlite expects the
// extra portion of the buffer to be zeroed
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR_SHORT_READ;
}
return SQLITE_OK;
} catch (Error& e) {
if (e.isInjectedFault()) {
VFSAsyncFile::setInjectedError(SQLITE_IOERR_READ);
}
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR_READ;
}
}
static int asyncReleaseZeroCopy(sqlite3_file* pFile, void* data, int iAmt, sqlite_int64 iOfst) {
// printf("-asyncReleaseRef %p +%lld %d <= %p\n", pFile, iOfst, iAmt, data);
delete[](char*) data;
2017-05-26 04:48:44 +08:00
return SQLITE_OK;
}
#endif
static int asyncWrite(sqlite3_file* pFile, const void* zBuf, int iAmt, sqlite_int64 iOfst) {
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
2017-05-26 04:48:44 +08:00
try {
waitFor(p->file->write(zBuf, iAmt, iOfst));
2017-05-26 04:48:44 +08:00
return SQLITE_OK;
} catch (Error& e) {
if (e.isInjectedFault()) {
VFSAsyncFile::setInjectedError(SQLITE_IOERR_WRITE);
}
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR_WRITE;
}
}
static int asyncTruncate(sqlite3_file* pFile, sqlite_int64 size) {
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
// Adjust size to a multiple of chunkSize if set
if (p->chunkSize != 0) {
size = ((size + p->chunkSize - 1) / p->chunkSize) * p->chunkSize;
}
2017-05-26 04:48:44 +08:00
try {
waitFor(p->file->truncate(size));
2017-05-26 04:48:44 +08:00
return SQLITE_OK;
} catch (Error& e) {
if (e.isInjectedFault()) {
VFSAsyncFile::setInjectedError(SQLITE_IOERR_TRUNCATE);
}
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR_TRUNCATE;
}
}
static int asyncSync(sqlite3_file* pFile, int flags) {
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
2017-05-26 04:48:44 +08:00
try {
waitFor(p->file->sync());
2017-05-26 04:48:44 +08:00
return SQLITE_OK;
} catch (Error& e) {
if (e.isInjectedFault()) {
VFSAsyncFile::setInjectedError(SQLITE_IOERR_FSYNC);
}
TraceEvent("VFSAsyncFileSyncError")
.error(e)
.detail("Filename", p->filename)
.detail("Sqlite3File", (int64_t)pFile)
.detail("IAsyncFile", (int64_t)p->file.getPtr());
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR_FSYNC;
}
}
/*
** Write the size of the file in bytes to *pSize.
*/
static int VFSAsyncFileSize(sqlite3_file* pFile, sqlite_int64* pSize) {
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
2017-05-26 04:48:44 +08:00
try {
*pSize = waitForAndGet(p->file->size());
2017-05-26 04:48:44 +08:00
return SQLITE_OK;
} catch (Error& e) {
if (e.isInjectedFault()) {
VFSAsyncFile::setInjectedError(SQLITE_IOERR_FSTAT);
}
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR_FSTAT;
}
}
static int asyncLock(sqlite3_file* pFile, int eLock) {
// VFSAsyncFile *p = (VFSAsyncFile*)pFile;
2017-05-26 04:48:44 +08:00
//TraceEvent("FileLock").detail("File", p->filename).detail("Fd", p->file->debugFD()).detail("PrevLockLevel", p->lockLevel).detail("Op", eLock).detail("LockCount", *p->pLockCount);
2017-05-26 04:48:44 +08:00
return eLock == EXCLUSIVE_LOCK ? SQLITE_BUSY : SQLITE_OK;
}
static int asyncUnlock(sqlite3_file* pFile, int eLock) {
assert(eLock <= SHARED_LOCK);
2017-05-26 04:48:44 +08:00
return SQLITE_OK;
}
static int asyncCheckReservedLock(sqlite3_file* pFile, int* pResOut) {
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
2017-05-26 04:48:44 +08:00
*pResOut = *p->pLockCount >= RESERVED_COUNT;
return SQLITE_OK;
}
/*
** No xFileControl() verbs are implemented by this VFS.
*/
static int VFSAsyncFileControl(sqlite3_file* pFile, int op, void* pArg) {
VFSAsyncFile* p = (VFSAsyncFile*)pFile;
switch (op) {
case SQLITE_FCNTL_CHUNK_SIZE:
p->chunkSize = *(int*)pArg;
return SQLITE_OK;
case SQLITE_FCNTL_SIZE_HINT:
return asyncTruncate(pFile, *(int64_t*)pArg);
default:
return SQLITE_NOTFOUND;
};
2017-05-26 04:48:44 +08:00
}
static int asyncSectorSize(sqlite3_file* pFile) {
return 512;
} // SOMEDAY: Would 4K be better?
static int asyncDeviceCharacteristics(sqlite3_file* pFile) {
return 0;
}
2017-05-26 04:48:44 +08:00
#if 1
struct SharedMemoryInfo { // for a file
std::string filename;
std::vector<void*> regions;
int regionSize;
int refcount; // Number of connections with this open
int sharedLocks[SQLITE_SHM_NLOCK];
int exclusiveLocks[SQLITE_SHM_NLOCK];
SharedMemoryInfo() : regionSize(0), refcount(0) {
memset(sharedLocks, 0, sizeof(sharedLocks));
memset(exclusiveLocks, 0, sizeof(exclusiveLocks));
}
void cleanup() {
for (int i = 0; i < regions.size(); i++)
delete[](uint8_t*) regions[i];
table.erase(filename);
}
static Mutex mutex;
static std::map<std::string, SharedMemoryInfo> table;
};
Mutex SharedMemoryInfo::mutex;
std::map<std::string, SharedMemoryInfo> SharedMemoryInfo::table;
/*
** This function is called to obtain a pointer to region iRegion of the
** shared-memory associated with the database file fd. Shared-memory regions
** are numbered starting from zero. Each shared-memory region is szRegion
** bytes in size.
**
** If an error occurs, an error code is returned and *pp is set to nullptr.
**
** Otherwise, if the bExtend parameter is 0 and the requested shared-memory
** region has not been allocated (by any client, including one running in a
** separate process), then *pp is set to nullptr and SQLITE_OK returned. If
** bExtend is non-zero and the requested shared-memory region has not yet
** been allocated, it is allocated by this function.
**
** If the shared-memory region has already been allocated or is allocated by
** this call as described above, then it is mapped into this processes
** address space (if it is not already), *pp is set to point to the mapped
** memory and SQLITE_OK returned.
*/
static int asyncShmMap(sqlite3_file* fd, /* Handle open on database file */
int iRegion, /* Region to retrieve */
int szRegion, /* Size of regions */
int bExtend, /* True to extend file if necessary */
void volatile** pp /* OUT: Mapped memory */
) {
MutexHolder hold(SharedMemoryInfo::mutex);
VFSAsyncFile* pDbFd = (VFSAsyncFile*)fd;
SharedMemoryInfo* memInfo = pDbFd->sharedMemory;
if (!memInfo) {
std::string filename = pDbFd->filename;
memInfo = pDbFd->sharedMemory = &SharedMemoryInfo::table[filename];
memInfo->filename = filename;
memInfo->regionSize = szRegion;
++memInfo->refcount;
// printf("Shared memory for: '%s' (%d refs)\n", filename.c_str(), memInfo->refcount);
} else {
assert(memInfo->regionSize == szRegion);
}
if (iRegion >= memInfo->regions.size()) {
if (!bExtend) {
*pp = nullptr;
return SQLITE_OK;
2017-05-26 04:48:44 +08:00
}
while (memInfo->regions.size() <= iRegion) {
void* mem = new uint8_t[szRegion];
memset(mem, 0, szRegion);
memInfo->regions.push_back(mem);
2017-05-26 04:48:44 +08:00
}
}
*pp = memInfo->regions[iRegion];
return SQLITE_OK;
}
2017-05-26 04:48:44 +08:00
/*
** Change the lock state for a shared-memory segment.
**
** Note that the relationship between SHAREd and EXCLUSIVE locks is a little
** different here than in posix. In xShmLock(), one can go from unlocked
** to shared and back or from unlocked to exclusive and back. But one may
** not go from shared to exclusive or from exclusive to shared.
*/
// sqlite doesn't seem to match these up correctly - it happily calls unlock on locks it doesn't hold.
// So we have to keep track of which locks are held by a given sqlite3_file
static int asyncShmLock(sqlite3_file* fd, /* Database file holding the shared memory */
int ofst, /* First lock to acquire or release */
int n, /* Number of locks to acquire or release */
int flags /* What to do with the lock */
) {
assert(ofst >= 0 && ofst + n <= SQLITE_SHM_NLOCK);
assert(n >= 1);
assert(flags == (SQLITE_SHM_LOCK | SQLITE_SHM_SHARED) || flags == (SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE) ||
flags == (SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED) || flags == (SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE));
assert(n == 1 || (flags & SQLITE_SHM_EXCLUSIVE) != 0);
MutexHolder hold(SharedMemoryInfo::mutex);
VFSAsyncFile* pDbFd = (VFSAsyncFile*)fd;
SharedMemoryInfo* memInfo = pDbFd->sharedMemory;
if (flags & SQLITE_SHM_UNLOCK) {
for (int i = ofst; i < ofst + n; i++) {
if (pDbFd->sharedMemorySharedLocks & (1 << i)) {
pDbFd->sharedMemorySharedLocks &= ~(1 << i);
--memInfo->sharedLocks[i];
}
if (pDbFd->sharedMemoryExclusiveLocks & (1 << i)) {
pDbFd->sharedMemoryExclusiveLocks &= ~(1 << i);
--memInfo->exclusiveLocks[i];
}
}
} else if (flags & SQLITE_SHM_SHARED) {
for (int i = ofst; i < ofst + n; i++)
if (memInfo->exclusiveLocks[i] != ((pDbFd->sharedMemoryExclusiveLocks >> i) & 1)) {
//TraceEvent("ShmLocked").detail("File", DEBUG_DETERMINISM ? 0 : (int64_t)pDbFd).detail("Acquiring", "Shared").detail("I", i).detail("Exclusive", memInfo->exclusiveLocks[i]).detail("MyExclusive", pDbFd->sharedMemoryExclusiveLocks);
return SQLITE_BUSY;
}
for (int i = ofst; i < ofst + n; i++)
if (!(pDbFd->sharedMemorySharedLocks & (1 << i))) {
pDbFd->sharedMemorySharedLocks |= 1 << i;
memInfo->sharedLocks[i]++;
}
} else {
for (int i = ofst; i < ofst + n; i++)
if (memInfo->exclusiveLocks[i] != ((pDbFd->sharedMemoryExclusiveLocks >> i) & 1) ||
memInfo->sharedLocks[i] != ((pDbFd->sharedMemorySharedLocks >> i) & 1)) {
//TraceEvent("ShmLocked").detail("File", DEBUG_DETERMINISM ? 0 : (int64_t)pDbFd).detail("Acquiring", "Exclusive").detail("I", i).detail("Exclusive", memInfo->exclusiveLocks[i]).detail("MyExclusive", pDbFd->sharedMemoryExclusiveLocks).detail("Shared", memInfo->sharedLocks[i]).detail("MyShared", pDbFd->sharedMemorySharedLocks);
return SQLITE_BUSY;
}
for (int i = ofst; i < ofst + n; i++)
if (!(pDbFd->sharedMemoryExclusiveLocks & (1 << i))) {
pDbFd->sharedMemoryExclusiveLocks |= 1 << i;
memInfo->exclusiveLocks[i]++;
}
2017-05-26 04:48:44 +08:00
}
return SQLITE_OK;
}
2017-05-26 04:48:44 +08:00
/*
** Implement a memory barrier or memory fence on shared memory.
**
** All loads and stores begun before the barrier must complete before
** any load or store begun after the barrier.
*/
static void asyncShmBarrier(sqlite3_file*) {
2017-05-26 04:48:44 +08:00
#if WIN32
_ReadWriteBarrier();
2017-05-26 04:48:44 +08:00
#else
__sync_synchronize();
2017-05-26 04:48:44 +08:00
#endif
}
/*
** Close a connection to shared-memory. Delete the underlying
** storage if deleteFlag is true.
**
** If there is no shared memory associated with the connection then this
** routine is a harmless no-op.
*/
static int asyncShmUnmap(sqlite3_file* fd, /* The underlying database file */
int deleteFlag /* Delete shared-memory if true */
) {
MutexHolder hold(SharedMemoryInfo::mutex);
VFSAsyncFile* pDbFd = (VFSAsyncFile*)fd;
SharedMemoryInfo* memInfo = pDbFd->sharedMemory;
if (!memInfo)
return SQLITE_OK;
pDbFd->sharedMemory = 0;
// printf("Connection %p closed shared memory\n", fd);
if (!--memInfo->refcount) {
// printf("Cleanup shared memory for: '%s' (%d refs; deleteFlag=%d)\n", memInfo->filename.c_str(),
// memInfo->refcount, deleteFlag); printf(" Shared locks: "); for(int i=0; i<8; i++) printf("%d ",
// memInfo->sharedLocks[i]); printf("\n"); printf(" Exclusive locks: "); for(int i=0; i<8; i++) printf("%d ",
// memInfo->exclusiveLocks[i]); printf("\n");
//TraceEvent("CleanupSharedMemory").detail("Filename", memInfo->filename.c_str()).detail("RefCount", memInfo->refcount).detail("DeleteFlag", deleteFlag);
// for(int i = 0; i < 8; i++)
//TraceEvent("CleanupSharedMemory_Locks").detail("Filename", memInfo->filename.c_str()).detail("Num", i).detail("Shared", memInfo->sharedLocks[i]).detail("Exclusive", memInfo->exclusiveLocks[i]);
2017-05-26 04:48:44 +08:00
// We don't think deleteFlag will ever be set
ASSERT(!deleteFlag);
2017-05-26 04:48:44 +08:00
}
return SQLITE_OK;
}
2017-05-26 04:48:44 +08:00
VFSAsyncFile::~VFSAsyncFile() {
TraceEvent(SevDebug, "VFSAsyncFileDestroyStart")
.detail("Filename", filename)
.detail("OpenCount", filename_lockCount_openCount[filename].second)
.detail("LockCount", filename_lockCount_openCount[filename].first)
.backtrace();
if (!--filename_lockCount_openCount[filename].second) {
filename_lockCount_openCount.erase(filename);
TraceEvent(SevDebug, "VFSAsyncFileDestroy").detail("Filename", filename).backtrace();
// Always delete the shared memory when the last copy of the file is deleted. In simulation, this is helpful
// because "killing" a file without properly closing it can result in a shared memory state that causes
// corruption when reopening the killed file. The only expected penalty from doing this is a potentially slower
// open operation on a database, but that should happen infrequently.
//
// We can't do this in ShmUnmap when refcount is 0 because it seems that SQLite sometimes subsequently tries to
// reopen the WAL from multiple locations simultaneously, resulting in a locking error
auto itr = SharedMemoryInfo::table.find(filename);
if (itr != SharedMemoryInfo::table.end()) {
ASSERT_ABORT(itr->second.refcount == 0);
itr->second.cleanup();
2017-05-26 04:48:44 +08:00
}
}
}
2017-05-26 04:48:44 +08:00
#endif
/*
** Open a file handle.
*/
static int asyncOpen(sqlite3_vfs* pVfs, /* VFS */
const char* zName, /* File to open, or 0 for a temp file */
sqlite3_file* pFile, /* Pointer to VFSAsyncFile struct to populate */
int flags, /* Input SQLITE_OPEN_XXX flags */
int* pOutFlags /* Output SQLITE_OPEN_XXX flags (or nullptr) */
) {
static const sqlite3_io_methods asyncio = { 3, /* iVersion */
asyncClose, /* xClose */
asyncRead, /* xRead */
asyncWrite, /* xWrite */
asyncTruncate, /* xTruncate */
asyncSync, /* xSync */
VFSAsyncFileSize, /* xFileSize */
asyncLock, /* xLock */
asyncUnlock, /* xUnlock */
asyncCheckReservedLock, /* xCheckReservedLock */
VFSAsyncFileControl, /* xFileControl */
asyncSectorSize, /* xSectorSize */
asyncDeviceCharacteristics, /* xDeviceCharacteristics */
asyncShmMap,
asyncShmLock,
asyncShmBarrier,
asyncShmUnmap,
asyncReadZeroCopy,
asyncReleaseZeroCopy };
VFSAsyncFile* p = (VFSAsyncFile*)pFile; /* Populate this structure */
if (zName == 0)
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR;
static_assert(
SQLITE_OPEN_EXCLUSIVE == IAsyncFile::OPEN_EXCLUSIVE && SQLITE_OPEN_CREATE == IAsyncFile::OPEN_CREATE &&
SQLITE_OPEN_READONLY == IAsyncFile::OPEN_READONLY && SQLITE_OPEN_READWRITE == IAsyncFile::OPEN_READWRITE,
"SQLite flag values don't match IAsyncFile flag values");
// File creation here is disabled because we always create the files first in KeyValueStoreSQLite, using atomic
// creation
int oflags =
flags & (/*SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_CREATE |*/ SQLITE_OPEN_READONLY | SQLITE_OPEN_READWRITE);
if (flags & SQLITE_OPEN_WAL)
oflags |= IAsyncFile::OPEN_LARGE_PAGES;
2017-05-26 04:48:44 +08:00
oflags |= IAsyncFile::OPEN_LOCK;
memset(static_cast<void*>(p), 0, sizeof(VFSAsyncFile));
2017-05-26 04:48:44 +08:00
new (p) VFSAsyncFile(zName, flags);
try {
// Note that SQLiteDB::open also opens the db file, so its flags and modes are important, too
p->file = waitForAndGet(IAsyncFileSystem::filesystem()->open(p->filename, oflags, 0600));
2017-05-26 04:48:44 +08:00
TraceEvent(SevDebug, "VFSAsyncFileOpened").detail("Filename", p->filename).backtrace();
2017-05-26 04:48:44 +08:00
} catch (Error& e) {
if (e.isInjectedFault()) {
VFSAsyncFile::setInjectedError(SQLITE_CANTOPEN);
}
TraceEvent("VFSAsyncFileOpenError").error(e).detail("Filename", p->filename);
2017-05-26 04:48:44 +08:00
p->~VFSAsyncFile();
return SQLITE_CANTOPEN;
}
if (pOutFlags) {
2017-05-26 04:48:44 +08:00
*pOutFlags = flags;
}
p->base.pMethods = &asyncio;
return SQLITE_OK;
}
// The next few functions, which perform filesystem operations by path rather than by file, have
// OS-specific implementations.
/*
** Delete the file identified by argument zPath. If the dirSync parameter
** is non-zero, then ensure the file-system modification to delete the
** file has been synced to disk before returning.
*/
static int asyncDelete(sqlite3_vfs* pVfs, const char* zPath, int dirSync) {
ASSERT(false); // At the moment this isn't used; hence isn't under test. Could easily use
// IAsyncFileSystem::filesystem()->deleteFile().
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR_DELETE;
}
/*
** Query the file-system to see if the named file exists, is readable or
** is both readable and writable. For an exists query, treat a zero-length file
** as if it does not exist.
*/
static int asyncAccess(sqlite3_vfs* pVfs, const char* zPath, int flags, int* pResOut) {
2017-05-26 04:48:44 +08:00
#ifdef __unixish__
#ifndef F_OK
#define F_OK 0
#endif
#ifndef R_OK
#define R_OK 4
#endif
#ifndef W_OK
#define W_OK 2
#endif
int rc; /* access() return code */
int eAccess = F_OK; /* Second argument to access() */
assert(flags == SQLITE_ACCESS_EXISTS /* access(zPath, F_OK) */
|| flags == SQLITE_ACCESS_READ /* access(zPath, R_OK) */
|| flags == SQLITE_ACCESS_READWRITE /* access(zPath, R_OK|W_OK) */
2017-05-26 04:48:44 +08:00
);
if (flags == SQLITE_ACCESS_READWRITE)
eAccess = R_OK | W_OK;
if (flags == SQLITE_ACCESS_READ)
eAccess = R_OK;
2017-05-26 04:48:44 +08:00
rc = access(zPath, eAccess);
*pResOut = (rc == 0);
2017-05-26 04:48:44 +08:00
if (flags == SQLITE_ACCESS_EXISTS && *pResOut) {
2017-05-26 04:48:44 +08:00
struct stat buf;
if (0 == stat(zPath, &buf) && buf.st_size == 0) {
2017-05-26 04:48:44 +08:00
*pResOut = 0;
}
}
return SQLITE_OK;
#else
WIN32_FILE_ATTRIBUTE_DATA data;
DWORD attr = INVALID_FILE_ATTRIBUTES;
memset(&data, 0, sizeof(data));
if (GetFileAttributesEx(zPath, GetFileExInfoStandard, &data)) {
if (!(flags == SQLITE_ACCESS_EXISTS && data.nFileSizeHigh == 0 && data.nFileSizeLow == 0))
2017-05-26 04:48:44 +08:00
attr = data.dwFileAttributes;
} else if (GetLastError() != ERROR_FILE_NOT_FOUND)
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR_ACCESS;
if (flags == SQLITE_ACCESS_READWRITE)
*pResOut = (attr & FILE_ATTRIBUTE_READONLY) == 0;
2017-05-26 04:48:44 +08:00
else
*pResOut = attr != INVALID_FILE_ATTRIBUTES;
return SQLITE_OK;
#endif
}
/*
** Argument zPath points to a nul-terminated string containing a file path.
** If zPath is an absolute path, then it is copied as is into the output
2017-05-26 04:48:44 +08:00
** buffer. Otherwise, if it is a relative path, then the equivalent full
** path is written to the output buffer.
*/
static int asyncFullPathname(sqlite3_vfs* pVfs, /* VFS */
const char* zPath, /* Input path (possibly a relative path) */
int nPathOut, /* Size of output buffer in bytes */
char* zPathOut /* Pointer to output buffer */
) {
2017-05-26 04:48:44 +08:00
try {
auto s = abspath(zPath);
2017-05-26 04:48:44 +08:00
if (s.size() >= nPathOut)
return SQLITE_IOERR;
memcpy(zPathOut, s.c_str(), s.size() + 1);
2017-05-26 04:48:44 +08:00
return SQLITE_OK;
} catch (Error& e) {
if (e.isInjectedFault()) {
VFSAsyncFile::setInjectedError(SQLITE_IOERR);
}
TraceEvent(SevError, "VFSAsyncFullPathnameError").error(e).detail("PathIn", (std::string)zPath);
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR;
} catch (...) {
TraceEvent(SevError, "VFSAsyncFullPathnameError").error(unknown_error()).detail("PathIn", (std::string)zPath);
2017-05-26 04:48:44 +08:00
return SQLITE_IOERR;
}
}
/*
** Returns true if there is a shared memory entry for the specified filename,
** and false otherwise.
*/
bool vfsAsyncIsOpen(std::string filename) {
return SharedMemoryInfo::table.count(abspath(filename)) > 0;
2017-05-26 04:48:44 +08:00
}
/*
** The following four VFS methods:
**
** xDlOpen
** xDlError
** xDlSym
** xDlClose
**
** are supposed to implement the functionality needed by SQLite to load
** extensions compiled as shared objects. This simple VFS does not support
** this functionality, so the following functions are no-ops.
*/
static void* asyncDlOpen(sqlite3_vfs* pVfs, const char* zPath) {
return 0;
2017-05-26 04:48:44 +08:00
}
static void asyncDlError(sqlite3_vfs* pVfs, int nByte, char* zErrMsg) {
sqlite3_snprintf(nByte, zErrMsg, "Loadable extensions are not supported");
zErrMsg[nByte - 1] = '\0';
2017-05-26 04:48:44 +08:00
}
static void (*asyncDlSym(sqlite3_vfs* pVfs, void* pH, const char* z))(void) {
return 0;
2017-05-26 04:48:44 +08:00
}
static void asyncDlClose(sqlite3_vfs* pVfs, void* pHandle) {
return;
2017-05-26 04:48:44 +08:00
}
/*
** Parameter zByte points to a buffer nByte bytes in size. Populate this
** buffer with pseudo-random data.
*/
static int asyncRandomness(sqlite3_vfs* pVfs, int nByte, char* zByte) {
for (int i = 0; i < nByte; i++)
zByte[i] = deterministicRandom()->randomInt(0, 256);
return SQLITE_OK;
2017-05-26 04:48:44 +08:00
}
/*
** Sleep for at least nMicro microseconds. Return the (approximate) number
2017-05-26 04:48:44 +08:00
** of microseconds slept for.
*/
static int asyncSleep(sqlite3_vfs* pVfs, int microseconds) {
2017-05-26 04:48:44 +08:00
try {
Future<Void> simCancel = Never();
if (g_network->isSimulated())
simCancel = success(g_simulator.getCurrentProcess()->shutdownSignal.getFuture());
if (simCancel.isReady()) {
waitFor(delay(FLOW_KNOBS->MAX_BUGGIFIED_DELAY));
2017-05-26 04:48:44 +08:00
return 0;
}
waitFor(g_network->delay(microseconds * 1e-6, TaskPriority::DefaultDelay) || simCancel);
2017-05-26 04:48:44 +08:00
return microseconds;
} catch (Error& e) {
if (e.isInjectedFault()) {
VFSAsyncFile::setInjectedError(SQLITE_ERROR);
}
TraceEvent(SevError, "VFSAsyncSleepError").error(e, true);
2017-05-26 04:48:44 +08:00
return 0;
}
}
/*
** Find the current time (in Universal Coordinated Time). Write into *piNow
** the current time and date as a Julian Day number times 86_400_000. In
** other words, write into *piNow the number of milliseconds since the Julian
** epoch of noon in Greenwich on November 24, 4714 B.C according to the
** proleptic Gregorian calendar.
**
** On success, return 0. Return 1 if the time and date cannot be found.
*/
static int asyncCurrentTimeInt64(sqlite3_vfs* NotUsed, sqlite3_int64* piNow) {
2017-05-26 04:48:44 +08:00
#if __unixish__
static const sqlite3_int64 unixEpoch = 24405875 * (sqlite3_int64)8640000;
2017-05-26 04:48:44 +08:00
struct timeval sNow;
2020-08-19 05:18:50 +08:00
gettimeofday(&sNow, nullptr);
*piNow = unixEpoch + 1000 * (sqlite3_int64)sNow.tv_sec + sNow.tv_usec / 1000;
2017-05-26 04:48:44 +08:00
#elif defined(_WIN32)
static const sqlite3_int64 winFiletimeEpoch = 23058135 * (sqlite3_int64)8640000;
2017-05-26 04:48:44 +08:00
int64_t ft = 0;
GetSystemTimeAsFileTime((FILETIME*)&ft);
2017-05-26 04:48:44 +08:00
*piNow = winFiletimeEpoch + ft / 10000;
#else
#error Port me!
#endif
return 0;
2017-05-26 04:48:44 +08:00
}
/*
** Set *pTime to the current UTC time expressed as a Julian day. Return
** SQLITE_OK if successful, or an error code otherwise.
**
** http://en.wikipedia.org/wiki/Julian_day
*/
static int asyncCurrentTime(sqlite3_vfs* pVfs, double* pTime) {
2017-05-26 04:48:44 +08:00
sqlite3_int64 t = 0;
int rc = asyncCurrentTimeInt64(pVfs, &t);
if (rc)
return rc;
2017-05-26 04:48:44 +08:00
*pTime = t / 86400000.0;
return SQLITE_OK;
}
static int asyncGetLastError(sqlite3_vfs* NotUsed, int NotUsed2, char* NotUsed3) {
return 0;
}
2017-05-26 04:48:44 +08:00
/*
** This function returns a pointer to the VFS implemented in this file.
** To make the VFS available to SQLite:
**
** sqlite3_vfs_register(sqlite3_asyncvfs(), 0);
*/
sqlite3_vfs* vfsAsync() {
static sqlite3_vfs asyncvfs = {
3, /* iVersion */
sizeof(VFSAsyncFile), /* szOsFile */
MAXPATHNAME, /* mxPathname */
0, /* pNext */
"fdb_async", /* zName */
0, /* pAppData */
asyncOpen, /* xOpen */
asyncDelete, /* xDelete */
asyncAccess, /* xAccess */
asyncFullPathname, /* xFullPathname */
asyncDlOpen, /* xDlOpen */
asyncDlError, /* xDlError */
asyncDlSym, /* xDlSym */
asyncDlClose, /* xDlClose */
asyncRandomness, /* xRandomness */
asyncSleep, /* xSleep */
asyncCurrentTime, /* xCurrentTime */
asyncGetLastError, /* xGetLastError */
asyncCurrentTimeInt64, /* xCurrentTimeInt64 */
0, /* xSetSystemCall */
0, /* xGetSystemCall */
0, /* xNextSystemCall */
};
return &asyncvfs;
2017-05-26 04:48:44 +08:00
}