ceph: add buffered/direct exclusionary locking for reads and writes

xfstest generic/451 intermittently fails. The test does O_DIRECT writes
to a file, and then reads back the result using buffered I/O, while
running a separate set of tasks that are also doing buffered reads.

The client will invalidate the cache prior to a direct write, but it's
easy for one of the other readers' replies to race in and reinstantiate
the invalidated range with stale data.

To fix this, we must to serialize direct I/O writes and buffered reads.
We could just sprinkle in some shared locks on the i_rwsem for reads,
and increase the exclusive footprint on the write side, but that would
cause O_DIRECT writes to end up serialized vs. other direct requests.

Instead, borrow the scheme used by nfs.ko. Buffered writes take the
i_rwsem exclusively, but buffered reads take a shared lock, allowing
them to run in parallel.

O_DIRECT requests also take a shared lock, but we need for them to not
run in parallel with buffered reads.  A flag on the ceph_inode_info is
used to indicate whether it's in direct or buffered I/O mode. When a
conflicting request is submitted, it will block until the inode can be
flipped to the necessary mode.

Link: https://tracker.ceph.com/issues/40985
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
This commit is contained in:
Jeff Layton 2019-08-02 13:15:39 -04:00 committed by Ilya Dryomov
parent 4766815b11
commit 321fe13c93
5 changed files with 200 additions and 16 deletions

View File

@ -6,7 +6,7 @@
obj-$(CONFIG_CEPH_FS) += ceph.o obj-$(CONFIG_CEPH_FS) += ceph.o
ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
export.o caps.o snap.o xattr.o quota.o \ export.o caps.o snap.o xattr.o quota.o io.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \
debugfs.o debugfs.o

View File

@ -15,6 +15,7 @@
#include "super.h" #include "super.h"
#include "mds_client.h" #include "mds_client.h"
#include "cache.h" #include "cache.h"
#include "io.h"
static __le32 ceph_flags_sys2wire(u32 flags) static __le32 ceph_flags_sys2wire(u32 flags)
{ {
@ -930,7 +931,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
struct ceph_aio_request *aio_req = NULL; struct ceph_aio_request *aio_req = NULL;
int num_pages = 0; int num_pages = 0;
int flags; int flags;
int ret; int ret = 0;
struct timespec64 mtime = current_time(inode); struct timespec64 mtime = current_time(inode);
size_t count = iov_iter_count(iter); size_t count = iov_iter_count(iter);
loff_t pos = iocb->ki_pos; loff_t pos = iocb->ki_pos;
@ -944,11 +945,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
(write ? "write" : "read"), file, pos, (unsigned)count, (write ? "write" : "read"), file, pos, (unsigned)count,
snapc, snapc ? snapc->seq : 0); snapc, snapc ? snapc->seq : 0);
ret = filemap_write_and_wait_range(inode->i_mapping,
pos, pos + count - 1);
if (ret < 0)
return ret;
if (write) { if (write) {
int ret2 = invalidate_inode_pages2_range(inode->i_mapping, int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT, pos >> PAGE_SHIFT,
@ -1284,12 +1280,16 @@ again:
if (ci->i_inline_version == CEPH_INLINE_NONE) { if (ci->i_inline_version == CEPH_INLINE_NONE) {
if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
ceph_start_io_direct(inode);
ret = ceph_direct_read_write(iocb, to, ret = ceph_direct_read_write(iocb, to,
NULL, NULL); NULL, NULL);
ceph_end_io_direct(inode);
if (ret >= 0 && ret < len) if (ret >= 0 && ret < len)
retry_op = CHECK_EOF; retry_op = CHECK_EOF;
} else { } else {
ceph_start_io_read(inode);
ret = ceph_sync_read(iocb, to, &retry_op); ret = ceph_sync_read(iocb, to, &retry_op);
ceph_end_io_read(inode);
} }
} else { } else {
retry_op = READ_INLINE; retry_op = READ_INLINE;
@ -1300,7 +1300,9 @@ again:
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
ceph_cap_string(got)); ceph_cap_string(got));
ceph_add_rw_context(fi, &rw_ctx); ceph_add_rw_context(fi, &rw_ctx);
ceph_start_io_read(inode);
ret = generic_file_read_iter(iocb, to); ret = generic_file_read_iter(iocb, to);
ceph_end_io_read(inode);
ceph_del_rw_context(fi, &rw_ctx); ceph_del_rw_context(fi, &rw_ctx);
} }
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
@ -1409,7 +1411,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
return -ENOMEM; return -ENOMEM;
retry_snap: retry_snap:
inode_lock(inode); if (iocb->ki_flags & IOCB_DIRECT)
ceph_start_io_direct(inode);
else
ceph_start_io_write(inode);
/* We can write back this queue in page reclaim */ /* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode); current->backing_dev_info = inode_to_bdi(inode);
@ -1480,7 +1485,6 @@ retry_snap:
(ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
struct ceph_snap_context *snapc; struct ceph_snap_context *snapc;
struct iov_iter data; struct iov_iter data;
inode_unlock(inode);
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
if (__ceph_have_pending_cap_snap(ci)) { if (__ceph_have_pending_cap_snap(ci)) {
@ -1497,11 +1501,14 @@ retry_snap:
/* we might need to revert back to that point */ /* we might need to revert back to that point */
data = *from; data = *from;
if (iocb->ki_flags & IOCB_DIRECT) if (iocb->ki_flags & IOCB_DIRECT) {
written = ceph_direct_read_write(iocb, &data, snapc, written = ceph_direct_read_write(iocb, &data, snapc,
&prealloc_cf); &prealloc_cf);
else ceph_end_io_direct(inode);
} else {
written = ceph_sync_write(iocb, &data, pos, snapc); written = ceph_sync_write(iocb, &data, pos, snapc);
ceph_end_io_write(inode);
}
if (written > 0) if (written > 0)
iov_iter_advance(from, written); iov_iter_advance(from, written);
ceph_put_snap_context(snapc); ceph_put_snap_context(snapc);
@ -1516,7 +1523,7 @@ retry_snap:
written = generic_perform_write(file, from, pos); written = generic_perform_write(file, from, pos);
if (likely(written >= 0)) if (likely(written >= 0))
iocb->ki_pos = pos + written; iocb->ki_pos = pos + written;
inode_unlock(inode); ceph_end_io_write(inode);
} }
if (written >= 0) { if (written >= 0) {
@ -1551,9 +1558,11 @@ retry_snap:
} }
goto out_unlocked; goto out_unlocked;
out: out:
inode_unlock(inode); if (iocb->ki_flags & IOCB_DIRECT)
ceph_end_io_direct(inode);
else
ceph_end_io_write(inode);
out_unlocked: out_unlocked:
ceph_free_cap_flush(prealloc_cf); ceph_free_cap_flush(prealloc_cf);
current->backing_dev_info = NULL; current->backing_dev_info = NULL;

163
fs/ceph/io.c Normal file
View File

@ -0,0 +1,163 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2016 Trond Myklebust
* Copyright (c) 2019 Jeff Layton
*
* I/O and data path helper functionality.
*
* Heavily borrowed from equivalent code in fs/nfs/io.c
*/
#include <linux/ceph/ceph_debug.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/rwsem.h>
#include <linux/fs.h>
#include "super.h"
#include "io.h"
/* Call with exclusively locked inode->i_rwsem */
static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
{
lockdep_assert_held_write(&inode->i_rwsem);
if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) {
spin_lock(&ci->i_ceph_lock);
ci->i_ceph_flags &= ~CEPH_I_ODIRECT;
spin_unlock(&ci->i_ceph_lock);
inode_dio_wait(inode);
}
}
/**
* ceph_start_io_read - declare the file is being used for buffered reads
* @inode: file inode
*
* Declare that a buffered read operation is about to start, and ensure
* that we block all direct I/O.
* On exit, the function ensures that the CEPH_I_ODIRECT flag is unset,
* and holds a shared lock on inode->i_rwsem to ensure that the flag
* cannot be changed.
* In practice, this means that buffered read operations are allowed to
* execute in parallel, thanks to the shared lock, whereas direct I/O
* operations need to wait to grab an exclusive lock in order to set
* CEPH_I_ODIRECT.
* Note that buffered writes and truncates both take a write lock on
* inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
*/
void
ceph_start_io_read(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
/* Be an optimist! */
down_read(&inode->i_rwsem);
if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT))
return;
up_read(&inode->i_rwsem);
/* Slow path.... */
down_write(&inode->i_rwsem);
ceph_block_o_direct(ci, inode);
downgrade_write(&inode->i_rwsem);
}
/**
* ceph_end_io_read - declare that the buffered read operation is done
* @inode: file inode
*
* Declare that a buffered read operation is done, and release the shared
* lock on inode->i_rwsem.
*/
void
ceph_end_io_read(struct inode *inode)
{
up_read(&inode->i_rwsem);
}
/**
* ceph_start_io_write - declare the file is being used for buffered writes
* @inode: file inode
*
* Declare that a buffered write operation is about to start, and ensure
* that we block all direct I/O.
*/
void
ceph_start_io_write(struct inode *inode)
{
down_write(&inode->i_rwsem);
ceph_block_o_direct(ceph_inode(inode), inode);
}
/**
* ceph_end_io_write - declare that the buffered write operation is done
* @inode: file inode
*
* Declare that a buffered write operation is done, and release the
* lock on inode->i_rwsem.
*/
void
ceph_end_io_write(struct inode *inode)
{
up_write(&inode->i_rwsem);
}
/* Call with exclusively locked inode->i_rwsem */
static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
{
lockdep_assert_held_write(&inode->i_rwsem);
if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) {
spin_lock(&ci->i_ceph_lock);
ci->i_ceph_flags |= CEPH_I_ODIRECT;
spin_unlock(&ci->i_ceph_lock);
/* FIXME: unmap_mapping_range? */
filemap_write_and_wait(inode->i_mapping);
}
}
/**
* ceph_end_io_direct - declare the file is being used for direct i/o
* @inode: file inode
*
* Declare that a direct I/O operation is about to start, and ensure
* that we block all buffered I/O.
* On exit, the function ensures that the CEPH_I_ODIRECT flag is set,
* and holds a shared lock on inode->i_rwsem to ensure that the flag
* cannot be changed.
* In practice, this means that direct I/O operations are allowed to
* execute in parallel, thanks to the shared lock, whereas buffered I/O
* operations need to wait to grab an exclusive lock in order to clear
* CEPH_I_ODIRECT.
* Note that buffered writes and truncates both take a write lock on
* inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
*/
void
ceph_start_io_direct(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
/* Be an optimist! */
down_read(&inode->i_rwsem);
if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)
return;
up_read(&inode->i_rwsem);
/* Slow path.... */
down_write(&inode->i_rwsem);
ceph_block_buffered(ci, inode);
downgrade_write(&inode->i_rwsem);
}
/**
* ceph_end_io_direct - declare that the direct i/o operation is done
* @inode: file inode
*
* Declare that a direct I/O operation is done, and release the shared
* lock on inode->i_rwsem.
*/
void
ceph_end_io_direct(struct inode *inode)
{
up_read(&inode->i_rwsem);
}

12
fs/ceph/io.h Normal file
View File

@ -0,0 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_IO_H
#define _FS_CEPH_IO_H
void ceph_start_io_read(struct inode *inode);
void ceph_end_io_read(struct inode *inode);
void ceph_start_io_write(struct inode *inode);
void ceph_end_io_write(struct inode *inode);
void ceph_start_io_direct(struct inode *inode);
void ceph_end_io_direct(struct inode *inode);
#endif /* FS_CEPH_IO_H */

View File

@ -516,7 +516,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ #define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */
#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ #define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */
#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ #define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */
#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */
/* /*
* Masks of ceph inode work. * Masks of ceph inode work.