From a81bc3102b4ffb885f34855d0133f862f915ab13 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 13 Nov 2019 09:10:27 -0500 Subject: [PATCH 1/3] ceph: take the inode lock before acquiring cap refs Most of the time, we (or the vfs layer) takes the inode_lock and then acquires caps, but ceph_read_iter does the opposite, and that can lead to a deadlock. When there are multiple clients treading over the same data, we can end up in a situation where a reader takes caps and then tries to acquire the inode_lock. Another task holds the inode_lock and issues a request to the MDS which needs to revoke the caps, but that can't happen until the inode_lock is unwedged. Fix this by having ceph_read_iter take the inode_lock earlier, before attempting to acquire caps. Fixes: 321fe13c9398 ("ceph: add buffered/direct exclusionary locking for reads and writes") Link: https://tracker.ceph.com/issues/36348 Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index bd77adb64bfd..06efeaff3b57 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1264,14 +1264,24 @@ again: dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); + if (iocb->ki_flags & IOCB_DIRECT) + ceph_start_io_direct(inode); + else + ceph_start_io_read(inode); + if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); - if (ret < 0) + if (ret < 0) { + if (iocb->ki_flags & IOCB_DIRECT) + ceph_end_io_direct(inode); + else + ceph_end_io_read(inode); return ret; + } if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_flags & IOCB_DIRECT) || @@ -1283,16 +1293,12 @@ again: if (ci->i_inline_version == CEPH_INLINE_NONE) { if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { - ceph_start_io_direct(inode); ret = ceph_direct_read_write(iocb, to, NULL, NULL); - ceph_end_io_direct(inode); if (ret >= 0 && ret < len) retry_op = CHECK_EOF; } else { - ceph_start_io_read(inode); ret = ceph_sync_read(iocb, to, &retry_op); - ceph_end_io_read(inode); } } else { retry_op = READ_INLINE; @@ -1303,11 +1309,10 @@ again: inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); ceph_add_rw_context(fi, &rw_ctx); - ceph_start_io_read(inode); ret = generic_file_read_iter(iocb, to); - ceph_end_io_read(inode); ceph_del_rw_context(fi, &rw_ctx); } + dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); if (pinned_page) { @@ -1315,6 +1320,12 @@ again: pinned_page = NULL; } ceph_put_cap_refs(ci, got); + + if (iocb->ki_flags & IOCB_DIRECT) + ceph_end_io_direct(inode); + else + ceph_end_io_read(inode); + if (retry_op > HAVE_RETRIED && ret >= 0) { int statret; struct page *page = NULL; From 6a81749ebe5f1b52d7eeb8a1031deb8d520f23e6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 13 Nov 2019 09:56:06 -0500 Subject: [PATCH 2/3] ceph: increment/decrement dio counter on async requests Ceph can in some cases issue an async DIO request, in which case we can end up calling ceph_end_io_direct before the I/O is actually complete. That may allow buffered operations to proceed while DIO requests are still in flight. Fix this by incrementing the i_dio_count when issuing an async DIO request, and decrement it when tearing down the aio_req. Fixes: 321fe13c9398 ("ceph: add buffered/direct exclusionary locking for reads and writes") Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 06efeaff3b57..8de633964dc3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -753,6 +753,9 @@ static void ceph_aio_complete(struct inode *inode, if (!atomic_dec_and_test(&aio_req->pending_reqs)) return; + if (aio_req->iocb->ki_flags & IOCB_DIRECT) + inode_dio_end(inode); + ret = aio_req->error; if (!ret) ret = aio_req->total_len; @@ -1091,6 +1094,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, CEPH_CAP_FILE_RD); list_splice(&aio_req->osd_reqs, &osd_reqs); + inode_dio_begin(inode); while (!list_empty(&osd_reqs)) { req = list_first_entry(&osd_reqs, struct ceph_osd_request, From 633739b2fedb6617d782ca252797b7a8ad754347 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 13 Nov 2019 12:07:15 +0100 Subject: [PATCH 3/3] rbd: silence bogus uninitialized warning in rbd_object_map_update_finish() Some versions of gcc (so far 6.3 and 7.4) throw a warning: drivers/block/rbd.c: In function 'rbd_object_map_callback': drivers/block/rbd.c:2124:21: warning: 'current_state' may be used uninitialized in this function [-Wmaybe-uninitialized] (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) drivers/block/rbd.c:2092:23: note: 'current_state' was declared here u8 state, new_state, current_state; ^~~~~~~~~~~~~ It's bogus because all current_state accesses are guarded by has_current_state. Reported-by: kbuild test robot Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 39136675dae5..13527a0b4e44 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2087,7 +2087,7 @@ static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req, struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; struct ceph_osd_data *osd_data; u64 objno; - u8 state, new_state, current_state; + u8 state, new_state, uninitialized_var(current_state); bool has_current_state; void *p;