From 0e4e1de5b63fa423b13593337a27fd2d2b0bcf77 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 13 Mar 2020 11:20:51 +0100 Subject: [PATCH 1/5] rbd: avoid a deadlock on header_rwsem when flushing notifies rbd_unregister_watch() flushes notifies and therefore cannot be called under header_rwsem because a header update notify takes header_rwsem to synchronize with "rbd map". If mapping an image fails after the watch is established and a header update notify sneaks in, we deadlock when erroring out from rbd_dev_image_probe(). Move watch registration and unregistration out of the critical section. The only reason they were put there was to make header_rwsem management slightly more obvious. Fixes: 811c66887746 ("rbd: fix rbd map vs notify races") Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1e0a6b19ae0d..ff2377e6d12c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4527,6 +4527,10 @@ static void cancel_tasks_sync(struct rbd_device *rbd_dev) cancel_work_sync(&rbd_dev->unlock_work); } +/* + * header_rwsem must not be held to avoid a deadlock with + * rbd_dev_refresh() when flushing notifies. + */ static void rbd_unregister_watch(struct rbd_device *rbd_dev) { cancel_tasks_sync(rbd_dev); @@ -6907,6 +6911,9 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) * device. If this image is the one being mapped (i.e., not a * parent), initiate a watch on its header object before using that * object to get detailed information about the rbd image. + * + * On success, returns with header_rwsem held for write if called + * with @depth == 0. */ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) { @@ -6936,6 +6943,9 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) } } + if (!depth) + down_write(&rbd_dev->header_rwsem); + ret = rbd_dev_header_info(rbd_dev); if (ret) { if (ret == -ENOENT && !need_watch) @@ -6987,6 +6997,8 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) err_out_probe: rbd_dev_unprobe(rbd_dev); err_out_watch: + if (!depth) + up_write(&rbd_dev->header_rwsem); if (need_watch) rbd_unregister_watch(rbd_dev); err_out_format: @@ -7050,12 +7062,9 @@ static ssize_t do_rbd_add(struct bus_type *bus, goto err_out_rbd_dev; } - down_write(&rbd_dev->header_rwsem); rc = rbd_dev_image_probe(rbd_dev, 0); - if (rc < 0) { - up_write(&rbd_dev->header_rwsem); + if (rc < 0) goto err_out_rbd_dev; - } if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) { rbd_warn(rbd_dev, "alloc_size adjusted to %u", From 952c48b0ed18919bff7528501e9a3fff8a24f8cd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Mar 2020 15:52:54 +0100 Subject: [PATCH 2/5] rbd: call rbd_dev_unprobe() after unwatching and flushing notifies rbd_dev_unprobe() is supposed to undo most of rbd_dev_image_probe(), including rbd_dev_header_info(), which means that rbd_dev_header_info() isn't supposed to be called after rbd_dev_unprobe(). However, rbd_dev_image_release() calls rbd_dev_unprobe() before rbd_unregister_watch(). This is racy because a header update notify can sneak in: "rbd unmap" thread ceph-watch-notify worker rbd_dev_image_release() rbd_dev_unprobe() free and zero out header rbd_watch_cb() rbd_dev_refresh() rbd_dev_header_info() read in header The same goes for "rbd map" because rbd_dev_image_probe() calls rbd_dev_unprobe() on errors. In both cases this results in a memory leak. Fixes: fd22aef8b47c ("rbd: move rbd_unregister_watch() call into rbd_dev_image_release()") Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ff2377e6d12c..7aec8bc5df6e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -6898,9 +6898,10 @@ static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap) static void rbd_dev_image_release(struct rbd_device *rbd_dev) { - rbd_dev_unprobe(rbd_dev); if (rbd_dev->opts) rbd_unregister_watch(rbd_dev); + + rbd_dev_unprobe(rbd_dev); rbd_dev->image_format = 0; kfree(rbd_dev->spec->image_id); rbd_dev->spec->image_id = NULL; @@ -6950,7 +6951,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) if (ret) { if (ret == -ENOENT && !need_watch) rbd_print_dne(rbd_dev, false); - goto err_out_watch; + goto err_out_probe; } /* @@ -6995,12 +6996,11 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) return 0; err_out_probe: - rbd_dev_unprobe(rbd_dev); -err_out_watch: if (!depth) up_write(&rbd_dev->header_rwsem); if (need_watch) rbd_unregister_watch(rbd_dev); + rbd_dev_unprobe(rbd_dev); err_out_format: rbd_dev->image_format = 0; kfree(rbd_dev->spec->image_id); From b8776051529230f76e464d5ffc5d1cf8465576bf Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Mar 2020 17:16:28 +0100 Subject: [PATCH 3/5] rbd: don't test rbd_dev->opts in rbd_dev_image_release() rbd_dev->opts is used to distinguish between the image that is being mapped and a parent. However, because we no longer establish watch for read-only mappings, this test is imprecise and results in unnecessary rbd_unregister_watch() calls. Make it consistent with need_watch in rbd_dev_image_probe(). Fixes: b9ef2b8858a0 ("rbd: don't establish watch for read-only mappings") Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7aec8bc5df6e..205192a5ec8f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -6898,7 +6898,7 @@ static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap) static void rbd_dev_image_release(struct rbd_device *rbd_dev) { - if (rbd_dev->opts) + if (!rbd_is_ro(rbd_dev)) rbd_unregister_watch(rbd_dev); rbd_dev_unprobe(rbd_dev); From 8ae0299a4b72f2f9ad2b755da91c6a2beabaee62 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 17 Mar 2020 15:18:48 +0100 Subject: [PATCH 4/5] rbd: don't mess with a page vector in rbd_notify_op_lock() rbd_notify_op_lock() isn't interested in a notify reply. Instead of accepting that page vector just to free it, have watch-notify code take care of it. Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 205192a5ec8f..67d65ac785e9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3754,11 +3754,7 @@ static int __rbd_notify_op_lock(struct rbd_device *rbd_dev, static void rbd_notify_op_lock(struct rbd_device *rbd_dev, enum rbd_notify_op notify_op) { - struct page **reply_pages; - size_t reply_len; - - __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len); - ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); + __rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL); } static void rbd_notify_acquired_lock(struct work_struct *work) From 2a575f138d003fff0f4930b5cfae4a1c46343b8f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 8 Apr 2020 08:41:38 -0400 Subject: [PATCH 5/5] ceph: fix potential bad pointer deref in async dirops cb's The new async dirops callback routines can pass ERR_PTR values to ceph_mdsc_free_path, which could cause an oops. Make ceph_mdsc_free_path ignore ERR_PTR values. Also, ensure that the pr_warn messages look sane even if ceph_mdsc_build_path fails. Reported-by: Dan Carpenter Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 4 ++-- fs/ceph/file.c | 4 ++-- fs/ceph/mds_client.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index d594c2627430..4c4202c93b71 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1051,8 +1051,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, /* If op failed, mark everyone involved for errors */ if (result) { - int pathlen; - u64 base; + int pathlen = 0; + u64 base = 0; char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, &base, 0); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4a5ccbb7e808..afdfca965a7f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -527,8 +527,8 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, if (result) { struct dentry *dentry = req->r_dentry; - int pathlen; - u64 base; + int pathlen = 0; + u64 base = 0; char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, &base, 0); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4e5be79bf080..903d9edfd4bf 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -521,7 +521,7 @@ extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); static inline void ceph_mdsc_free_path(char *path, int len) { - if (path) + if (!IS_ERR_OR_NULL(path)) __putname(path - (PATH_MAX - 1 - len)); }