From 69cea0d45a618ad4ae74f36386ef1af5128b2b19 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 10 Jun 2020 15:02:37 -0400 Subject: [PATCH 01/20] dm mpath: changes from initial m->flags locking audit Fix locking in slow-paths where m->lock should be taken. Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 78cff42d987e..d7bb74bded8c 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -335,6 +335,8 @@ static int pg_init_all_paths(struct multipath *m) static void __switch_pg(struct multipath *m, struct priority_group *pg) { + lockdep_assert_held(&m->lock); + m->current_pg = pg; /* Must we initialise the PG first, and queue I/O till it's ready? */ @@ -382,7 +384,9 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) unsigned bypassed = 1; if (!atomic_read(&m->nr_valid_paths)) { + spin_lock_irqsave(&m->lock, flags); clear_bit(MPATHF_QUEUE_IO, &m->flags); + spin_unlock_irqrestore(&m->lock, flags); goto failed; } @@ -422,8 +426,11 @@ check_current_pg: continue; pgpath = choose_path_in_pg(m, pg, nr_bytes); if (!IS_ERR_OR_NULL(pgpath)) { - if (!bypassed) + if (!bypassed) { + spin_lock_irqsave(&m->lock, flags); set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); + spin_unlock_irqrestore(&m->lock, flags); + } return pgpath; } } @@ -1662,9 +1669,9 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, spin_lock_irqsave(&m->lock, flags); bio_list_add(&m->queued_bios, clone); - spin_unlock_irqrestore(&m->lock, flags); if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) queue_work(kmultipathd, &m->process_queued_bios); + spin_unlock_irqrestore(&m->lock, flags); r = DM_ENDIO_INCOMPLETE; done: @@ -1938,6 +1945,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, { struct multipath *m = ti->private; struct pgpath *current_pgpath; + unsigned long flags; int r; current_pgpath = READ_ONCE(m->current_pgpath); @@ -1965,8 +1973,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti, /* Path status changed, redo selection */ (void) choose_pgpath(m, 0); } + spin_lock_irqsave(&m->lock, flags); if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) - pg_init_all_paths(m); + (void) __pg_init_all_paths(m); + spin_unlock_irqrestore(&m->lock, flags); dm_table_run_md_queue_async(m->ti->table); process_queued_io_list(m); } From a271a89c6e4773478b1c4f8213dfe8351ea66723 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 10 Jun 2020 16:07:57 -0400 Subject: [PATCH 02/20] dm mpath: take m->lock spinlock when testing QUEUE_IF_NO_PATH Fix multipath_end_io, multipath_end_io_bio and multipath_busy to take m->lock while testing if MPATHF_QUEUE_IF_NO_PATH bit is set. These are all slow-path cases when no paths are available so extra locking isn't a performance hit. Correctness matters most. Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 52 ++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index d7bb74bded8c..bf5175805b1d 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1621,12 +1621,16 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, if (pgpath) fail_path(pgpath); - if (atomic_read(&m->nr_valid_paths) == 0 && - !must_push_back_rq(m)) { - if (error == BLK_STS_IOERR) - dm_report_EIO(m); - /* complete with the original error */ - r = DM_ENDIO_DONE; + if (!atomic_read(&m->nr_valid_paths)) { + unsigned long flags; + spin_lock_irqsave(&m->lock, flags); + if (!must_push_back_rq(m)) { + if (error == BLK_STS_IOERR) + dm_report_EIO(m); + /* complete with the original error */ + r = DM_ENDIO_DONE; + } + spin_unlock_irqrestore(&m->lock, flags); } } @@ -1656,15 +1660,19 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, if (pgpath) fail_path(pgpath); - if (atomic_read(&m->nr_valid_paths) == 0 && - !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { - if (__must_push_back(m)) { - r = DM_ENDIO_REQUEUE; - } else { - dm_report_EIO(m); - *error = BLK_STS_IOERR; + if (!atomic_read(&m->nr_valid_paths)) { + spin_lock_irqsave(&m->lock, flags); + if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + if (__must_push_back(m)) { + r = DM_ENDIO_REQUEUE; + } else { + dm_report_EIO(m); + *error = BLK_STS_IOERR; + } + spin_unlock_irqrestore(&m->lock, flags); + goto done; } - goto done; + spin_unlock_irqrestore(&m->lock, flags); } spin_lock_irqsave(&m->lock, flags); @@ -1962,10 +1970,11 @@ static int multipath_prepare_ioctl(struct dm_target *ti, } } else { /* No path is available */ + r = -EIO; + spin_lock_irqsave(&m->lock, flags); if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) r = -ENOTCONN; - else - r = -EIO; + spin_unlock_irqrestore(&m->lock, flags); } if (r == -ENOTCONN) { @@ -2036,8 +2045,15 @@ static int multipath_busy(struct dm_target *ti) return true; /* no paths available, for blk-mq: rely on IO mapping to delay requeue */ - if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) - return (m->queue_mode != DM_TYPE_REQUEST_BASED); + if (!atomic_read(&m->nr_valid_paths)) { + unsigned long flags; + spin_lock_irqsave(&m->lock, flags); + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + spin_unlock_irqrestore(&m->lock, flags); + return (m->queue_mode != DM_TYPE_REQUEST_BASED); + } + spin_unlock_irqrestore(&m->lock, flags); + } /* Guess which priority_group will be used at next mapping time */ pg = READ_ONCE(m->current_pg); From 73265f3ffdc90f0e27b79fbcba8530c75c74ef12 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 10 Jun 2020 16:51:32 -0400 Subject: [PATCH 03/20] dm mpath: push locking down to must_push_back_rq() Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index bf5175805b1d..87b6a2c98c73 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -472,7 +472,14 @@ static bool __must_push_back(struct multipath *m) static bool must_push_back_rq(struct multipath *m) { - return test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m); + unsigned long flags; + bool ret; + + spin_lock_irqsave(&m->lock, flags); + ret = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m)); + spin_unlock_irqrestore(&m->lock, flags); + + return ret; } /* @@ -1621,16 +1628,12 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, if (pgpath) fail_path(pgpath); - if (!atomic_read(&m->nr_valid_paths)) { - unsigned long flags; - spin_lock_irqsave(&m->lock, flags); - if (!must_push_back_rq(m)) { - if (error == BLK_STS_IOERR) - dm_report_EIO(m); - /* complete with the original error */ - r = DM_ENDIO_DONE; - } - spin_unlock_irqrestore(&m->lock, flags); + if (!atomic_read(&m->nr_valid_paths) && + !must_push_back_rq(m)) { + if (error == BLK_STS_IOERR) + dm_report_EIO(m); + /* complete with the original error */ + r = DM_ENDIO_DONE; } } From f45f11868e0efeccfc1c695e56864f9032f18bdc Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 10 Jun 2020 20:03:19 -0400 Subject: [PATCH 04/20] dm mpath: factor out multipath_queue_bio Enables further cleanup. Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 87b6a2c98c73..4b48183c6b5c 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -574,6 +574,18 @@ static void multipath_release_clone(struct request *clone, * Map cloned bios (bio-based multipath) */ +static void multipath_queue_bio(struct multipath *m, struct bio *bio) +{ + unsigned long flags; + + /* Queue for the daemon to resubmit */ + spin_lock_irqsave(&m->lock, flags); + bio_list_add(&m->queued_bios, bio); + if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) + queue_work(kmultipathd, &m->process_queued_bios); + spin_unlock_irqrestore(&m->lock, flags); +} + static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) { struct pgpath *pgpath; @@ -590,16 +602,11 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) if ((pgpath && queue_io) || (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { - /* Queue for the daemon to resubmit */ - spin_lock_irqsave(&m->lock, flags); - bio_list_add(&m->queued_bios, bio); - spin_unlock_irqrestore(&m->lock, flags); + multipath_queue_bio(m, bio); /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */ if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) pg_init_all_paths(m); - else if (!queue_io) - queue_work(kmultipathd, &m->process_queued_bios); return ERR_PTR(-EAGAIN); } @@ -1678,12 +1685,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, spin_unlock_irqrestore(&m->lock, flags); } - spin_lock_irqsave(&m->lock, flags); - bio_list_add(&m->queued_bios, clone); - if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) - queue_work(kmultipathd, &m->process_queued_bios); - spin_unlock_irqrestore(&m->lock, flags); - + multipath_queue_bio(m, clone); r = DM_ENDIO_INCOMPLETE; done: if (pgpath) { From 17213ec1806199ab528655946af144abc37d89fb Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 10 Jun 2020 20:19:56 -0400 Subject: [PATCH 05/20] dm mpath: rework __map_bio() so that it follows same pattern as request-based multipath_clone_and_map() Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 4b48183c6b5c..ab6ccd619573 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -574,15 +574,20 @@ static void multipath_release_clone(struct request *clone, * Map cloned bios (bio-based multipath) */ +static void __multipath_queue_bio(struct multipath *m, struct bio *bio) +{ + /* Queue for the daemon to resubmit */ + bio_list_add(&m->queued_bios, bio); + if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) + queue_work(kmultipathd, &m->process_queued_bios); +} + static void multipath_queue_bio(struct multipath *m, struct bio *bio) { unsigned long flags; - /* Queue for the daemon to resubmit */ spin_lock_irqsave(&m->lock, flags); - bio_list_add(&m->queued_bios, bio); - if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) - queue_work(kmultipathd, &m->process_queued_bios); + __multipath_queue_bio(m, bio); spin_unlock_irqrestore(&m->lock, flags); } @@ -590,24 +595,24 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) { struct pgpath *pgpath; unsigned long flags; - bool queue_io; /* Do we need to select a new pgpath? */ pgpath = READ_ONCE(m->current_pgpath); if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) pgpath = choose_pgpath(m, bio->bi_iter.bi_size); - /* MPATHF_QUEUE_IO might have been cleared by choose_pgpath. */ - queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); + if (!pgpath) { + spin_lock_irqsave(&m->lock, flags); + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + __multipath_queue_bio(m, bio); + pgpath = ERR_PTR(-EAGAIN); + } + spin_unlock_irqrestore(&m->lock, flags); - if ((pgpath && queue_io) || - (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { + } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || + test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { multipath_queue_bio(m, bio); - - /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */ - if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) - pg_init_all_paths(m); - + pg_init_all_paths(m); return ERR_PTR(-EAGAIN); } From 564dbb130b3f8ab39719a42c130505bf75610fbf Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 10 Jun 2020 21:25:39 -0400 Subject: [PATCH 06/20] dm mpath: rename current_pgpath to pgpath in multipath_prepare_ioctl Makes consistent with __map_bio() and multipath_clone_and_map(). Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index ab6ccd619573..f71bb4e5eaf7 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1962,17 +1962,17 @@ static int multipath_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct multipath *m = ti->private; - struct pgpath *current_pgpath; + struct pgpath *pgpath; unsigned long flags; int r; - current_pgpath = READ_ONCE(m->current_pgpath); - if (!current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) - current_pgpath = choose_pgpath(m, 0); + pgpath = READ_ONCE(m->current_pgpath); + if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) + pgpath = choose_pgpath(m, 0); - if (current_pgpath) { + if (pgpath) { if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { - *bdev = current_pgpath->path.dev->bdev; + *bdev = pgpath->path.dev->bdev; r = 0; } else { /* pg_init has not started or completed */ From 374117ad4736c5a4f8012cfe59fc07d9d58191d5 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 15 Jun 2020 11:31:04 -0400 Subject: [PATCH 07/20] dm mpath: use double checked locking in fast path Fast-path code biased toward lazy acknowledgement of bit being set (primarily only for initialization). Multipath code is very retry oriented so even if state is missed it'll recover. Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index f71bb4e5eaf7..88dc7b41b1c6 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -128,6 +128,20 @@ static void queue_if_no_path_timeout_work(struct timer_list *t); #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ +static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m) +{ + bool r = test_bit(MPATHF_bit, &m->flags); + + if (r) { + unsigned long flags; + spin_lock_irqsave(&m->lock, flags); + r = test_bit(MPATHF_bit, &m->flags); + spin_unlock_irqrestore(&m->lock, flags); + } + + return r; +} + /*----------------------------------------------- * Allocation routines *-----------------------------------------------*/ @@ -499,7 +513,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, /* Do we need to select a new pgpath? */ pgpath = READ_ONCE(m->current_pgpath); - if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) + if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) pgpath = choose_pgpath(m, nr_bytes); if (!pgpath) { @@ -507,8 +521,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, return DM_MAPIO_DELAY_REQUEUE; dm_report_EIO(m); /* Failed */ return DM_MAPIO_KILL; - } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || - test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { + } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) || + mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) { pg_init_all_paths(m); return DM_MAPIO_DELAY_REQUEUE; } @@ -598,7 +612,7 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) /* Do we need to select a new pgpath? */ pgpath = READ_ONCE(m->current_pgpath); - if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) + if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) pgpath = choose_pgpath(m, bio->bi_iter.bi_size); if (!pgpath) { @@ -609,8 +623,8 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) } spin_unlock_irqrestore(&m->lock, flags); - } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || - test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { + } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) || + mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) { multipath_queue_bio(m, bio); pg_init_all_paths(m); return ERR_PTR(-EAGAIN); @@ -861,7 +875,7 @@ static int setup_scsi_dh(struct block_device *bdev, struct multipath *m, struct request_queue *q = bdev_get_queue(bdev); int r; - if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) { + if (mpath_double_check_test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, m)) { retain: if (*attached_handler_name) { /* @@ -1967,11 +1981,11 @@ static int multipath_prepare_ioctl(struct dm_target *ti, int r; pgpath = READ_ONCE(m->current_pgpath); - if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) + if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) pgpath = choose_pgpath(m, 0); if (pgpath) { - if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { + if (!mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) { *bdev = pgpath->path.dev->bdev; r = 0; } else { From e1fef0b08e06dcce275bd585bc6a900dff395096 Mon Sep 17 00:00:00 2001 From: JeongHyeon Lee Date: Thu, 18 Jun 2020 15:56:50 +0900 Subject: [PATCH 08/20] dm verity: add "panic_on_corruption" error handling mode Samsung smart phones may need the ability to panic on corruption. Not all devices provide the bootloader support needed to use the existing "restart_on_corruption" mode. Additional details for why Samsung needs this new mode can be found here: https://www.redhat.com/archives/dm-devel/2020-June/msg00235.html Signed-off-by: jhs2.lee Signed-off-by: Mike Snitzer --- Documentation/admin-guide/device-mapper/verity.rst | 4 ++++ drivers/md/dm-verity-target.c | 13 ++++++++++++- drivers/md/dm-verity.h | 3 ++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst index bb02caa45289..66f71f0dab1b 100644 --- a/Documentation/admin-guide/device-mapper/verity.rst +++ b/Documentation/admin-guide/device-mapper/verity.rst @@ -83,6 +83,10 @@ restart_on_corruption not compatible with ignore_corruption and requires user space support to avoid restart loops. +panic_on_corruption + Panic the device when a corrupted block is discovered. This option is + not compatible with ignore_corruption and restart_on_corruption. + ignore_zero_blocks Do not verify blocks that are expected to contain zeroes and always return zeroes instead. This may be useful if the partition contains unused blocks diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index eec9f252e935..ddcd1d03beb9 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -30,6 +30,7 @@ #define DM_VERITY_OPT_LOGGING "ignore_corruption" #define DM_VERITY_OPT_RESTART "restart_on_corruption" +#define DM_VERITY_OPT_PANIC "panic_on_corruption" #define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" #define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once" @@ -254,6 +255,9 @@ out: if (v->mode == DM_VERITY_MODE_RESTART) kernel_restart("dm-verity device corrupted"); + if (v->mode == DM_VERITY_MODE_PANIC) + panic("dm-verity device corrupted"); + return 1; } @@ -742,6 +746,9 @@ static void verity_status(struct dm_target *ti, status_type_t type, case DM_VERITY_MODE_RESTART: DMEMIT(DM_VERITY_OPT_RESTART); break; + case DM_VERITY_MODE_PANIC: + DMEMIT(DM_VERITY_OPT_PANIC); + break; default: BUG(); } @@ -907,6 +914,10 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, v->mode = DM_VERITY_MODE_RESTART; continue; + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_PANIC)) { + v->mode = DM_VERITY_MODE_PANIC; + continue; + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) { r = verity_alloc_zero_digest(v); if (r) { @@ -1221,7 +1232,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 6, 0}, + .version = {1, 7, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 641b9e3a399b..4e769d13473a 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -20,7 +20,8 @@ enum verity_mode { DM_VERITY_MODE_EIO, DM_VERITY_MODE_LOGGING, - DM_VERITY_MODE_RESTART + DM_VERITY_MODE_RESTART, + DM_VERITY_MODE_PANIC }; enum verity_block_type { From 4f7f590b152444c1403ece9eeeddd1e8b22ba04e Mon Sep 17 00:00:00 2001 From: yangerkun Date: Fri, 19 Jun 2020 17:10:39 -0400 Subject: [PATCH 09/20] dm dust: report some message results directly back to user Some messages (queryblock, countbadblocks, removebadblock) are best reported directly to user directly. Do so with DMEMIT. [Bryan: maintain __func__ output in DMEMIT messages] Signed-off-by: yangerkun Signed-off-by: Bryan Gurney Signed-off-by: Mike Snitzer --- .../admin-guide/device-mapper/dm-dust.rst | 15 ++++----- drivers/md/dm-dust.c | 31 +++++++++++-------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/Documentation/admin-guide/device-mapper/dm-dust.rst b/Documentation/admin-guide/device-mapper/dm-dust.rst index b6e7e7ead831..cf8079e368de 100644 --- a/Documentation/admin-guide/device-mapper/dm-dust.rst +++ b/Documentation/admin-guide/device-mapper/dm-dust.rst @@ -69,10 +69,11 @@ Create the dm-dust device: $ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 4096' Check the status of the read behavior ("bypass" indicates that all I/O -will be passed through to the underlying device):: +will be passed through to the underlying device; "verbose" indicates that +bad block additions, removals, and remaps will be verbosely logged):: $ sudo dmsetup status dust1 - 0 33552384 dust 252:17 bypass + 0 33552384 dust 252:17 bypass verbose $ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=128 iflag=direct 128+0 records in @@ -164,7 +165,7 @@ following message command:: A message will print with the number of bad blocks currently configured on the device:: - kernel: device-mapper: dust: countbadblocks: 895 badblock(s) found + countbadblocks: 895 badblock(s) found Querying for specific bad blocks -------------------------------- @@ -176,11 +177,11 @@ following message command:: The following message will print if the block is in the list:: - device-mapper: dust: queryblock: block 72 found in badblocklist + dust_query_block: block 72 found in badblocklist The following message will print if the block is not in the list:: - device-mapper: dust: queryblock: block 72 not found in badblocklist + dust_query_block: block 72 not found in badblocklist The "queryblock" message command will work in both the "enabled" and "disabled" modes, allowing the verification of whether a block @@ -198,12 +199,12 @@ following message command:: After clearing the bad block list, the following message will appear:: - kernel: device-mapper: dust: clearbadblocks: badblocks cleared + dust_clear_badblocks: badblocks cleared If there were no bad blocks to clear, the following message will appear:: - kernel: device-mapper: dust: clearbadblocks: no badblocks found + dust_clear_badblocks: no badblocks found Message commands list --------------------- diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index ff03b90072c5..f1f2dd6a4e84 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -138,20 +138,22 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block, return 0; } -static int dust_query_block(struct dust_device *dd, unsigned long long block) +static int dust_query_block(struct dust_device *dd, unsigned long long block, char *result, + unsigned int maxlen, unsigned int *sz_ptr) { struct badblock *bblock; unsigned long flags; + unsigned int sz = *sz_ptr; spin_lock_irqsave(&dd->dust_lock, flags); bblock = dust_rb_search(&dd->badblocklist, block); if (bblock != NULL) - DMINFO("%s: block %llu found in badblocklist", __func__, block); + DMEMIT("%s: block %llu found in badblocklist", __func__, block); else - DMINFO("%s: block %llu not found in badblocklist", __func__, block); + DMEMIT("%s: block %llu not found in badblocklist", __func__, block); spin_unlock_irqrestore(&dd->dust_lock, flags); - return 0; + return 1; } static int __dust_map_read(struct dust_device *dd, sector_t thisblock) @@ -259,11 +261,13 @@ static bool __dust_clear_badblocks(struct rb_root *tree, return true; } -static int dust_clear_badblocks(struct dust_device *dd) +static int dust_clear_badblocks(struct dust_device *dd, char *result, unsigned int maxlen, + unsigned int *sz_ptr) { unsigned long flags; struct rb_root badblocklist; unsigned long long badblock_count; + unsigned int sz = *sz_ptr; spin_lock_irqsave(&dd->dust_lock, flags); badblocklist = dd->badblocklist; @@ -273,11 +277,11 @@ static int dust_clear_badblocks(struct dust_device *dd) spin_unlock_irqrestore(&dd->dust_lock, flags); if (!__dust_clear_badblocks(&badblocklist, badblock_count)) - DMINFO("%s: no badblocks found", __func__); + DMEMIT("%s: no badblocks found", __func__); else - DMINFO("%s: badblocks cleared", __func__); + DMEMIT("%s: badblocks cleared", __func__); - return 0; + return 1; } /* @@ -383,7 +387,7 @@ static void dust_dtr(struct dm_target *ti) } static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, - char *result_buf, unsigned int maxlen) + char *result, unsigned int maxlen) { struct dust_device *dd = ti->private; sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT; @@ -393,6 +397,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, unsigned char wr_fail_cnt; unsigned int tmp_ui; unsigned long flags; + unsigned int sz = 0; char dummy; if (argc == 1) { @@ -410,12 +415,12 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, r = 0; } else if (!strcasecmp(argv[0], "countbadblocks")) { spin_lock_irqsave(&dd->dust_lock, flags); - DMINFO("countbadblocks: %llu badblock(s) found", + DMEMIT("countbadblocks: %llu badblock(s) found", dd->badblock_count); spin_unlock_irqrestore(&dd->dust_lock, flags); - r = 0; + r = 1; } else if (!strcasecmp(argv[0], "clearbadblocks")) { - r = dust_clear_badblocks(dd); + r = dust_clear_badblocks(dd, result, maxlen, &sz); } else if (!strcasecmp(argv[0], "quiet")) { if (!dd->quiet_mode) dd->quiet_mode = true; @@ -441,7 +446,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, else if (!strcasecmp(argv[0], "removebadblock")) r = dust_remove_block(dd, block); else if (!strcasecmp(argv[0], "queryblock")) - r = dust_query_block(dd, block); + r = dust_query_block(dd, block, result, maxlen, &sz); else invalid_msg = true; From 0c248ea27fc88cf8a3035ba0ed75b210be9abf80 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Fri, 19 Jun 2020 17:12:42 -0400 Subject: [PATCH 10/20] dm dust: add interface to list all badblocks This interface may help anyone who want to know all badblocks without querying for each block. [Bryan: DMEMIT message if no blocks are in the bad block list.] Signed-off-by: yangerkun Signed-off-by: Bryan Gurney Signed-off-by: Mike Snitzer --- .../admin-guide/device-mapper/dm-dust.rst | 17 ++++++++++++ drivers/md/dm-dust.c | 27 +++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/Documentation/admin-guide/device-mapper/dm-dust.rst b/Documentation/admin-guide/device-mapper/dm-dust.rst index cf8079e368de..e35ec8cd2f88 100644 --- a/Documentation/admin-guide/device-mapper/dm-dust.rst +++ b/Documentation/admin-guide/device-mapper/dm-dust.rst @@ -206,6 +206,22 @@ appear:: dust_clear_badblocks: no badblocks found +Listing the bad block list +-------------------------- + +To list all bad blocks in the bad block list (using an example device +with blocks 1 and 2 in the bad block list), run the following message +command:: + + $ sudo dmsetup message dust1 0 listbadblocks + 1 + 2 + +If there are no bad blocks in the bad block list, the command will +execute with no output:: + + $ sudo dmsetup message dust1 0 listbadblocks + Message commands list --------------------- @@ -224,6 +240,7 @@ Single argument message commands:: countbadblocks clearbadblocks + listbadblocks disable enable quiet diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index f1f2dd6a4e84..072ea913cebc 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -284,6 +284,31 @@ static int dust_clear_badblocks(struct dust_device *dd, char *result, unsigned i return 1; } +static int dust_list_badblocks(struct dust_device *dd, char *result, unsigned int maxlen, + unsigned int *sz_ptr) +{ + unsigned long flags; + struct rb_root badblocklist; + struct rb_node *node; + struct badblock *bblk; + unsigned int sz = *sz_ptr; + unsigned long long num = 0; + + spin_lock_irqsave(&dd->dust_lock, flags); + badblocklist = dd->badblocklist; + for (node = rb_first(&badblocklist); node; node = rb_next(node)) { + bblk = rb_entry(node, struct badblock, node); + DMEMIT("%llu\n", bblk->bb); + num++; + } + + spin_unlock_irqrestore(&dd->dust_lock, flags); + if (!num) + DMEMIT("No blocks in badblocklist"); + + return 1; +} + /* * Target parameters: * @@ -427,6 +452,8 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, else dd->quiet_mode = false; r = 0; + } else if (!strcasecmp(argv[0], "listbadblocks")) { + r = dust_list_badblocks(dd, result, maxlen, &sz); } else { invalid_msg = true; } From e766668c6cd49d741cfb49eaeb38998ba34d27bc Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 Jun 2020 16:42:14 +0800 Subject: [PATCH 11/20] dm rq: don't call blk_mq_queue_stopped() in dm_stop_queue() dm_stop_queue() only uses blk_mq_quiesce_queue() so it doesn't formally stop the blk-mq queue; therefore there is no point making the blk_mq_queue_stopped() check -- it will never be stopped. In addition, even though dm_stop_queue() actually tries to quiesce hw queues via blk_mq_quiesce_queue(), checking with blk_queue_quiesced() to avoid unnecessary queue quiesce isn't reliable because: the QUEUE_FLAG_QUIESCED flag is set before synchronize_rcu() and dm_stop_queue() may be called when synchronize_rcu() from another blk_mq_quiesce_queue() is in-progress. Fixes: 7b17c2f7292ba ("dm: Fix a race condition related to stopping and starting queues") Signed-off-by: Ming Lei Signed-off-by: Mike Snitzer --- drivers/md/dm-rq.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 85e0daabad49..20745e2e34b9 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -70,9 +70,6 @@ void dm_start_queue(struct request_queue *q) void dm_stop_queue(struct request_queue *q) { - if (blk_mq_queue_stopped(q)) - return; - blk_mq_quiesce_queue(q); } From 70704c33db841d847b16d8ae7e29ae3b8a479e75 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 3 Jul 2020 10:26:46 -0400 Subject: [PATCH 12/20] dm bufio: do buffer cleanup from a workqueue Until now, DM bufio's waiting for IO from reclaim context in its shrinker has caused kswapd to block; which results in systemic IO stalls and even deadlock, e.g.: https://www.redhat.com/archives/dm-devel/2020-March/msg00025.html Here is Dave Chinner's problem description that motivated this fix, from: https://lore.kernel.org/linux-fsdevel/20190809215733.GZ7777@dread.disaster.area/ "Waiting for IO in kswapd reclaim context is considered harmful - kswapd context shrinker reclaim should be as non-blocking as possible, and any back-off to wait for IO to complete should be done by the high level reclaim core once it's completed an entire reclaim scan cycle of everything.... What follows from that, and is pertinent in this situation, is that if you don't block kswapd, then other reclaim contexts are not going to get stuck waiting for it regardless of the reclaim context they use." Continued elsewhere: "The only way to fix this problem once and for all is to stop using the shrinker as a mechanism to issue and wait on IO. If you need background writeback of dirty buffers, do it from a WQ_MEM_RECLAIM workqueue that isn't directly in the memory reclaim path and so can issue writeback and block safely from a GFP_KERNEL context. Kick the workqueue from the shrinker context, but get rid of the IO submission and waiting from the shrinker and all the GFP_NOFS memory reclaim recursion problems go away." As such, this commit moves buffer cleanup to a workqueue. Suggested-by: Dave Chinner Reported-by: Tahsin Erdogan Signed-off-by: Mikulas Patocka Tested-by: Gabriel Krisman Bertazi Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 60 +++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 6d1565021d74..9c1a86bde658 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -108,7 +108,10 @@ struct dm_bufio_client { int async_write_error; struct list_head client_list; + struct shrinker shrinker; + struct work_struct shrink_work; + atomic_long_t need_shrink; }; /* @@ -1634,8 +1637,7 @@ static unsigned long get_retain_buffers(struct dm_bufio_client *c) return retain_bytes; } -static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, - gfp_t gfp_mask) +static void __scan(struct dm_bufio_client *c) { int l; struct dm_buffer *b, *tmp; @@ -1646,42 +1648,58 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, for (l = 0; l < LIST_SIZE; l++) { list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { - if (__try_evict_buffer(b, gfp_mask)) + if (count - freed <= retain_target) + atomic_long_set(&c->need_shrink, 0); + if (!atomic_long_read(&c->need_shrink)) + return; + if (__try_evict_buffer(b, GFP_KERNEL)) { + atomic_long_dec(&c->need_shrink); freed++; - if (!--nr_to_scan || ((count - freed) <= retain_target)) - return freed; + } cond_resched(); } } - return freed; } -static unsigned long -dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +static void shrink_work(struct work_struct *w) +{ + struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work); + + dm_bufio_lock(c); + __scan(c); + dm_bufio_unlock(c); +} + +static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct dm_bufio_client *c; - unsigned long freed; c = container_of(shrink, struct dm_bufio_client, shrinker); - if (sc->gfp_mask & __GFP_FS) - dm_bufio_lock(c); - else if (!dm_bufio_trylock(c)) - return SHRINK_STOP; + atomic_long_add(sc->nr_to_scan, &c->need_shrink); + queue_work(dm_bufio_wq, &c->shrink_work); - freed = __scan(c, sc->nr_to_scan, sc->gfp_mask); - dm_bufio_unlock(c); - return freed; + return sc->nr_to_scan; } -static unsigned long -dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker); unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) + READ_ONCE(c->n_buffers[LIST_DIRTY]); unsigned long retain_target = get_retain_buffers(c); + unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink); - return (count < retain_target) ? 0 : (count - retain_target); + if (unlikely(count < retain_target)) + count = 0; + else + count -= retain_target; + + if (unlikely(count < queued_for_cleanup)) + count = 0; + else + count -= queued_for_cleanup; + + return count; } /* @@ -1772,6 +1790,9 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign __free_buffer_wake(b); } + INIT_WORK(&c->shrink_work, shrink_work); + atomic_long_set(&c->need_shrink, 0); + c->shrinker.count_objects = dm_bufio_shrink_count; c->shrinker.scan_objects = dm_bufio_shrink_scan; c->shrinker.seeks = 1; @@ -1817,6 +1838,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) drop_buffers(c); unregister_shrinker(&c->shrinker); + flush_work(&c->shrink_work); mutex_lock(&dm_bufio_clients_lock); From 39d42fa96ba1b7d2544db3f8ed5da8fb0d5cb877 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 6 Jul 2020 18:37:31 +0100 Subject: [PATCH 13/20] dm crypt: add flags to optionally bypass kcryptd workqueues This is a follow up to [1] that detailed latency problems associated with dm-crypt's use of workqueues when processing IO. Current dm-crypt implementation creates a significant IO performance overhead (at least on small IO block sizes) for both latency and throughput. We suspect offloading IO request processing into workqueues and async threads is more harmful these days with the modern fast storage. I also did some digging into the dm-crypt git history and much of this async processing is not needed anymore, because the reasons it was added are mostly gone from the kernel. More details can be found in [2] (see "Git archeology" section). This change adds DM_CRYPT_NO_READ_WORKQUEUE and DM_CRYPT_NO_WRITE_WORKQUEUE flags for read and write BIOs, which direct dm-crypt to not offload crypto operations into kcryptd workqueues. In addition, writes are not buffered to be sorted in the dm-crypt red-black tree, but dispatched immediately. For cases, where crypto operations cannot happen (hard interrupt context, for example the read path of some NVME drivers), we offload the work to a tasklet rather than a workqueue. These flags only ensure no async BIO processing in the dm-crypt module. It is worth noting that some Crypto API implementations may offload encryption into their own workqueues, which are independent of the dm-crypt and its configuration. However upon enabling these new flags dm-crypt will instruct Crypto API not to backlog crypto requests. To give an idea of the performance gains for certain workloads, consider the script, and results when tested against various devices, detailed here: https://www.redhat.com/archives/dm-devel/2020-July/msg00138.html [1]: https://www.spinics.net/lists/dm-crypt/msg07516.html [2]: https://blog.cloudflare.com/speeding-up-linux-disk-encryption/ Signed-off-by: Ignat Korchagin Reviewed-by: Damien Le Moal Reviewed-by: Bob Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 50 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 000ddfab5ba0..7536ecb2c95d 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -69,6 +69,7 @@ struct dm_crypt_io { u8 *integrity_metadata; bool integrity_metadata_from_pool; struct work_struct work; + struct tasklet_struct tasklet; struct convert_context ctx; @@ -127,7 +128,8 @@ struct iv_elephant_private { * and encrypts / decrypts at the same time. */ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, - DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD }; + DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD, + DM_CRYPT_NO_READ_WORKQUEUE, DM_CRYPT_NO_WRITE_WORKQUEUE }; enum cipher_flags { CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */ @@ -1523,7 +1525,7 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_ * Encrypt / decrypt data from one bio to another one (can be the same one) */ static blk_status_t crypt_convert(struct crypt_config *cc, - struct convert_context *ctx) + struct convert_context *ctx, bool atomic) { unsigned int tag_offset = 0; unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT; @@ -1566,7 +1568,8 @@ static blk_status_t crypt_convert(struct crypt_config *cc, atomic_dec(&ctx->cc_pending); ctx->cc_sector += sector_step; tag_offset++; - cond_resched(); + if (!atomic) + cond_resched(); continue; /* * There was a data integrity error. @@ -1892,7 +1895,8 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) clone->bi_iter.bi_sector = cc->start + io->sector; - if (likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) { + if ((likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) || + test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) { generic_make_request(clone); return; } @@ -1941,7 +1945,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) sector += bio_sectors(clone); crypt_inc_pending(io); - r = crypt_convert(cc, &io->ctx); + r = crypt_convert(cc, &io->ctx, + test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)); if (r) io->error = r; crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); @@ -1971,7 +1976,8 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, io->sector); - r = crypt_convert(cc, &io->ctx); + r = crypt_convert(cc, &io->ctx, + test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags)); if (r) io->error = r; @@ -2031,10 +2037,28 @@ static void kcryptd_crypt(struct work_struct *work) kcryptd_crypt_write_convert(io); } +static void kcryptd_crypt_tasklet(unsigned long work) +{ + kcryptd_crypt((struct work_struct *)work); +} + static void kcryptd_queue_crypt(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; + if ((bio_data_dir(io->base_bio) == READ && test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags)) || + (bio_data_dir(io->base_bio) == WRITE && test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags))) { + if (in_irq()) { + /* Crypto API's "skcipher_walk_first() refuses to work in hard IRQ context */ + tasklet_init(&io->tasklet, kcryptd_crypt_tasklet, (unsigned long)&io->work); + tasklet_schedule(&io->tasklet); + return; + } + + kcryptd_crypt(&io->work); + return; + } + INIT_WORK(&io->work, kcryptd_crypt); queue_work(cc->crypt_queue, &io->work); } @@ -2838,7 +2862,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar struct crypt_config *cc = ti->private; struct dm_arg_set as; static const struct dm_arg _args[] = { - {0, 6, "Invalid number of feature args"}, + {0, 8, "Invalid number of feature args"}, }; unsigned int opt_params, val; const char *opt_string, *sval; @@ -2868,6 +2892,10 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar else if (!strcasecmp(opt_string, "submit_from_crypt_cpus")) set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); + else if (!strcasecmp(opt_string, "no_read_workqueue")) + set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); + else if (!strcasecmp(opt_string, "no_write_workqueue")) + set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); else if (sscanf(opt_string, "integrity:%u:", &val) == 1) { if (val == 0 || val > MAX_TAG_SIZE) { ti->error = "Invalid integrity arguments"; @@ -3196,6 +3224,8 @@ static void crypt_status(struct dm_target *ti, status_type_t type, num_feature_args += !!ti->num_discard_bios; num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags); num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); + num_feature_args += test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); + num_feature_args += test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT); num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); if (cc->on_disk_tag_size) @@ -3208,6 +3238,10 @@ static void crypt_status(struct dm_target *ti, status_type_t type, DMEMIT(" same_cpu_crypt"); if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) DMEMIT(" submit_from_crypt_cpus"); + if (test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags)) + DMEMIT(" no_read_workqueue"); + if (test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) + DMEMIT(" no_write_workqueue"); if (cc->on_disk_tag_size) DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth); if (cc->sector_size != (1 << SECTOR_SHIFT)) @@ -3320,7 +3354,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 21, 0}, + .version = {1, 22, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, From 8e225f04d2dd6c1ca1eaecd5b04eaa90284df507 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 8 Jul 2020 18:28:08 +0900 Subject: [PATCH 14/20] dm crypt: Enable zoned block device support Enable support for zoned block devices. This is done by: 1) implementing the target report_zones method. 2) adding the DM_TARGET_ZONED_HM flag to the target features. 3) setting DM_CRYPT_NO_WRITE_WORKQUEUE flag to avoid IO processing via workqueue. 4) Introducing inline write encryption completion to preserve write ordering. The last point is implemented by introducing the internal flag DM_CRYPT_WRITE_INLINE. When set, kcryptd_crypt_write_convert() always waits inline for the completion of a write request encryption if the request is not already completed once crypt_convert() returns. Completion of write request encryption is signaled using the restart completion by kcryptd_async_done(). This mechanism allows using ciphers that have an asynchronous implementation, isolating dm-crypt from any potential request completion reordering for these ciphers. Signed-off-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 83 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 76 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 7536ecb2c95d..bad05c5ed3b5 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -129,7 +129,8 @@ struct iv_elephant_private { */ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD, - DM_CRYPT_NO_READ_WORKQUEUE, DM_CRYPT_NO_WRITE_WORKQUEUE }; + DM_CRYPT_NO_READ_WORKQUEUE, DM_CRYPT_NO_WRITE_WORKQUEUE, + DM_CRYPT_WRITE_INLINE }; enum cipher_flags { CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */ @@ -1919,9 +1920,32 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) spin_unlock_irqrestore(&cc->write_thread_lock, flags); } +static bool kcryptd_crypt_write_inline(struct crypt_config *cc, + struct convert_context *ctx) + +{ + if (!test_bit(DM_CRYPT_WRITE_INLINE, &cc->flags)) + return false; + + /* + * Note: zone append writes (REQ_OP_ZONE_APPEND) do not have ordering + * constraints so they do not need to be issued inline by + * kcryptd_crypt_write_convert(). + */ + switch (bio_op(ctx->bio_in)) { + case REQ_OP_WRITE: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE_ZEROES: + return true; + default: + return false; + } +} + static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; + struct convert_context *ctx = &io->ctx; struct bio *clone; int crypt_finished; sector_t sector = io->sector; @@ -1931,7 +1955,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) * Prevent io from disappearing until this function completes. */ crypt_inc_pending(io); - crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector); + crypt_convert_init(cc, ctx, NULL, io->base_bio, sector); clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size); if (unlikely(!clone)) { @@ -1945,11 +1969,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) sector += bio_sectors(clone); crypt_inc_pending(io); - r = crypt_convert(cc, &io->ctx, + r = crypt_convert(cc, ctx, test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)); if (r) io->error = r; - crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); + crypt_finished = atomic_dec_and_test(&ctx->cc_pending); + if (!crypt_finished && kcryptd_crypt_write_inline(cc, ctx)) { + /* Wait for completion signaled by kcryptd_async_done() */ + wait_for_completion(&ctx->restart); + crypt_finished = 1; + } /* Encryption was already finished, submit io now */ if (crypt_finished) { @@ -2021,10 +2050,21 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, if (!atomic_dec_and_test(&ctx->cc_pending)) return; - if (bio_data_dir(io->base_bio) == READ) + /* + * The request is fully completed: for inline writes, let + * kcryptd_crypt_write_convert() do the IO submission. + */ + if (bio_data_dir(io->base_bio) == READ) { kcryptd_crypt_read_done(io); - else - kcryptd_crypt_write_io_submit(io, 1); + return; + } + + if (kcryptd_crypt_write_inline(cc, ctx)) { + complete(&ctx->restart); + return; + } + + kcryptd_crypt_write_io_submit(io, 1); } static void kcryptd_crypt(struct work_struct *work) @@ -2936,6 +2976,21 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar return 0; } +#ifdef CONFIG_BLK_DEV_ZONED + +static int crypt_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + struct crypt_config *cc = ti->private; + sector_t sector = cc->start + dm_target_offset(ti, args->next_sector); + + args->start = cc->start; + return blkdev_report_zones(cc->dev->bdev, sector, nr_zones, + dm_report_zones_cb, args); +} + +#endif + /* * Construct an encryption mapping: * [|:::] @@ -3069,6 +3124,16 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } cc->start = tmpll; + /* + * For zoned block devices, we need to preserve the issuer write + * ordering. To do so, disable write workqueues and force inline + * encryption completion. + */ + if (bdev_is_zoned(cc->dev->bdev)) { + set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); + set_bit(DM_CRYPT_WRITE_INLINE, &cc->flags); + } + if (crypt_integrity_aead(cc) || cc->integrity_iv_size) { ret = crypt_integrity_ctr(cc, ti); if (ret) @@ -3358,6 +3423,10 @@ static struct target_type crypt_target = { .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, +#ifdef CONFIG_BLK_DEV_ZONED + .features = DM_TARGET_ZONED_HM, + .report_zones = crypt_report_zones, +#endif .map = crypt_map, .status = crypt_status, .postsuspend = crypt_postsuspend, From a84c4308333a2cbd54593649a8b144df95c68227 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 16 Jul 2020 13:38:12 +0900 Subject: [PATCH 15/20] dm verity: Fix compilation warning For the case !CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG, declare the functions verity_verify_root_hash(), verity_verify_is_sig_opt_arg(), verity_verify_sig_parse_opt_args() and verity_verify_sig_opts_cleanup() as inline to avoid a "no previous prototype for xxx" compilation warning when compiling with W=1. Signed-off-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-verity-verify-sig.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-verity-verify-sig.h b/drivers/md/dm-verity-verify-sig.h index 19b1547aa741..3987c7141f79 100644 --- a/drivers/md/dm-verity-verify-sig.h +++ b/drivers/md/dm-verity-verify-sig.h @@ -34,25 +34,25 @@ void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts); #define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 0 -int verity_verify_root_hash(const void *data, size_t data_len, - const void *sig_data, size_t sig_len) +static inline int verity_verify_root_hash(const void *data, size_t data_len, + const void *sig_data, size_t sig_len) { return 0; } -bool verity_verify_is_sig_opt_arg(const char *arg_name) +static inline bool verity_verify_is_sig_opt_arg(const char *arg_name) { return false; } -int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, - struct dm_verity_sig_opts *sig_opts, - unsigned int *argc, const char *arg_name) +static inline int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, + struct dm_verity *v, struct dm_verity_sig_opts *sig_opts, + unsigned int *argc, const char *arg_name) { return -EINVAL; } -void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts) +static inline void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts) { } From 04dc5330e5deebde2c88f8422c446e8212090ba3 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 16 Jul 2020 13:38:13 +0900 Subject: [PATCH 16/20] dm raid: Remove empty if statement In super_init_validation(), remove a body-less if statement testing only variables to avoid a compilation warning when compiling with W=1. Signed-off-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 10e8b2fe787b..08023c50aaa0 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2345,8 +2345,6 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) if (new_devs == rs->raid_disks || !rebuilds) { /* Replace a broken device */ - if (new_devs == 1 && !rs->delta_disks) - ; if (new_devs == rs->raid_disks) { DMINFO("Superblocks created for new raid set"); set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); From 1aeb6e7cd118b90f1a227c79b26100af6a15960e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 16 Jul 2020 13:38:14 +0900 Subject: [PATCH 17/20] dm ioctl: Fix compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In retrieve_status(), when copying the target type name in the target_type string field of struct dm_target_spec, copy at most DM_MAX_TYPE_NAME - 1 character to avoid the compilation warning: warning: ‘__builtin_strncpy’ specified bound 16 equals destination size [-Wstringop-truncation] when compiling with W-1. Signed-off-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 489935d5f22d..5792878dbff6 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1168,7 +1168,7 @@ static void retrieve_status(struct dm_table *table, spec->sector_start = ti->begin; spec->length = ti->len; strncpy(spec->target_type, ti->type->name, - sizeof(spec->target_type)); + sizeof(spec->target_type) - 1); outptr += sizeof(struct dm_target_spec); remaining = len - (outptr - outbuf); From 90e6bf0659f11f3d8ffa7ae1d7a4fa2af015747a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 16 Jul 2020 13:38:15 +0900 Subject: [PATCH 18/20] dm init: Set file local variable static Declare dm_allowed_targets as static to avoid the warning: drivers/md/dm-init.c:39:12: warning: symbol 'dm_allowed_targets' was not declared. Should it be static? when compiling with C=1. Signed-off-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c index b869316d3722..b0c45c6ebe0b 100644 --- a/drivers/md/dm-init.c +++ b/drivers/md/dm-init.c @@ -36,7 +36,7 @@ struct dm_device { struct list_head list; }; -const char * const dm_allowed_targets[] __initconst = { +static const char * const dm_allowed_targets[] __initconst = { "crypt", "delay", "linear", From 4cb6f22612511ff2aba4c33fb0f281cae7c23772 Mon Sep 17 00:00:00 2001 From: John Dorminy Date: Fri, 31 Jul 2020 18:46:45 -0400 Subject: [PATCH 19/20] dm ebs: Fix incorrect checking for REQ_OP_FLUSH REQ_OP_FLUSH was being treated as a flag, but the operation part of bio->bi_opf must be treated as a whole. Change to accessing the operation part via bio_op(bio) and checking for equality. Signed-off-by: John Dorminy Acked-by: Heinz Mauelshagen Fixes: d3c7b35c20d60 ("dm: add emulated block size target") Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-ebs-target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index 44451276f128..cb85610527c2 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -363,7 +363,7 @@ static int ebs_map(struct dm_target *ti, struct bio *bio) bio_set_dev(bio, ec->dev->bdev); bio->bi_iter.bi_sector = ec->start + dm_target_offset(ti, bio->bi_iter.bi_sector); - if (unlikely(bio->bi_opf & REQ_OP_FLUSH)) + if (unlikely(bio_op(bio) == REQ_OP_FLUSH)) return DM_MAPIO_REMAPPED; /* * Only queue for bufio processing in case of partial or overlapping buffers From a9cb9f4148ef6bb8fabbdaa85c42b2171fbd5a0d Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 4 Aug 2020 18:25:01 +0900 Subject: [PATCH 20/20] dm: don't call report zones for more than the user requested Don't call report zones for more zones than the user actually requested, otherwise this can lead to out-of-bounds accesses in the callback functions. Such a situation can happen if the target's ->report_zones() callback function returns 0 because we've reached the end of the target and then restart the report zones on the second target. We're again calling into ->report_zones() and ultimately into the user supplied callback function but when we're not subtracting the number of zones already processed this may lead to out-of-bounds accesses in the user callbacks. Signed-off-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Fixes: d41003513e61 ("block: rework zone reporting") Cc: stable@vger.kernel.org # v5.5+ Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 52449afd58eb..937a4194442f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -503,7 +503,8 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, } args.tgt = tgt; - ret = tgt->type->report_zones(tgt, &args, nr_zones); + ret = tgt->type->report_zones(tgt, &args, + nr_zones - args.zone_idx); if (ret < 0) goto out; } while (args.zone_idx < nr_zones &&