From 69cea0d45a618ad4ae74f36386ef1af5128b2b19 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 10 Jun 2020 15:02:37 -0400
Subject: [PATCH 01/20] dm mpath: changes from initial m->flags locking audit

Fix locking in slow-paths where m->lock should be taken.

Signed-off-by: Mike Snitzer <snitzer@rredhat.com>
---
 drivers/md/dm-mpath.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 78cff42d987e..d7bb74bded8c 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -335,6 +335,8 @@ static int pg_init_all_paths(struct multipath *m)
 
 static void __switch_pg(struct multipath *m, struct priority_group *pg)
 {
+	lockdep_assert_held(&m->lock);
+
 	m->current_pg = pg;
 
 	/* Must we initialise the PG first, and queue I/O till it's ready? */
@@ -382,7 +384,9 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
 	unsigned bypassed = 1;
 
 	if (!atomic_read(&m->nr_valid_paths)) {
+		spin_lock_irqsave(&m->lock, flags);
 		clear_bit(MPATHF_QUEUE_IO, &m->flags);
+		spin_unlock_irqrestore(&m->lock, flags);
 		goto failed;
 	}
 
@@ -422,8 +426,11 @@ check_current_pg:
 				continue;
 			pgpath = choose_path_in_pg(m, pg, nr_bytes);
 			if (!IS_ERR_OR_NULL(pgpath)) {
-				if (!bypassed)
+				if (!bypassed) {
+					spin_lock_irqsave(&m->lock, flags);
 					set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
+					spin_unlock_irqrestore(&m->lock, flags);
+				}
 				return pgpath;
 			}
 		}
@@ -1662,9 +1669,9 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
 
 	spin_lock_irqsave(&m->lock, flags);
 	bio_list_add(&m->queued_bios, clone);
-	spin_unlock_irqrestore(&m->lock, flags);
 	if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
 		queue_work(kmultipathd, &m->process_queued_bios);
+	spin_unlock_irqrestore(&m->lock, flags);
 
 	r = DM_ENDIO_INCOMPLETE;
 done:
@@ -1938,6 +1945,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 {
 	struct multipath *m = ti->private;
 	struct pgpath *current_pgpath;
+	unsigned long flags;
 	int r;
 
 	current_pgpath = READ_ONCE(m->current_pgpath);
@@ -1965,8 +1973,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 			/* Path status changed, redo selection */
 			(void) choose_pgpath(m, 0);
 		}
+		spin_lock_irqsave(&m->lock, flags);
 		if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
-			pg_init_all_paths(m);
+			(void) __pg_init_all_paths(m);
+		spin_unlock_irqrestore(&m->lock, flags);
 		dm_table_run_md_queue_async(m->ti->table);
 		process_queued_io_list(m);
 	}

From a271a89c6e4773478b1c4f8213dfe8351ea66723 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 10 Jun 2020 16:07:57 -0400
Subject: [PATCH 02/20] dm mpath: take m->lock spinlock when testing
 QUEUE_IF_NO_PATH

Fix multipath_end_io, multipath_end_io_bio and multipath_busy to take
m->lock while testing if MPATHF_QUEUE_IF_NO_PATH bit is set.  These are
all slow-path cases when no paths are available so extra locking isn't a
performance hit.  Correctness matters most.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 52 ++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index d7bb74bded8c..bf5175805b1d 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1621,12 +1621,16 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 		if (pgpath)
 			fail_path(pgpath);
 
-		if (atomic_read(&m->nr_valid_paths) == 0 &&
-		    !must_push_back_rq(m)) {
-			if (error == BLK_STS_IOERR)
-				dm_report_EIO(m);
-			/* complete with the original error */
-			r = DM_ENDIO_DONE;
+		if (!atomic_read(&m->nr_valid_paths)) {
+			unsigned long flags;
+			spin_lock_irqsave(&m->lock, flags);
+			if (!must_push_back_rq(m)) {
+				if (error == BLK_STS_IOERR)
+					dm_report_EIO(m);
+				/* complete with the original error */
+				r = DM_ENDIO_DONE;
+			}
+			spin_unlock_irqrestore(&m->lock, flags);
 		}
 	}
 
@@ -1656,15 +1660,19 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
 	if (pgpath)
 		fail_path(pgpath);
 
-	if (atomic_read(&m->nr_valid_paths) == 0 &&
-	    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
-		if (__must_push_back(m)) {
-			r = DM_ENDIO_REQUEUE;
-		} else {
-			dm_report_EIO(m);
-			*error = BLK_STS_IOERR;
+	if (!atomic_read(&m->nr_valid_paths)) {
+		spin_lock_irqsave(&m->lock, flags);
+		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+			if (__must_push_back(m)) {
+				r = DM_ENDIO_REQUEUE;
+			} else {
+				dm_report_EIO(m);
+				*error = BLK_STS_IOERR;
+			}
+			spin_unlock_irqrestore(&m->lock, flags);
+			goto done;
 		}
-		goto done;
+		spin_unlock_irqrestore(&m->lock, flags);
 	}
 
 	spin_lock_irqsave(&m->lock, flags);
@@ -1962,10 +1970,11 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 		}
 	} else {
 		/* No path is available */
+		r = -EIO;
+		spin_lock_irqsave(&m->lock, flags);
 		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
 			r = -ENOTCONN;
-		else
-			r = -EIO;
+		spin_unlock_irqrestore(&m->lock, flags);
 	}
 
 	if (r == -ENOTCONN) {
@@ -2036,8 +2045,15 @@ static int multipath_busy(struct dm_target *ti)
 		return true;
 
 	/* no paths available, for blk-mq: rely on IO mapping to delay requeue */
-	if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
-		return (m->queue_mode != DM_TYPE_REQUEST_BASED);
+	if (!atomic_read(&m->nr_valid_paths)) {
+		unsigned long flags;
+		spin_lock_irqsave(&m->lock, flags);
+		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+			spin_unlock_irqrestore(&m->lock, flags);
+			return (m->queue_mode != DM_TYPE_REQUEST_BASED);
+		}
+		spin_unlock_irqrestore(&m->lock, flags);
+	}
 
 	/* Guess which priority_group will be used at next mapping time */
 	pg = READ_ONCE(m->current_pg);

From 73265f3ffdc90f0e27b79fbcba8530c75c74ef12 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 10 Jun 2020 16:51:32 -0400
Subject: [PATCH 03/20] dm mpath: push locking down to must_push_back_rq()

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index bf5175805b1d..87b6a2c98c73 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -472,7 +472,14 @@ static bool __must_push_back(struct multipath *m)
 
 static bool must_push_back_rq(struct multipath *m)
 {
-	return test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m);
+	unsigned long flags;
+	bool ret;
+
+	spin_lock_irqsave(&m->lock, flags);
+	ret = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m));
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	return ret;
 }
 
 /*
@@ -1621,16 +1628,12 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 		if (pgpath)
 			fail_path(pgpath);
 
-		if (!atomic_read(&m->nr_valid_paths)) {
-			unsigned long flags;
-			spin_lock_irqsave(&m->lock, flags);
-			if (!must_push_back_rq(m)) {
-				if (error == BLK_STS_IOERR)
-					dm_report_EIO(m);
-				/* complete with the original error */
-				r = DM_ENDIO_DONE;
-			}
-			spin_unlock_irqrestore(&m->lock, flags);
+		if (!atomic_read(&m->nr_valid_paths) &&
+		    !must_push_back_rq(m)) {
+			if (error == BLK_STS_IOERR)
+				dm_report_EIO(m);
+			/* complete with the original error */
+			r = DM_ENDIO_DONE;
 		}
 	}
 

From f45f11868e0efeccfc1c695e56864f9032f18bdc Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 10 Jun 2020 20:03:19 -0400
Subject: [PATCH 04/20] dm mpath: factor out multipath_queue_bio

Enables further cleanup.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 87b6a2c98c73..4b48183c6b5c 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -574,6 +574,18 @@ static void multipath_release_clone(struct request *clone,
  * Map cloned bios (bio-based multipath)
  */
 
+static void multipath_queue_bio(struct multipath *m, struct bio *bio)
+{
+	unsigned long flags;
+
+	/* Queue for the daemon to resubmit */
+	spin_lock_irqsave(&m->lock, flags);
+	bio_list_add(&m->queued_bios, bio);
+	if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
+		queue_work(kmultipathd, &m->process_queued_bios);
+	spin_unlock_irqrestore(&m->lock, flags);
+}
+
 static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
 {
 	struct pgpath *pgpath;
@@ -590,16 +602,11 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
 
 	if ((pgpath && queue_io) ||
 	    (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
-		/* Queue for the daemon to resubmit */
-		spin_lock_irqsave(&m->lock, flags);
-		bio_list_add(&m->queued_bios, bio);
-		spin_unlock_irqrestore(&m->lock, flags);
+		multipath_queue_bio(m, bio);
 
 		/* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
 		if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
 			pg_init_all_paths(m);
-		else if (!queue_io)
-			queue_work(kmultipathd, &m->process_queued_bios);
 
 		return ERR_PTR(-EAGAIN);
 	}
@@ -1678,12 +1685,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
 		spin_unlock_irqrestore(&m->lock, flags);
 	}
 
-	spin_lock_irqsave(&m->lock, flags);
-	bio_list_add(&m->queued_bios, clone);
-	if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
-		queue_work(kmultipathd, &m->process_queued_bios);
-	spin_unlock_irqrestore(&m->lock, flags);
-
+	multipath_queue_bio(m, clone);
 	r = DM_ENDIO_INCOMPLETE;
 done:
 	if (pgpath) {

From 17213ec1806199ab528655946af144abc37d89fb Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 10 Jun 2020 20:19:56 -0400
Subject: [PATCH 05/20] dm mpath: rework __map_bio()

so that it follows same pattern as request-based
multipath_clone_and_map()

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 4b48183c6b5c..ab6ccd619573 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -574,15 +574,20 @@ static void multipath_release_clone(struct request *clone,
  * Map cloned bios (bio-based multipath)
  */
 
+static void __multipath_queue_bio(struct multipath *m, struct bio *bio)
+{
+	/* Queue for the daemon to resubmit */
+	bio_list_add(&m->queued_bios, bio);
+	if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
+		queue_work(kmultipathd, &m->process_queued_bios);
+}
+
 static void multipath_queue_bio(struct multipath *m, struct bio *bio)
 {
 	unsigned long flags;
 
-	/* Queue for the daemon to resubmit */
 	spin_lock_irqsave(&m->lock, flags);
-	bio_list_add(&m->queued_bios, bio);
-	if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
-		queue_work(kmultipathd, &m->process_queued_bios);
+	__multipath_queue_bio(m, bio);
 	spin_unlock_irqrestore(&m->lock, flags);
 }
 
@@ -590,24 +595,24 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
 {
 	struct pgpath *pgpath;
 	unsigned long flags;
-	bool queue_io;
 
 	/* Do we need to select a new pgpath? */
 	pgpath = READ_ONCE(m->current_pgpath);
 	if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
 		pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
 
-	/* MPATHF_QUEUE_IO might have been cleared by choose_pgpath. */
-	queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
+	if (!pgpath) {
+		spin_lock_irqsave(&m->lock, flags);
+		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+			__multipath_queue_bio(m, bio);
+			pgpath = ERR_PTR(-EAGAIN);
+		}
+		spin_unlock_irqrestore(&m->lock, flags);
 
-	if ((pgpath && queue_io) ||
-	    (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
+	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
+		   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
 		multipath_queue_bio(m, bio);
-
-		/* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
-		if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
-			pg_init_all_paths(m);
-
+		pg_init_all_paths(m);
 		return ERR_PTR(-EAGAIN);
 	}
 

From 564dbb130b3f8ab39719a42c130505bf75610fbf Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 10 Jun 2020 21:25:39 -0400
Subject: [PATCH 06/20] dm mpath: rename current_pgpath to pgpath in
 multipath_prepare_ioctl

Makes consistent with __map_bio() and multipath_clone_and_map().

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index ab6ccd619573..f71bb4e5eaf7 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1962,17 +1962,17 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 				   struct block_device **bdev)
 {
 	struct multipath *m = ti->private;
-	struct pgpath *current_pgpath;
+	struct pgpath *pgpath;
 	unsigned long flags;
 	int r;
 
-	current_pgpath = READ_ONCE(m->current_pgpath);
-	if (!current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
-		current_pgpath = choose_pgpath(m, 0);
+	pgpath = READ_ONCE(m->current_pgpath);
+	if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
+		pgpath = choose_pgpath(m, 0);
 
-	if (current_pgpath) {
+	if (pgpath) {
 		if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
-			*bdev = current_pgpath->path.dev->bdev;
+			*bdev = pgpath->path.dev->bdev;
 			r = 0;
 		} else {
 			/* pg_init has not started or completed */

From 374117ad4736c5a4f8012cfe59fc07d9d58191d5 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 15 Jun 2020 11:31:04 -0400
Subject: [PATCH 07/20] dm mpath: use double checked locking in fast path

Fast-path code biased toward lazy acknowledgement of bit being set
(primarily only for initialization).  Multipath code is very retry
oriented so even if state is missed it'll recover.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f71bb4e5eaf7..88dc7b41b1c6 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -128,6 +128,20 @@ static void queue_if_no_path_timeout_work(struct timer_list *t);
 #define MPATHF_PG_INIT_REQUIRED 5		/* pg_init needs calling? */
 #define MPATHF_PG_INIT_DELAY_RETRY 6		/* Delay pg_init retry? */
 
+static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m)
+{
+	bool r = test_bit(MPATHF_bit, &m->flags);
+
+	if (r) {
+		unsigned long flags;
+		spin_lock_irqsave(&m->lock, flags);
+		r = test_bit(MPATHF_bit, &m->flags);
+		spin_unlock_irqrestore(&m->lock, flags);
+	}
+
+	return r;
+}
+
 /*-----------------------------------------------
  * Allocation routines
  *-----------------------------------------------*/
@@ -499,7 +513,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 
 	/* Do we need to select a new pgpath? */
 	pgpath = READ_ONCE(m->current_pgpath);
-	if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
+	if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
 		pgpath = choose_pgpath(m, nr_bytes);
 
 	if (!pgpath) {
@@ -507,8 +521,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 			return DM_MAPIO_DELAY_REQUEUE;
 		dm_report_EIO(m);	/* Failed */
 		return DM_MAPIO_KILL;
-	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
-		   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
+	} else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) ||
+		   mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) {
 		pg_init_all_paths(m);
 		return DM_MAPIO_DELAY_REQUEUE;
 	}
@@ -598,7 +612,7 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
 
 	/* Do we need to select a new pgpath? */
 	pgpath = READ_ONCE(m->current_pgpath);
-	if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
+	if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
 		pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
 
 	if (!pgpath) {
@@ -609,8 +623,8 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
 		}
 		spin_unlock_irqrestore(&m->lock, flags);
 
-	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
-		   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
+	} else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) ||
+		   mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) {
 		multipath_queue_bio(m, bio);
 		pg_init_all_paths(m);
 		return ERR_PTR(-EAGAIN);
@@ -861,7 +875,7 @@ static int setup_scsi_dh(struct block_device *bdev, struct multipath *m,
 	struct request_queue *q = bdev_get_queue(bdev);
 	int r;
 
-	if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) {
+	if (mpath_double_check_test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, m)) {
 retain:
 		if (*attached_handler_name) {
 			/*
@@ -1967,11 +1981,11 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 	int r;
 
 	pgpath = READ_ONCE(m->current_pgpath);
-	if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
+	if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
 		pgpath = choose_pgpath(m, 0);
 
 	if (pgpath) {
-		if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
+		if (!mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) {
 			*bdev = pgpath->path.dev->bdev;
 			r = 0;
 		} else {

From e1fef0b08e06dcce275bd585bc6a900dff395096 Mon Sep 17 00:00:00 2001
From: JeongHyeon Lee <jhs2.lee@samsung.com>
Date: Thu, 18 Jun 2020 15:56:50 +0900
Subject: [PATCH 08/20] dm verity: add "panic_on_corruption" error handling
 mode

Samsung smart phones may need the ability to panic on corruption.  Not
all devices provide the bootloader support needed to use the existing
"restart_on_corruption" mode.  Additional details for why Samsung needs
this new mode can be found here:
https://www.redhat.com/archives/dm-devel/2020-June/msg00235.html

Signed-off-by: jhs2.lee <jhs2.lee@samsung.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/admin-guide/device-mapper/verity.rst |  4 ++++
 drivers/md/dm-verity-target.c                      | 13 ++++++++++++-
 drivers/md/dm-verity.h                             |  3 ++-
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst
index bb02caa45289..66f71f0dab1b 100644
--- a/Documentation/admin-guide/device-mapper/verity.rst
+++ b/Documentation/admin-guide/device-mapper/verity.rst
@@ -83,6 +83,10 @@ restart_on_corruption
     not compatible with ignore_corruption and requires user space support to
     avoid restart loops.
 
+panic_on_corruption
+    Panic the device when a corrupted block is discovered. This option is
+    not compatible with ignore_corruption and restart_on_corruption.
+
 ignore_zero_blocks
     Do not verify blocks that are expected to contain zeroes and always return
     zeroes instead. This may be useful if the partition contains unused blocks
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index eec9f252e935..ddcd1d03beb9 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -30,6 +30,7 @@
 
 #define DM_VERITY_OPT_LOGGING		"ignore_corruption"
 #define DM_VERITY_OPT_RESTART		"restart_on_corruption"
+#define DM_VERITY_OPT_PANIC		"panic_on_corruption"
 #define DM_VERITY_OPT_IGN_ZEROES	"ignore_zero_blocks"
 #define DM_VERITY_OPT_AT_MOST_ONCE	"check_at_most_once"
 
@@ -254,6 +255,9 @@ out:
 	if (v->mode == DM_VERITY_MODE_RESTART)
 		kernel_restart("dm-verity device corrupted");
 
+	if (v->mode == DM_VERITY_MODE_PANIC)
+		panic("dm-verity device corrupted");
+
 	return 1;
 }
 
@@ -742,6 +746,9 @@ static void verity_status(struct dm_target *ti, status_type_t type,
 			case DM_VERITY_MODE_RESTART:
 				DMEMIT(DM_VERITY_OPT_RESTART);
 				break;
+			case DM_VERITY_MODE_PANIC:
+				DMEMIT(DM_VERITY_OPT_PANIC);
+				break;
 			default:
 				BUG();
 			}
@@ -907,6 +914,10 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
 			v->mode = DM_VERITY_MODE_RESTART;
 			continue;
 
+		} else if (!strcasecmp(arg_name, DM_VERITY_OPT_PANIC)) {
+			v->mode = DM_VERITY_MODE_PANIC;
+			continue;
+
 		} else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) {
 			r = verity_alloc_zero_digest(v);
 			if (r) {
@@ -1221,7 +1232,7 @@ bad:
 
 static struct target_type verity_target = {
 	.name		= "verity",
-	.version	= {1, 6, 0},
+	.version	= {1, 7, 0},
 	.module		= THIS_MODULE,
 	.ctr		= verity_ctr,
 	.dtr		= verity_dtr,
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 641b9e3a399b..4e769d13473a 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -20,7 +20,8 @@
 enum verity_mode {
 	DM_VERITY_MODE_EIO,
 	DM_VERITY_MODE_LOGGING,
-	DM_VERITY_MODE_RESTART
+	DM_VERITY_MODE_RESTART,
+	DM_VERITY_MODE_PANIC
 };
 
 enum verity_block_type {

From 4f7f590b152444c1403ece9eeeddd1e8b22ba04e Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Fri, 19 Jun 2020 17:10:39 -0400
Subject: [PATCH 09/20] dm dust: report some message results directly back to
 user

Some messages (queryblock, countbadblocks, removebadblock) are best
reported directly to user directly. Do so with DMEMIT.

[Bryan: maintain __func__ output in DMEMIT messages]

Signed-off-by: yangerkun <yangerkun@huawei.com>
Signed-off-by: Bryan Gurney <bgurney@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 .../admin-guide/device-mapper/dm-dust.rst     | 15 ++++-----
 drivers/md/dm-dust.c                          | 31 +++++++++++--------
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/Documentation/admin-guide/device-mapper/dm-dust.rst b/Documentation/admin-guide/device-mapper/dm-dust.rst
index b6e7e7ead831..cf8079e368de 100644
--- a/Documentation/admin-guide/device-mapper/dm-dust.rst
+++ b/Documentation/admin-guide/device-mapper/dm-dust.rst
@@ -69,10 +69,11 @@ Create the dm-dust device:
         $ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 4096'
 
 Check the status of the read behavior ("bypass" indicates that all I/O
-will be passed through to the underlying device)::
+will be passed through to the underlying device; "verbose" indicates that
+bad block additions, removals, and remaps will be verbosely logged)::
 
         $ sudo dmsetup status dust1
-        0 33552384 dust 252:17 bypass
+        0 33552384 dust 252:17 bypass verbose
 
         $ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=128 iflag=direct
         128+0 records in
@@ -164,7 +165,7 @@ following message command::
 A message will print with the number of bad blocks currently
 configured on the device::
 
-        kernel: device-mapper: dust: countbadblocks: 895 badblock(s) found
+        countbadblocks: 895 badblock(s) found
 
 Querying for specific bad blocks
 --------------------------------
@@ -176,11 +177,11 @@ following message command::
 
 The following message will print if the block is in the list::
 
-        device-mapper: dust: queryblock: block 72 found in badblocklist
+        dust_query_block: block 72 found in badblocklist
 
 The following message will print if the block is not in the list::
 
-        device-mapper: dust: queryblock: block 72 not found in badblocklist
+        dust_query_block: block 72 not found in badblocklist
 
 The "queryblock" message command will work in both the "enabled"
 and "disabled" modes, allowing the verification of whether a block
@@ -198,12 +199,12 @@ following message command::
 
 After clearing the bad block list, the following message will appear::
 
-        kernel: device-mapper: dust: clearbadblocks: badblocks cleared
+        dust_clear_badblocks: badblocks cleared
 
 If there were no bad blocks to clear, the following message will
 appear::
 
-        kernel: device-mapper: dust: clearbadblocks: no badblocks found
+        dust_clear_badblocks: no badblocks found
 
 Message commands list
 ---------------------
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
index ff03b90072c5..f1f2dd6a4e84 100644
--- a/drivers/md/dm-dust.c
+++ b/drivers/md/dm-dust.c
@@ -138,20 +138,22 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block,
 	return 0;
 }
 
-static int dust_query_block(struct dust_device *dd, unsigned long long block)
+static int dust_query_block(struct dust_device *dd, unsigned long long block, char *result,
+			    unsigned int maxlen, unsigned int *sz_ptr)
 {
 	struct badblock *bblock;
 	unsigned long flags;
+	unsigned int sz = *sz_ptr;
 
 	spin_lock_irqsave(&dd->dust_lock, flags);
 	bblock = dust_rb_search(&dd->badblocklist, block);
 	if (bblock != NULL)
-		DMINFO("%s: block %llu found in badblocklist", __func__, block);
+		DMEMIT("%s: block %llu found in badblocklist", __func__, block);
 	else
-		DMINFO("%s: block %llu not found in badblocklist", __func__, block);
+		DMEMIT("%s: block %llu not found in badblocklist", __func__, block);
 	spin_unlock_irqrestore(&dd->dust_lock, flags);
 
-	return 0;
+	return 1;
 }
 
 static int __dust_map_read(struct dust_device *dd, sector_t thisblock)
@@ -259,11 +261,13 @@ static bool __dust_clear_badblocks(struct rb_root *tree,
 	return true;
 }
 
-static int dust_clear_badblocks(struct dust_device *dd)
+static int dust_clear_badblocks(struct dust_device *dd, char *result, unsigned int maxlen,
+				unsigned int *sz_ptr)
 {
 	unsigned long flags;
 	struct rb_root badblocklist;
 	unsigned long long badblock_count;
+	unsigned int sz = *sz_ptr;
 
 	spin_lock_irqsave(&dd->dust_lock, flags);
 	badblocklist = dd->badblocklist;
@@ -273,11 +277,11 @@ static int dust_clear_badblocks(struct dust_device *dd)
 	spin_unlock_irqrestore(&dd->dust_lock, flags);
 
 	if (!__dust_clear_badblocks(&badblocklist, badblock_count))
-		DMINFO("%s: no badblocks found", __func__);
+		DMEMIT("%s: no badblocks found", __func__);
 	else
-		DMINFO("%s: badblocks cleared", __func__);
+		DMEMIT("%s: badblocks cleared", __func__);
 
-	return 0;
+	return 1;
 }
 
 /*
@@ -383,7 +387,7 @@ static void dust_dtr(struct dm_target *ti)
 }
 
 static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
-			char *result_buf, unsigned int maxlen)
+			char *result, unsigned int maxlen)
 {
 	struct dust_device *dd = ti->private;
 	sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT;
@@ -393,6 +397,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
 	unsigned char wr_fail_cnt;
 	unsigned int tmp_ui;
 	unsigned long flags;
+	unsigned int sz = 0;
 	char dummy;
 
 	if (argc == 1) {
@@ -410,12 +415,12 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
 			r = 0;
 		} else if (!strcasecmp(argv[0], "countbadblocks")) {
 			spin_lock_irqsave(&dd->dust_lock, flags);
-			DMINFO("countbadblocks: %llu badblock(s) found",
+			DMEMIT("countbadblocks: %llu badblock(s) found",
 			       dd->badblock_count);
 			spin_unlock_irqrestore(&dd->dust_lock, flags);
-			r = 0;
+			r = 1;
 		} else if (!strcasecmp(argv[0], "clearbadblocks")) {
-			r = dust_clear_badblocks(dd);
+			r = dust_clear_badblocks(dd, result, maxlen, &sz);
 		} else if (!strcasecmp(argv[0], "quiet")) {
 			if (!dd->quiet_mode)
 				dd->quiet_mode = true;
@@ -441,7 +446,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
 		else if (!strcasecmp(argv[0], "removebadblock"))
 			r = dust_remove_block(dd, block);
 		else if (!strcasecmp(argv[0], "queryblock"))
-			r = dust_query_block(dd, block);
+			r = dust_query_block(dd, block, result, maxlen, &sz);
 		else
 			invalid_msg = true;
 

From 0c248ea27fc88cf8a3035ba0ed75b210be9abf80 Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Fri, 19 Jun 2020 17:12:42 -0400
Subject: [PATCH 10/20] dm dust: add interface to list all badblocks

This interface may help anyone who want to know all badblocks without
querying for each block.

[Bryan: DMEMIT message if no blocks are in the bad block list.]

Signed-off-by: yangerkun <yangerkun@huawei.com>
Signed-off-by: Bryan Gurney <bgurney@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 .../admin-guide/device-mapper/dm-dust.rst     | 17 ++++++++++++
 drivers/md/dm-dust.c                          | 27 +++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/Documentation/admin-guide/device-mapper/dm-dust.rst b/Documentation/admin-guide/device-mapper/dm-dust.rst
index cf8079e368de..e35ec8cd2f88 100644
--- a/Documentation/admin-guide/device-mapper/dm-dust.rst
+++ b/Documentation/admin-guide/device-mapper/dm-dust.rst
@@ -206,6 +206,22 @@ appear::
 
         dust_clear_badblocks: no badblocks found
 
+Listing the bad block list
+--------------------------
+
+To list all bad blocks in the bad block list (using an example device
+with blocks 1 and 2 in the bad block list), run the following message
+command::
+
+        $ sudo dmsetup message dust1 0 listbadblocks
+        1
+        2
+
+If there are no bad blocks in the bad block list, the command will
+execute with no output::
+
+        $ sudo dmsetup message dust1 0 listbadblocks
+
 Message commands list
 ---------------------
 
@@ -224,6 +240,7 @@ Single argument message commands::
 
         countbadblocks
         clearbadblocks
+        listbadblocks
         disable
         enable
         quiet
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
index f1f2dd6a4e84..072ea913cebc 100644
--- a/drivers/md/dm-dust.c
+++ b/drivers/md/dm-dust.c
@@ -284,6 +284,31 @@ static int dust_clear_badblocks(struct dust_device *dd, char *result, unsigned i
 	return 1;
 }
 
+static int dust_list_badblocks(struct dust_device *dd, char *result, unsigned int maxlen,
+				unsigned int *sz_ptr)
+{
+	unsigned long flags;
+	struct rb_root badblocklist;
+	struct rb_node *node;
+	struct badblock *bblk;
+	unsigned int sz = *sz_ptr;
+	unsigned long long num = 0;
+
+	spin_lock_irqsave(&dd->dust_lock, flags);
+	badblocklist = dd->badblocklist;
+	for (node = rb_first(&badblocklist); node; node = rb_next(node)) {
+		bblk = rb_entry(node, struct badblock, node);
+		DMEMIT("%llu\n", bblk->bb);
+		num++;
+	}
+
+	spin_unlock_irqrestore(&dd->dust_lock, flags);
+	if (!num)
+		DMEMIT("No blocks in badblocklist");
+
+	return 1;
+}
+
 /*
  * Target parameters:
  *
@@ -427,6 +452,8 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
 			else
 				dd->quiet_mode = false;
 			r = 0;
+		} else if (!strcasecmp(argv[0], "listbadblocks")) {
+			r = dust_list_badblocks(dd, result, maxlen, &sz);
 		} else {
 			invalid_msg = true;
 		}

From e766668c6cd49d741cfb49eaeb38998ba34d27bc Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 19 Jun 2020 16:42:14 +0800
Subject: [PATCH 11/20] dm rq: don't call blk_mq_queue_stopped() in
 dm_stop_queue()

dm_stop_queue() only uses blk_mq_quiesce_queue() so it doesn't
formally stop the blk-mq queue; therefore there is no point making the
blk_mq_queue_stopped() check -- it will never be stopped.

In addition, even though dm_stop_queue() actually tries to quiesce hw
queues via blk_mq_quiesce_queue(), checking with blk_queue_quiesced()
to avoid unnecessary queue quiesce isn't reliable because: the
QUEUE_FLAG_QUIESCED flag is set before synchronize_rcu() and
dm_stop_queue() may be called when synchronize_rcu() from another
blk_mq_quiesce_queue() is in-progress.

Fixes: 7b17c2f7292ba ("dm: Fix a race condition related to stopping and starting queues")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-rq.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 85e0daabad49..20745e2e34b9 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -70,9 +70,6 @@ void dm_start_queue(struct request_queue *q)
 
 void dm_stop_queue(struct request_queue *q)
 {
-	if (blk_mq_queue_stopped(q))
-		return;
-
 	blk_mq_quiesce_queue(q);
 }
 

From 70704c33db841d847b16d8ae7e29ae3b8a479e75 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 3 Jul 2020 10:26:46 -0400
Subject: [PATCH 12/20] dm bufio: do buffer cleanup from a workqueue

Until now, DM bufio's waiting for IO from reclaim context in its
shrinker has caused kswapd to block; which results in systemic IO
stalls and even deadlock, e.g.:
https://www.redhat.com/archives/dm-devel/2020-March/msg00025.html

Here is Dave Chinner's problem description that motivated this fix,
from: https://lore.kernel.org/linux-fsdevel/20190809215733.GZ7777@dread.disaster.area/

"Waiting for IO in kswapd reclaim context is considered harmful -
kswapd context shrinker reclaim should be as non-blocking as possible,
and any back-off to wait for IO to complete should be done by the high
level reclaim core once it's completed an entire reclaim scan cycle of
everything....

What follows from that, and is pertinent in this situation, is that if
you don't block kswapd, then other reclaim contexts are not going to
get stuck waiting for it regardless of the reclaim context they use."

Continued elsewhere:

"The only way to fix this problem once and for all is to stop using
the shrinker as a mechanism to issue and wait on IO. If you need
background writeback of dirty buffers, do it from a WQ_MEM_RECLAIM
workqueue that isn't directly in the memory reclaim path and so can
issue writeback and block safely from a GFP_KERNEL context. Kick the
workqueue from the shrinker context, but get rid of the IO submission
and waiting from the shrinker and all the GFP_NOFS memory reclaim
recursion problems go away."

As such, this commit moves buffer cleanup to a workqueue.

Suggested-by: Dave Chinner <dchinner@redhat.com>
Reported-by: Tahsin Erdogan <tahsin@google.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Tested-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c | 60 +++++++++++++++++++++++++++++--------------
 1 file changed, 41 insertions(+), 19 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 6d1565021d74..9c1a86bde658 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -108,7 +108,10 @@ struct dm_bufio_client {
 	int async_write_error;
 
 	struct list_head client_list;
+
 	struct shrinker shrinker;
+	struct work_struct shrink_work;
+	atomic_long_t need_shrink;
 };
 
 /*
@@ -1634,8 +1637,7 @@ static unsigned long get_retain_buffers(struct dm_bufio_client *c)
 	return retain_bytes;
 }
 
-static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
-			    gfp_t gfp_mask)
+static void __scan(struct dm_bufio_client *c)
 {
 	int l;
 	struct dm_buffer *b, *tmp;
@@ -1646,42 +1648,58 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
 
 	for (l = 0; l < LIST_SIZE; l++) {
 		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
-			if (__try_evict_buffer(b, gfp_mask))
+			if (count - freed <= retain_target)
+				atomic_long_set(&c->need_shrink, 0);
+			if (!atomic_long_read(&c->need_shrink))
+				return;
+			if (__try_evict_buffer(b, GFP_KERNEL)) {
+				atomic_long_dec(&c->need_shrink);
 				freed++;
-			if (!--nr_to_scan || ((count - freed) <= retain_target))
-				return freed;
+			}
 			cond_resched();
 		}
 	}
-	return freed;
 }
 
-static unsigned long
-dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+static void shrink_work(struct work_struct *w)
+{
+	struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
+
+	dm_bufio_lock(c);
+	__scan(c);
+	dm_bufio_unlock(c);
+}
+
+static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct dm_bufio_client *c;
-	unsigned long freed;
 
 	c = container_of(shrink, struct dm_bufio_client, shrinker);
-	if (sc->gfp_mask & __GFP_FS)
-		dm_bufio_lock(c);
-	else if (!dm_bufio_trylock(c))
-		return SHRINK_STOP;
+	atomic_long_add(sc->nr_to_scan, &c->need_shrink);
+	queue_work(dm_bufio_wq, &c->shrink_work);
 
-	freed  = __scan(c, sc->nr_to_scan, sc->gfp_mask);
-	dm_bufio_unlock(c);
-	return freed;
+	return sc->nr_to_scan;
 }
 
-static unsigned long
-dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
 	unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
 			      READ_ONCE(c->n_buffers[LIST_DIRTY]);
 	unsigned long retain_target = get_retain_buffers(c);
+	unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
 
-	return (count < retain_target) ? 0 : (count - retain_target);
+	if (unlikely(count < retain_target))
+		count = 0;
+	else
+		count -= retain_target;
+
+	if (unlikely(count < queued_for_cleanup))
+		count = 0;
+	else
+		count -= queued_for_cleanup;
+
+	return count;
 }
 
 /*
@@ -1772,6 +1790,9 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 		__free_buffer_wake(b);
 	}
 
+	INIT_WORK(&c->shrink_work, shrink_work);
+	atomic_long_set(&c->need_shrink, 0);
+
 	c->shrinker.count_objects = dm_bufio_shrink_count;
 	c->shrinker.scan_objects = dm_bufio_shrink_scan;
 	c->shrinker.seeks = 1;
@@ -1817,6 +1838,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
 	drop_buffers(c);
 
 	unregister_shrinker(&c->shrinker);
+	flush_work(&c->shrink_work);
 
 	mutex_lock(&dm_bufio_clients_lock);
 

From 39d42fa96ba1b7d2544db3f8ed5da8fb0d5cb877 Mon Sep 17 00:00:00 2001
From: Ignat Korchagin <ignat@cloudflare.com>
Date: Mon, 6 Jul 2020 18:37:31 +0100
Subject: [PATCH 13/20] dm crypt: add flags to optionally bypass kcryptd
 workqueues

This is a follow up to [1] that detailed latency problems associated
with dm-crypt's use of workqueues when processing IO.

Current dm-crypt implementation creates a significant IO performance
overhead (at least on small IO block sizes) for both latency and
throughput. We suspect offloading IO request processing into
workqueues and async threads is more harmful these days with the
modern fast storage. I also did some digging into the dm-crypt git
history and much of this async processing is not needed anymore,
because the reasons it was added are mostly gone from the kernel. More
details can be found in [2] (see "Git archeology" section).

This change adds DM_CRYPT_NO_READ_WORKQUEUE and
DM_CRYPT_NO_WRITE_WORKQUEUE flags for read and write BIOs, which
direct dm-crypt to not offload crypto operations into kcryptd
workqueues.  In addition, writes are not buffered to be sorted in the
dm-crypt red-black tree, but dispatched immediately. For cases, where
crypto operations cannot happen (hard interrupt context, for example
the read path of some NVME drivers), we offload the work to a tasklet
rather than a workqueue.

These flags only ensure no async BIO processing in the dm-crypt
module. It is worth noting that some Crypto API implementations may
offload encryption into their own workqueues, which are independent of
the dm-crypt and its configuration. However upon enabling these new
flags dm-crypt will instruct Crypto API not to backlog crypto
requests.

To give an idea of the performance gains for certain workloads,
consider the script, and results when tested against various
devices, detailed here:
https://www.redhat.com/archives/dm-devel/2020-July/msg00138.html

[1]: https://www.spinics.net/lists/dm-crypt/msg07516.html
[2]: https://blog.cloudflare.com/speeding-up-linux-disk-encryption/

Signed-off-by: Ignat Korchagin <ignat@cloudflare.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 50 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 000ddfab5ba0..7536ecb2c95d 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -69,6 +69,7 @@ struct dm_crypt_io {
 	u8 *integrity_metadata;
 	bool integrity_metadata_from_pool;
 	struct work_struct work;
+	struct tasklet_struct tasklet;
 
 	struct convert_context ctx;
 
@@ -127,7 +128,8 @@ struct iv_elephant_private {
  * and encrypts / decrypts at the same time.
  */
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
-	     DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
+	     DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD,
+	     DM_CRYPT_NO_READ_WORKQUEUE, DM_CRYPT_NO_WRITE_WORKQUEUE };
 
 enum cipher_flags {
 	CRYPT_MODE_INTEGRITY_AEAD,	/* Use authenticated mode for cihper */
@@ -1523,7 +1525,7 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_
  * Encrypt / decrypt data from one bio to another one (can be the same one)
  */
 static blk_status_t crypt_convert(struct crypt_config *cc,
-			 struct convert_context *ctx)
+			 struct convert_context *ctx, bool atomic)
 {
 	unsigned int tag_offset = 0;
 	unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
@@ -1566,7 +1568,8 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
 			atomic_dec(&ctx->cc_pending);
 			ctx->cc_sector += sector_step;
 			tag_offset++;
-			cond_resched();
+			if (!atomic)
+				cond_resched();
 			continue;
 		/*
 		 * There was a data integrity error.
@@ -1892,7 +1895,8 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 
 	clone->bi_iter.bi_sector = cc->start + io->sector;
 
-	if (likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) {
+	if ((likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) ||
+	    test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) {
 		generic_make_request(clone);
 		return;
 	}
@@ -1941,7 +1945,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 	sector += bio_sectors(clone);
 
 	crypt_inc_pending(io);
-	r = crypt_convert(cc, &io->ctx);
+	r = crypt_convert(cc, &io->ctx,
+			  test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags));
 	if (r)
 		io->error = r;
 	crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
@@ -1971,7 +1976,8 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 	crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio,
 			   io->sector);
 
-	r = crypt_convert(cc, &io->ctx);
+	r = crypt_convert(cc, &io->ctx,
+			  test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags));
 	if (r)
 		io->error = r;
 
@@ -2031,10 +2037,28 @@ static void kcryptd_crypt(struct work_struct *work)
 		kcryptd_crypt_write_convert(io);
 }
 
+static void kcryptd_crypt_tasklet(unsigned long work)
+{
+	kcryptd_crypt((struct work_struct *)work);
+}
+
 static void kcryptd_queue_crypt(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
 
+	if ((bio_data_dir(io->base_bio) == READ && test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags)) ||
+	    (bio_data_dir(io->base_bio) == WRITE && test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags))) {
+		if (in_irq()) {
+			/* Crypto API's "skcipher_walk_first() refuses to work in hard IRQ context */
+			tasklet_init(&io->tasklet, kcryptd_crypt_tasklet, (unsigned long)&io->work);
+			tasklet_schedule(&io->tasklet);
+			return;
+		}
+
+		kcryptd_crypt(&io->work);
+		return;
+	}
+
 	INIT_WORK(&io->work, kcryptd_crypt);
 	queue_work(cc->crypt_queue, &io->work);
 }
@@ -2838,7 +2862,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
 	struct crypt_config *cc = ti->private;
 	struct dm_arg_set as;
 	static const struct dm_arg _args[] = {
-		{0, 6, "Invalid number of feature args"},
+		{0, 8, "Invalid number of feature args"},
 	};
 	unsigned int opt_params, val;
 	const char *opt_string, *sval;
@@ -2868,6 +2892,10 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
 
 		else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
 			set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+		else if (!strcasecmp(opt_string, "no_read_workqueue"))
+			set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
+		else if (!strcasecmp(opt_string, "no_write_workqueue"))
+			set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
 		else if (sscanf(opt_string, "integrity:%u:", &val) == 1) {
 			if (val == 0 || val > MAX_TAG_SIZE) {
 				ti->error = "Invalid integrity arguments";
@@ -3196,6 +3224,8 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 		num_feature_args += !!ti->num_discard_bios;
 		num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
 		num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+		num_feature_args += test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
+		num_feature_args += test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
 		num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT);
 		num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags);
 		if (cc->on_disk_tag_size)
@@ -3208,6 +3238,10 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 				DMEMIT(" same_cpu_crypt");
 			if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
 				DMEMIT(" submit_from_crypt_cpus");
+			if (test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags))
+				DMEMIT(" no_read_workqueue");
+			if (test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags))
+				DMEMIT(" no_write_workqueue");
 			if (cc->on_disk_tag_size)
 				DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth);
 			if (cc->sector_size != (1 << SECTOR_SHIFT))
@@ -3320,7 +3354,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 21, 0},
+	.version = {1, 22, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,

From 8e225f04d2dd6c1ca1eaecd5b04eaa90284df507 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 8 Jul 2020 18:28:08 +0900
Subject: [PATCH 14/20] dm crypt: Enable zoned block device support

Enable support for zoned block devices. This is done by:
1) implementing the target report_zones method.
2) adding the DM_TARGET_ZONED_HM flag to the target features.
3) setting DM_CRYPT_NO_WRITE_WORKQUEUE flag to avoid IO
   processing via workqueue.
4) Introducing inline write encryption completion to preserve write
   ordering.

The last point is implemented by introducing the internal flag
DM_CRYPT_WRITE_INLINE. When set, kcryptd_crypt_write_convert() always
waits inline for the completion of a write request encryption if the
request is not already completed once crypt_convert() returns.
Completion of write request encryption is signaled using the
restart completion by kcryptd_async_done(). This mechanism allows
using ciphers that have an asynchronous implementation, isolating
dm-crypt from any potential request completion reordering for these
ciphers.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 83 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 76 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 7536ecb2c95d..bad05c5ed3b5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -129,7 +129,8 @@ struct iv_elephant_private {
  */
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
 	     DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD,
-	     DM_CRYPT_NO_READ_WORKQUEUE, DM_CRYPT_NO_WRITE_WORKQUEUE };
+	     DM_CRYPT_NO_READ_WORKQUEUE, DM_CRYPT_NO_WRITE_WORKQUEUE,
+	     DM_CRYPT_WRITE_INLINE };
 
 enum cipher_flags {
 	CRYPT_MODE_INTEGRITY_AEAD,	/* Use authenticated mode for cihper */
@@ -1919,9 +1920,32 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 	spin_unlock_irqrestore(&cc->write_thread_lock, flags);
 }
 
+static bool kcryptd_crypt_write_inline(struct crypt_config *cc,
+				       struct convert_context *ctx)
+
+{
+	if (!test_bit(DM_CRYPT_WRITE_INLINE, &cc->flags))
+		return false;
+
+	/*
+	 * Note: zone append writes (REQ_OP_ZONE_APPEND) do not have ordering
+	 * constraints so they do not need to be issued inline by
+	 * kcryptd_crypt_write_convert().
+	 */
+	switch (bio_op(ctx->bio_in)) {
+	case REQ_OP_WRITE:
+	case REQ_OP_WRITE_SAME:
+	case REQ_OP_WRITE_ZEROES:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
+	struct convert_context *ctx = &io->ctx;
 	struct bio *clone;
 	int crypt_finished;
 	sector_t sector = io->sector;
@@ -1931,7 +1955,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 	 * Prevent io from disappearing until this function completes.
 	 */
 	crypt_inc_pending(io);
-	crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector);
+	crypt_convert_init(cc, ctx, NULL, io->base_bio, sector);
 
 	clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
 	if (unlikely(!clone)) {
@@ -1945,11 +1969,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 	sector += bio_sectors(clone);
 
 	crypt_inc_pending(io);
-	r = crypt_convert(cc, &io->ctx,
+	r = crypt_convert(cc, ctx,
 			  test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags));
 	if (r)
 		io->error = r;
-	crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
+	crypt_finished = atomic_dec_and_test(&ctx->cc_pending);
+	if (!crypt_finished && kcryptd_crypt_write_inline(cc, ctx)) {
+		/* Wait for completion signaled by kcryptd_async_done() */
+		wait_for_completion(&ctx->restart);
+		crypt_finished = 1;
+	}
 
 	/* Encryption was already finished, submit io now */
 	if (crypt_finished) {
@@ -2021,10 +2050,21 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	if (!atomic_dec_and_test(&ctx->cc_pending))
 		return;
 
-	if (bio_data_dir(io->base_bio) == READ)
+	/*
+	 * The request is fully completed: for inline writes, let
+	 * kcryptd_crypt_write_convert() do the IO submission.
+	 */
+	if (bio_data_dir(io->base_bio) == READ) {
 		kcryptd_crypt_read_done(io);
-	else
-		kcryptd_crypt_write_io_submit(io, 1);
+		return;
+	}
+
+	if (kcryptd_crypt_write_inline(cc, ctx)) {
+		complete(&ctx->restart);
+		return;
+	}
+
+	kcryptd_crypt_write_io_submit(io, 1);
 }
 
 static void kcryptd_crypt(struct work_struct *work)
@@ -2936,6 +2976,21 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
 	return 0;
 }
 
+#ifdef CONFIG_BLK_DEV_ZONED
+
+static int crypt_report_zones(struct dm_target *ti,
+		struct dm_report_zones_args *args, unsigned int nr_zones)
+{
+	struct crypt_config *cc = ti->private;
+	sector_t sector = cc->start + dm_target_offset(ti, args->next_sector);
+
+	args->start = cc->start;
+	return blkdev_report_zones(cc->dev->bdev, sector, nr_zones,
+				   dm_report_zones_cb, args);
+}
+
+#endif
+
 /*
  * Construct an encryption mapping:
  * <cipher> [<key>|:<key_size>:<user|logon>:<key_description>] <iv_offset> <dev_path> <start>
@@ -3069,6 +3124,16 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 	cc->start = tmpll;
 
+	/*
+	 * For zoned block devices, we need to preserve the issuer write
+	 * ordering. To do so, disable write workqueues and force inline
+	 * encryption completion.
+	 */
+	if (bdev_is_zoned(cc->dev->bdev)) {
+		set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
+		set_bit(DM_CRYPT_WRITE_INLINE, &cc->flags);
+	}
+
 	if (crypt_integrity_aead(cc) || cc->integrity_iv_size) {
 		ret = crypt_integrity_ctr(cc, ti);
 		if (ret)
@@ -3358,6 +3423,10 @@ static struct target_type crypt_target = {
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
+#ifdef CONFIG_BLK_DEV_ZONED
+	.features = DM_TARGET_ZONED_HM,
+	.report_zones = crypt_report_zones,
+#endif
 	.map    = crypt_map,
 	.status = crypt_status,
 	.postsuspend = crypt_postsuspend,

From a84c4308333a2cbd54593649a8b144df95c68227 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 16 Jul 2020 13:38:12 +0900
Subject: [PATCH 15/20] dm verity: Fix compilation warning

For the case !CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG, declare the
functions verity_verify_root_hash(), verity_verify_is_sig_opt_arg(),
verity_verify_sig_parse_opt_args() and verity_verify_sig_opts_cleanup()
as inline to avoid a "no previous prototype for xxx" compilation
warning when compiling with W=1.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-verity-verify-sig.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-verity-verify-sig.h b/drivers/md/dm-verity-verify-sig.h
index 19b1547aa741..3987c7141f79 100644
--- a/drivers/md/dm-verity-verify-sig.h
+++ b/drivers/md/dm-verity-verify-sig.h
@@ -34,25 +34,25 @@ void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts);
 
 #define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 0
 
-int verity_verify_root_hash(const void *data, size_t data_len,
-			    const void *sig_data, size_t sig_len)
+static inline int verity_verify_root_hash(const void *data, size_t data_len,
+					  const void *sig_data, size_t sig_len)
 {
 	return 0;
 }
 
-bool verity_verify_is_sig_opt_arg(const char *arg_name)
+static inline bool verity_verify_is_sig_opt_arg(const char *arg_name)
 {
 	return false;
 }
 
-int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
-				    struct dm_verity_sig_opts *sig_opts,
-				    unsigned int *argc, const char *arg_name)
+static inline int verity_verify_sig_parse_opt_args(struct dm_arg_set *as,
+			struct dm_verity *v, struct dm_verity_sig_opts *sig_opts,
+			unsigned int *argc, const char *arg_name)
 {
 	return -EINVAL;
 }
 
-void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts)
+static inline void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts)
 {
 }
 

From 04dc5330e5deebde2c88f8422c446e8212090ba3 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 16 Jul 2020 13:38:13 +0900
Subject: [PATCH 16/20] dm raid: Remove empty if statement

In super_init_validation(), remove a body-less if statement testing only
variables to avoid a compilation warning when compiling with W=1.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 10e8b2fe787b..08023c50aaa0 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2345,8 +2345,6 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 
 	if (new_devs == rs->raid_disks || !rebuilds) {
 		/* Replace a broken device */
-		if (new_devs == 1 && !rs->delta_disks)
-			;
 		if (new_devs == rs->raid_disks) {
 			DMINFO("Superblocks created for new raid set");
 			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);

From 1aeb6e7cd118b90f1a227c79b26100af6a15960e Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 16 Jul 2020 13:38:14 +0900
Subject: [PATCH 17/20] dm ioctl: Fix compilation warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In retrieve_status(), when copying the target type name in the
target_type string field of struct dm_target_spec, copy at most
DM_MAX_TYPE_NAME - 1 character to avoid the compilation warning:

warning: ‘__builtin_strncpy’ specified bound 16 equals destination
size [-Wstringop-truncation]

when compiling with W-1.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 489935d5f22d..5792878dbff6 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1168,7 +1168,7 @@ static void retrieve_status(struct dm_table *table,
 		spec->sector_start = ti->begin;
 		spec->length = ti->len;
 		strncpy(spec->target_type, ti->type->name,
-			sizeof(spec->target_type));
+			sizeof(spec->target_type) - 1);
 
 		outptr += sizeof(struct dm_target_spec);
 		remaining = len - (outptr - outbuf);

From 90e6bf0659f11f3d8ffa7ae1d7a4fa2af015747a Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 16 Jul 2020 13:38:15 +0900
Subject: [PATCH 18/20] dm init: Set file local variable static

Declare dm_allowed_targets as static to avoid the warning:

drivers/md/dm-init.c:39:12: warning: symbol 'dm_allowed_targets' was
not declared. Should it be static?

when compiling with C=1.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c
index b869316d3722..b0c45c6ebe0b 100644
--- a/drivers/md/dm-init.c
+++ b/drivers/md/dm-init.c
@@ -36,7 +36,7 @@ struct dm_device {
 	struct list_head list;
 };
 
-const char * const dm_allowed_targets[] __initconst = {
+static const char * const dm_allowed_targets[] __initconst = {
 	"crypt",
 	"delay",
 	"linear",

From 4cb6f22612511ff2aba4c33fb0f281cae7c23772 Mon Sep 17 00:00:00 2001
From: John Dorminy <jdorminy@redhat.com>
Date: Fri, 31 Jul 2020 18:46:45 -0400
Subject: [PATCH 19/20] dm ebs: Fix incorrect checking for REQ_OP_FLUSH

REQ_OP_FLUSH was being treated as a flag, but the operation
part of bio->bi_opf must be treated as a whole. Change to
accessing the operation part via bio_op(bio) and checking
for equality.

Signed-off-by: John Dorminy <jdorminy@redhat.com>
Acked-by: Heinz Mauelshagen <heinzm@redhat.com>
Fixes: d3c7b35c20d60 ("dm: add emulated block size target")
Cc: stable@vger.kernel.org
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-ebs-target.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index 44451276f128..cb85610527c2 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -363,7 +363,7 @@ static int ebs_map(struct dm_target *ti, struct bio *bio)
 	bio_set_dev(bio, ec->dev->bdev);
 	bio->bi_iter.bi_sector = ec->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
 
-	if (unlikely(bio->bi_opf & REQ_OP_FLUSH))
+	if (unlikely(bio_op(bio) == REQ_OP_FLUSH))
 		return DM_MAPIO_REMAPPED;
 	/*
 	 * Only queue for bufio processing in case of partial or overlapping buffers

From a9cb9f4148ef6bb8fabbdaa85c42b2171fbd5a0d Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Tue, 4 Aug 2020 18:25:01 +0900
Subject: [PATCH 20/20] dm: don't call report zones for more than the user
 requested

Don't call report zones for more zones than the user actually requested,
otherwise this can lead to out-of-bounds accesses in the callback
functions.

Such a situation can happen if the target's ->report_zones() callback
function returns 0 because we've reached the end of the target and then
restart the report zones on the second target.

We're again calling into ->report_zones() and ultimately into the user
supplied callback function but when we're not subtracting the number of
zones already processed this may lead to out-of-bounds accesses in the
user callbacks.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Fixes: d41003513e61 ("block: rework zone reporting")
Cc: stable@vger.kernel.org # v5.5+
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 52449afd58eb..937a4194442f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -503,7 +503,8 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
 		}
 
 		args.tgt = tgt;
-		ret = tgt->type->report_zones(tgt, &args, nr_zones);
+		ret = tgt->type->report_zones(tgt, &args,
+					      nr_zones - args.zone_idx);
 		if (ret < 0)
 			goto out;
 	} while (args.zone_idx < nr_zones &&