From 16961b042db8cc5cf75d782b4255193ad56e1d4f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 17 Dec 2013 13:19:11 -0500
Subject: [PATCH 01/35] dm thin: initialize dm_thin_new_mapping returned by
 get_next_mapping

As additional members are added to the dm_thin_new_mapping structure
care should be taken to make sure they get initialized before use.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-thin.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index ee29037ffc2e..da65febdb6c4 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -751,13 +751,17 @@ static int ensure_next_mapping(struct pool *pool)
 
 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
 {
-	struct dm_thin_new_mapping *r = pool->next_mapping;
+	struct dm_thin_new_mapping *m = pool->next_mapping;
 
 	BUG_ON(!pool->next_mapping);
 
+	memset(m, 0, sizeof(struct dm_thin_new_mapping));
+	INIT_LIST_HEAD(&m->list);
+	m->bio = NULL;
+
 	pool->next_mapping = NULL;
 
-	return r;
+	return m;
 }
 
 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -769,15 +773,10 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	struct pool *pool = tc->pool;
 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
-	INIT_LIST_HEAD(&m->list);
-	m->quiesced = 0;
-	m->prepared = 0;
 	m->tc = tc;
 	m->virt_block = virt_block;
 	m->data_block = data_dest;
 	m->cell = cell;
-	m->err = 0;
-	m->bio = NULL;
 
 	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
 		m->quiesced = 1;
@@ -840,15 +839,12 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 	struct pool *pool = tc->pool;
 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
-	INIT_LIST_HEAD(&m->list);
 	m->quiesced = 1;
 	m->prepared = 0;
 	m->tc = tc;
 	m->virt_block = virt_block;
 	m->data_block = data_block;
 	m->cell = cell;
-	m->err = 0;
-	m->bio = NULL;
 
 	/*
 	 * If the whole block of data is being overwritten or we are not
@@ -1045,7 +1041,6 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 			m->data_block = lookup_result.block;
 			m->cell = cell;
 			m->cell2 = cell2;
-			m->err = 0;
 			m->bio = bio;
 
 			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {

From 19fa1a6756ed9e92daa9537c03b47d6b55cc2316 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Tue, 17 Dec 2013 12:09:40 -0500
Subject: [PATCH 02/35] dm thin: fix discard support to a previously shared
 block

If a snapshot is created and later deleted the origin dm_thin_device's
snapshotted_time will have been updated to reflect the snapshot's
creation time.  The 'shared' flag in the dm_thin_lookup_result struct
returned from dm_thin_find_block() is an approximation based on
snapshotted_time -- this is done to avoid 0(n), or worse, time
complexity.  In this case, the shared flag would be true.

But because the 'shared' flag reflects an approximation a block can be
incorrectly assumed to be shared (e.g. false positive for 'shared'
because the snapshot no longer exists).  This could result in discards
issued to a thin device not being passed down to the pool's underlying
data device.

To fix this we double check that a thin block is really still in-use
after a mapping is removed using dm_pool_block_is_used().  If the
reference count for a block is now zero the discard is allowed to be
passed down.

Also add a 'definitely_not_shared' member to the dm_thin_new_mapping
structure -- reflects that the 'shared' flag in the response from
dm_thin_find_block() can only be held as definitive if false is
returned.

Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1043527

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-thin-metadata.c | 20 ++++++++++++++++++++
 drivers/md/dm-thin-metadata.h |  2 ++
 drivers/md/dm-thin.c          | 14 ++++++++++++--
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 8a30ad54bd46..7da347665552 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1349,6 +1349,12 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
 	return td->id;
 }
 
+/*
+ * Check whether @time (of block creation) is older than @td's last snapshot.
+ * If so then the associated block is shared with the last snapshot device.
+ * Any block on a device created *after* the device last got snapshotted is
+ * necessarily not shared.
+ */
 static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
 {
 	return td->snapshotted_time > time;
@@ -1458,6 +1464,20 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
 	return r;
 }
 
+int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
+{
+	int r;
+	uint32_t ref_count;
+
+	down_read(&pmd->root_lock);
+	r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
+	if (!r)
+		*result = (ref_count != 0);
+	up_read(&pmd->root_lock);
+
+	return r;
+}
+
 bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
 {
 	int r;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 7bcc0e1d6238..2edf5dbac76a 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -181,6 +181,8 @@ int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result);
 
 int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
 
+int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
+
 /*
  * Returns -ENOSPC if the new size is too small and already allocated
  * blocks would be lost.
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index da65febdb6c4..51e656a3002c 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -512,6 +512,7 @@ struct dm_thin_new_mapping {
 	unsigned quiesced:1;
 	unsigned prepared:1;
 	unsigned pass_discard:1;
+	unsigned definitely_not_shared:1;
 
 	struct thin_c *tc;
 	dm_block_t virt_block;
@@ -683,7 +684,15 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
 	cell_defer_no_holder(tc, m->cell2);
 
 	if (m->pass_discard)
-		remap_and_issue(tc, m->bio, m->data_block);
+		if (m->definitely_not_shared)
+			remap_and_issue(tc, m->bio, m->data_block);
+		else {
+			bool used = false;
+			if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
+				bio_endio(m->bio, 0);
+			else
+				remap_and_issue(tc, m->bio, m->data_block);
+		}
 	else
 		bio_endio(m->bio, 0);
 
@@ -1036,7 +1045,8 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 			 */
 			m = get_next_mapping(pool);
 			m->tc = tc;
-			m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
+			m->pass_discard = pool->pf.discard_passdown;
+			m->definitely_not_shared = !lookup_result.shared;
 			m->virt_block = block;
 			m->data_block = lookup_result.block;
 			m->cell = cell;

From 57a2f238564e0700c8648238d31f366246a5b963 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 22 Nov 2013 19:51:39 -0500
Subject: [PATCH 03/35] dm table: remove unused buggy code that extends the
 targets array

A device mapper table is allocated in the following way:
* The function dm_table_create is called, it gets the number of targets
  as an argument -- it allocates a targets array accordingly.
* For each target, we call dm_table_add_target.

If we add more targets than were specified in dm_table_create, the
function dm_table_add_target reallocates the targets array.  However,
this reallocation code is wrong - it moves the targets array to a new
location, while some target constructors hold pointers to the array in
the old location.

The following DM target drivers save the pointer to the target
structure, so they corrupt memory if the target array is moved:
multipath, raid, mirror, snapshot, stripe, switch, thin, verity.

Under normal circumstances, the reallocation function is not called
(because dm_table_create is called with the correct number of targets),
so the buggy reallocation code is not used.

Prior to the fix "dm table: fail dm_table_create on dm_round_up
overflow", the reallocation code could only be used in case the user
specifies too large a value in param->target_count, such as 0xffffffff.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ba6a3859ce3..6a7f2b83a126 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -155,7 +155,6 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
 {
 	sector_t *n_highs;
 	struct dm_target *n_targets;
-	int n = t->num_targets;
 
 	/*
 	 * Allocate both the target array and offset array at once.
@@ -169,12 +168,7 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
 
 	n_targets = (struct dm_target *) (n_highs + num);
 
-	if (n) {
-		memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
-		memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
-	}
-
-	memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
+	memset(n_highs, -1, sizeof(*n_highs) * num);
 	vfree(t->highs);
 
 	t->num_allocated = num;
@@ -260,17 +254,6 @@ void dm_table_destroy(struct dm_table *t)
 	kfree(t);
 }
 
-/*
- * Checks to see if we need to extend highs or targets.
- */
-static inline int check_space(struct dm_table *t)
-{
-	if (t->num_targets >= t->num_allocated)
-		return alloc_targets(t, t->num_allocated * 2);
-
-	return 0;
-}
-
 /*
  * See if we've already got a device in the list.
  */
@@ -731,8 +714,7 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 		return -EINVAL;
 	}
 
-	if ((r = check_space(t)))
-		return r;
+	BUG_ON(t->num_targets >= t->num_allocated);
 
 	tgt = t->targets + t->num_targets;
 	memset(tgt, 0, sizeof(*tgt));

From 42065460aed7201ec8adf0179a258a23bd1ebd78 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 15 Nov 2013 16:12:51 -0500
Subject: [PATCH 04/35] dm delay: use per-bio data instead of a mempool and
 slab cache

Starting with commit c0820cf5ad095 ("dm: introduce per_bio_data"),
device mapper has the capability to pre-allocate a target-specific
structure with the bio.

This patch changes dm-delay to use this facility instead of a slab cache
and mempool.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-delay.c | 35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 2f91d6d4a2cc..a8a511c053a5 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -24,7 +24,6 @@ struct delay_c {
 	struct work_struct flush_expired_bios;
 	struct list_head delayed_bios;
 	atomic_t may_delay;
-	mempool_t *delayed_pool;
 
 	struct dm_dev *dev_read;
 	sector_t start_read;
@@ -40,14 +39,11 @@ struct delay_c {
 struct dm_delay_info {
 	struct delay_c *context;
 	struct list_head list;
-	struct bio *bio;
 	unsigned long expires;
 };
 
 static DEFINE_MUTEX(delayed_bios_lock);
 
-static struct kmem_cache *delayed_cache;
-
 static void handle_delayed_timer(unsigned long data)
 {
 	struct delay_c *dc = (struct delay_c *)data;
@@ -87,13 +83,14 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
 	mutex_lock(&delayed_bios_lock);
 	list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
 		if (flush_all || time_after_eq(jiffies, delayed->expires)) {
+			struct bio *bio = dm_bio_from_per_bio_data(delayed,
+						sizeof(struct dm_delay_info));
 			list_del(&delayed->list);
-			bio_list_add(&flush_bios, delayed->bio);
-			if ((bio_data_dir(delayed->bio) == WRITE))
+			bio_list_add(&flush_bios, bio);
+			if ((bio_data_dir(bio) == WRITE))
 				delayed->context->writes--;
 			else
 				delayed->context->reads--;
-			mempool_free(delayed, dc->delayed_pool);
 			continue;
 		}
 
@@ -185,12 +182,6 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 out:
-	dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache);
-	if (!dc->delayed_pool) {
-		DMERR("Couldn't create delayed bio pool.");
-		goto bad_dev_write;
-	}
-
 	dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
 	if (!dc->kdelayd_wq) {
 		DMERR("Couldn't start kdelayd");
@@ -206,12 +197,11 @@ out:
 
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
+	ti->per_bio_data_size = sizeof(struct dm_delay_info);
 	ti->private = dc;
 	return 0;
 
 bad_queue:
-	mempool_destroy(dc->delayed_pool);
-bad_dev_write:
 	if (dc->dev_write)
 		dm_put_device(ti, dc->dev_write);
 bad_dev_read:
@@ -232,7 +222,6 @@ static void delay_dtr(struct dm_target *ti)
 	if (dc->dev_write)
 		dm_put_device(ti, dc->dev_write);
 
-	mempool_destroy(dc->delayed_pool);
 	kfree(dc);
 }
 
@@ -244,10 +233,9 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
 	if (!delay || !atomic_read(&dc->may_delay))
 		return 1;
 
-	delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO);
+	delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
 
 	delayed->context = dc;
-	delayed->bio = bio;
 	delayed->expires = expires = jiffies + (delay * HZ / 1000);
 
 	mutex_lock(&delayed_bios_lock);
@@ -356,13 +344,7 @@ static struct target_type delay_target = {
 
 static int __init dm_delay_init(void)
 {
-	int r = -ENOMEM;
-
-	delayed_cache = KMEM_CACHE(dm_delay_info, 0);
-	if (!delayed_cache) {
-		DMERR("Couldn't create delayed bio cache.");
-		goto bad_memcache;
-	}
+	int r;
 
 	r = dm_register_target(&delay_target);
 	if (r < 0) {
@@ -373,15 +355,12 @@ static int __init dm_delay_init(void)
 	return 0;
 
 bad_register:
-	kmem_cache_destroy(delayed_cache);
-bad_memcache:
 	return r;
 }
 
 static void __exit dm_delay_exit(void)
 {
 	dm_unregister_target(&delay_target);
-	kmem_cache_destroy(delayed_cache);
 }
 
 /* Module hooks */

From c46985e211fa6d6895104cc4858e85e53e8c7731 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 13 Dec 2013 09:58:46 -0500
Subject: [PATCH 05/35] dm space map metadata: limit errors in
 sm_metadata_new_block

The "unable to allocate new metadata block" error can be a particularly
verbose error if there is a systemic issue with the metadata device.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/persistent-data/dm-space-map-metadata.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 58fc1eef7499..e93084419068 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -385,13 +385,13 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
 
 	int r = sm_metadata_new_block_(sm, b);
 	if (r) {
-		DMERR("unable to allocate new metadata block");
+		DMERR_LIMIT("unable to allocate new metadata block");
 		return r;
 	}
 
 	r = sm_metadata_get_nr_free(sm, &count);
 	if (r) {
-		DMERR("couldn't get free block count");
+		DMERR_LIMIT("couldn't get free block count");
 		return r;
 	}
 

From 10343180f5c4023043e82d46e71048e68f975f50 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 13 Dec 2013 08:24:44 -0500
Subject: [PATCH 06/35] dm persistent data: cleanup dm-thin specific references
 in text

DM's persistent-data library is now used my multiple targets so
exclusive references to "pool" or "thin provisioning" need to be
cleaned up.  Adjust Kconfig's DM_DEBUG_BLOCK_STACK_TRACING text
and remove "pool" from a block manager error message.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/Kconfig                            | 6 +++---
 drivers/md/persistent-data/dm-block-manager.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index f2ccbc3b9fe4..7441344bd214 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -250,12 +250,12 @@ config DM_THIN_PROVISIONING
          Provides thin provisioning and snapshots that share a data store.
 
 config DM_DEBUG_BLOCK_STACK_TRACING
-	boolean "Keep stack trace of thin provisioning block lock holders"
-	depends on STACKTRACE_SUPPORT && DM_THIN_PROVISIONING
+	boolean "Keep stack trace of persistent data block lock holders"
+	depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
 	select STACKTRACE
 	---help---
 	  Enable this for messages that may help debug problems with the
-	  block manager locking used by thin provisioning.
+	  block manager locking used by thin provisioning and caching.
 
 	  If unsure, say N.
 
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 064a3c271baa..455f79279a16 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -104,7 +104,7 @@ static int __check_holder(struct block_lock *lock)
 
 	for (i = 0; i < MAX_HOLDERS; i++) {
 		if (lock->holders[i] == current) {
-			DMERR("recursive lock detected in pool metadata");
+			DMERR("recursive lock detected in metadata");
 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 			DMERR("previously held here:");
 			print_stack_trace(lock->traces + i, 4);

From 7f214665124401db3d171fd1f9f1ec6552b38b36 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 17 Dec 2013 13:43:31 -0500
Subject: [PATCH 07/35] dm thin: use bool rather than unsigned for flags in
 structures

Also, move 'err' member in dm_thin_new_mapping structure to eliminate 4
byte hole (reduces size from 88 bytes to 80).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/dm-thin-metadata.h |  2 +-
 drivers/md/dm-thin.c          | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 2edf5dbac76a..9a368567632f 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -131,7 +131,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td);
 
 struct dm_thin_lookup_result {
 	dm_block_t block;
-	unsigned shared:1;
+	bool shared:1;
 };
 
 /*
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 51e656a3002c..5f1b11e45702 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -509,16 +509,16 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 struct dm_thin_new_mapping {
 	struct list_head list;
 
-	unsigned quiesced:1;
-	unsigned prepared:1;
-	unsigned pass_discard:1;
-	unsigned definitely_not_shared:1;
+	bool quiesced:1;
+	bool prepared:1;
+	bool pass_discard:1;
+	bool definitely_not_shared:1;
 
+	int err;
 	struct thin_c *tc;
 	dm_block_t virt_block;
 	dm_block_t data_block;
 	struct dm_bio_prison_cell *cell, *cell2;
-	int err;
 
 	/*
 	 * If the bio covers the whole area of a block then we can avoid
@@ -549,7 +549,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
 	m->err = read_err || write_err ? -EIO : 0;
 
 	spin_lock_irqsave(&pool->lock, flags);
-	m->prepared = 1;
+	m->prepared = true;
 	__maybe_add_mapping(m);
 	spin_unlock_irqrestore(&pool->lock, flags);
 }
@@ -564,7 +564,7 @@ static void overwrite_endio(struct bio *bio, int err)
 	m->err = err;
 
 	spin_lock_irqsave(&pool->lock, flags);
-	m->prepared = 1;
+	m->prepared = true;
 	__maybe_add_mapping(m);
 	spin_unlock_irqrestore(&pool->lock, flags);
 }
@@ -788,7 +788,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	m->cell = cell;
 
 	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
-		m->quiesced = 1;
+		m->quiesced = true;
 
 	/*
 	 * IO to pool_dev remaps to the pool target's data_dev.
@@ -848,8 +848,8 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 	struct pool *pool = tc->pool;
 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
-	m->quiesced = 1;
-	m->prepared = 0;
+	m->quiesced = true;
+	m->prepared = false;
 	m->tc = tc;
 	m->virt_block = virt_block;
 	m->data_block = data_block;
@@ -2904,7 +2904,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
 		spin_lock_irqsave(&pool->lock, flags);
 		list_for_each_entry_safe(m, tmp, &work, list) {
 			list_del(&m->list);
-			m->quiesced = 1;
+			m->quiesced = true;
 			__maybe_add_mapping(m);
 		}
 		spin_unlock_irqrestore(&pool->lock, flags);

From 8d30abff758b5f6c71343b7da6bb5de129a76c08 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 4 Dec 2013 19:16:11 -0500
Subject: [PATCH 08/35] dm thin: return error from alloc_data_block if pool is
 not in write mode

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 5f1b11e45702..1988019df5c9 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -923,6 +923,9 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 	if (pool->no_free_space)
 		return -ENOSPC;
 
+	if (get_pool_mode(pool) != PM_WRITE)
+		return -EINVAL;
+
 	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 	if (r)
 		return r;

From daec338bbdaa96ba5b14c4777603e65ef74c769b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 11 Dec 2013 14:01:20 -0500
Subject: [PATCH 09/35] dm thin: add mappings to end of prepared_* lists

Mappings could be processed in descending logical block order,
particularly if buffered IO is used.  This could adversely affect the
latency of IO processing.  Fix this by adding mappings to the end of the
'prepared_mappings' and 'prepared_discards' lists.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/dm-thin.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 1988019df5c9..efa3d42ac70a 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -535,7 +535,7 @@ static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
 	struct pool *pool = m->tc->pool;
 
 	if (m->quiesced && m->prepared) {
-		list_add(&m->list, &pool->prepared_mappings);
+		list_add_tail(&m->list, &pool->prepared_mappings);
 		wake_worker(pool);
 	}
 }
@@ -1058,7 +1058,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
 
 			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
 				spin_lock_irqsave(&pool->lock, flags);
-				list_add(&m->list, &pool->prepared_discards);
+				list_add_tail(&m->list, &pool->prepared_discards);
 				spin_unlock_irqrestore(&pool->lock, flags);
 				wake_worker(pool);
 			}
@@ -2919,7 +2919,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
 		if (!list_empty(&work)) {
 			spin_lock_irqsave(&pool->lock, flags);
 			list_for_each_entry_safe(m, tmp, &work, list)
-				list_add(&m->list, &pool->prepared_discards);
+				list_add_tail(&m->list, &pool->prepared_discards);
 			spin_unlock_irqrestore(&pool->lock, flags);
 			wake_worker(pool);
 		}

From 88a6621bed65ce2d421a808a2f60e1b64914d777 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 4 Dec 2013 20:16:12 -0500
Subject: [PATCH 10/35] dm thin: factor out check_low_water_mark and use bools

Factor check_low_water_mark() out of alloc_data_block().
Change a couple unsigned flags in the pool structure to bool.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index efa3d42ac70a..e49c27c91a1f 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -163,8 +163,8 @@ struct pool {
 	int sectors_per_block_shift;
 
 	struct pool_features pf;
-	unsigned low_water_triggered:1;	/* A dm event has been sent */
-	unsigned no_free_space:1;	/* A -ENOSPC warning has been issued */
+	bool low_water_triggered:1;	/* A dm event has been sent */
+	bool no_free_space:1;		/* A -ENOSPC warning has been issued */
 
 	struct dm_bio_prison *prison;
 	struct dm_kcopyd_client *copier;
@@ -909,6 +909,20 @@ static int commit(struct pool *pool)
 	return r;
 }
 
+static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
+{
+	unsigned long flags;
+
+	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
+		DMWARN("%s: reached low water mark for data device: sending event.",
+		       dm_device_name(pool->pool_md));
+		spin_lock_irqsave(&pool->lock, flags);
+		pool->low_water_triggered = true;
+		spin_unlock_irqrestore(&pool->lock, flags);
+		dm_table_event(pool->ti->table);
+	}
+}
+
 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 {
 	int r;
@@ -930,14 +944,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 	if (r)
 		return r;
 
-	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
-		DMWARN("%s: reached low water mark for data device: sending event.",
-		       dm_device_name(pool->pool_md));
-		spin_lock_irqsave(&pool->lock, flags);
-		pool->low_water_triggered = 1;
-		spin_unlock_irqrestore(&pool->lock, flags);
-		dm_table_event(pool->ti->table);
-	}
+	check_low_water_mark(pool, free_blocks);
 
 	if (!free_blocks) {
 		/*
@@ -963,7 +970,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 			DMWARN("%s: no free data space available.",
 			       dm_device_name(pool->pool_md));
 			spin_lock_irqsave(&pool->lock, flags);
-			pool->no_free_space = 1;
+			pool->no_free_space = true;
 			spin_unlock_irqrestore(&pool->lock, flags);
 			return -ENOSPC;
 		}
@@ -1780,8 +1787,8 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	bio_list_init(&pool->deferred_flush_bios);
 	INIT_LIST_HEAD(&pool->prepared_mappings);
 	INIT_LIST_HEAD(&pool->prepared_discards);
-	pool->low_water_triggered = 0;
-	pool->no_free_space = 0;
+	pool->low_water_triggered = false;
+	pool->no_free_space = false;
 	bio_list_init(&pool->retry_on_resume_list);
 
 	pool->shared_read_ds = dm_deferred_set_create();
@@ -2298,8 +2305,8 @@ static void pool_resume(struct dm_target *ti)
 	unsigned long flags;
 
 	spin_lock_irqsave(&pool->lock, flags);
-	pool->low_water_triggered = 0;
-	pool->no_free_space = 0;
+	pool->low_water_triggered = false;
+	pool->no_free_space = false;
 	__requeue_bios(pool);
 	spin_unlock_irqrestore(&pool->lock, flags);
 

From b53306558526a097a587774573b76d0d9903c5bf Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 4 Dec 2013 19:51:33 -0500
Subject: [PATCH 11/35] dm thin: handle metadata failures more consistently

Introduce metadata_operation_failed() wrappers, around set_pool_mode(),
to assist with improving the consistency of how metadata failures are
handled.  Logging is improved and metadata operation failures trigger
read-only mode immediately.

Also, eliminate redundant set_pool_mode() calls in the two
alloc_data_block() caller's error paths.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 48 +++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index e49c27c91a1f..35d2e41ef82f 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -198,7 +198,7 @@ struct pool {
 };
 
 static enum pool_mode get_pool_mode(struct pool *pool);
-static void set_pool_mode(struct pool *pool, enum pool_mode mode);
+static void metadata_operation_failed(struct pool *pool, const char *op, int r);
 
 /*
  * Target context for a pool.
@@ -641,9 +641,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	 */
 	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
 	if (r) {
-		DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d",
-			    dm_device_name(pool->pool_md), r);
-		set_pool_mode(pool, PM_READ_ONLY);
+		metadata_operation_failed(pool, "dm_thin_insert_block", r);
 		cell_error(pool, m->cell);
 		goto out;
 	}
@@ -900,11 +898,8 @@ static int commit(struct pool *pool)
 		return -EINVAL;
 
 	r = dm_pool_commit_metadata(pool->pmd);
-	if (r) {
-		DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d",
-			    dm_device_name(pool->pool_md), r);
-		set_pool_mode(pool, PM_READ_ONLY);
-	}
+	if (r)
+		metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
 
 	return r;
 }
@@ -941,8 +936,10 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 		return -EINVAL;
 
 	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
-	if (r)
+	if (r) {
+		metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
 		return r;
+	}
 
 	check_low_water_mark(pool, free_blocks);
 
@@ -956,8 +953,10 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 			return r;
 
 		r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
-		if (r)
+		if (r) {
+			metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
 			return r;
+		}
 
 		/*
 		 * If we still have no space we set a flag to avoid
@@ -980,11 +979,11 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 	if (r) {
 		if (r == -ENOSPC &&
 		    !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
-		    !free_blocks) {
+		    !free_blocks)
 			DMWARN("%s: no free metadata space available.",
 			       dm_device_name(pool->pool_md));
-			set_pool_mode(pool, PM_READ_ONLY);
-		}
+
+		metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
 		return r;
 	}
 
@@ -1126,7 +1125,6 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
 	default:
 		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
 			    __func__, r);
-		set_pool_mode(pool, PM_READ_ONLY);
 		cell_error(pool, cell);
 		break;
 	}
@@ -1205,7 +1203,6 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 	default:
 		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
 			    __func__, r);
-		set_pool_mode(pool, PM_READ_ONLY);
 		cell_error(pool, cell);
 		break;
 	}
@@ -1449,6 +1446,18 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
 	}
 }
 
+/*
+ * Rather than calling set_pool_mode directly, use these which describe the
+ * reason for mode degradation.
+ */
+static void metadata_operation_failed(struct pool *pool, const char *op, int r)
+{
+	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
+		    dm_device_name(pool->pool_md), op, r);
+
+	set_pool_mode(pool, PM_READ_ONLY);
+}
+
 /*----------------------------------------------------------------*/
 
 /*
@@ -2209,9 +2218,7 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
 	} else if (data_size > sb_data_size) {
 		r = dm_pool_resize_data_dev(pool->pmd, data_size);
 		if (r) {
-			DMERR("%s: failed to resize data device",
-			      dm_device_name(pool->pool_md));
-			set_pool_mode(pool, PM_READ_ONLY);
+			metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
 			return r;
 		}
 
@@ -2248,8 +2255,7 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
 	} else if (metadata_dev_size > sb_metadata_dev_size) {
 		r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
 		if (r) {
-			DMERR("%s: failed to resize metadata device",
-			      dm_device_name(pool->pool_md));
+			metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
 			return r;
 		}
 

From 6f7f51d4344d530f725e9c932fa44f00ba363fa2 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 4 Dec 2013 10:25:53 -0500
Subject: [PATCH 12/35] dm thin: log info when growing the data or metadata
 device

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/dm-thin.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 35d2e41ef82f..234696009d7b 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2216,6 +2216,10 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
 		return -EINVAL;
 
 	} else if (data_size > sb_data_size) {
+		if (sb_data_size)
+			DMINFO("%s: growing the data device from %llu to %llu blocks",
+			       dm_device_name(pool->pool_md),
+			       sb_data_size, (unsigned long long)data_size);
 		r = dm_pool_resize_data_dev(pool->pmd, data_size);
 		if (r) {
 			metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
@@ -2253,6 +2257,9 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
 		return -EINVAL;
 
 	} else if (metadata_dev_size > sb_metadata_dev_size) {
+		DMINFO("%s: growing the metadata device from %llu to %llu blocks",
+		       dm_device_name(pool->pool_md),
+		       sb_metadata_dev_size, metadata_dev_size);
 		r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
 		if (r) {
 			metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);

From 399caddfb16f5fa30c66056a32477cf95c947e2b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 5 Dec 2013 16:03:33 -0500
Subject: [PATCH 13/35] dm thin: cleanup and improve no space handling

Factor out_of_data_space() out of alloc_data_block().  Eliminate the use
of 'no_free_space' as a latch in alloc_data_block() -- this is no longer
needed now that we switch to read-only mode when we run out of data or
metadata space.  In a later patch, the 'no_free_space' flag will be
eliminated entirely (in favor of checking metadata rather than relying
on a transient flag).

Move no metdata space handling into metdata_operation_failed().  Set
no_free_space when metadata space is exhausted too.  This is useful,
because it offers consistency, for the following patch that will requeue
data IOs if no_free_space.

Also, rename no_space() to retry_bios_on_resume().

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/dm-thin.c | 61 +++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 29 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 234696009d7b..96ce36a1a764 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -198,6 +198,7 @@ struct pool {
 };
 
 static enum pool_mode get_pool_mode(struct pool *pool);
+static void out_of_data_space(struct pool *pool);
 static void metadata_operation_failed(struct pool *pool, const char *op, int r);
 
 /*
@@ -922,16 +923,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 {
 	int r;
 	dm_block_t free_blocks;
-	unsigned long flags;
 	struct pool *pool = tc->pool;
 
-	/*
-	 * Once no_free_space is set we must not allow allocation to succeed.
-	 * Otherwise it is difficult to explain, debug, test and support.
-	 */
-	if (pool->no_free_space)
-		return -ENOSPC;
-
 	if (get_pool_mode(pool) != PM_WRITE)
 		return -EINVAL;
 
@@ -958,31 +951,14 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 			return r;
 		}
 
-		/*
-		 * If we still have no space we set a flag to avoid
-		 * doing all this checking and return -ENOSPC.  This
-		 * flag serves as a latch that disallows allocations from
-		 * this pool until the admin takes action (e.g. resize or
-		 * table reload).
-		 */
 		if (!free_blocks) {
-			DMWARN("%s: no free data space available.",
-			       dm_device_name(pool->pool_md));
-			spin_lock_irqsave(&pool->lock, flags);
-			pool->no_free_space = true;
-			spin_unlock_irqrestore(&pool->lock, flags);
+			out_of_data_space(pool);
 			return -ENOSPC;
 		}
 	}
 
 	r = dm_pool_alloc_data_block(pool->pmd, result);
 	if (r) {
-		if (r == -ENOSPC &&
-		    !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
-		    !free_blocks)
-			DMWARN("%s: no free metadata space available.",
-			       dm_device_name(pool->pool_md));
-
 		metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
 		return r;
 	}
@@ -1006,7 +982,7 @@ static void retry_on_resume(struct bio *bio)
 	spin_unlock_irqrestore(&pool->lock, flags);
 }
 
-static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell)
+static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
 {
 	struct bio *bio;
 	struct bio_list bios;
@@ -1119,7 +1095,7 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
 		break;
 
 	case -ENOSPC:
-		no_space(pool, cell);
+		retry_bios_on_resume(pool, cell);
 		break;
 
 	default:
@@ -1197,7 +1173,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
 		break;
 
 	case -ENOSPC:
-		no_space(pool, cell);
+		retry_bios_on_resume(pool, cell);
 		break;
 
 	default:
@@ -1446,15 +1422,42 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
 	}
 }
 
+static void set_no_free_space(struct pool *pool)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	pool->no_free_space = true;
+	spin_unlock_irqrestore(&pool->lock, flags);
+}
+
 /*
  * Rather than calling set_pool_mode directly, use these which describe the
  * reason for mode degradation.
  */
+static void out_of_data_space(struct pool *pool)
+{
+	DMERR_LIMIT("%s: no free data space available.",
+		    dm_device_name(pool->pool_md));
+	set_no_free_space(pool);
+	set_pool_mode(pool, PM_READ_ONLY);
+}
+
 static void metadata_operation_failed(struct pool *pool, const char *op, int r)
 {
+	dm_block_t free_blocks;
+
 	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
 		    dm_device_name(pool->pool_md), op, r);
 
+	if (r == -ENOSPC &&
+	    !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
+	    !free_blocks) {
+		DMERR_LIMIT("%s: no free metadata space available.",
+			    dm_device_name(pool->pool_md));
+		set_no_free_space(pool);
+	}
+
 	set_pool_mode(pool, PM_READ_ONLY);
 }
 

From 8c0f0e8c9f07e6554b2281f86f00e769cf805fd9 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 5 Dec 2013 15:47:24 -0500
Subject: [PATCH 14/35] dm thin: requeue bios to DM core if no_free_space and
 in read-only mode

Now that we switch the pool to read-only mode when the data device runs
out of space it causes active writers to get IO errors once we resume
after resizing the data device.

If no_free_space is set, save bios to the 'retry_on_resume_list' and
requeue them on resume (once the data or metadata device may have been
resized).

With this patch the resize_io test passes again (on slower storage):
 dmtest run --suite thin-provisioning -n /resize_io/

Later patches fix some subtle races associated with the pool mode
transitions done as part of the pool's -ENOSPC handling.  These races
are exposed on fast storage (e.g. PCIe SSD).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/dm-thin.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 96ce36a1a764..53252d2af249 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -164,7 +164,7 @@ struct pool {
 
 	struct pool_features pf;
 	bool low_water_triggered:1;	/* A dm event has been sent */
-	bool no_free_space:1;		/* A -ENOSPC warning has been issued */
+	bool no_free_space:1;		/* bios will be requeued if set */
 
 	struct dm_bio_prison *prison;
 	struct dm_kcopyd_client *copier;
@@ -982,6 +982,20 @@ static void retry_on_resume(struct bio *bio)
 	spin_unlock_irqrestore(&pool->lock, flags);
 }
 
+static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+{
+	/*
+	 * When pool is read-only, no cell locking is needed because
+	 * nothing is changing.
+	 */
+	WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
+
+	if (pool->no_free_space)
+		retry_on_resume(bio);
+	else
+		bio_io_error(bio);
+}
+
 static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
 {
 	struct bio *bio;
@@ -991,7 +1005,7 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
 	cell_release(pool, cell, &bios);
 
 	while ((bio = bio_list_pop(&bios)))
-		retry_on_resume(bio);
+		handle_unserviceable_bio(pool, bio);
 }
 
 static void process_discard(struct thin_c *tc, struct bio *bio)
@@ -1245,7 +1259,7 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
 	switch (r) {
 	case 0:
 		if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
-			bio_io_error(bio);
+			handle_unserviceable_bio(tc->pool, bio);
 		else {
 			inc_all_io_entry(tc->pool, bio);
 			remap_and_issue(tc, bio, lookup_result.block);
@@ -1254,7 +1268,7 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
 
 	case -ENODATA:
 		if (rw != READ) {
-			bio_io_error(bio);
+			handle_unserviceable_bio(tc->pool, bio);
 			break;
 		}
 
@@ -1565,9 +1579,9 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 		if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
 			/*
 			 * This block isn't provisioned, and we have no way
-			 * of doing so.  Just error it.
+			 * of doing so.
 			 */
-			bio_io_error(bio);
+			handle_unserviceable_bio(tc->pool, bio);
 			return DM_MAPIO_SUBMITTED;
 		}
 		/* fall through */

From 787a996cb251e20f560e1615cd85693562541a7a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 6 Dec 2013 16:21:43 -0500
Subject: [PATCH 15/35] dm thin: add error_if_no_space feature

If the pool runs out of data or metadata space, the pool can either
queue or error the IO destined to the data device.  The default is to
queue the IO until more space is added.

An admin may now configure the pool to error IO when no space is
available by setting the 'error_if_no_space' feature when loading the
thin-pool table.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 .../device-mapper/thin-provisioning.txt       |  7 +++++
 drivers/md/dm-thin.c                          | 31 +++++++++++++++----
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index 50c44cf79b0e..8a7a3d46e0da 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -235,6 +235,8 @@ i) Constructor
       read_only: Don't allow any changes to be made to the pool
 		 metadata.
 
+      error_if_no_space: Error IOs, instead of queueing, if no space.
+
     Data block size must be between 64KB (128 sectors) and 1GB
     (2097152 sectors) inclusive.
 
@@ -276,6 +278,11 @@ ii) Status
 	contain the string 'Fail'.  The userspace recovery tools
 	should then be used.
 
+    error_if_no_space|queue_if_no_space
+	If the pool runs out of data or metadata space, the pool will
+	either queue or error the IO destined to the data device.  The
+	default is to queue the IO until more space is added.
+
 iii) Messages
 
     create_thin <dev id>
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 53252d2af249..075c39edea21 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -144,6 +144,7 @@ struct pool_features {
 	bool zero_new_blocks:1;
 	bool discard_enabled:1;
 	bool discard_passdown:1;
+	bool error_if_no_space:1;
 };
 
 struct thin_c;
@@ -1440,6 +1441,9 @@ static void set_no_free_space(struct pool *pool)
 {
 	unsigned long flags;
 
+	if (pool->pf.error_if_no_space)
+		return;
+
 	spin_lock_irqsave(&pool->lock, flags);
 	pool->no_free_space = true;
 	spin_unlock_irqrestore(&pool->lock, flags);
@@ -1723,6 +1727,7 @@ static void pool_features_init(struct pool_features *pf)
 	pf->zero_new_blocks = true;
 	pf->discard_enabled = true;
 	pf->discard_passdown = true;
+	pf->error_if_no_space = false;
 }
 
 static void __pool_destroy(struct pool *pool)
@@ -1968,6 +1973,9 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
 		else if (!strcasecmp(arg_name, "read_only"))
 			pf->mode = PM_READ_ONLY;
 
+		else if (!strcasecmp(arg_name, "error_if_no_space"))
+			pf->error_if_no_space = true;
+
 		else {
 			ti->error = "Unrecognised pool feature requested";
 			r = -EINVAL;
@@ -2038,6 +2046,8 @@ static dm_block_t calc_metadata_threshold(struct pool_c *pt)
  *	     skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
  *	     ignore_discard: disable discard
  *	     no_discard_passdown: don't pass discards down to the data device
+ *	     read_only: Don't allow any changes to be made to the pool metadata.
+ *	     error_if_no_space: error IOs, instead of queueing, if no space.
  */
 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
@@ -2555,7 +2565,8 @@ static void emit_flags(struct pool_features *pf, char *result,
 		       unsigned sz, unsigned maxlen)
 {
 	unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
-		!pf->discard_passdown + (pf->mode == PM_READ_ONLY);
+		!pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
+		pf->error_if_no_space;
 	DMEMIT("%u ", count);
 
 	if (!pf->zero_new_blocks)
@@ -2569,6 +2580,9 @@ static void emit_flags(struct pool_features *pf, char *result,
 
 	if (pf->mode == PM_READ_ONLY)
 		DMEMIT("read_only ");
+
+	if (pf->error_if_no_space)
+		DMEMIT("error_if_no_space ");
 }
 
 /*
@@ -2663,11 +2677,16 @@ static void pool_status(struct dm_target *ti, status_type_t type,
 			DMEMIT("rw ");
 
 		if (!pool->pf.discard_enabled)
-			DMEMIT("ignore_discard");
+			DMEMIT("ignore_discard ");
 		else if (pool->pf.discard_passdown)
-			DMEMIT("discard_passdown");
+			DMEMIT("discard_passdown ");
 		else
-			DMEMIT("no_discard_passdown");
+			DMEMIT("no_discard_passdown ");
+
+		if (pool->pf.error_if_no_space)
+			DMEMIT("error_if_no_space ");
+		else
+			DMEMIT("queue_if_no_space ");
 
 		break;
 
@@ -2766,7 +2785,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 9, 0},
+	.version = {1, 10, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -3053,7 +3072,7 @@ static int thin_iterate_devices(struct dm_target *ti,
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 9, 0},
+	.version = {1, 10, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,

From 6d16202be7bca169771e2cec140a6c6c53ce9df5 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 20 Dec 2013 18:09:02 -0500
Subject: [PATCH 16/35] dm thin: eliminate the no_free_space flag

The pool's error_if_no_space flag can easily serve the same purpose that
no_free_space did, namely: control whether handle_unserviceable_bio()
will error a bio or requeue it.

This is cleaner since error_if_no_space is established when the pool's
features are processed during table load.  So it avoids managing the
no_free_space flag by taking the pool's spinlock.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 075c39edea21..a55c5ebb4031 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -165,7 +165,6 @@ struct pool {
 
 	struct pool_features pf;
 	bool low_water_triggered:1;	/* A dm event has been sent */
-	bool no_free_space:1;		/* bios will be requeued if set */
 
 	struct dm_bio_prison *prison;
 	struct dm_kcopyd_client *copier;
@@ -991,10 +990,10 @@ static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
 	 */
 	WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
 
-	if (pool->no_free_space)
-		retry_on_resume(bio);
-	else
+	if (pool->pf.error_if_no_space)
 		bio_io_error(bio);
+	else
+		retry_on_resume(bio);
 }
 
 static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
@@ -1437,18 +1436,6 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
 	}
 }
 
-static void set_no_free_space(struct pool *pool)
-{
-	unsigned long flags;
-
-	if (pool->pf.error_if_no_space)
-		return;
-
-	spin_lock_irqsave(&pool->lock, flags);
-	pool->no_free_space = true;
-	spin_unlock_irqrestore(&pool->lock, flags);
-}
-
 /*
  * Rather than calling set_pool_mode directly, use these which describe the
  * reason for mode degradation.
@@ -1457,7 +1444,6 @@ static void out_of_data_space(struct pool *pool)
 {
 	DMERR_LIMIT("%s: no free data space available.",
 		    dm_device_name(pool->pool_md));
-	set_no_free_space(pool);
 	set_pool_mode(pool, PM_READ_ONLY);
 }
 
@@ -1470,11 +1456,9 @@ static void metadata_operation_failed(struct pool *pool, const char *op, int r)
 
 	if (r == -ENOSPC &&
 	    !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
-	    !free_blocks) {
+	    !free_blocks)
 		DMERR_LIMIT("%s: no free metadata space available.",
 			    dm_device_name(pool->pool_md));
-		set_no_free_space(pool);
-	}
 
 	set_pool_mode(pool, PM_READ_ONLY);
 }
@@ -1819,7 +1803,6 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	INIT_LIST_HEAD(&pool->prepared_mappings);
 	INIT_LIST_HEAD(&pool->prepared_discards);
 	pool->low_water_triggered = false;
-	pool->no_free_space = false;
 	bio_list_init(&pool->retry_on_resume_list);
 
 	pool->shared_read_ds = dm_deferred_set_create();
@@ -2346,7 +2329,6 @@ static void pool_resume(struct dm_target *ti)
 
 	spin_lock_irqsave(&pool->lock, flags);
 	pool->low_water_triggered = false;
-	pool->no_free_space = false;
 	__requeue_bios(pool);
 	spin_unlock_irqrestore(&pool->lock, flags);
 

From 8b64e881eb40ac8b9bfcbce068a97eef819044ee Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 20 Dec 2013 14:27:28 -0500
Subject: [PATCH 17/35] dm thin: fix set_pool_mode exposed pool operation races

The pool mode must not be switched until after the corresponding pool
process_* methods have been established.  Otherwise, because
set_pool_mode() isn't interlocked with the IO path for performance
reasons, the IO path can end up executing process_* operations that
don't match the mode.  This patch eliminates problems like the following
(as seen on really fast PCIe SSD storage when transitioning the pool's
mode from PM_READ_ONLY to PM_WRITE):

kernel: device-mapper: thin: 253:2: reached low water mark for data device: sending event.
kernel: device-mapper: thin: 253:2: no free data space available.
kernel: device-mapper: thin: 253:2: switching pool to read-only mode
kernel: device-mapper: thin: 253:2: switching pool to write mode
kernel: ------------[ cut here ]------------
kernel: WARNING: CPU: 11 PID: 7564 at drivers/md/dm-thin.c:995 handle_unserviceable_bio+0x146/0x160 [dm_thin_pool]()
...
kernel: Workqueue: dm-thin do_worker [dm_thin_pool]
kernel: 00000000000003e3 ffff880308831cc8 ffffffff8152ebcb 00000000000003e3
kernel: 0000000000000000 ffff880308831d08 ffffffff8104c46c ffff88032502a800
kernel: ffff880036409000 ffff88030ec7ce00 0000000000000001 00000000ffffffc3
kernel: Call Trace:
kernel: [<ffffffff8152ebcb>] dump_stack+0x49/0x5e
kernel: [<ffffffff8104c46c>] warn_slowpath_common+0x8c/0xc0
kernel: [<ffffffff8104c4ba>] warn_slowpath_null+0x1a/0x20
kernel: [<ffffffffa001e2c6>] handle_unserviceable_bio+0x146/0x160 [dm_thin_pool]
kernel: [<ffffffffa001f276>] process_bio_read_only+0x136/0x180 [dm_thin_pool]
kernel: [<ffffffffa0020b75>] process_deferred_bios+0xc5/0x230 [dm_thin_pool]
kernel: [<ffffffffa0020d31>] do_worker+0x51/0x60 [dm_thin_pool]
kernel: [<ffffffff81067823>] process_one_work+0x183/0x490
kernel: [<ffffffff81068c70>] worker_thread+0x120/0x3a0
kernel: [<ffffffff81068b50>] ? manage_workers+0x160/0x160
kernel: [<ffffffff8106e86e>] kthread+0xce/0xf0
kernel: [<ffffffff8106e7a0>] ? kthread_freezable_should_stop+0x70/0x70
kernel: [<ffffffff8153b3ec>] ret_from_fork+0x7c/0xb0
kernel: [<ffffffff8106e7a0>] ? kthread_freezable_should_stop+0x70/0x70
kernel: ---[ end trace 3f00528e08ffa55c ]---
kernel: device-mapper: thin: pool mode is PM_WRITE not PM_READ_ONLY like expected!?

dm-thin.c:995 was the WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
at the top of handle_unserviceable_bio().  And as the additional
debugging I had conveys: the pool mode was _not_ PM_READ_ONLY like
expected, it was already PM_WRITE, yet pool->process_bio was still set
to process_bio_read_only().

Also, while fixing this up, reduce logging of redundant pool mode
transitions by checking new_mode is different from old_mode.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-thin.c | 40 +++++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index a55c5ebb4031..d2328bb05192 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1392,16 +1392,16 @@ static enum pool_mode get_pool_mode(struct pool *pool)
 	return pool->pf.mode;
 }
 
-static void set_pool_mode(struct pool *pool, enum pool_mode mode)
+static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 {
 	int r;
+	enum pool_mode old_mode = pool->pf.mode;
 
-	pool->pf.mode = mode;
-
-	switch (mode) {
+	switch (new_mode) {
 	case PM_FAIL:
-		DMERR("%s: switching pool to failure mode",
-		      dm_device_name(pool->pool_md));
+		if (old_mode != new_mode)
+			DMERR("%s: switching pool to failure mode",
+			      dm_device_name(pool->pool_md));
 		dm_pool_metadata_read_only(pool->pmd);
 		pool->process_bio = process_bio_fail;
 		pool->process_discard = process_bio_fail;
@@ -1410,13 +1410,15 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
 		break;
 
 	case PM_READ_ONLY:
-		DMERR("%s: switching pool to read-only mode",
-		      dm_device_name(pool->pool_md));
+		if (old_mode != new_mode)
+			DMERR("%s: switching pool to read-only mode",
+			      dm_device_name(pool->pool_md));
 		r = dm_pool_abort_metadata(pool->pmd);
 		if (r) {
 			DMERR("%s: aborting transaction failed",
 			      dm_device_name(pool->pool_md));
-			set_pool_mode(pool, PM_FAIL);
+			new_mode = PM_FAIL;
+			set_pool_mode(pool, new_mode);
 		} else {
 			dm_pool_metadata_read_only(pool->pmd);
 			pool->process_bio = process_bio_read_only;
@@ -1427,6 +1429,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
 		break;
 
 	case PM_WRITE:
+		if (old_mode != new_mode)
+			DMINFO("%s: switching pool to write mode",
+			       dm_device_name(pool->pool_md));
 		dm_pool_metadata_read_write(pool->pmd);
 		pool->process_bio = process_bio;
 		pool->process_discard = process_discard;
@@ -1434,6 +1439,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
 		pool->process_prepared_discard = process_prepared_discard;
 		break;
 	}
+
+	pool->pf.mode = new_mode;
 }
 
 /*
@@ -1676,6 +1683,17 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 	enum pool_mode old_mode = pool->pf.mode;
 	enum pool_mode new_mode = pt->adjusted_pf.mode;
 
+	/*
+	 * Don't change the pool's mode until set_pool_mode() below.
+	 * Otherwise the pool's process_* function pointers may
+	 * not match the desired pool mode.
+	 */
+	pt->adjusted_pf.mode = old_mode;
+
+	pool->ti = ti;
+	pool->pf = pt->adjusted_pf;
+	pool->low_water_blocks = pt->low_water_blocks;
+
 	/*
 	 * If we were in PM_FAIL mode, rollback of metadata failed.  We're
 	 * not going to recover without a thin_repair.  So we never let the
@@ -1686,10 +1704,6 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 	if (old_mode == PM_FAIL)
 		new_mode = old_mode;
 
-	pool->ti = ti;
-	pool->low_water_blocks = pt->low_water_blocks;
-	pool->pf = pt->adjusted_pf;
-
 	set_pool_mode(pool, new_mode);
 
 	return 0;

From b815805154cc62debbc423a6c27ae39290b300ae Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Mon, 18 Nov 2013 13:32:43 -0500
Subject: [PATCH 18/35] dm cache policy mq: use list_del_init instead of
 list_del + INIT_LIST_HEAD

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Acked-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-mq.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 64780ad73bb0..7f1aaa38a7e0 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -287,9 +287,8 @@ static struct entry *alloc_entry(struct entry_pool *ep)
 static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
 {
 	struct entry *e = ep->entries + from_cblock(cblock);
-	list_del(&e->list);
 
-	INIT_LIST_HEAD(&e->list);
+	list_del_init(&e->list);
 	INIT_HLIST_NODE(&e->hlist);
 	ep->nr_allocated++;
 

From 78e03d69733c48312ae81fe4ac0790dbea412b9d Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 9 Dec 2013 12:53:05 +0000
Subject: [PATCH 19/35] dm cache policy mq: introduce three promotion threshold
 tunables

Internally the mq policy maintains a promotion threshold variable.  If
the hit count of a block not in the cache goes above this threshold it
gets promoted to the cache.

This patch introduces three new tunables that allow you to tweak the
promotion threshold by adding a small value.  These adjustments depend
on the io type:

   read_promote_adjustment:    READ io, default 4
   write_promote_adjustment:   WRITE io, default 8
   discard_promote_adjustment: READ/WRITE io to a discarded block, default 1

If you're trying to quickly warm a new cache device you may wish to
reduce these to encourage promotion.  Remember to switch them back to
their defaults after the cache fills though.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 .../device-mapper/cache-policies.txt          | 16 ++++-
 drivers/md/dm-cache-policy-mq.c               | 64 +++++++++++++------
 2 files changed, 57 insertions(+), 23 deletions(-)

diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.txt
index df52a849957f..66c2774c0c64 100644
--- a/Documentation/device-mapper/cache-policies.txt
+++ b/Documentation/device-mapper/cache-policies.txt
@@ -40,8 +40,11 @@ on hit count on entry. The policy aims to take different cache miss
 costs into account and to adjust to varying load patterns automatically.
 
 Message and constructor argument pairs are:
-	'sequential_threshold <#nr_sequential_ios>' and
-	'random_threshold <#nr_random_ios>'.
+	'sequential_threshold <#nr_sequential_ios>'
+	'random_threshold <#nr_random_ios>'
+	'read_promote_adjustment <value>'
+	'write_promote_adjustment <value>'
+	'discard_promote_adjustment <value>'
 
 The sequential threshold indicates the number of contiguous I/Os
 required before a stream is treated as sequential.  The random threshold
@@ -55,6 +58,15 @@ since spindles tend to have good bandwidth. The io_tracker counts
 contiguous I/Os to try to spot when the io is in one of these sequential
 modes.
 
+Internally the mq policy maintains a promotion threshold variable.  If
+the hit count of a block not in the cache goes above this threshold it
+gets promoted to the cache.  The read, write and discard promote adjustment
+tunables allow you to tweak the promotion threshold by adding a small
+value based on the io type.  They default to 4, 8 and 1 respectively.
+If you're trying to quickly warm a new cache device you may wish to
+reduce these to encourage promotion.  Remember to switch them back to
+their defaults after the cache fills though.
+
 cleaner
 -------
 
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 7f1aaa38a7e0..e63e36cefc89 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -390,6 +390,10 @@ struct mq_policy {
 	 */
 	unsigned promote_threshold;
 
+	unsigned discard_promote_adjustment;
+	unsigned read_promote_adjustment;
+	unsigned write_promote_adjustment;
+
 	/*
 	 * The hash table allows us to quickly find an entry by origin
 	 * block.  Both pre_cache and cache entries are in here.
@@ -399,6 +403,10 @@ struct mq_policy {
 	struct hlist_head *table;
 };
 
+#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
+#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
+#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
+
 /*----------------------------------------------------------------*/
 
 /*
@@ -641,25 +649,21 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
  * We bias towards reads, since they can be demoted at no cost if they
  * haven't been dirtied.
  */
-#define DISCARDED_PROMOTE_THRESHOLD 1
-#define READ_PROMOTE_THRESHOLD 4
-#define WRITE_PROMOTE_THRESHOLD 8
-
 static unsigned adjusted_promote_threshold(struct mq_policy *mq,
 					   bool discarded_oblock, int data_dir)
 {
 	if (data_dir == READ)
-		return mq->promote_threshold + READ_PROMOTE_THRESHOLD;
+		return mq->promote_threshold + mq->read_promote_adjustment;
 
 	if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
 		/*
 		 * We don't need to do any copying at all, so give this a
 		 * very low threshold.
 		 */
-		return DISCARDED_PROMOTE_THRESHOLD;
+		return mq->discard_promote_adjustment;
 	}
 
-	return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD;
+	return mq->promote_threshold + mq->write_promote_adjustment;
 }
 
 static bool should_promote(struct mq_policy *mq, struct entry *e,
@@ -808,7 +812,7 @@ static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
 			  bool can_migrate, bool discarded_oblock,
 			  int data_dir, struct policy_result *result)
 {
-	if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) {
+	if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
 		if (can_migrate)
 			insert_in_cache(mq, oblock, result);
 		else
@@ -1134,20 +1138,28 @@ static int mq_set_config_value(struct dm_cache_policy *p,
 			       const char *key, const char *value)
 {
 	struct mq_policy *mq = to_mq_policy(p);
-	enum io_pattern pattern;
 	unsigned long tmp;
 
-	if (!strcasecmp(key, "random_threshold"))
-		pattern = PATTERN_RANDOM;
-	else if (!strcasecmp(key, "sequential_threshold"))
-		pattern = PATTERN_SEQUENTIAL;
-	else
-		return -EINVAL;
-
 	if (kstrtoul(value, 10, &tmp))
 		return -EINVAL;
 
-	mq->tracker.thresholds[pattern] = tmp;
+	if (!strcasecmp(key, "random_threshold")) {
+		mq->tracker.thresholds[PATTERN_RANDOM] = tmp;
+
+	} else if (!strcasecmp(key, "sequential_threshold")) {
+		mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp;
+
+	} else if (!strcasecmp(key, "discard_promote_adjustment"))
+		mq->discard_promote_adjustment = tmp;
+
+	else if (!strcasecmp(key, "read_promote_adjustment"))
+		mq->read_promote_adjustment = tmp;
+
+	else if (!strcasecmp(key, "write_promote_adjustment"))
+		mq->write_promote_adjustment = tmp;
+
+	else
+		return -EINVAL;
 
 	return 0;
 }
@@ -1157,9 +1169,16 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsign
 	ssize_t sz = 0;
 	struct mq_policy *mq = to_mq_policy(p);
 
-	DMEMIT("4 random_threshold %u sequential_threshold %u",
+	DMEMIT("10 random_threshold %u "
+	       "sequential_threshold %u "
+	       "discard_promote_adjustment %u "
+	       "read_promote_adjustment %u "
+	       "write_promote_adjustment %u",
 	       mq->tracker.thresholds[PATTERN_RANDOM],
-	       mq->tracker.thresholds[PATTERN_SEQUENTIAL]);
+	       mq->tracker.thresholds[PATTERN_SEQUENTIAL],
+	       mq->discard_promote_adjustment,
+	       mq->read_promote_adjustment,
+	       mq->write_promote_adjustment);
 
 	return 0;
 }
@@ -1212,6 +1231,9 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
 	mq->hit_count = 0;
 	mq->generation = 0;
 	mq->promote_threshold = 0;
+	mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
+	mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
+	mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
 	mutex_init(&mq->lock);
 	spin_lock_init(&mq->tick_lock);
 
@@ -1243,7 +1265,7 @@ bad_pre_cache_init:
 
 static struct dm_cache_policy_type mq_policy_type = {
 	.name = "mq",
-	.version = {1, 1, 0},
+	.version = {1, 2, 0},
 	.hint_size = 4,
 	.owner = THIS_MODULE,
 	.create = mq_create
@@ -1251,7 +1273,7 @@ static struct dm_cache_policy_type mq_policy_type = {
 
 static struct dm_cache_policy_type default_policy_type = {
 	.name = "default",
-	.version = {1, 1, 0},
+	.version = {1, 2, 0},
 	.hint_size = 4,
 	.owner = THIS_MODULE,
 	.create = mq_create

From c1a6416021b311fdb5b98e40ed8b29508845ed16 Mon Sep 17 00:00:00 2001
From: Chuansheng Liu <chuansheng.liu@intel.com>
Date: Tue, 7 Jan 2014 16:56:18 +0800
Subject: [PATCH 20/35] dm snapshot: call destroy_work_on_stack() to pair with
 INIT_WORK_ONSTACK()

In case CONFIG_DEBUG_OBJECTS_WORK is defined, it is needed to
call destroy_work_on_stack() which frees the debug object to pair
with INIT_WORK_ONSTACK().

Signed-off-by: Liu, Chuansheng <chuansheng.liu@intel.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-snap-persistent.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2d2b1b7588d7..2f5a9f83fc8d 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -257,6 +257,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
 	INIT_WORK_ONSTACK(&req.work, do_metadata);
 	queue_work(ps->metadata_wq, &req.work);
 	flush_workqueue(ps->metadata_wq);
+	destroy_work_on_stack(&req.work);
 
 	return req.result;
 }

From 1ddd641ddcfa46d719189468b6856e9b17381a61 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 6 Jan 2014 22:53:28 -0500
Subject: [PATCH 21/35] dm: remove pointless kobject comparison in
 dm_get_from_kobject

The comparison is always true and the compiler optimizes it out anyway.

Milan offered additional context relative to the original commit
784aae735d ("dm: add name and uuid to sysfs") which introduced the code:
"I think it is just relict of some experiments before I committed this
simple embedded sysfs kobj handling".

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Acked-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0704c523a76b..b3d937211a48 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2905,17 +2905,11 @@ struct kobject *dm_kobject(struct mapped_device *md)
 	return &md->kobj;
 }
 
-/*
- * struct mapped_device should not be exported outside of dm.c
- * so use this check to verify that kobj is part of md structure
- */
 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
 {
 	struct mapped_device *md;
 
 	md = container_of(kobj, struct mapped_device, kobj);
-	if (&md->kobj != kobj)
-		return NULL;
 
 	if (test_bit(DMF_FREEING, &md->flags) ||
 	    dm_deleting_md(md))

From be35f486108227e10fe5d96fd42fb2b344c59983 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 6 Jan 2014 23:01:22 -0500
Subject: [PATCH 22/35] dm: wait until embedded kobject is released before
 destroying a device

There may be other parts of the kernel holding a reference on the dm
kobject.  We must wait until all references are dropped before
deallocating the mapped_device structure.

The dm_kobject_release method signals that all references are dropped
via completion.  But dm_kobject_release doesn't free the kobject (which
is embedded in the mapped_device structure).

This is the sequence of operations:
* when destroying a DM device, call kobject_put from dm_sysfs_exit
* wait until all users stop using the kobject, when it happens the
  release method is called
* the release method signals the completion and should return without
  delay
* the dm device removal code that waits on the completion continues
* the dm device removal code drops the dm_mod reference the device had
* the dm device removal code frees the mapped_device structure that
  contains the kobject

Using kobject this way should avoid the module unload race that was
mentioned at the beginning of this thread:
https://lkml.org/lkml/2014/1/4/83

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-sysfs.c | 10 +++++++++-
 drivers/md/dm.c       | 11 +++++++++++
 drivers/md/dm.h       |  2 ++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 84d2b91e4efb..e0cc5d6a9e46 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -79,6 +79,11 @@ static const struct sysfs_ops dm_sysfs_ops = {
 	.show	= dm_attr_show,
 };
 
+static void dm_kobject_release(struct kobject *kobj)
+{
+	complete(dm_get_completion_from_kobject(kobj));
+}
+
 /*
  * dm kobject is embedded in mapped_device structure
  * no need to define release function here
@@ -86,6 +91,7 @@ static const struct sysfs_ops dm_sysfs_ops = {
 static struct kobj_type dm_ktype = {
 	.sysfs_ops	= &dm_sysfs_ops,
 	.default_attrs	= dm_attrs,
+	.release	= dm_kobject_release,
 };
 
 /*
@@ -104,5 +110,7 @@ int dm_sysfs_init(struct mapped_device *md)
  */
 void dm_sysfs_exit(struct mapped_device *md)
 {
-	kobject_put(dm_kobject(md));
+	struct kobject *kobj = dm_kobject(md);
+	kobject_put(kobj);
+	wait_for_completion(dm_get_completion_from_kobject(kobj));
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b3d937211a48..e290e72922a4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -203,6 +203,9 @@ struct mapped_device {
 	/* sysfs handle */
 	struct kobject kobj;
 
+	/* wait until the kobject is released */
+	struct completion kobj_completion;
+
 	/* zero-length flush that will be cloned and submitted to targets */
 	struct bio flush_bio;
 
@@ -2041,6 +2044,7 @@ static struct mapped_device *alloc_dev(int minor)
 	init_waitqueue_head(&md->wait);
 	INIT_WORK(&md->work, dm_wq_work);
 	init_waitqueue_head(&md->eventq);
+	init_completion(&md->kobj_completion);
 
 	md->disk->major = _major;
 	md->disk->first_minor = minor;
@@ -2919,6 +2923,13 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
 	return md;
 }
 
+struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
+{
+	struct mapped_device *md = container_of(kobj, struct mapped_device, kobj);
+
+	return &md->kobj_completion;
+}
+
 int dm_suspended_md(struct mapped_device *md)
 {
 	return test_bit(DMF_SUSPENDED, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index c57ba550f69e..1ab2028559ca 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -15,6 +15,7 @@
 #include <linux/list.h>
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
+#include <linux/completion.h>
 
 #include "dm-stats.h"
 
@@ -152,6 +153,7 @@ int dm_sysfs_init(struct mapped_device *md);
 void dm_sysfs_exit(struct mapped_device *md);
 struct kobject *dm_kobject(struct mapped_device *md);
 struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
+struct completion *dm_get_completion_from_kobject(struct kobject *kobj);
 
 /*
  * Targets for linear and striped mappings

From 12c91a5c2d2a8e8cc40a9552313e1e7b0a2d9ee3 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Tue, 7 Jan 2014 15:47:59 +0000
Subject: [PATCH 23/35] dm space map common: make sure new space is used during
 extend

When extending a low level space map we should update nr_blocks at
the start so the new space is used for the index entries.

Otherwise extend can fail, e.g.: sm_metadata_extend call sequence
that fails:
 -> sm_ll_extend
    -> dm_tm_new_block -> dm_sm_new_block -> sm_bootstrap_new_block
    => returns -ENOSPC because smm->begin == smm->ll.nr_blocks

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/persistent-data/dm-space-map-common.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 466a60bbd716..aacbe70c2c2e 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -245,6 +245,10 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
 		return -EINVAL;
 	}
 
+	/*
+	 * We need to set this before the dm_tm_new_block() call below.
+	 */
+	ll->nr_blocks = nr_blocks;
 	for (i = old_blocks; i < blocks; i++) {
 		struct dm_block *b;
 		struct disk_index_entry idx;
@@ -252,6 +256,7 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
 		r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b);
 		if (r < 0)
 			return r;
+
 		idx.blocknr = cpu_to_le64(dm_block_location(b));
 
 		r = dm_tm_unlock(ll->tm, b);
@@ -266,7 +271,6 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
 			return r;
 	}
 
-	ll->nr_blocks = nr_blocks;
 	return 0;
 }
 

From 7e664b3dec431eebf0c5df5ff704d6197634cf35 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Tue, 7 Jan 2014 15:49:02 +0000
Subject: [PATCH 24/35] dm space map metadata: fix extending the space map

When extending a metadata space map we should do the first commit whilst
still in bootstrap mode -- a mode where all blocks get allocated in the
new area.

That way the commit overhead is allocated from the newly added space.
Otherwise we risk running out of space.

With this fix, and the previous commit "dm space map common: make sure
new space is used during extend", the following device mapper testsuite
test passes:
 dmtest run --suite thin-provisioning -n /resize_metadata_no_io/

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 .../md/persistent-data/dm-space-map-metadata.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index e93084419068..bfbfe03228c1 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -608,20 +608,28 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
 	 * Flick into a mode where all blocks get allocated in the new area.
 	 */
 	smm->begin = old_len;
-	memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
+	memcpy(sm, &bootstrap_ops, sizeof(*sm));
 
 	/*
 	 * Extend.
 	 */
 	r = sm_ll_extend(&smm->ll, extra_blocks);
+	if (r)
+		goto out;
 
+	for (i = old_len; !r && i < smm->begin; i++) {
+		r = sm_ll_inc(&smm->ll, i, &ev);
+		if (r)
+			goto out;
+	}
+
+	r = sm_metadata_commit(sm);
+
+out:
 	/*
 	 * Switch back to normal behaviour.
 	 */
-	memcpy(&smm->sm, &ops, sizeof(smm->sm));
-	for (i = old_len; !r && i < smm->begin; i++)
-		r = sm_ll_inc(&smm->ll, i, &ev);
-
+	memcpy(sm, &ops, sizeof(*sm));
 	return r;
 }
 

From f164e6900f2be2c29f5c11ca52af5bb824f40826 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 20 Dec 2013 15:41:11 +0000
Subject: [PATCH 25/35] dm btree: add dm_btree_find_lowest_key

dm_btree_find_lowest_key is the reciprocal of dm_btree_find_highest_key.
Factor out common code for dm_btree_find_{highest,lowest}_key.

dm_btree_find_lowest_key is needed for an upcoming DM target, as such it
is best to get this interface in place.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/persistent-data/dm-btree.c | 33 +++++++++++++++++++++------
 drivers/md/persistent-data/dm-btree.h |  8 +++++++
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 468e371ee9b2..416060c25709 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -770,8 +770,8 @@ EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
 
 /*----------------------------------------------------------------*/
 
-static int find_highest_key(struct ro_spine *s, dm_block_t block,
-			    uint64_t *result_key, dm_block_t *next_block)
+static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
+		    uint64_t *result_key, dm_block_t *next_block)
 {
 	int i, r;
 	uint32_t flags;
@@ -788,7 +788,11 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block,
 		else
 			i--;
 
-		*result_key = le64_to_cpu(ro_node(s)->keys[i]);
+		if (find_highest)
+			*result_key = le64_to_cpu(ro_node(s)->keys[i]);
+		else
+			*result_key = le64_to_cpu(ro_node(s)->keys[0]);
+
 		if (next_block || flags & INTERNAL_NODE)
 			block = value64(ro_node(s), i);
 
@@ -799,16 +803,16 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block,
 	return 0;
 }
 
-int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
-			      uint64_t *result_keys)
+static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
+			     bool find_highest, uint64_t *result_keys)
 {
 	int r = 0, count = 0, level;
 	struct ro_spine spine;
 
 	init_ro_spine(&spine, info);
 	for (level = 0; level < info->levels; level++) {
-		r = find_highest_key(&spine, root, result_keys + level,
-				     level == info->levels - 1 ? NULL : &root);
+		r = find_key(&spine, root, find_highest, result_keys + level,
+			     level == info->levels - 1 ? NULL : &root);
 		if (r == -ENODATA) {
 			r = 0;
 			break;
@@ -822,8 +826,23 @@ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
 
 	return r ? r : count;
 }
+
+int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
+			      uint64_t *result_keys)
+{
+	return dm_btree_find_key(info, root, true, result_keys);
+}
 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
 
+int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
+			     uint64_t *result_keys)
+{
+	return dm_btree_find_key(info, root, false, result_keys);
+}
+EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
+
+/*----------------------------------------------------------------*/
+
 /*
  * FIXME: We shouldn't use a recursive algorithm when we have limited stack
  * space.  Also this only works for single level trees.
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index 8672d159e0b5..dacfc34180b4 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -134,6 +134,14 @@ int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 		    uint64_t *keys, dm_block_t *new_root);
 
+/*
+ * Returns < 0 on failure.  Otherwise the number of key entries that have
+ * been filled out.  Remember trees can have zero entries, and as such have
+ * no lowest key.
+ */
+int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
+			     uint64_t *result_keys);
+
 /*
  * Returns < 0 on failure.  Otherwise the number of key entries that have
  * been filled out.  Remember trees can have zero entries, and as such have

From 6a388618f120cdc70cd6b6dbcab5f7a4aff500f6 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 9 Jan 2014 16:04:12 -0500
Subject: [PATCH 26/35] dm cache: add block sizes and total cache blocks to
 status output

Improve cache_status to emit:
<metadata block size> <#used metadata blocks>/<#total metadata blocks>
<cache block size> <#used cache blocks>/<#total cache blocks>
...

Adding the block sizes allows for easier calculation of the overall size
of both the metadata and cache devices.  Adding <#total cache blocks>
provides useful context for how much of the cache is used.

Unfortunately these additions to the status will require updates to
users' scripts that monitor the cache status.  But these changes help
provide more comprehensive information about the cache device and will
simplify tools that are being developed to manage dm-cache devices --
because they won't need to issue 3 operations to cobble together the
information that we can easily provide via a single status ioctl.

While updating the status documentation in cache.txt spaces were
tabify'd.

Requested-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 Documentation/device-mapper/cache.txt | 48 +++++++++++++++------------
 drivers/md/dm-cache-target.c          | 16 +++++----
 2 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
index 719320b5ed3f..63fd7cfa4cf1 100644
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -217,36 +217,42 @@ the characteristics of a specific policy, always request it by name.
 Status
 ------
 
-<#used metadata blocks>/<#total metadata blocks> <#read hits> <#read misses>
-<#write hits> <#write misses> <#demotions> <#promotions> <#blocks in cache>
-<#dirty> <#features> <features>* <#core args> <core args>* <#policy args>
-<policy args>*
+<metadata block size> <#used metadata blocks>/<#total metadata blocks>
+<cache block size> <#used cache blocks>/<#total cache blocks>
+<#read hits> <#read misses> <#write hits> <#write misses>
+<#demotions> <#promotions> <#dirty> <#features> <features>*
+<#core args> <core args>* <#policy args> <policy args>*
 
-#used metadata blocks    : Number of metadata blocks used
-#total metadata blocks   : Total number of metadata blocks
-#read hits               : Number of times a READ bio has been mapped
+metadata block size	 : Fixed block size for each metadata block in
+			     sectors
+#used metadata blocks	 : Number of metadata blocks used
+#total metadata blocks	 : Total number of metadata blocks
+cache block size	 : Configurable block size for the cache device
+			     in sectors
+#used cache blocks	 : Number of blocks resident in the cache
+#total cache blocks	 : Total number of cache blocks
+#read hits		 : Number of times a READ bio has been mapped
 			     to the cache
-#read misses             : Number of times a READ bio has been mapped
+#read misses		 : Number of times a READ bio has been mapped
 			     to the origin
-#write hits              : Number of times a WRITE bio has been mapped
+#write hits		 : Number of times a WRITE bio has been mapped
 			     to the cache
-#write misses            : Number of times a WRITE bio has been
+#write misses		 : Number of times a WRITE bio has been
 			     mapped to the origin
-#demotions               : Number of times a block has been removed
+#demotions		 : Number of times a block has been removed
 			     from the cache
-#promotions              : Number of times a block has been moved to
+#promotions		 : Number of times a block has been moved to
 			     the cache
-#blocks in cache         : Number of blocks resident in the cache
-#dirty                   : Number of blocks in the cache that differ
+#dirty			 : Number of blocks in the cache that differ
 			     from the origin
-#feature args            : Number of feature args to follow
-feature args             : 'writethrough' (optional)
-#core args               : Number of core arguments (must be even)
-core args                : Key/value pairs for tuning the core
+#feature args		 : Number of feature args to follow
+feature args		 : 'writethrough' (optional)
+#core args		 : Number of core arguments (must be even)
+core args		 : Key/value pairs for tuning the core
 			     e.g. migration_threshold
-#policy args             : Number of policy arguments to follow (must be even)
-policy args              : Key/value pairs
-			     e.g. 'sequential_threshold 1024
+#policy args		 : Number of policy arguments to follow (must be even)
+policy args		 : Key/value pairs
+			     e.g. sequential_threshold
 
 Messages
 --------
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1b1469ebe5cb..11ad70540d40 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2826,9 +2826,10 @@ static void cache_resume(struct dm_target *ti)
 /*
  * Status format:
  *
- * <#used metadata blocks>/<#total metadata blocks>
+ * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
+ * <cache block size> <#used cache blocks>/<#total cache blocks>
  * <#read hits> <#read misses> <#write hits> <#write misses>
- * <#demotions> <#promotions> <#blocks in cache> <#dirty>
+ * <#demotions> <#promotions> <#dirty>
  * <#features> <features>*
  * <#core args> <core args>
  * <#policy args> <policy args>*
@@ -2869,17 +2870,20 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 
 		residency = policy_residency(cache->policy);
 
-		DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
+		DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %llu ",
+		       (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
 		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
 		       (unsigned long long)nr_blocks_metadata,
+		       cache->sectors_per_block,
+		       (unsigned long long) from_cblock(residency),
+		       (unsigned long long) from_cblock(cache->cache_size),
 		       (unsigned) atomic_read(&cache->stats.read_hit),
 		       (unsigned) atomic_read(&cache->stats.read_miss),
 		       (unsigned) atomic_read(&cache->stats.write_hit),
 		       (unsigned) atomic_read(&cache->stats.write_miss),
 		       (unsigned) atomic_read(&cache->stats.demotion),
 		       (unsigned) atomic_read(&cache->stats.promotion),
-		       (unsigned long long) from_cblock(residency),
-		       cache->nr_dirty);
+		       (unsigned long long) from_cblock(cache->nr_dirty));
 
 		if (writethrough_mode(&cache->features))
 			DMEMIT("1 writethrough ");
@@ -3129,7 +3133,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type cache_target = {
 	.name = "cache",
-	.version = {1, 2, 0},
+	.version = {1, 3, 0},
 	.module = THIS_MODULE,
 	.ctr = cache_ctr,
 	.dtr = cache_dtr,

From 119bc547362e5252074f81f56b8fcdac45cedff4 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 13 Jan 2014 19:13:36 -0500
Subject: [PATCH 27/35] dm snapshot: use GFP_KERNEL when initializing
 exceptions

The list of initial exceptions is loaded in the target constructor.  We
are allowed to allocate memory with GFP_KERNEL at this point.  So,
change alloc_completed_exception to use GFP_KERNEL when being called
from the constructor.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-snap.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 944690bafd93..717718558bd9 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -610,12 +610,12 @@ static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
 	return NULL;
 }
 
-static struct dm_exception *alloc_completed_exception(void)
+static struct dm_exception *alloc_completed_exception(gfp_t gfp)
 {
 	struct dm_exception *e;
 
-	e = kmem_cache_alloc(exception_cache, GFP_NOIO);
-	if (!e)
+	e = kmem_cache_alloc(exception_cache, gfp);
+	if (!e && gfp == GFP_NOIO)
 		e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
 
 	return e;
@@ -697,7 +697,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 	struct dm_snapshot *s = context;
 	struct dm_exception *e;
 
-	e = alloc_completed_exception();
+	e = alloc_completed_exception(GFP_KERNEL);
 	if (!e)
 		return -ENOMEM;
 
@@ -1405,7 +1405,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 		goto out;
 	}
 
-	e = alloc_completed_exception();
+	e = alloc_completed_exception(GFP_NOIO);
 	if (!e) {
 		down_write(&s->lock);
 		__invalidate_snapshot(s, -ENOMEM);

From 2cadabd512acca99e6553d303eaedc97a3178a4d Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 13 Jan 2014 19:14:04 -0500
Subject: [PATCH 28/35] dm snapshot: prepare for switch to using dm-bufio

Change the functions get_exception, read_exception and insert_exceptions
so that ps->area is passed as an argument.

This patch doesn't change any functionality, but it refactors the code
to allow for a cleaner switch over to using dm-bufio.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-snap-persistent.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2f5a9f83fc8d..ba792ae068b7 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -402,17 +402,18 @@ static int write_header(struct pstore *ps)
 /*
  * Access functions for the disk exceptions, these do the endian conversions.
  */
-static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
+static struct disk_exception *get_exception(struct pstore *ps, void *ps_area,
+					    uint32_t index)
 {
 	BUG_ON(index >= ps->exceptions_per_area);
 
-	return ((struct disk_exception *) ps->area) + index;
+	return ((struct disk_exception *) ps_area) + index;
 }
 
-static void read_exception(struct pstore *ps,
+static void read_exception(struct pstore *ps, void *ps_area,
 			   uint32_t index, struct core_exception *result)
 {
-	struct disk_exception *de = get_exception(ps, index);
+	struct disk_exception *de = get_exception(ps, ps_area, index);
 
 	/* copy it */
 	result->old_chunk = le64_to_cpu(de->old_chunk);
@@ -422,7 +423,7 @@ static void read_exception(struct pstore *ps,
 static void write_exception(struct pstore *ps,
 			    uint32_t index, struct core_exception *e)
 {
-	struct disk_exception *de = get_exception(ps, index);
+	struct disk_exception *de = get_exception(ps, ps->area, index);
 
 	/* copy it */
 	de->old_chunk = cpu_to_le64(e->old_chunk);
@@ -431,7 +432,7 @@ static void write_exception(struct pstore *ps,
 
 static void clear_exception(struct pstore *ps, uint32_t index)
 {
-	struct disk_exception *de = get_exception(ps, index);
+	struct disk_exception *de = get_exception(ps, ps->area, index);
 
 	/* clear it */
 	de->old_chunk = 0;
@@ -443,7 +444,7 @@ static void clear_exception(struct pstore *ps, uint32_t index)
  * 'full' is filled in to indicate if the area has been
  * filled.
  */
-static int insert_exceptions(struct pstore *ps,
+static int insert_exceptions(struct pstore *ps, void *ps_area,
 			     int (*callback)(void *callback_context,
 					     chunk_t old, chunk_t new),
 			     void *callback_context,
@@ -457,7 +458,7 @@ static int insert_exceptions(struct pstore *ps,
 	*full = 1;
 
 	for (i = 0; i < ps->exceptions_per_area; i++) {
-		read_exception(ps, i, &e);
+		read_exception(ps, ps_area, i, &e);
 
 		/*
 		 * If the new_chunk is pointing at the start of
@@ -504,7 +505,8 @@ static int read_exceptions(struct pstore *ps,
 		if (r)
 			return r;
 
-		r = insert_exceptions(ps, callback, callback_context, &full);
+		r = insert_exceptions(ps, ps->area, callback, callback_context,
+				      &full);
 		if (r)
 			return r;
 	}
@@ -734,7 +736,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
 		ps->current_committed = ps->exceptions_per_area;
 	}
 
-	read_exception(ps, ps->current_committed - 1, &ce);
+	read_exception(ps, ps->area, ps->current_committed - 1, &ce);
 	*last_old_chunk = ce.old_chunk;
 	*last_new_chunk = ce.new_chunk;
 
@@ -744,8 +746,8 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
 	 */
 	for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
 	     nr_consecutive++) {
-		read_exception(ps, ps->current_committed - 1 - nr_consecutive,
-			       &ce);
+		read_exception(ps, ps->area,
+			       ps->current_committed - 1 - nr_consecutive, &ce);
 		if (ce.old_chunk != *last_old_chunk - nr_consecutive ||
 		    ce.new_chunk != *last_new_chunk - nr_consecutive)
 			break;

From 55494bf2947dccdf2d98b62374fea7365dfead84 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 13 Jan 2014 19:12:36 -0500
Subject: [PATCH 29/35] dm snapshot: use dm-bufio

Use dm-bufio for initial loading of the exceptions.
Introduce a new function dm_bufio_forget that frees the given buffer.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/Kconfig              |  1 +
 drivers/md/dm-bufio.c           | 22 +++++++++++++++++++
 drivers/md/dm-bufio.h           |  7 ++++++
 drivers/md/dm-snap-persistent.c | 39 +++++++++++++++++++++++++++------
 4 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 7441344bd214..39b540a13369 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -238,6 +238,7 @@ config DM_CRYPT
 config DM_SNAPSHOT
        tristate "Snapshot target"
        depends on BLK_DEV_DM
+       select DM_BUFIO
        ---help---
          Allow volume managers to take writable snapshots of a device.
 
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 54bdd923316f..d86593721915 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1350,6 +1350,28 @@ retry:
 }
 EXPORT_SYMBOL_GPL(dm_bufio_release_move);
 
+/*
+ * Free the given buffer.
+ *
+ * This is just a hint, if the buffer is in use or dirty, this function
+ * does nothing.
+ */
+void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
+{
+	struct dm_buffer *b;
+
+	dm_bufio_lock(c);
+
+	b = __find(c, block);
+	if (b && likely(!b->hold_count) && likely(!b->state)) {
+		__unlink_buffer(b);
+		__free_buffer_wake(b);
+	}
+
+	dm_bufio_unlock(c);
+}
+EXPORT_SYMBOL(dm_bufio_forget);
+
 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
 {
 	return c->block_size;
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index b142946a9e32..3dac37627ba4 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -108,6 +108,13 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c);
  */
 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block);
 
+/*
+ * Free the given buffer.
+ * This is just a hint, if the buffer is in use or dirty, this function
+ * does nothing.
+ */
+void dm_bufio_forget(struct dm_bufio_client *c, sector_t block);
+
 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c);
 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c);
 sector_t dm_bufio_get_block_number(struct dm_buffer *b);
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index ba792ae068b7..169275050c0b 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -13,6 +13,7 @@
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/dm-io.h>
+#include "dm-bufio.h"
 
 #define DM_MSG_PREFIX "persistent snapshot"
 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32	/* 16KB */
@@ -495,27 +496,51 @@ static int read_exceptions(struct pstore *ps,
 			   void *callback_context)
 {
 	int r, full = 1;
+	struct dm_bufio_client *client;
+
+	client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev,
+					ps->store->chunk_size << SECTOR_SHIFT,
+					1, 0, NULL, NULL);
+
+	if (IS_ERR(client))
+		return PTR_ERR(client);
 
 	/*
 	 * Keeping reading chunks and inserting exceptions until
 	 * we find a partially full area.
 	 */
 	for (ps->current_area = 0; full; ps->current_area++) {
-		r = area_io(ps, READ);
-		if (r)
-			return r;
+		struct dm_buffer *bp;
+		void *area;
+		chunk_t chunk = area_location(ps, ps->current_area);
 
-		r = insert_exceptions(ps, ps->area, callback, callback_context,
+		area = dm_bufio_read(client, chunk, &bp);
+		if (unlikely(IS_ERR(area))) {
+			r = PTR_ERR(area);
+			goto ret_destroy_bufio;
+		}
+
+		r = insert_exceptions(ps, area, callback, callback_context,
 				      &full);
-		if (r)
-			return r;
+
+		dm_bufio_release(bp);
+
+		dm_bufio_forget(client, chunk);
+
+		if (unlikely(r))
+			goto ret_destroy_bufio;
 	}
 
 	ps->current_area--;
 
 	skip_metadata(ps);
 
-	return 0;
+	r = 0;
+
+ret_destroy_bufio:
+	dm_bufio_client_destroy(client);
+
+	return r;
 }
 
 static struct pstore *get_info(struct dm_exception_store *store)

From 55b082e614e219fb5199a6f93e648ed35d3c96d5 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 13 Jan 2014 19:13:05 -0500
Subject: [PATCH 30/35] dm snapshot: use dm-bufio prefetch

This patch modifies dm-snapshot so that it prefetches the buffers when
loading the exceptions.

The number of buffers read ahead is specified in the DM_PREFETCH_CHUNKS
macro.  The current value for DM_PREFETCH_CHUNKS (12) was found to
provide the best performance on a single 15k SCSI spindle.  In the
future we may modify this default or make it configurable.

Also, introduce the function dm_bufio_set_minimum_buffers to setup
bufio's number of internal buffers before freeing happens.  dm-bufio may
hold more buffers if enough memory is available.  There is no guarantee
that the specified number of buffers will be available - if you need a
guarantee, use the argument reserved_buffers for
dm_bufio_client_create.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c           | 14 ++++++++++++--
 drivers/md/dm-bufio.h           |  5 +++++
 drivers/md/dm-snap-persistent.c | 25 ++++++++++++++++++++++++-
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index d86593721915..9ed42125514b 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -104,6 +104,8 @@ struct dm_bufio_client {
 	struct list_head reserved_buffers;
 	unsigned need_reserved_buffers;
 
+	unsigned minimum_buffers;
+
 	struct hlist_head *cache_hash;
 	wait_queue_head_t free_buffer_wait;
 
@@ -861,8 +863,8 @@ static void __get_memory_limit(struct dm_bufio_client *c,
 	buffers = dm_bufio_cache_size_per_client >>
 		  (c->sectors_per_block_bits + SECTOR_SHIFT);
 
-	if (buffers < DM_BUFIO_MIN_BUFFERS)
-		buffers = DM_BUFIO_MIN_BUFFERS;
+	if (buffers < c->minimum_buffers)
+		buffers = c->minimum_buffers;
 
 	*limit_buffers = buffers;
 	*threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
@@ -1372,6 +1374,12 @@ void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
 }
 EXPORT_SYMBOL(dm_bufio_forget);
 
+void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
+{
+	c->minimum_buffers = n;
+}
+EXPORT_SYMBOL(dm_bufio_set_minimum_buffers);
+
 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
 {
 	return c->block_size;
@@ -1568,6 +1576,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 	INIT_LIST_HEAD(&c->reserved_buffers);
 	c->need_reserved_buffers = reserved_buffers;
 
+	c->minimum_buffers = DM_BUFIO_MIN_BUFFERS;
+
 	init_waitqueue_head(&c->free_buffer_wait);
 	c->async_write_error = 0;
 
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index 3dac37627ba4..c096779a7292 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -115,6 +115,11 @@ void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block);
  */
 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block);
 
+/*
+ * Set the minimum number of buffers before cleanup happens.
+ */
+void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n);
+
 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c);
 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c);
 sector_t dm_bufio_get_block_number(struct dm_buffer *b);
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 169275050c0b..afc3d017de4c 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -18,6 +18,8 @@
 #define DM_MSG_PREFIX "persistent snapshot"
 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32	/* 16KB */
 
+#define DM_PREFETCH_CHUNKS		12
+
 /*-----------------------------------------------------------------
  * Persistent snapshots, by persistent we mean that the snapshot
  * will survive a reboot.
@@ -497,6 +499,7 @@ static int read_exceptions(struct pstore *ps,
 {
 	int r, full = 1;
 	struct dm_bufio_client *client;
+	chunk_t prefetch_area = 0;
 
 	client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev,
 					ps->store->chunk_size << SECTOR_SHIFT,
@@ -505,6 +508,11 @@ static int read_exceptions(struct pstore *ps,
 	if (IS_ERR(client))
 		return PTR_ERR(client);
 
+	/*
+	 * Setup for one current buffer + desired readahead buffers.
+	 */
+	dm_bufio_set_minimum_buffers(client, 1 + DM_PREFETCH_CHUNKS);
+
 	/*
 	 * Keeping reading chunks and inserting exceptions until
 	 * we find a partially full area.
@@ -512,7 +520,22 @@ static int read_exceptions(struct pstore *ps,
 	for (ps->current_area = 0; full; ps->current_area++) {
 		struct dm_buffer *bp;
 		void *area;
-		chunk_t chunk = area_location(ps, ps->current_area);
+		chunk_t chunk;
+
+		if (unlikely(prefetch_area < ps->current_area))
+			prefetch_area = ps->current_area;
+
+		if (DM_PREFETCH_CHUNKS) do {
+			chunk_t pf_chunk = area_location(ps, prefetch_area);
+			if (unlikely(pf_chunk >= dm_bufio_get_device_size(client)))
+				break;
+			dm_bufio_prefetch(client, pf_chunk, 1);
+			prefetch_area++;
+			if (unlikely(!prefetch_area))
+				break;
+		} while (prefetch_area <= ps->current_area + DM_PREFETCH_CHUNKS);
+
+		chunk = area_location(ps, ps->current_area);
 
 		area = dm_bufio_read(client, chunk, &bp);
 		if (unlikely(IS_ERR(area))) {

From 2995fa78e423d7193f3b57835f6c1c75006a0315 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 13 Jan 2014 19:37:54 -0500
Subject: [PATCH 31/35] dm sysfs: fix a module unload race

This reverts commit be35f48610 ("dm: wait until embedded kobject is
released before destroying a device") and provides an improved fix.

The kobject release code that calls the completion must be placed in a
non-module file, otherwise there is a module unload race (if the process
calling dm_kobject_release is preempted and the DM module unloaded after
the completion is triggered, but before dm_kobject_release returns).

To fix this race, this patch moves the completion code to dm-builtin.c
which is always compiled directly into the kernel if BLK_DEV_DM is
selected.

The patch introduces a new dm_kobject_holder structure, its purpose is
to keep the completion and kobject in one place, so that it can be
accessed from non-module code without the need to export the layout of
struct mapped_device to that code.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/Kconfig      |  4 ++++
 drivers/md/Makefile     |  1 +
 drivers/md/dm-builtin.c | 48 +++++++++++++++++++++++++++++++++++++++++
 drivers/md/dm-sysfs.c   |  5 -----
 drivers/md/dm.c         | 20 +++++------------
 drivers/md/dm.h         | 17 ++++++++++++++-
 6 files changed, 74 insertions(+), 21 deletions(-)
 create mode 100644 drivers/md/dm-builtin.c

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 39b540a13369..9a06fe883766 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -176,8 +176,12 @@ config MD_FAULTY
 
 source "drivers/md/bcache/Kconfig"
 
+config BLK_DEV_DM_BUILTIN
+	boolean
+
 config BLK_DEV_DM
 	tristate "Device mapper support"
+	select BLK_DEV_DM_BUILTIN
 	---help---
 	  Device-mapper is a low level volume manager.  It works by allowing
 	  people to specify mappings for ranges of logical sectors.  Various
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 2acc43fe0229..f26d83292579 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_MD_FAULTY)		+= faulty.o
 obj-$(CONFIG_BCACHE)		+= bcache/
 obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
+obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o
 obj-$(CONFIG_DM_BUFIO)		+= dm-bufio.o
 obj-$(CONFIG_DM_BIO_PRISON)	+= dm-bio-prison.o
 obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c
new file mode 100644
index 000000000000..6c9049c51b2b
--- /dev/null
+++ b/drivers/md/dm-builtin.c
@@ -0,0 +1,48 @@
+#include "dm.h"
+
+/*
+ * The kobject release method must not be placed in the module itself,
+ * otherwise we are subject to module unload races.
+ *
+ * The release method is called when the last reference to the kobject is
+ * dropped. It may be called by any other kernel code that drops the last
+ * reference.
+ *
+ * The release method suffers from module unload race. We may prevent the
+ * module from being unloaded at the start of the release method (using
+ * increased module reference count or synchronizing against the release
+ * method), however there is no way to prevent the module from being
+ * unloaded at the end of the release method.
+ *
+ * If this code were placed in the dm module, the following race may
+ * happen:
+ *  1. Some other process takes a reference to dm kobject
+ *  2. The user issues ioctl function to unload the dm device
+ *  3. dm_sysfs_exit calls kobject_put, however the object is not released
+ *     because of the other reference taken at step 1
+ *  4. dm_sysfs_exit waits on the completion
+ *  5. The other process that took the reference in step 1 drops it,
+ *     dm_kobject_release is called from this process
+ *  6. dm_kobject_release calls complete()
+ *  7. a reschedule happens before dm_kobject_release returns
+ *  8. dm_sysfs_exit continues, the dm device is unloaded, module reference
+ *     count is decremented
+ *  9. The user unloads the dm module
+ * 10. The other process that was rescheduled in step 7 continues to run,
+ *     it is now executing code in unloaded module, so it crashes
+ *
+ * Note that if the process that takes the foreign reference to dm kobject
+ * has a low priority and the system is sufficiently loaded with
+ * higher-priority processes that prevent the low-priority process from
+ * being scheduled long enough, this bug may really happen.
+ *
+ * In order to fix this module unload race, we place the release method
+ * into a helper code that is compiled directly into the kernel.
+ */
+
+void dm_kobject_release(struct kobject *kobj)
+{
+	complete(dm_get_completion_from_kobject(kobj));
+}
+
+EXPORT_SYMBOL(dm_kobject_release);
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index e0cc5d6a9e46..c62c5ab6aed5 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -79,11 +79,6 @@ static const struct sysfs_ops dm_sysfs_ops = {
 	.show	= dm_attr_show,
 };
 
-static void dm_kobject_release(struct kobject *kobj)
-{
-	complete(dm_get_completion_from_kobject(kobj));
-}
-
 /*
  * dm kobject is embedded in mapped_device structure
  * no need to define release function here
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e290e72922a4..b49c76284241 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -200,11 +200,8 @@ struct mapped_device {
 	/* forced geometry settings */
 	struct hd_geometry geometry;
 
-	/* sysfs handle */
-	struct kobject kobj;
-
-	/* wait until the kobject is released */
-	struct completion kobj_completion;
+	/* kobject and completion */
+	struct dm_kobject_holder kobj_holder;
 
 	/* zero-length flush that will be cloned and submitted to targets */
 	struct bio flush_bio;
@@ -2044,7 +2041,7 @@ static struct mapped_device *alloc_dev(int minor)
 	init_waitqueue_head(&md->wait);
 	INIT_WORK(&md->work, dm_wq_work);
 	init_waitqueue_head(&md->eventq);
-	init_completion(&md->kobj_completion);
+	init_completion(&md->kobj_holder.completion);
 
 	md->disk->major = _major;
 	md->disk->first_minor = minor;
@@ -2906,14 +2903,14 @@ struct gendisk *dm_disk(struct mapped_device *md)
 
 struct kobject *dm_kobject(struct mapped_device *md)
 {
-	return &md->kobj;
+	return &md->kobj_holder.kobj;
 }
 
 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
 {
 	struct mapped_device *md;
 
-	md = container_of(kobj, struct mapped_device, kobj);
+	md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
 
 	if (test_bit(DMF_FREEING, &md->flags) ||
 	    dm_deleting_md(md))
@@ -2923,13 +2920,6 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
 	return md;
 }
 
-struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
-{
-	struct mapped_device *md = container_of(kobj, struct mapped_device, kobj);
-
-	return &md->kobj_completion;
-}
-
 int dm_suspended_md(struct mapped_device *md)
 {
 	return test_bit(DMF_SUSPENDED, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 1ab2028559ca..c4569f02f50f 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -16,6 +16,7 @@
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
 #include <linux/completion.h>
+#include <linux/kobject.h>
 
 #include "dm-stats.h"
 
@@ -149,11 +150,25 @@ void dm_interface_exit(void);
 /*
  * sysfs interface
  */
+struct dm_kobject_holder {
+	struct kobject kobj;
+	struct completion completion;
+};
+
+static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
+{
+	return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
+}
+
 int dm_sysfs_init(struct mapped_device *md);
 void dm_sysfs_exit(struct mapped_device *md);
 struct kobject *dm_kobject(struct mapped_device *md);
 struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
-struct completion *dm_get_completion_from_kobject(struct kobject *kobj);
+
+/*
+ * The kobject helper
+ */
+void dm_kobject_release(struct kobject *kobj);
 
 /*
  * Targets for linear and striped mappings

From 74aa45c33c5afefa0107c15f2465ff4195c33f96 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 15 Jan 2014 19:07:58 -0500
Subject: [PATCH 32/35] dm thin: fix pool feature parsing

Commit 787a996cb251e20 ("dm thin: add error_if_no_space feature")
mistakenly forgot to increase the number of feature args supported.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index d2328bb05192..726228b33a01 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1941,7 +1941,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
 	const char *arg_name;
 
 	static struct dm_arg _args[] = {
-		{0, 3, "Invalid number of pool feature arguments"},
+		{0, 4, "Invalid number of pool feature arguments"},
 	};
 
 	/*

From 2e68c4e6caad9fdadc1cef8b6cb9569192e8a42b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 15 Jan 2014 21:06:55 -0500
Subject: [PATCH 33/35] dm cache: add policy name to status output

The cache's policy may have been established using the "default" alias,
which is currently the "mq" policy but the default policy may change in
the future.  It is useful to know exactly which policy is being used.

Add a 'real' member to the dm_cache_policy_type structure and have the
"default" dm_cache_policy_type point to the real "mq"
dm_cache_policy_type.  Update dm_cache_policy_get_name() to check if
real is set, if so report the name of the real policy (not the alias).

Requested-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 Documentation/device-mapper/cache.txt | 3 ++-
 drivers/md/dm-cache-policy-mq.c       | 3 ++-
 drivers/md/dm-cache-policy.c          | 4 ++++
 drivers/md/dm-cache-policy.h          | 6 ++++++
 drivers/md/dm-cache-target.c          | 4 +++-
 5 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
index 63fd7cfa4cf1..e6b72d355151 100644
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -221,7 +221,7 @@ Status
 <cache block size> <#used cache blocks>/<#total cache blocks>
 <#read hits> <#read misses> <#write hits> <#write misses>
 <#demotions> <#promotions> <#dirty> <#features> <features>*
-<#core args> <core args>* <#policy args> <policy args>*
+<#core args> <core args>* <policy name> <#policy args> <policy args>*
 
 metadata block size	 : Fixed block size for each metadata block in
 			     sectors
@@ -250,6 +250,7 @@ feature args		 : 'writethrough' (optional)
 #core args		 : Number of core arguments (must be even)
 core args		 : Key/value pairs for tuning the core
 			     e.g. migration_threshold
+policy name		 : Name of the policy
 #policy args		 : Number of policy arguments to follow (must be even)
 policy args		 : Key/value pairs
 			     e.g. sequential_threshold
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index e63e36cefc89..930e8c3d73e9 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -1276,7 +1276,8 @@ static struct dm_cache_policy_type default_policy_type = {
 	.version = {1, 2, 0},
 	.hint_size = 4,
 	.owner = THIS_MODULE,
-	.create = mq_create
+	.create = mq_create,
+	.real = &mq_policy_type
 };
 
 static int __init mq_init(void)
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
index d80057968407..c1a3cee99b44 100644
--- a/drivers/md/dm-cache-policy.c
+++ b/drivers/md/dm-cache-policy.c
@@ -146,6 +146,10 @@ const char *dm_cache_policy_get_name(struct dm_cache_policy *p)
 {
 	struct dm_cache_policy_type *t = p->private;
 
+	/* if t->real is set then an alias was used (e.g. "default") */
+	if (t->real)
+		return t->real->name;
+
 	return t->name;
 }
 EXPORT_SYMBOL_GPL(dm_cache_policy_get_name);
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 052c00a84a5c..f50fe360c546 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -222,6 +222,12 @@ struct dm_cache_policy_type {
 	char name[CACHE_POLICY_NAME_SIZE];
 	unsigned version[CACHE_POLICY_VERSION_SIZE];
 
+	/*
+	 * For use by an alias dm_cache_policy_type to point to the
+	 * real dm_cache_policy_type.
+	 */
+	struct dm_cache_policy_type *real;
+
 	/*
 	 * Policies may store a hint for each each cache block.
 	 * Currently the size of this hint must be 0 or 4 bytes but we
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 11ad70540d40..09334c275c79 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2832,7 +2832,7 @@ static void cache_resume(struct dm_target *ti)
  * <#demotions> <#promotions> <#dirty>
  * <#features> <features>*
  * <#core args> <core args>
- * <#policy args> <policy args>*
+ * <policy name> <#policy args> <policy args>*
  */
 static void cache_status(struct dm_target *ti, status_type_t type,
 			 unsigned status_flags, char *result, unsigned maxlen)
@@ -2900,6 +2900,8 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 		}
 
 		DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
+
+		DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
 		if (sz < maxlen) {
 			r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
 			if (r)

From fca028438fb903852beaf7c3fe1cd326651af57d Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Tue, 21 Jan 2014 11:07:32 +0000
Subject: [PATCH 34/35] dm space map metadata: fix bug in resizing of thin
 metadata

This bug was introduced in commit 7e664b3dec431e ("dm space map metadata:
fix extending the space map").

When extending a dm-thin metadata volume we:

- Switch the space map into a simple bootstrap mode, which allocates
  all space linearly from the newly added space.
- Add new bitmap entries for the new space
- Increment the reference counts for those newly allocated bitmap
  entries
- Commit changes to disk
- Switch back out of bootstrap mode.

But, the disk commit may allocate space itself, if so this fact will be
lost when switching out of bootstrap mode.

The bug exhibited itself as an error when the bitmap_root, with an
erroneous ref count of 0, was subsequently decremented as part of a
later disk commit.  This would cause the disk commit to fail, and thinp
to enter read_only mode.  The metadata was not damaged (thin_check
passed).

The fix is to put the increments + commit into a loop, running until
the commit has not allocated extra space.  In practise this loop only
runs twice.

With this fix the following device mapper testsuite test passes:
 dmtest run --suite thin-provisioning -n thin_remove_works_after_resize

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # depends on commit 7e664b3dec431e
---
 .../md/persistent-data/dm-space-map-metadata.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index bfbfe03228c1..536782e3bcb7 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -617,13 +617,23 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
 	if (r)
 		goto out;
 
-	for (i = old_len; !r && i < smm->begin; i++) {
-		r = sm_ll_inc(&smm->ll, i, &ev);
+	/*
+	 * We repeatedly increment then commit until the commit doesn't
+	 * allocate any new blocks.
+	 */
+	do {
+		for (i = old_len; !r && i < smm->begin; i++) {
+			r = sm_ll_inc(&smm->ll, i, &ev);
+			if (r)
+				goto out;
+		}
+		old_len = smm->begin;
+
+		r = sm_ll_commit(&smm->ll);
 		if (r)
 			goto out;
-	}
 
-	r = sm_metadata_commit(sm);
+	} while (old_len != smm->begin);
 
 out:
 	/*

From 5066a4df1f427faac8372d20494483bb09a4a1cd Mon Sep 17 00:00:00 2001
From: Dongmao Zhang <dmzhang@suse.com>
Date: Wed, 15 Jan 2014 15:44:37 -0600
Subject: [PATCH 35/35] dm log userspace: allow mark requests to piggyback on
 flush requests

In the cluster evironment, cluster write has poor performance because
userspace_flush() has to contact a userspace program (cmirrord) for
clear/mark/flush requests.  But both mark and flush requests require
cmirrord to communicate the message to all the cluster nodes for each
flush call.  This behaviour is really slow.

To address this we now merge mark and flush requests together to reduce
the kernel-userspace-kernel time.  We allow a new directive,
"integrated_flush" that can be used to instruct the kernel log code to
combine flush and mark requests when directed by userspace.  If not
directed by userspace (due to an older version of the userspace code
perhaps), the kernel will function as it did previously - preserving
backwards compatibility.  Additionally, flush requests are performed
lazily when only clear requests exist.

Signed-off-by: Dongmao Zhang <dmzhang@suse.com>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-log-userspace-base.c    | 210 ++++++++++++++++++++------
 include/uapi/linux/dm-log-userspace.h |  20 ++-
 2 files changed, 178 insertions(+), 52 deletions(-)

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 9429159d9ee3..b953db6cc229 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -10,10 +10,11 @@
 #include <linux/device-mapper.h>
 #include <linux/dm-log-userspace.h>
 #include <linux/module.h>
+#include <linux/workqueue.h>
 
 #include "dm-log-userspace-transfer.h"
 
-#define DM_LOG_USERSPACE_VSN "1.1.0"
+#define DM_LOG_USERSPACE_VSN "1.3.0"
 
 struct flush_entry {
 	int type;
@@ -58,6 +59,18 @@ struct log_c {
 	spinlock_t flush_lock;
 	struct list_head mark_list;
 	struct list_head clear_list;
+
+	/*
+	 * Workqueue for flush of clear region requests.
+	 */
+	struct workqueue_struct *dmlog_wq;
+	struct delayed_work flush_log_work;
+	atomic_t sched_flush;
+
+	/*
+	 * Combine userspace flush and mark requests for efficiency.
+	 */
+	uint32_t integrated_flush;
 };
 
 static mempool_t *flush_entry_pool;
@@ -122,6 +135,9 @@ static int build_constructor_string(struct dm_target *ti,
 
 	*ctr_str = NULL;
 
+	/*
+	 * Determine overall size of the string.
+	 */
 	for (i = 0, str_size = 0; i < argc; i++)
 		str_size += strlen(argv[i]) + 1; /* +1 for space between args */
 
@@ -141,18 +157,39 @@ static int build_constructor_string(struct dm_target *ti,
 	return str_size;
 }
 
+static void do_flush(struct work_struct *work)
+{
+	int r;
+	struct log_c *lc = container_of(work, struct log_c, flush_log_work.work);
+
+	atomic_set(&lc->sched_flush, 0);
+
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL);
+
+	if (r)
+		dm_table_event(lc->ti->table);
+}
+
 /*
  * userspace_ctr
  *
  * argv contains:
- *	<UUID> <other args>
- * Where 'other args' is the userspace implementation specific log
- * arguments.  An example might be:
- *	<UUID> clustered-disk <arg count> <log dev> <region_size> [[no]sync]
+ *	<UUID> [integrated_flush] <other args>
+ * Where 'other args' are the userspace implementation-specific log
+ * arguments.
  *
- * So, this module will strip off the <UUID> for identification purposes
- * when communicating with userspace about a log; but will pass on everything
- * else.
+ * Example:
+ *	<UUID> [integrated_flush] clustered-disk <arg count> <log dev>
+ *	<region_size> [[no]sync]
+ *
+ * This module strips off the <UUID> and uses it for identification
+ * purposes when communicating with userspace about a log.
+ *
+ * If integrated_flush is defined, the kernel combines flush
+ * and mark requests.
+ *
+ * The rest of the line, beginning with 'clustered-disk', is passed
+ * to the userspace ctr function.
  */
 static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 			 unsigned argc, char **argv)
@@ -188,12 +225,22 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 		return -EINVAL;
 	}
 
+	lc->usr_argc = argc;
+
 	strncpy(lc->uuid, argv[0], DM_UUID_LEN);
+	argc--;
+	argv++;
 	spin_lock_init(&lc->flush_lock);
 	INIT_LIST_HEAD(&lc->mark_list);
 	INIT_LIST_HEAD(&lc->clear_list);
 
-	str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
+	if (!strcasecmp(argv[0], "integrated_flush")) {
+		lc->integrated_flush = 1;
+		argc--;
+		argv++;
+	}
+
+	str_size = build_constructor_string(ti, argc, argv, &ctr_str);
 	if (str_size < 0) {
 		kfree(lc);
 		return str_size;
@@ -246,6 +293,19 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 			DMERR("Failed to register %s with device-mapper",
 			      devices_rdata);
 	}
+
+	if (lc->integrated_flush) {
+		lc->dmlog_wq = alloc_workqueue("dmlogd", WQ_MEM_RECLAIM, 0);
+		if (!lc->dmlog_wq) {
+			DMERR("couldn't start dmlogd");
+			r = -ENOMEM;
+			goto out;
+		}
+
+		INIT_DELAYED_WORK(&lc->flush_log_work, do_flush);
+		atomic_set(&lc->sched_flush, 0);
+	}
+
 out:
 	kfree(devices_rdata);
 	if (r) {
@@ -253,7 +313,6 @@ out:
 		kfree(ctr_str);
 	} else {
 		lc->usr_argv_str = ctr_str;
-		lc->usr_argc = argc;
 		log->context = lc;
 	}
 
@@ -264,9 +323,16 @@ static void userspace_dtr(struct dm_dirty_log *log)
 {
 	struct log_c *lc = log->context;
 
+	if (lc->integrated_flush) {
+		/* flush workqueue */
+		if (atomic_read(&lc->sched_flush))
+			flush_delayed_work(&lc->flush_log_work);
+
+		destroy_workqueue(lc->dmlog_wq);
+	}
+
 	(void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
-				 NULL, 0,
-				 NULL, NULL);
+				    NULL, 0, NULL, NULL);
 
 	if (lc->log_dev)
 		dm_put_device(lc->ti, lc->log_dev);
@@ -283,8 +349,7 @@ static int userspace_presuspend(struct dm_dirty_log *log)
 	struct log_c *lc = log->context;
 
 	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND,
-				 NULL, 0,
-				 NULL, NULL);
+				 NULL, 0, NULL, NULL);
 
 	return r;
 }
@@ -294,9 +359,14 @@ static int userspace_postsuspend(struct dm_dirty_log *log)
 	int r;
 	struct log_c *lc = log->context;
 
+	/*
+	 * Run planned flush earlier.
+	 */
+	if (lc->integrated_flush && atomic_read(&lc->sched_flush))
+		flush_delayed_work(&lc->flush_log_work);
+
 	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND,
-				 NULL, 0,
-				 NULL, NULL);
+				 NULL, 0, NULL, NULL);
 
 	return r;
 }
@@ -308,8 +378,7 @@ static int userspace_resume(struct dm_dirty_log *log)
 
 	lc->in_sync_hint = 0;
 	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME,
-				 NULL, 0,
-				 NULL, NULL);
+				 NULL, 0, NULL, NULL);
 
 	return r;
 }
@@ -405,7 +474,8 @@ static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
 	return r;
 }
 
-static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
+static int flush_by_group(struct log_c *lc, struct list_head *flush_list,
+			  int flush_with_payload)
 {
 	int r = 0;
 	int count;
@@ -431,15 +501,29 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
 				break;
 		}
 
-		r = userspace_do_request(lc, lc->uuid, type,
-					 (char *)(group),
-					 count * sizeof(uint64_t),
-					 NULL, NULL);
-		if (r) {
-			/* Group send failed.  Attempt one-by-one. */
-			list_splice_init(&tmp_list, flush_list);
-			r = flush_one_by_one(lc, flush_list);
-			break;
+		if (flush_with_payload) {
+			r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+						 (char *)(group),
+						 count * sizeof(uint64_t),
+						 NULL, NULL);
+			/*
+			 * Integrated flush failed.
+			 */
+			if (r)
+				break;
+		} else {
+			r = userspace_do_request(lc, lc->uuid, type,
+						 (char *)(group),
+						 count * sizeof(uint64_t),
+						 NULL, NULL);
+			if (r) {
+				/*
+				 * Group send failed.  Attempt one-by-one.
+				 */
+				list_splice_init(&tmp_list, flush_list);
+				r = flush_one_by_one(lc, flush_list);
+				break;
+			}
 		}
 	}
 
@@ -476,6 +560,8 @@ static int userspace_flush(struct dm_dirty_log *log)
 	struct log_c *lc = log->context;
 	LIST_HEAD(mark_list);
 	LIST_HEAD(clear_list);
+	int mark_list_is_empty;
+	int clear_list_is_empty;
 	struct flush_entry *fe, *tmp_fe;
 
 	spin_lock_irqsave(&lc->flush_lock, flags);
@@ -483,23 +569,51 @@ static int userspace_flush(struct dm_dirty_log *log)
 	list_splice_init(&lc->clear_list, &clear_list);
 	spin_unlock_irqrestore(&lc->flush_lock, flags);
 
-	if (list_empty(&mark_list) && list_empty(&clear_list))
+	mark_list_is_empty = list_empty(&mark_list);
+	clear_list_is_empty = list_empty(&clear_list);
+
+	if (mark_list_is_empty && clear_list_is_empty)
 		return 0;
 
-	r = flush_by_group(lc, &mark_list);
+	r = flush_by_group(lc, &clear_list, 0);
 	if (r)
-		goto fail;
+		goto out;
 
-	r = flush_by_group(lc, &clear_list);
-	if (r)
-		goto fail;
+	if (!lc->integrated_flush) {
+		r = flush_by_group(lc, &mark_list, 0);
+		if (r)
+			goto out;
+		r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+					 NULL, 0, NULL, NULL);
+		goto out;
+	}
 
-	r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
-				 NULL, 0, NULL, NULL);
-
-fail:
 	/*
-	 * We can safely remove these entries, even if failure.
+	 * Send integrated flush request with mark_list as payload.
+	 */
+	r = flush_by_group(lc, &mark_list, 1);
+	if (r)
+		goto out;
+
+	if (mark_list_is_empty && !atomic_read(&lc->sched_flush)) {
+		/*
+		 * When there are only clear region requests,
+		 * we schedule a flush in the future.
+		 */
+		queue_delayed_work(lc->dmlog_wq, &lc->flush_log_work, 3 * HZ);
+		atomic_set(&lc->sched_flush, 1);
+	} else {
+		/*
+		 * Cancel pending flush because we
+		 * have already flushed in mark_region.
+		 */
+		cancel_delayed_work(&lc->flush_log_work);
+		atomic_set(&lc->sched_flush, 0);
+	}
+
+out:
+	/*
+	 * We can safely remove these entries, even after failure.
 	 * Calling code will receive an error and will know that
 	 * the log facility has failed.
 	 */
@@ -603,8 +717,7 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
 
 	rdata_size = sizeof(pkg);
 	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
-				 NULL, 0,
-				 (char *)&pkg, &rdata_size);
+				 NULL, 0, (char *)&pkg, &rdata_size);
 
 	*region = pkg.r;
 	return (r) ? r : (int)pkg.i;
@@ -630,8 +743,7 @@ static void userspace_set_region_sync(struct dm_dirty_log *log,
 	pkg.i = (int64_t)in_sync;
 
 	r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
-				 (char *)&pkg, sizeof(pkg),
-				 NULL, NULL);
+				 (char *)&pkg, sizeof(pkg), NULL, NULL);
 
 	/*
 	 * It would be nice to be able to report failures.
@@ -657,8 +769,7 @@ static region_t userspace_get_sync_count(struct dm_dirty_log *log)
 
 	rdata_size = sizeof(sync_count);
 	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
-				 NULL, 0,
-				 (char *)&sync_count, &rdata_size);
+				 NULL, 0, (char *)&sync_count, &rdata_size);
 
 	if (r)
 		return 0;
@@ -685,8 +796,7 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
 	switch (status_type) {
 	case STATUSTYPE_INFO:
 		r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
-					 NULL, 0,
-					 result, &sz);
+					 NULL, 0, result, &sz);
 
 		if (r) {
 			sz = 0;
@@ -699,8 +809,10 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
 		BUG_ON(!table_args); /* There will always be a ' ' */
 		table_args++;
 
-		DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc,
-		       lc->uuid, table_args);
+		DMEMIT("%s %u %s ", log->type->name, lc->usr_argc, lc->uuid);
+		if (lc->integrated_flush)
+			DMEMIT("integrated_flush ");
+		DMEMIT("%s ", table_args);
 		break;
 	}
 	return (r) ? 0 : (int)sz;
diff --git a/include/uapi/linux/dm-log-userspace.h b/include/uapi/linux/dm-log-userspace.h
index 0678c2adc421..0fa0d9ef06a5 100644
--- a/include/uapi/linux/dm-log-userspace.h
+++ b/include/uapi/linux/dm-log-userspace.h
@@ -201,11 +201,18 @@
  * int (*flush)(struct dm_dirty_log *log);
  *
  * Payload-to-userspace:
- *	None.
+ *	If the 'integrated_flush' directive is present in the constructor
+ *	table, the payload is as same as DM_ULOG_MARK_REGION:
+ *		uint64_t [] - region(s) to mark
+ *	else
+ *		None
  * Payload-to-kernel:
  *	None.
  *
- * No incoming or outgoing payload.  Simply flush log state to disk.
+ * If the 'integrated_flush' option was used during the creation of the
+ * log, mark region requests are carried as payload in the flush request.
+ * Piggybacking the mark requests in this way allows for fewer communications
+ * between kernel and userspace.
  *
  * When the request has been processed, user-space must return the
  * dm_ulog_request to the kernel - setting the 'error' field and clearing
@@ -385,8 +392,15 @@
  *	version 2:  DM_ULOG_CTR allowed to return a string containing a
  *	            device name that is to be registered with DM via
  *	            'dm_get_device'.
+ *	version 3:  DM_ULOG_FLUSH is capable of carrying payload for marking
+ *		    regions.  This "integrated flush" reduces the number of
+ *		    requests between the kernel and userspace by effectively
+ *		    merging 'mark' and 'flush' requests.  A constructor table
+ *		    argument ('integrated_flush') is required to turn this
+ *		    feature on, so it is backwards compatible with older
+ *		    userspace versions.
  */
-#define DM_ULOG_REQUEST_VERSION 2
+#define DM_ULOG_REQUEST_VERSION 3
 
 struct dm_ulog_request {
 	/*