Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm

* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: dm: tidy local_init dm: remove unused flush_all dm raid1: separate region_hash interface part1 dm: mark split bio as cloned dm crypt: remove waitqueue dm crypt: fix async split dm crypt: tidy sector dm: remove dm header from targets dm: publish array_too_big dm exception store: fix misordered writes dm exception store: refactor zero_area dm snapshot: drop unused last_percent dm snapshot: fix primary_pe race dm kcopyd: avoid queue shuffle
2008-10-23 09:50:12 -07:00 · 2008-10-23 09:50:12 -07:00 · 3e5cce627c
parent f2e4bd2b37 51157b4ab4
commit 3e5cce627c
21 changed files with 1084 additions and 798 deletions
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@ -34,7 +34,7 @@ obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
 obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
 obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
-obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o
+obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o

 quiet_cmd_unroll = UNROLL  $@
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@ -23,7 +23,7 @@
 #include <asm/page.h>
 #include <asm/unaligned.h>

-#include "dm.h"
+#include <linux/device-mapper.h>

 #define DM_MSG_PREFIX "crypt"
 #define MESG_STR(x) x, sizeof(x)
@ -56,6 +56,7 @@ struct dm_crypt_io {
 	atomic_t pending;
 	int error;
 	sector_t sector;
+	struct dm_crypt_io *base_io;
 };

 struct dm_crypt_request {
@ -93,7 +94,6 @@ struct crypt_config {

 	struct workqueue_struct *io_queue;
 	struct workqueue_struct *crypt_queue;
-	wait_queue_head_t writeq;

 	/*
 	 * crypto related data
@ -534,6 +534,7 @@ static struct dm_crypt_io *crypt_io_alloc(struct dm_target *ti,
 	io->base_bio = bio;
 	io->sector = sector;
 	io->error = 0;
+	io->base_io = NULL;
 	atomic_set(&io->pending, 0);

 	return io;
@ -547,6 +548,7 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
 /*
 * One of the bios was finished. Check for completion of
 * the whole request and correctly clean up the buffer.
+ * If base_io is set, wait for the last fragment to complete.
 */
 static void crypt_dec_pending(struct dm_crypt_io *io)
 {
@ -555,7 +557,14 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 	if (!atomic_dec_and_test(&io->pending))
 		return;

-	bio_endio(io->base_bio, io->error);
+	if (likely(!io->base_io))
+		bio_endio(io->base_bio, io->error);
+	else {
+		if (io->error && !io->base_io->error)
+			io->base_io->error = io->error;
+		crypt_dec_pending(io->base_io);
+	}
+
 	mempool_free(io, cc->io_pool);
 }

@ -646,10 +655,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
 static void kcryptd_io_write(struct dm_crypt_io *io)
 {
 	struct bio *clone = io->ctx.bio_out;
-	struct crypt_config *cc = io->target->private;
-
 	generic_make_request(clone);
-	wake_up(&cc->writeq);
 }

 static void kcryptd_io(struct work_struct *work)
@ -688,7 +694,6 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io,
 	BUG_ON(io->ctx.idx_out < clone->bi_vcnt);

 	clone->bi_sector = cc->start + io->sector;
-	io->sector += bio_sectors(clone);

 	if (async)
 		kcryptd_queue_io(io);
@ -700,16 +705,18 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->target->private;
 	struct bio *clone;
+	struct dm_crypt_io *new_io;
 	int crypt_finished;
 	unsigned out_of_pages = 0;
 	unsigned remaining = io->base_bio->bi_size;
+	sector_t sector = io->sector;
 	int r;

 	/*
 	 * Prevent io from disappearing until this function completes.
 	 */
 	crypt_inc_pending(io);
-	crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector);
+	crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector);

 	/*
 	 * The allocated buffers can be smaller than the whole bio,
@ -726,6 +733,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		io->ctx.idx_out = 0;

 		remaining -= clone->bi_size;
+		sector += bio_sectors(clone);

 		crypt_inc_pending(io);
 		r = crypt_convert(cc, &io->ctx);
@ -741,6 +749,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 			 */
 			if (unlikely(r < 0))
 				break;
+
+			io->sector = sector;
 		}

 		/*
@ -750,8 +760,33 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		if (unlikely(out_of_pages))
 			congestion_wait(WRITE, HZ/100);

-		if (unlikely(remaining))
-			wait_event(cc->writeq, !atomic_read(&io->ctx.pending));
+		/*
+		 * With async crypto it is unsafe to share the crypto context
+		 * between fragments, so switch to a new dm_crypt_io structure.
+		 */
+		if (unlikely(!crypt_finished && remaining)) {
+			new_io = crypt_io_alloc(io->target, io->base_bio,
+						sector);
+			crypt_inc_pending(new_io);
+			crypt_convert_init(cc, &new_io->ctx, NULL,
+					   io->base_bio, sector);
+			new_io->ctx.idx_in = io->ctx.idx_in;
+			new_io->ctx.offset_in = io->ctx.offset_in;
+
+			/*
+			 * Fragments after the first use the base_io
+			 * pending count.
+			 */
+			if (!io->base_io)
+				new_io->base_io = io;
+			else {
+				new_io->base_io = io->base_io;
+				crypt_inc_pending(io->base_io);
+				crypt_dec_pending(io);
+			}
+
+			io = new_io;
+		}
 	}

 	crypt_dec_pending(io);
@ -1078,7 +1113,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_crypt_queue;
 	}

-	init_waitqueue_head(&cc->writeq);
 	ti->private = cc;
 	return 0;

--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@ -13,7 +13,8 @@
 #include <linux/bio.h>
 #include <linux/slab.h>

-#include "dm.h"
+#include <linux/device-mapper.h>
+
 #include "dm-bio-list.h"

 #define DM_MSG_PREFIX "delay"
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@ -7,7 +7,6 @@
 * This file is released under the GPL.
 */

-#include "dm.h"
 #include "dm-snap.h"

 #include <linux/mm.h>
@ -104,6 +103,11 @@ struct pstore {
 	 */
 	void *area;

+	/*
+	 * An area of zeros used to clear the next area.
+	 */
+	void *zero_area;
+
 	/*
 	 * Used to keep track of which metadata area the data in
 	 * 'chunk' refers to.
@ -149,6 +153,13 @@ static int alloc_area(struct pstore *ps)
 	if (!ps->area)
 		return r;

+	ps->zero_area = vmalloc(len);
+	if (!ps->zero_area) {
+		vfree(ps->area);
+		return r;
+	}
+	memset(ps->zero_area, 0, len);
+
 	return 0;
 }

@ -156,6 +167,8 @@ static void free_area(struct pstore *ps)
 {
 	vfree(ps->area);
 	ps->area = NULL;
+	vfree(ps->zero_area);
+	ps->zero_area = NULL;
 }

 struct mdata_req {
@ -220,25 +233,41 @@ static chunk_t area_location(struct pstore *ps, chunk_t area)
 * Read or write a metadata area.  Remembering to skip the first
 * chunk which holds the header.
 */
-static int area_io(struct pstore *ps, chunk_t area, int rw)
+static int area_io(struct pstore *ps, int rw)
 {
 	int r;
 	chunk_t chunk;

-	chunk = area_location(ps, area);
+	chunk = area_location(ps, ps->current_area);

 	r = chunk_io(ps, chunk, rw, 0);
 	if (r)
 		return r;

-	ps->current_area = area;
 	return 0;
 }

-static int zero_area(struct pstore *ps, chunk_t area)
+static void zero_memory_area(struct pstore *ps)
 {
 	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
-	return area_io(ps, area, WRITE);
+}
+
+static int zero_disk_area(struct pstore *ps, chunk_t area)
+{
+	struct dm_io_region where = {
+		.bdev = ps->snap->cow->bdev,
+		.sector = ps->snap->chunk_size * area_location(ps, area),
+		.count = ps->snap->chunk_size,
+	};
+	struct dm_io_request io_req = {
+		.bi_rw = WRITE,
+		.mem.type = DM_IO_VMA,
+		.mem.ptr.vma = ps->zero_area,
+		.client = ps->io_client,
+		.notify.fn = NULL,
+	};
+
+	return dm_io(&io_req, 1, &where, NULL);
 }

 static int read_header(struct pstore *ps, int *new_snapshot)
@ -411,15 +440,14 @@ static int insert_exceptions(struct pstore *ps, int *full)

 static int read_exceptions(struct pstore *ps)
 {
-	chunk_t area;
 	int r, full = 1;

 	/*
 	 * Keeping reading chunks and inserting exceptions until
 	 * we find a partially full area.
 	 */
-	for (area = 0; full; area++) {
-		r = area_io(ps, area, READ);
+	for (ps->current_area = 0; full; ps->current_area++) {
+		r = area_io(ps, READ);
 		if (r)
 			return r;

@ -428,6 +456,8 @@ static int read_exceptions(struct pstore *ps)
 			return r;
 	}

+	ps->current_area--;
+
 	return 0;
 }

@ -486,12 +516,13 @@ static int persistent_read_metadata(struct exception_store *store)
 			return r;
 		}

-		r = zero_area(ps, 0);
+		ps->current_area = 0;
+		zero_memory_area(ps);
+		r = zero_disk_area(ps, 0);
 		if (r) {
-			DMWARN("zero_area(0) failed");
+			DMWARN("zero_disk_area(0) failed");
 			return r;
 		}
-
 	} else {
 		/*
 		 * Sanity checks.
@ -551,7 +582,6 @@ static void persistent_commit(struct exception_store *store,
 			      void (*callback) (void *, int success),
 			      void *callback_context)
 {
-	int r;
 	unsigned int i;
 	struct pstore *ps = get_info(store);
 	struct disk_exception de;
@ -572,33 +602,41 @@ static void persistent_commit(struct exception_store *store,
 	cb->context = callback_context;

 	/*
-	 * If there are no more exceptions in flight, or we have
-	 * filled this metadata area we commit the exceptions to
-	 * disk.
+	 * If there are exceptions in flight and we have not yet
+	 * filled this metadata area there's nothing more to do.
 	 */
-	if (atomic_dec_and_test(&ps->pending_count) ||
-	    (ps->current_committed == ps->exceptions_per_area)) {
-		r = area_io(ps, ps->current_area, WRITE);
-		if (r)
-			ps->valid = 0;
+	if (!atomic_dec_and_test(&ps->pending_count) &&
+	    (ps->current_committed != ps->exceptions_per_area))
+		return;

-		/*
-		 * Have we completely filled the current area ?
-		 */
-		if (ps->current_committed == ps->exceptions_per_area) {
-			ps->current_committed = 0;
-			r = zero_area(ps, ps->current_area + 1);
-			if (r)
-				ps->valid = 0;
-		}
+	/*
+	 * If we completely filled the current area, then wipe the next one.
+	 */
+	if ((ps->current_committed == ps->exceptions_per_area) &&
+	     zero_disk_area(ps, ps->current_area + 1))
+		ps->valid = 0;

-		for (i = 0; i < ps->callback_count; i++) {
-			cb = ps->callbacks + i;
-			cb->callback(cb->context, r == 0 ? 1 : 0);
-		}
+	/*
+	 * Commit exceptions to disk.
+	 */
+	if (ps->valid && area_io(ps, WRITE))
+		ps->valid = 0;

-		ps->callback_count = 0;
+	/*
+	 * Advance to the next area if this one is full.
+	 */
+	if (ps->current_committed == ps->exceptions_per_area) {
+		ps->current_committed = 0;
+		ps->current_area++;
+		zero_memory_area(ps);
 	}
+
+	for (i = 0; i < ps->callback_count; i++) {
+		cb = ps->callbacks + i;
+		cb->callback(cb->context, ps->valid);
+	}
+
+	ps->callback_count = 0;
 }

 static void persistent_drop(struct exception_store *store)
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@ -5,7 +5,7 @@
 * This file is released under the GPL.
 */

-#include "dm.h"
+#include <linux/device-mapper.h>

 #include <linux/bio.h>
 #include <linux/mempool.h>
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@ -22,6 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
+#include <linux/device-mapper.h>
 #include <linux/dm-kcopyd.h>

 #include "dm.h"
@ -268,6 +269,17 @@ static void push(struct list_head *jobs, struct kcopyd_job *job)
 	spin_unlock_irqrestore(&kc->job_lock, flags);
 }

+
+static void push_head(struct list_head *jobs, struct kcopyd_job *job)
+{
+	unsigned long flags;
+	struct dm_kcopyd_client *kc = job->kc;
+
+	spin_lock_irqsave(&kc->job_lock, flags);
+	list_add(&job->list, jobs);
+	spin_unlock_irqrestore(&kc->job_lock, flags);
+}
+
 /*
 * These three functions process 1 item from the corresponding
 * job list.
@ -398,7 +410,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
 			 * We couldn't service this job ATM, so
 			 * push this job back onto the list.
 			 */
-			push(jobs, job);
+			push_head(jobs, job);
 			break;
 		}

--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@ -5,12 +5,12 @@
 */

 #include "dm.h"
-
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/slab.h>
+#include <linux/device-mapper.h>

 #define DM_MSG_PREFIX "linear"

--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@ -12,7 +12,7 @@
 #include <linux/dm-io.h>
 #include <linux/dm-dirty-log.h>

-#include "dm.h"
+#include <linux/device-mapper.h>

 #define DM_MSG_PREFIX "dirty region log"

--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@ -5,7 +5,8 @@
 * This file is released under the GPL.
 */

-#include "dm.h"
+#include <linux/device-mapper.h>
+
 #include "dm-path-selector.h"
 #include "dm-bio-list.h"
 #include "dm-bio-record.h"
--- a/drivers/md/dm-path-selector.c
+++ b/drivers/md/dm-path-selector.c
@ -9,7 +9,8 @@
 * Path selector registration.
 */

-#include "dm.h"
+#include <linux/device-mapper.h>
+
 #include "dm-path-selector.h"

 #include <linux/slab.h>
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@ -0,0 +1,704 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/dm-dirty-log.h>
+#include <linux/dm-region-hash.h>
+
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+
+#include "dm.h"
+#include "dm-bio-list.h"
+
+#define	DM_MSG_PREFIX	"region hash"
+
+/*-----------------------------------------------------------------
+ * Region hash
+ *
+ * The mirror splits itself up into discrete regions.  Each
+ * region can be in one of three states: clean, dirty,
+ * nosync.  There is no need to put clean regions in the hash.
+ *
+ * In addition to being present in the hash table a region _may_
+ * be present on one of three lists.
+ *
+ *   clean_regions: Regions on this list have no io pending to
+ *   them, they are in sync, we are no longer interested in them,
+ *   they are dull.  dm_rh_update_states() will remove them from the
+ *   hash table.
+ *
+ *   quiesced_regions: These regions have been spun down, ready
+ *   for recovery.  rh_recovery_start() will remove regions from
+ *   this list and hand them to kmirrord, which will schedule the
+ *   recovery io with kcopyd.
+ *
+ *   recovered_regions: Regions that kcopyd has successfully
+ *   recovered.  dm_rh_update_states() will now schedule any delayed
+ *   io, up the recovery_count, and remove the region from the
+ *   hash.
+ *
+ * There are 2 locks:
+ *   A rw spin lock 'hash_lock' protects just the hash table,
+ *   this is never held in write mode from interrupt context,
+ *   which I believe means that we only have to disable irqs when
+ *   doing a write lock.
+ *
+ *   An ordinary spin lock 'region_lock' that protects the three
+ *   lists in the region_hash, with the 'state', 'list' and
+ *   'delayed_bios' fields of the regions.  This is used from irq
+ *   context, so all other uses will have to suspend local irqs.
+ *---------------------------------------------------------------*/
+struct dm_region_hash {
+	uint32_t region_size;
+	unsigned region_shift;
+
+	/* holds persistent region state */
+	struct dm_dirty_log *log;
+
+	/* hash table */
+	rwlock_t hash_lock;
+	mempool_t *region_pool;
+	unsigned mask;
+	unsigned nr_buckets;
+	unsigned prime;
+	unsigned shift;
+	struct list_head *buckets;
+
+	unsigned max_recovery; /* Max # of regions to recover in parallel */
+
+	spinlock_t region_lock;
+	atomic_t recovery_in_flight;
+	struct semaphore recovery_count;
+	struct list_head clean_regions;
+	struct list_head quiesced_regions;
+	struct list_head recovered_regions;
+	struct list_head failed_recovered_regions;
+
+	void *context;
+	sector_t target_begin;
+
+	/* Callback function to schedule bios writes */
+	void (*dispatch_bios)(void *context, struct bio_list *bios);
+
+	/* Callback function to wakeup callers worker thread. */
+	void (*wakeup_workers)(void *context);
+
+	/* Callback function to wakeup callers recovery waiters. */
+	void (*wakeup_all_recovery_waiters)(void *context);
+};
+
+struct dm_region {
+	struct dm_region_hash *rh;	/* FIXME: can we get rid of this ? */
+	region_t key;
+	int state;
+
+	struct list_head hash_list;
+	struct list_head list;
+
+	atomic_t pending;
+	struct bio_list delayed_bios;
+};
+
+/*
+ * Conversion fns
+ */
+static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector)
+{
+	return sector >> rh->region_shift;
+}
+
+sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region)
+{
+	return region << rh->region_shift;
+}
+EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
+
+region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio)
+{
+	return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin);
+}
+EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
+
+void *dm_rh_region_context(struct dm_region *reg)
+{
+	return reg->rh->context;
+}
+EXPORT_SYMBOL_GPL(dm_rh_region_context);
+
+region_t dm_rh_get_region_key(struct dm_region *reg)
+{
+	return reg->key;
+}
+EXPORT_SYMBOL_GPL(dm_rh_get_region_key);
+
+sector_t dm_rh_get_region_size(struct dm_region_hash *rh)
+{
+	return rh->region_size;
+}
+EXPORT_SYMBOL_GPL(dm_rh_get_region_size);
+
+/*
+ * FIXME: shall we pass in a structure instead of all these args to
+ * dm_region_hash_create()????
+ */
+#define RH_HASH_MULT 2654435387U
+#define RH_HASH_SHIFT 12
+
+#define MIN_REGIONS 64
+struct dm_region_hash *dm_region_hash_create(
+		void *context, void (*dispatch_bios)(void *context,
+						     struct bio_list *bios),
+		void (*wakeup_workers)(void *context),
+		void (*wakeup_all_recovery_waiters)(void *context),
+		sector_t target_begin, unsigned max_recovery,
+		struct dm_dirty_log *log, uint32_t region_size,
+		region_t nr_regions)
+{
+	struct dm_region_hash *rh;
+	unsigned nr_buckets, max_buckets;
+	size_t i;
+
+	/*
+	 * Calculate a suitable number of buckets for our hash
+	 * table.
+	 */
+	max_buckets = nr_regions >> 6;
+	for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
+		;
+	nr_buckets >>= 1;
+
+	rh = kmalloc(sizeof(*rh), GFP_KERNEL);
+	if (!rh) {
+		DMERR("unable to allocate region hash memory");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	rh->context = context;
+	rh->dispatch_bios = dispatch_bios;
+	rh->wakeup_workers = wakeup_workers;
+	rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters;
+	rh->target_begin = target_begin;
+	rh->max_recovery = max_recovery;
+	rh->log = log;
+	rh->region_size = region_size;
+	rh->region_shift = ffs(region_size) - 1;
+	rwlock_init(&rh->hash_lock);
+	rh->mask = nr_buckets - 1;
+	rh->nr_buckets = nr_buckets;
+
+	rh->shift = RH_HASH_SHIFT;
+	rh->prime = RH_HASH_MULT;
+
+	rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
+	if (!rh->buckets) {
+		DMERR("unable to allocate region hash bucket memory");
+		kfree(rh);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < nr_buckets; i++)
+		INIT_LIST_HEAD(rh->buckets + i);
+
+	spin_lock_init(&rh->region_lock);
+	sema_init(&rh->recovery_count, 0);
+	atomic_set(&rh->recovery_in_flight, 0);
+	INIT_LIST_HEAD(&rh->clean_regions);
+	INIT_LIST_HEAD(&rh->quiesced_regions);
+	INIT_LIST_HEAD(&rh->recovered_regions);
+	INIT_LIST_HEAD(&rh->failed_recovered_regions);
+
+	rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
+						      sizeof(struct dm_region));
+	if (!rh->region_pool) {
+		vfree(rh->buckets);
+		kfree(rh);
+		rh = ERR_PTR(-ENOMEM);
+	}
+
+	return rh;
+}
+EXPORT_SYMBOL_GPL(dm_region_hash_create);
+
+void dm_region_hash_destroy(struct dm_region_hash *rh)
+{
+	unsigned h;
+	struct dm_region *reg, *nreg;
+
+	BUG_ON(!list_empty(&rh->quiesced_regions));
+	for (h = 0; h < rh->nr_buckets; h++) {
+		list_for_each_entry_safe(reg, nreg, rh->buckets + h,
+					 hash_list) {
+			BUG_ON(atomic_read(&reg->pending));
+			mempool_free(reg, rh->region_pool);
+		}
+	}
+
+	if (rh->log)
+		dm_dirty_log_destroy(rh->log);
+
+	if (rh->region_pool)
+		mempool_destroy(rh->region_pool);
+
+	vfree(rh->buckets);
+	kfree(rh);
+}
+EXPORT_SYMBOL_GPL(dm_region_hash_destroy);
+
+struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh)
+{
+	return rh->log;
+}
+EXPORT_SYMBOL_GPL(dm_rh_dirty_log);
+
+static unsigned rh_hash(struct dm_region_hash *rh, region_t region)
+{
+	return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask;
+}
+
+static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region)
+{
+	struct dm_region *reg;
+	struct list_head *bucket = rh->buckets + rh_hash(rh, region);
+
+	list_for_each_entry(reg, bucket, hash_list)
+		if (reg->key == region)
+			return reg;
+
+	return NULL;
+}
+
+static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg)
+{
+	list_add(&reg->hash_list, rh->buckets + rh_hash(rh, reg->key));
+}
+
+static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
+{
+	struct dm_region *reg, *nreg;
+
+	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
+	if (unlikely(!nreg))
+		nreg = kmalloc(sizeof(*nreg), GFP_NOIO);
+
+	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
+		      DM_RH_CLEAN : DM_RH_NOSYNC;
+	nreg->rh = rh;
+	nreg->key = region;
+	INIT_LIST_HEAD(&nreg->list);
+	atomic_set(&nreg->pending, 0);
+	bio_list_init(&nreg->delayed_bios);
+
+	write_lock_irq(&rh->hash_lock);
+	reg = __rh_lookup(rh, region);
+	if (reg)
+		/* We lost the race. */
+		mempool_free(nreg, rh->region_pool);
+	else {
+		__rh_insert(rh, nreg);
+		if (nreg->state == DM_RH_CLEAN) {
+			spin_lock(&rh->region_lock);
+			list_add(&nreg->list, &rh->clean_regions);
+			spin_unlock(&rh->region_lock);
+		}
+
+		reg = nreg;
+	}
+	write_unlock_irq(&rh->hash_lock);
+
+	return reg;
+}
+
+static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region)
+{
+	struct dm_region *reg;
+
+	reg = __rh_lookup(rh, region);
+	if (!reg) {
+		read_unlock(&rh->hash_lock);
+		reg = __rh_alloc(rh, region);
+		read_lock(&rh->hash_lock);
+	}
+
+	return reg;
+}
+
+int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block)
+{
+	int r;
+	struct dm_region *reg;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_lookup(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	if (reg)
+		return reg->state;
+
+	/*
+	 * The region wasn't in the hash, so we fall back to the
+	 * dirty log.
+	 */
+	r = rh->log->type->in_sync(rh->log, region, may_block);
+
+	/*
+	 * Any error from the dirty log (eg. -EWOULDBLOCK) gets
+	 * taken as a DM_RH_NOSYNC
+	 */
+	return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC;
+}
+EXPORT_SYMBOL_GPL(dm_rh_get_state);
+
+static void complete_resync_work(struct dm_region *reg, int success)
+{
+	struct dm_region_hash *rh = reg->rh;
+
+	rh->log->type->set_region_sync(rh->log, reg->key, success);
+
+	/*
+	 * Dispatch the bios before we call 'wake_up_all'.
+	 * This is important because if we are suspending,
+	 * we want to know that recovery is complete and
+	 * the work queue is flushed.  If we wake_up_all
+	 * before we dispatch_bios (queue bios and call wake()),
+	 * then we risk suspending before the work queue
+	 * has been properly flushed.
+	 */
+	rh->dispatch_bios(rh->context, &reg->delayed_bios);
+	if (atomic_dec_and_test(&rh->recovery_in_flight))
+		rh->wakeup_all_recovery_waiters(rh->context);
+	up(&rh->recovery_count);
+}
+
+/* dm_rh_mark_nosync
+ * @ms
+ * @bio
+ * @done
+ * @error
+ *
+ * The bio was written on some mirror(s) but failed on other mirror(s).
+ * We can successfully endio the bio but should avoid the region being
+ * marked clean by setting the state DM_RH_NOSYNC.
+ *
+ * This function is _not_ safe in interrupt context!
+ */
+void dm_rh_mark_nosync(struct dm_region_hash *rh,
+		       struct bio *bio, unsigned done, int error)
+{
+	unsigned long flags;
+	struct dm_dirty_log *log = rh->log;
+	struct dm_region *reg;
+	region_t region = dm_rh_bio_to_region(rh, bio);
+	int recovering = 0;
+
+	/* We must inform the log that the sync count has changed. */
+	log->type->set_region_sync(log, region, 0);
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	/* region hash entry should exist because write was in-flight */
+	BUG_ON(!reg);
+	BUG_ON(!list_empty(&reg->list));
+
+	spin_lock_irqsave(&rh->region_lock, flags);
+	/*
+	 * Possible cases:
+	 *   1) DM_RH_DIRTY
+	 *   2) DM_RH_NOSYNC: was dirty, other preceeding writes failed
+	 *   3) DM_RH_RECOVERING: flushing pending writes
+	 * Either case, the region should have not been connected to list.
+	 */
+	recovering = (reg->state == DM_RH_RECOVERING);
+	reg->state = DM_RH_NOSYNC;
+	BUG_ON(!list_empty(&reg->list));
+	spin_unlock_irqrestore(&rh->region_lock, flags);
+
+	bio_endio(bio, error);
+	if (recovering)
+		complete_resync_work(reg, 0);
+}
+EXPORT_SYMBOL_GPL(dm_rh_mark_nosync);
+
+void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled)
+{
+	struct dm_region *reg, *next;
+
+	LIST_HEAD(clean);
+	LIST_HEAD(recovered);
+	LIST_HEAD(failed_recovered);
+
+	/*
+	 * Quickly grab the lists.
+	 */
+	write_lock_irq(&rh->hash_lock);
+	spin_lock(&rh->region_lock);
+	if (!list_empty(&rh->clean_regions)) {
+		list_splice_init(&rh->clean_regions, &clean);
+
+		list_for_each_entry(reg, &clean, list)
+			list_del(&reg->hash_list);
+	}
+
+	if (!list_empty(&rh->recovered_regions)) {
+		list_splice_init(&rh->recovered_regions, &recovered);
+
+		list_for_each_entry(reg, &recovered, list)
+			list_del(&reg->hash_list);
+	}
+
+	if (!list_empty(&rh->failed_recovered_regions)) {
+		list_splice_init(&rh->failed_recovered_regions,
+				 &failed_recovered);
+
+		list_for_each_entry(reg, &failed_recovered, list)
+			list_del(&reg->hash_list);
+	}
+
+	spin_unlock(&rh->region_lock);
+	write_unlock_irq(&rh->hash_lock);
+
+	/*
+	 * All the regions on the recovered and clean lists have
+	 * now been pulled out of the system, so no need to do
+	 * any more locking.
+	 */
+	list_for_each_entry_safe(reg, next, &recovered, list) {
+		rh->log->type->clear_region(rh->log, reg->key);
+		complete_resync_work(reg, 1);
+		mempool_free(reg, rh->region_pool);
+	}
+
+	list_for_each_entry_safe(reg, next, &failed_recovered, list) {
+		complete_resync_work(reg, errors_handled ? 0 : 1);
+		mempool_free(reg, rh->region_pool);
+	}
+
+	list_for_each_entry_safe(reg, next, &clean, list) {
+		rh->log->type->clear_region(rh->log, reg->key);
+		mempool_free(reg, rh->region_pool);
+	}
+
+	rh->log->type->flush(rh->log);
+}
+EXPORT_SYMBOL_GPL(dm_rh_update_states);
+
+static void rh_inc(struct dm_region_hash *rh, region_t region)
+{
+	struct dm_region *reg;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, region);
+
+	spin_lock_irq(&rh->region_lock);
+	atomic_inc(&reg->pending);
+
+	if (reg->state == DM_RH_CLEAN) {
+		reg->state = DM_RH_DIRTY;
+		list_del_init(&reg->list);	/* take off the clean list */
+		spin_unlock_irq(&rh->region_lock);
+
+		rh->log->type->mark_region(rh->log, reg->key);
+	} else
+		spin_unlock_irq(&rh->region_lock);
+
+
+	read_unlock(&rh->hash_lock);
+}
+
+void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
+{
+	struct bio *bio;
+
+	for (bio = bios->head; bio; bio = bio->bi_next)
+		rh_inc(rh, dm_rh_bio_to_region(rh, bio));
+}
+EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
+
+void dm_rh_dec(struct dm_region_hash *rh, region_t region)
+{
+	unsigned long flags;
+	struct dm_region *reg;
+	int should_wake = 0;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_lookup(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	spin_lock_irqsave(&rh->region_lock, flags);
+	if (atomic_dec_and_test(&reg->pending)) {
+		/*
+		 * There is no pending I/O for this region.
+		 * We can move the region to corresponding list for next action.
+		 * At this point, the region is not yet connected to any list.
+		 *
+		 * If the state is DM_RH_NOSYNC, the region should be kept off
+		 * from clean list.
+		 * The hash entry for DM_RH_NOSYNC will remain in memory
+		 * until the region is recovered or the map is reloaded.
+		 */
+
+		/* do nothing for DM_RH_NOSYNC */
+		if (reg->state == DM_RH_RECOVERING) {
+			list_add_tail(&reg->list, &rh->quiesced_regions);
+		} else if (reg->state == DM_RH_DIRTY) {
+			reg->state = DM_RH_CLEAN;
+			list_add(&reg->list, &rh->clean_regions);
+		}
+		should_wake = 1;
+	}
+	spin_unlock_irqrestore(&rh->region_lock, flags);
+
+	if (should_wake)
+		rh->wakeup_workers(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_dec);
+
+/*
+ * Starts quiescing a region in preparation for recovery.
+ */
+static int __rh_recovery_prepare(struct dm_region_hash *rh)
+{
+	int r;
+	region_t region;
+	struct dm_region *reg;
+
+	/*
+	 * Ask the dirty log what's next.
+	 */
+	r = rh->log->type->get_resync_work(rh->log, &region);
+	if (r <= 0)
+		return r;
+
+	/*
+	 * Get this region, and start it quiescing by setting the
+	 * recovering flag.
+	 */
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, region);
+	read_unlock(&rh->hash_lock);
+
+	spin_lock_irq(&rh->region_lock);
+	reg->state = DM_RH_RECOVERING;
+
+	/* Already quiesced ? */
+	if (atomic_read(&reg->pending))
+		list_del_init(&reg->list);
+	else
+		list_move(&reg->list, &rh->quiesced_regions);
+
+	spin_unlock_irq(&rh->region_lock);
+
+	return 1;
+}
+
+void dm_rh_recovery_prepare(struct dm_region_hash *rh)
+{
+	/* Extra reference to avoid race with dm_rh_stop_recovery */
+	atomic_inc(&rh->recovery_in_flight);
+
+	while (!down_trylock(&rh->recovery_count)) {
+		atomic_inc(&rh->recovery_in_flight);
+		if (__rh_recovery_prepare(rh) <= 0) {
+			atomic_dec(&rh->recovery_in_flight);
+			up(&rh->recovery_count);
+			break;
+		}
+	}
+
+	/* Drop the extra reference */
+	if (atomic_dec_and_test(&rh->recovery_in_flight))
+		rh->wakeup_all_recovery_waiters(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare);
+
+/*
+ * Returns any quiesced regions.
+ */
+struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh)
+{
+	struct dm_region *reg = NULL;
+
+	spin_lock_irq(&rh->region_lock);
+	if (!list_empty(&rh->quiesced_regions)) {
+		reg = list_entry(rh->quiesced_regions.next,
+				 struct dm_region, list);
+		list_del_init(&reg->list);  /* remove from the quiesced list */
+	}
+	spin_unlock_irq(&rh->region_lock);
+
+	return reg;
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_start);
+
+void dm_rh_recovery_end(struct dm_region *reg, int success)
+{
+	struct dm_region_hash *rh = reg->rh;
+
+	spin_lock_irq(&rh->region_lock);
+	if (success)
+		list_add(&reg->list, &reg->rh->recovered_regions);
+	else {
+		reg->state = DM_RH_NOSYNC;
+		list_add(&reg->list, &reg->rh->failed_recovered_regions);
+	}
+	spin_unlock_irq(&rh->region_lock);
+
+	rh->wakeup_workers(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_end);
+
+/* Return recovery in flight count. */
+int dm_rh_recovery_in_flight(struct dm_region_hash *rh)
+{
+	return atomic_read(&rh->recovery_in_flight);
+}
+EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight);
+
+int dm_rh_flush(struct dm_region_hash *rh)
+{
+	return rh->log->type->flush(rh->log);
+}
+EXPORT_SYMBOL_GPL(dm_rh_flush);
+
+void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio)
+{
+	struct dm_region *reg;
+
+	read_lock(&rh->hash_lock);
+	reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio));
+	bio_list_add(&reg->delayed_bios, bio);
+	read_unlock(&rh->hash_lock);
+}
+EXPORT_SYMBOL_GPL(dm_rh_delay);
+
+void dm_rh_stop_recovery(struct dm_region_hash *rh)
+{
+	int i;
+
+	/* wait for any recovering regions */
+	for (i = 0; i < rh->max_recovery; i++)
+		down(&rh->recovery_count);
+}
+EXPORT_SYMBOL_GPL(dm_rh_stop_recovery);
+
+void dm_rh_start_recovery(struct dm_region_hash *rh)
+{
+	int i;
+
+	for (i = 0; i < rh->max_recovery; i++)
+		up(&rh->recovery_count);
+
+	rh->wakeup_workers(rh->context);
+}
+EXPORT_SYMBOL_GPL(dm_rh_start_recovery);
+
+MODULE_DESCRIPTION(DM_NAME " region hash");
+MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@ -9,7 +9,8 @@
 * Round-robin path selector.
 */

-#include "dm.h"
+#include <linux/device-mapper.h>
+
 #include "dm-path-selector.h"

 #include <linux/slab.h>
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@ -600,7 +600,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)

 	s->valid = 1;
 	s->active = 0;
-	s->last_percent = 0;
 	init_rwsem(&s->lock);
 	spin_lock_init(&s->pe_lock);
 	s->ti = ti;
@ -824,8 +823,10 @@ static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe)
 	 * the bios for the original write to the origin.
 	 */
 	if (primary_pe &&
-	    atomic_dec_and_test(&primary_pe->ref_count))
+	    atomic_dec_and_test(&primary_pe->ref_count)) {
 		origin_bios = bio_list_get(&primary_pe->origin_bios);
+		free_pending_exception(primary_pe);
+	}

 	/*
 	 * Free the pe if it's not linked to an origin write or if
@ -834,12 +835,6 @@ static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe)
 	if (!primary_pe || primary_pe != pe)
 		free_pending_exception(pe);

-	/*
-	 * Free the primary pe if nothing references it.
-	 */
-	if (primary_pe && !atomic_read(&primary_pe->ref_count))
-		free_pending_exception(primary_pe);
-
 	return origin_bios;
 }

--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@ -9,7 +9,7 @@
 #ifndef DM_SNAPSHOT_H
 #define DM_SNAPSHOT_H

-#include "dm.h"
+#include <linux/device-mapper.h>
 #include "dm-bio-list.h"
 #include <linux/blkdev.h>
 #include <linux/workqueue.h>
@ -158,9 +158,6 @@ struct dm_snapshot {
 	/* Used for display of table */
 	char type;

-	/* The last percentage we notified */
-	int last_percent;
-
 	mempool_t *pending_pool;

 	struct exception_table pending;
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@ -4,7 +4,7 @@
 * This file is released under the GPL.
 */

-#include "dm.h"
+#include <linux/device-mapper.h>

 #include <linux/module.h>
 #include <linux/init.h>
@ -60,8 +60,8 @@ static inline struct stripe_c *alloc_context(unsigned int stripes)
 {
 	size_t len;

-	if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
-			  stripes))
+	if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
+			     stripes))
 		return NULL;

 	len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@ -4,7 +4,7 @@
 * This file is released under the GPL.
 */

-#include "dm.h"
+#include <linux/device-mapper.h>

 #include <linux/module.h>
 #include <linux/init.h>
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@ -76,7 +76,6 @@ union map_info *dm_get_mapinfo(struct bio *bio)
 */
 struct dm_wq_req {
 	enum {
-		DM_WQ_FLUSH_ALL,
 		DM_WQ_FLUSH_DEFERRED,
 	} type;
 	struct work_struct work;
@ -151,40 +150,40 @@ static struct kmem_cache *_tio_cache;

 static int __init local_init(void)
 {
-	int r;
+	int r = -ENOMEM;

 	/* allocate a slab for the dm_ios */
 	_io_cache = KMEM_CACHE(dm_io, 0);
 	if (!_io_cache)
-		return -ENOMEM;
+		return r;

 	/* allocate a slab for the target ios */
 	_tio_cache = KMEM_CACHE(dm_target_io, 0);
-	if (!_tio_cache) {
-		kmem_cache_destroy(_io_cache);
-		return -ENOMEM;
-	}
+	if (!_tio_cache)
+		goto out_free_io_cache;

 	r = dm_uevent_init();
-	if (r) {
-		kmem_cache_destroy(_tio_cache);
-		kmem_cache_destroy(_io_cache);
-		return r;
-	}
+	if (r)
+		goto out_free_tio_cache;

 	_major = major;
 	r = register_blkdev(_major, _name);
-	if (r < 0) {
-		kmem_cache_destroy(_tio_cache);
-		kmem_cache_destroy(_io_cache);
-		dm_uevent_exit();
-		return r;
-	}
+	if (r < 0)
+		goto out_uevent_exit;

 	if (!_major)
 		_major = r;

 	return 0;
+
+out_uevent_exit:
+	dm_uevent_exit();
+out_free_tio_cache:
+	kmem_cache_destroy(_tio_cache);
+out_free_io_cache:
+	kmem_cache_destroy(_io_cache);
+
+	return r;
 }

 static void local_exit(void)
@ -669,6 +668,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
 	clone->bi_size = to_bytes(len);
 	clone->bi_io_vec->bv_offset = offset;
 	clone->bi_io_vec->bv_len = clone->bi_size;
+	clone->bi_flags |= 1 << BIO_CLONED;

 	return clone;
 }
@ -1394,9 +1394,6 @@ static void dm_wq_work(struct work_struct *work)

 	down_write(&md->io_lock);
 	switch (req->type) {
-	case DM_WQ_FLUSH_ALL:
-		__merge_pushback_list(md);
-		/* pass through */
 	case DM_WQ_FLUSH_DEFERRED:
 		__flush_deferred_io(md);
 		break;
@ -1526,7 +1523,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		if (!md->suspended_bdev) {
 			DMWARN("bdget failed in dm_suspend");
 			r = -ENOMEM;
-			goto flush_and_out;
+			goto out;
 		}

 		/*
@ -1577,14 +1574,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)

 	set_bit(DMF_SUSPENDED, &md->flags);

-flush_and_out:
-	if (r && noflush)
-		/*
-		 * Because there may be already I/Os in the pushback list,
-		 * flush them before return.
-		 */
-		dm_queue_flush(md, DM_WQ_FLUSH_ALL, NULL);
-
 out:
 	if (r && md->suspended_bdev) {
 		bdput(md->suspended_bdev);
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@ -62,15 +62,6 @@ void dm_put_target_type(struct target_type *t);
 int dm_target_iterate(void (*iter_func)(struct target_type *tt,
 					void *param), void *param);

-/*-----------------------------------------------------------------
- * Useful inlines.
- *---------------------------------------------------------------*/
-static inline int array_too_big(unsigned long fixed, unsigned long obj,
-				unsigned long num)
-{
-	return (num > (ULONG_MAX - fixed) / obj);
-}
-
 int dm_split_args(int *argc, char ***argvp, char *input);

 /*
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@ -354,6 +354,9 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
 */
 #define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz))

+#define dm_array_too_big(fixed, obj, num) \
+	((num) > (UINT_MAX - (fixed)) / (obj))
+
 static inline sector_t to_sector(unsigned long n)
 {
 	return (n >> SECTOR_SHIFT);
--- a/include/linux/dm-region-hash.h
+++ b/include/linux/dm-region-hash.h
@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+ *
+ * Device-Mapper dirty region hash interface.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_REGION_HASH_H
+#define DM_REGION_HASH_H
+
+#include <linux/dm-dirty-log.h>
+
+/*-----------------------------------------------------------------
+ * Region hash
+ *----------------------------------------------------------------*/
+struct dm_region_hash;
+struct dm_region;
+
+/*
+ * States a region can have.
+ */
+enum dm_rh_region_states {
+	DM_RH_CLEAN	 = 0x01,	/* No writes in flight. */
+	DM_RH_DIRTY	 = 0x02,	/* Writes in flight. */
+	DM_RH_NOSYNC	 = 0x04,	/* Out of sync. */
+	DM_RH_RECOVERING = 0x08,	/* Under resynchronization. */
+};
+
+/*
+ * Region hash create/destroy.
+ */
+struct bio_list;
+struct dm_region_hash *dm_region_hash_create(
+		void *context, void (*dispatch_bios)(void *context,
+						     struct bio_list *bios),
+		void (*wakeup_workers)(void *context),
+		void (*wakeup_all_recovery_waiters)(void *context),
+		sector_t target_begin, unsigned max_recovery,
+		struct dm_dirty_log *log, uint32_t region_size,
+		region_t nr_regions);
+void dm_region_hash_destroy(struct dm_region_hash *rh);
+
+struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh);
+
+/*
+ * Conversion functions.
+ */
+region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio);
+sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region);
+void *dm_rh_region_context(struct dm_region *reg);
+
+/*
+ * Get region size and key (ie. number of the region).
+ */
+sector_t dm_rh_get_region_size(struct dm_region_hash *rh);
+region_t dm_rh_get_region_key(struct dm_region *reg);
+
+/*
+ * Get/set/update region state (and dirty log).
+ *
+ */
+int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block);
+void dm_rh_set_state(struct dm_region_hash *rh, region_t region,
+		     enum dm_rh_region_states state, int may_block);
+
+/* Non-zero errors_handled leaves the state of the region NOSYNC */
+void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled);
+
+/* Flush the region hash and dirty log. */
+int dm_rh_flush(struct dm_region_hash *rh);
+
+/* Inc/dec pending count on regions. */
+void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios);
+void dm_rh_dec(struct dm_region_hash *rh, region_t region);
+
+/* Delay bios on regions. */
+void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio);
+
+void dm_rh_mark_nosync(struct dm_region_hash *rh,
+		       struct bio *bio, unsigned done, int error);
+
+/*
+ * Region recovery control.
+ */
+
+/* Prepare some regions for recovery by starting to quiesce them. */
+void dm_rh_recovery_prepare(struct dm_region_hash *rh);
+
+/* Try fetching a quiesced region for recovery. */
+struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh);
+
+/* Report recovery end on a region. */
+void dm_rh_recovery_end(struct dm_region *reg, int error);
+
+/* Returns number of regions with recovery work outstanding. */
+int dm_rh_recovery_in_flight(struct dm_region_hash *rh);
+
+/* Start/stop recovery. */
+void dm_rh_start_recovery(struct dm_region_hash *rh);
+void dm_rh_stop_recovery(struct dm_region_hash *rh);
+
+#endif /* DM_REGION_HASH_H */